aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kbuild2
-rw-r--r--arch/x86/Kconfig43
-rw-r--r--arch/x86/Makefile3
-rw-r--r--arch/x86/boot/compressed/cmdline.c4
-rw-r--r--arch/x86/boot/compressed/early_serial_console.c4
-rw-r--r--arch/x86/boot/compressed/eboot.c87
-rw-r--r--arch/x86/boot/compressed/eboot.h6
-rw-r--r--arch/x86/boot/compressed/misc.c31
-rw-r--r--arch/x86/boot/compressed/misc.h27
-rw-r--r--arch/x86/boot/header.S42
-rw-r--r--arch/x86/boot/tools/build.c172
-rw-r--r--arch/x86/crypto/Makefile14
-rw-r--r--arch/x86/crypto/ablk_helper.c149
-rw-r--r--arch/x86/crypto/aes_glue.c2
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S6
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c110
-rw-r--r--arch/x86/crypto/camellia_glue.c355
-rw-r--r--arch/x86/crypto/glue_helper.c307
-rw-r--r--arch/x86/crypto/serpent-avx-x86_64-asm_64.S704
-rw-r--r--arch/x86/crypto/serpent_avx_glue.c636
-rw-r--r--arch/x86/crypto/serpent_sse2_glue.c513
-rw-r--r--arch/x86/crypto/sha1_ssse3_asm.S2
-rw-r--r--arch/x86/crypto/sha1_ssse3_glue.c6
-rw-r--r--arch/x86/crypto/twofish-avx-x86_64-asm_64.S300
-rw-r--r--arch/x86/crypto/twofish_avx_glue.c624
-rw-r--r--arch/x86/crypto/twofish_glue_3way.c409
-rw-r--r--arch/x86/ia32/ia32_signal.c4
-rw-r--r--arch/x86/include/asm/acpi.h9
-rw-r--r--arch/x86/include/asm/alternative.h74
-rw-r--r--arch/x86/include/asm/amd_nb.h21
-rw-r--r--arch/x86/include/asm/apic.h64
-rw-r--r--arch/x86/include/asm/bitops.h9
-rw-r--r--arch/x86/include/asm/cpufeature.h4
-rw-r--r--arch/x86/include/asm/crypto/ablk_helper.h31
-rw-r--r--arch/x86/include/asm/crypto/aes.h (renamed from arch/x86/include/asm/aes.h)0
-rw-r--r--arch/x86/include/asm/crypto/glue_helper.h115
-rw-r--r--arch/x86/include/asm/crypto/serpent-avx.h32
-rw-r--r--arch/x86/include/asm/crypto/serpent-sse2.h (renamed from arch/x86/include/asm/serpent.h)4
-rw-r--r--arch/x86/include/asm/crypto/twofish.h46
-rw-r--r--arch/x86/include/asm/dma-contiguous.h13
-rw-r--r--arch/x86/include/asm/dma-mapping.h5
-rw-r--r--arch/x86/include/asm/emergency-restart.h2
-rw-r--r--arch/x86/include/asm/floppy.h2
-rw-r--r--arch/x86/include/asm/ftrace.h2
-rw-r--r--arch/x86/include/asm/gpio.h57
-rw-r--r--arch/x86/include/asm/hypervisor.h1
-rw-r--r--arch/x86/include/asm/iommu.h1
-rw-r--r--arch/x86/include/asm/kvm.h1
-rw-r--r--arch/x86/include/asm/kvm_emulate.h10
-rw-r--r--arch/x86/include/asm/kvm_host.h48
-rw-r--r--arch/x86/include/asm/kvm_para.h31
-rw-r--r--arch/x86/include/asm/msr.h46
-rw-r--r--arch/x86/include/asm/nmi.h6
-rw-r--r--arch/x86/include/asm/paravirt.h41
-rw-r--r--arch/x86/include/asm/paravirt_types.h2
-rw-r--r--arch/x86/include/asm/pci_x86.h15
-rw-r--r--arch/x86/include/asm/perf_event.h22
-rw-r--r--arch/x86/include/asm/pgtable-2level.h4
-rw-r--r--arch/x86/include/asm/pgtable-3level.h60
-rw-r--r--arch/x86/include/asm/pgtable_64.h8
-rw-r--r--arch/x86/include/asm/posix_types_32.h3
-rw-r--r--arch/x86/include/asm/processor-flags.h2
-rw-r--r--arch/x86/include/asm/processor.h7
-rw-r--r--arch/x86/include/asm/pvclock-abi.h1
-rw-r--r--arch/x86/include/asm/realmode.h63
-rw-r--r--arch/x86/include/asm/reboot.h4
-rw-r--r--arch/x86/include/asm/sighandling.h2
-rw-r--r--arch/x86/include/asm/smp.h5
-rw-r--r--arch/x86/include/asm/sta2x11.h12
-rw-r--r--arch/x86/include/asm/thread_info.h20
-rw-r--r--arch/x86/include/asm/trampoline.h39
-rw-r--r--arch/x86/include/asm/uaccess.h16
-rw-r--r--arch/x86/include/asm/uaccess_32.h17
-rw-r--r--arch/x86/include/asm/uaccess_64.h14
-rw-r--r--arch/x86/include/asm/uprobes.h57
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h29
-rw-r--r--arch/x86/include/asm/vga.h6
-rw-r--r--arch/x86/include/asm/vmx.h6
-rw-r--r--arch/x86/include/asm/word-at-a-time.h34
-rw-r--r--arch/x86/include/asm/x2apic.h18
-rw-r--r--arch/x86/include/asm/x86_init.h2
-rw-r--r--arch/x86/include/asm/xen/events.h1
-rw-r--r--arch/x86/include/asm/xen/hypercall.h8
-rw-r--r--arch/x86/include/asm/xen/page.h1
-rw-r--r--arch/x86/kernel/Makefile3
-rw-r--r--arch/x86/kernel/acpi/Makefile9
-rw-r--r--arch/x86/kernel/acpi/boot.c27
-rw-r--r--arch/x86/kernel/acpi/realmode/.gitignore3
-rw-r--r--arch/x86/kernel/acpi/realmode/Makefile59
-rw-r--r--arch/x86/kernel/acpi/realmode/bioscall.S1
-rw-r--r--arch/x86/kernel/acpi/realmode/copy.S1
-rw-r--r--arch/x86/kernel/acpi/realmode/regs.c1
-rw-r--r--arch/x86/kernel/acpi/realmode/video-bios.c1
-rw-r--r--arch/x86/kernel/acpi/realmode/video-mode.c1
-rw-r--r--arch/x86/kernel/acpi/realmode/video-vesa.c1
-rw-r--r--arch/x86/kernel/acpi/realmode/video-vga.c1
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.lds.S62
-rw-r--r--arch/x86/kernel/acpi/sleep.c33
-rw-r--r--arch/x86/kernel/acpi/sleep.h2
-rw-r--r--arch/x86/kernel/acpi/wakeup_rm.S12
-rw-r--r--arch/x86/kernel/alternative.c19
-rw-r--r--arch/x86/kernel/amd_nb.c11
-rw-r--r--arch/x86/kernel/aperture_64.c6
-rw-r--r--arch/x86/kernel/apic/apic.c36
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c76
-rw-r--r--arch/x86/kernel/apic/apic_noop.c9
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c50
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c48
-rw-r--r--arch/x86/kernel/apic/es7000_32.c51
-rw-r--r--arch/x86/kernel/apic/io_apic.c347
-rw-r--r--arch/x86/kernel/apic/numaq_32.c30
-rw-r--r--arch/x86/kernel/apic/probe_32.c23
-rw-r--r--arch/x86/kernel/apic/probe_64.c11
-rw-r--r--arch/x86/kernel/apic/summit_32.c68
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c82
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c39
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c45
-rw-r--r--arch/x86/kernel/apm_32.c29
-rw-r--r--arch/x86/kernel/cpu/Makefile6
-rw-r--r--arch/x86/kernel/cpu/amd.c39
-rw-r--r--arch/x86/kernel/cpu/bugs.c20
-rw-r--r--arch/x86/kernel/cpu/common.c10
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c3
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-apei.c3
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c26
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c93
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c286
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.pl25
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c8
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c6
-rw-r--r--arch/x86/kernel/cpu/perf_event.c122
-rw-r--r--arch/x86/kernel/cpu/perf_event.h28
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c103
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c279
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c21
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.c1850
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.h424
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c16
-rw-r--r--arch/x86/kernel/cpu/perf_event_p6.c4
-rw-r--r--arch/x86/kernel/cpu/scattered.c2
-rw-r--r--arch/x86/kernel/cpu/sched.c55
-rw-r--r--arch/x86/kernel/dumpstack.c5
-rw-r--r--arch/x86/kernel/dumpstack_32.c25
-rw-r--r--arch/x86/kernel/dumpstack_64.c21
-rw-r--r--arch/x86/kernel/e820.c53
-rw-r--r--arch/x86/kernel/early_printk.c12
-rw-r--r--arch/x86/kernel/entry_32.S13
-rw-r--r--arch/x86/kernel/entry_64.S64
-rw-r--r--arch/x86/kernel/ftrace.c102
-rw-r--r--arch/x86/kernel/head32.c1
-rw-r--r--arch/x86/kernel/head64.c1
-rw-r--r--arch/x86/kernel/head_32.S5
-rw-r--r--arch/x86/kernel/head_64.S4
-rw-r--r--arch/x86/kernel/hpet.c66
-rw-r--r--arch/x86/kernel/irq.c4
-rw-r--r--arch/x86/kernel/kgdb.c8
-rw-r--r--arch/x86/kernel/kvm.c64
-rw-r--r--arch/x86/kernel/kvmclock.c15
-rw-r--r--arch/x86/kernel/microcode_core.c66
-rw-r--r--arch/x86/kernel/module.c34
-rw-r--r--arch/x86/kernel/mpparse.c11
-rw-r--r--arch/x86/kernel/nmi.c53
-rw-r--r--arch/x86/kernel/nmi_selftest.c5
-rw-r--r--arch/x86/kernel/paravirt.c2
-rw-r--r--arch/x86/kernel/pci-calgary_64.c34
-rw-r--r--arch/x86/kernel/pci-dma.c28
-rw-r--r--arch/x86/kernel/pci-nommu.c8
-rw-r--r--arch/x86/kernel/process.c34
-rw-r--r--arch/x86/kernel/process_64.c12
-rw-r--r--arch/x86/kernel/ptrace.c6
-rw-r--r--arch/x86/kernel/quirks.c2
-rw-r--r--arch/x86/kernel/reboot.c107
-rw-r--r--arch/x86/kernel/setup.c24
-rw-r--r--arch/x86/kernel/signal.c73
-rw-r--r--arch/x86/kernel/smpboot.c150
-rw-r--r--arch/x86/kernel/tboot.c7
-rw-r--r--arch/x86/kernel/trampoline.c42
-rw-r--r--arch/x86/kernel/trampoline_32.S83
-rw-r--r--arch/x86/kernel/traps.c27
-rw-r--r--arch/x86/kernel/tsc.c50
-rw-r--r--arch/x86/kernel/uprobes.c675
-rw-r--r--arch/x86/kernel/vm86_32.c6
-rw-r--r--arch/x86/kernel/vmlinux.lds.S12
-rw-r--r--arch/x86/kernel/vsmp_64.c44
-rw-r--r--arch/x86/kernel/vsyscall_64.c56
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c1
-rw-r--r--arch/x86/kernel/xsave.c12
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/cpuid.c49
-rw-r--r--arch/x86/kvm/cpuid.h9
-rw-r--r--arch/x86/kvm/emulate.c546
-rw-r--r--arch/x86/kvm/i8254.c31
-rw-r--r--arch/x86/kvm/i8254.h7
-rw-r--r--arch/x86/kvm/i8259.c17
-rw-r--r--arch/x86/kvm/lapic.c225
-rw-r--r--arch/x86/kvm/lapic.h11
-rw-r--r--arch/x86/kvm/mmu.c658
-rw-r--r--arch/x86/kvm/mmu_audit.c10
-rw-r--r--arch/x86/kvm/mmutrace.h45
-rw-r--r--arch/x86/kvm/paging_tmpl.h5
-rw-r--r--arch/x86/kvm/pmu.c22
-rw-r--r--arch/x86/kvm/svm.c21
-rw-r--r--arch/x86/kvm/trace.h46
-rw-r--r--arch/x86/kvm/vmx.c230
-rw-r--r--arch/x86/kvm/x86.c399
-rw-r--r--arch/x86/kvm/x86.h2
-rw-r--r--arch/x86/lib/csum-wrappers_64.c2
-rw-r--r--arch/x86/lib/msr-reg-export.c4
-rw-r--r--arch/x86/lib/msr-reg.S10
-rw-r--r--arch/x86/lib/usercopy.c101
-rw-r--r--arch/x86/lib/usercopy_32.c41
-rw-r--r--arch/x86/lib/usercopy_64.c48
-rw-r--r--arch/x86/lib/x86-opcode-map.txt8
-rw-r--r--arch/x86/mm/init.c21
-rw-r--r--arch/x86/mm/ioremap.c4
-rw-r--r--arch/x86/mm/numa.c32
-rw-r--r--arch/x86/mm/numa_emulation.c4
-rw-r--r--arch/x86/mm/pageattr.c2
-rw-r--r--arch/x86/mm/pat.c98
-rw-r--r--arch/x86/mm/srat.c7
-rw-r--r--arch/x86/net/bpf_jit_comp.c4
-rw-r--r--arch/x86/oprofile/op_model_amd.c4
-rw-r--r--arch/x86/pci/acpi.c109
-rw-r--r--arch/x86/pci/amd_bus.c7
-rw-r--r--arch/x86/pci/bus_numa.c22
-rw-r--r--arch/x86/pci/bus_numa.h3
-rw-r--r--arch/x86/pci/common.c2
-rw-r--r--arch/x86/pci/fixup.c3
-rw-r--r--arch/x86/pci/mmconfig-shared.c372
-rw-r--r--arch/x86/pci/mmconfig_32.c30
-rw-r--r--arch/x86/pci/mmconfig_64.c52
-rw-r--r--arch/x86/pci/mrst.c2
-rw-r--r--arch/x86/pci/xen.c4
-rw-r--r--arch/x86/platform/mrst/early_printk_mrst.c13
-rw-r--r--arch/x86/platform/mrst/mrst.c2
-rw-r--r--arch/x86/platform/olpc/olpc-xo15-sci.c6
-rw-r--r--arch/x86/platform/uv/tlb_uv.c454
-rw-r--r--arch/x86/platform/uv/uv_irq.c7
-rw-r--r--arch/x86/realmode/Makefile18
-rw-r--r--arch/x86/realmode/init.c115
-rw-r--r--arch/x86/realmode/rm/.gitignore3
-rw-r--r--arch/x86/realmode/rm/Makefile82
-rw-r--r--arch/x86/realmode/rm/bioscall.S1
-rw-r--r--arch/x86/realmode/rm/copy.S1
-rw-r--r--arch/x86/realmode/rm/header.S43
-rw-r--r--arch/x86/realmode/rm/realmode.h21
-rw-r--r--arch/x86/realmode/rm/realmode.lds.S76
-rw-r--r--arch/x86/realmode/rm/reboot.S (renamed from arch/x86/kernel/reboot_32.S)113
-rw-r--r--arch/x86/realmode/rm/regs.c1
-rw-r--r--arch/x86/realmode/rm/stack.S19
-rw-r--r--arch/x86/realmode/rm/trampoline_32.S74
-rw-r--r--arch/x86/realmode/rm/trampoline_64.S (renamed from arch/x86/kernel/trampoline_64.S)148
-rw-r--r--arch/x86/realmode/rm/trampoline_common.S7
-rw-r--r--arch/x86/realmode/rm/video-bios.c1
-rw-r--r--arch/x86/realmode/rm/video-mode.c1
-rw-r--r--arch/x86/realmode/rm/video-vesa.c1
-rw-r--r--arch/x86/realmode/rm/video-vga.c1
-rw-r--r--arch/x86/realmode/rm/wakemain.c (renamed from arch/x86/kernel/acpi/realmode/wakemain.c)3
-rw-r--r--arch/x86/realmode/rm/wakeup.h (renamed from arch/x86/kernel/acpi/realmode/wakeup.h)10
-rw-r--r--arch/x86/realmode/rm/wakeup_asm.S (renamed from arch/x86/kernel/acpi/realmode/wakeup.S)131
-rw-r--r--arch/x86/realmode/rmpiggy.S20
-rw-r--r--arch/x86/syscalls/syscall_32.tbl1
-rw-r--r--arch/x86/syscalls/syscall_64.tbl2
-rw-r--r--arch/x86/tools/gen-insn-attr-x86.awk14
-rw-r--r--arch/x86/tools/relocs.c19
-rw-r--r--arch/x86/um/signal.c2
-rw-r--r--arch/x86/um/sys_call_table_32.c4
-rw-r--r--arch/x86/vdso/vdso32-setup.c6
-rw-r--r--arch/x86/video/fbdev.c20
-rw-r--r--arch/x86/xen/debugfs.c104
-rw-r--r--arch/x86/xen/debugfs.h4
-rw-r--r--arch/x86/xen/enlighten.c246
-rw-r--r--arch/x86/xen/mmu.c62
-rw-r--r--arch/x86/xen/p2m.c140
-rw-r--r--arch/x86/xen/setup.c165
-rw-r--r--arch/x86/xen/smp.c114
-rw-r--r--arch/x86/xen/smp.h12
-rw-r--r--arch/x86/xen/spinlock.c12
-rw-r--r--arch/x86/xen/suspend.c2
-rw-r--r--arch/x86/xen/xen-ops.h3
280 files changed, 12797 insertions, 5305 deletions
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index 0e9dec6cadd..e5287d8517a 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -1,4 +1,3 @@
1
2obj-$(CONFIG_KVM) += kvm/ 1obj-$(CONFIG_KVM) += kvm/
3 2
4# Xen paravirtualization support 3# Xen paravirtualization support
@@ -7,6 +6,7 @@ obj-$(CONFIG_XEN) += xen/
7# lguest paravirtualization support 6# lguest paravirtualization support
8obj-$(CONFIG_LGUEST_GUEST) += lguest/ 7obj-$(CONFIG_LGUEST_GUEST) += lguest/
9 8
9obj-y += realmode/
10obj-y += kernel/ 10obj-y += kernel/
11obj-y += mm/ 11obj-y += mm/
12 12
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d6168994e11..c70684f859e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -32,6 +32,7 @@ config X86
32 select ARCH_WANT_OPTIONAL_GPIOLIB 32 select ARCH_WANT_OPTIONAL_GPIOLIB
33 select ARCH_WANT_FRAME_POINTERS 33 select ARCH_WANT_FRAME_POINTERS
34 select HAVE_DMA_ATTRS 34 select HAVE_DMA_ATTRS
35 select HAVE_DMA_CONTIGUOUS if !SWIOTLB
35 select HAVE_KRETPROBES 36 select HAVE_KRETPROBES
36 select HAVE_OPTPROBES 37 select HAVE_OPTPROBES
37 select HAVE_FTRACE_MCOUNT_RECORD 38 select HAVE_FTRACE_MCOUNT_RECORD
@@ -85,9 +86,18 @@ config X86
85 select GENERIC_SMP_IDLE_THREAD 86 select GENERIC_SMP_IDLE_THREAD
86 select HAVE_ARCH_SECCOMP_FILTER 87 select HAVE_ARCH_SECCOMP_FILTER
87 select BUILDTIME_EXTABLE_SORT 88 select BUILDTIME_EXTABLE_SORT
89 select GENERIC_CMOS_UPDATE
90 select CLOCKSOURCE_WATCHDOG
91 select GENERIC_CLOCKEVENTS
92 select ARCH_CLOCKSOURCE_DATA if X86_64
93 select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC)
94 select GENERIC_TIME_VSYSCALL if X86_64
95 select KTIME_SCALAR if X86_32
96 select GENERIC_STRNCPY_FROM_USER
97 select GENERIC_STRNLEN_USER
88 98
89config INSTRUCTION_DECODER 99config INSTRUCTION_DECODER
90 def_bool (KPROBES || PERF_EVENTS) 100 def_bool (KPROBES || PERF_EVENTS || UPROBES)
91 101
92config OUTPUT_FORMAT 102config OUTPUT_FORMAT
93 string 103 string
@@ -99,23 +109,6 @@ config ARCH_DEFCONFIG
99 default "arch/x86/configs/i386_defconfig" if X86_32 109 default "arch/x86/configs/i386_defconfig" if X86_32
100 default "arch/x86/configs/x86_64_defconfig" if X86_64 110 default "arch/x86/configs/x86_64_defconfig" if X86_64
101 111
102config GENERIC_CMOS_UPDATE
103 def_bool y
104
105config CLOCKSOURCE_WATCHDOG
106 def_bool y
107
108config GENERIC_CLOCKEVENTS
109 def_bool y
110
111config ARCH_CLOCKSOURCE_DATA
112 def_bool y
113 depends on X86_64
114
115config GENERIC_CLOCKEVENTS_BROADCAST
116 def_bool y
117 depends on X86_64 || (X86_32 && X86_LOCAL_APIC)
118
119config LOCKDEP_SUPPORT 112config LOCKDEP_SUPPORT
120 def_bool y 113 def_bool y
121 114
@@ -166,10 +159,6 @@ config RWSEM_XCHGADD_ALGORITHM
166config GENERIC_CALIBRATE_DELAY 159config GENERIC_CALIBRATE_DELAY
167 def_bool y 160 def_bool y
168 161
169config GENERIC_TIME_VSYSCALL
170 bool
171 default X86_64
172
173config ARCH_HAS_CPU_RELAX 162config ARCH_HAS_CPU_RELAX
174 def_bool y 163 def_bool y
175 164
@@ -236,13 +225,13 @@ config ARCH_HWEIGHT_CFLAGS
236 default "-fcall-saved-ecx -fcall-saved-edx" if X86_32 225 default "-fcall-saved-ecx -fcall-saved-edx" if X86_32
237 default "-fcall-saved-rdi -fcall-saved-rsi -fcall-saved-rdx -fcall-saved-rcx -fcall-saved-r8 -fcall-saved-r9 -fcall-saved-r10 -fcall-saved-r11" if X86_64 226 default "-fcall-saved-rdi -fcall-saved-rsi -fcall-saved-rdx -fcall-saved-rcx -fcall-saved-r8 -fcall-saved-r9 -fcall-saved-r10 -fcall-saved-r11" if X86_64
238 227
239config KTIME_SCALAR
240 def_bool X86_32
241
242config ARCH_CPU_PROBE_RELEASE 228config ARCH_CPU_PROBE_RELEASE
243 def_bool y 229 def_bool y
244 depends on HOTPLUG_CPU 230 depends on HOTPLUG_CPU
245 231
232config ARCH_SUPPORTS_UPROBES
233 def_bool y
234
246source "init/Kconfig" 235source "init/Kconfig"
247source "kernel/Kconfig.freezer" 236source "kernel/Kconfig.freezer"
248 237
@@ -258,8 +247,6 @@ config ZONE_DMA
258 247
259 If unsure, say Y. 248 If unsure, say Y.
260 249
261source "kernel/time/Kconfig"
262
263config SMP 250config SMP
264 bool "Symmetric multi-processing support" 251 bool "Symmetric multi-processing support"
265 ---help--- 252 ---help---
@@ -1519,6 +1506,8 @@ config EFI_STUB
1519 This kernel feature allows a bzImage to be loaded directly 1506 This kernel feature allows a bzImage to be loaded directly
1520 by EFI firmware without the use of a bootloader. 1507 by EFI firmware without the use of a bootloader.
1521 1508
1509 See Documentation/x86/efi-stub.txt for more information.
1510
1522config SECCOMP 1511config SECCOMP
1523 def_bool y 1512 def_bool y
1524 prompt "Enable seccomp to safely compute untrusted bytecode" 1513 prompt "Enable seccomp to safely compute untrusted bytecode"
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 1f252143455..b0c5276861e 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -49,6 +49,9 @@ else
49 KBUILD_AFLAGS += -m64 49 KBUILD_AFLAGS += -m64
50 KBUILD_CFLAGS += -m64 50 KBUILD_CFLAGS += -m64
51 51
52 # Use -mpreferred-stack-boundary=3 if supported.
53 KBUILD_CFLAGS += $(call cc-option,-mno-sse -mpreferred-stack-boundary=3)
54
52 # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu) 55 # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu)
53 cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8) 56 cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8)
54 cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona) 57 cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c
index cb62f786990..10f6b1178c6 100644
--- a/arch/x86/boot/compressed/cmdline.c
+++ b/arch/x86/boot/compressed/cmdline.c
@@ -1,5 +1,7 @@
1#include "misc.h" 1#include "misc.h"
2 2
3#ifdef CONFIG_EARLY_PRINTK
4
3static unsigned long fs; 5static unsigned long fs;
4static inline void set_fs(unsigned long seg) 6static inline void set_fs(unsigned long seg)
5{ 7{
@@ -19,3 +21,5 @@ int cmdline_find_option_bool(const char *option)
19{ 21{
20 return __cmdline_find_option_bool(real_mode->hdr.cmd_line_ptr, option); 22 return __cmdline_find_option_bool(real_mode->hdr.cmd_line_ptr, option);
21} 23}
24
25#endif
diff --git a/arch/x86/boot/compressed/early_serial_console.c b/arch/x86/boot/compressed/early_serial_console.c
index 261e81fb958..d3d003cb548 100644
--- a/arch/x86/boot/compressed/early_serial_console.c
+++ b/arch/x86/boot/compressed/early_serial_console.c
@@ -1,5 +1,9 @@
1#include "misc.h" 1#include "misc.h"
2 2
3#ifdef CONFIG_EARLY_PRINTK
4
3int early_serial_base; 5int early_serial_base;
4 6
5#include "../early_serial_console.c" 7#include "../early_serial_console.c"
8
9#endif
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 2c14e76bb4c..4e85f5f8583 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -16,6 +16,26 @@
16 16
17static efi_system_table_t *sys_table; 17static efi_system_table_t *sys_table;
18 18
19static void efi_printk(char *str)
20{
21 char *s8;
22
23 for (s8 = str; *s8; s8++) {
24 struct efi_simple_text_output_protocol *out;
25 efi_char16_t ch[2] = { 0 };
26
27 ch[0] = *s8;
28 out = (struct efi_simple_text_output_protocol *)sys_table->con_out;
29
30 if (*s8 == '\n') {
31 efi_char16_t nl[2] = { '\r', 0 };
32 efi_call_phys2(out->output_string, out, nl);
33 }
34
35 efi_call_phys2(out->output_string, out, ch);
36 }
37}
38
19static efi_status_t __get_map(efi_memory_desc_t **map, unsigned long *map_size, 39static efi_status_t __get_map(efi_memory_desc_t **map, unsigned long *map_size,
20 unsigned long *desc_size) 40 unsigned long *desc_size)
21{ 41{
@@ -531,8 +551,10 @@ static efi_status_t handle_ramdisks(efi_loaded_image_t *image,
531 EFI_LOADER_DATA, 551 EFI_LOADER_DATA,
532 nr_initrds * sizeof(*initrds), 552 nr_initrds * sizeof(*initrds),
533 &initrds); 553 &initrds);
534 if (status != EFI_SUCCESS) 554 if (status != EFI_SUCCESS) {
555 efi_printk("Failed to alloc mem for initrds\n");
535 goto fail; 556 goto fail;
557 }
536 558
537 str = (char *)(unsigned long)hdr->cmd_line_ptr; 559 str = (char *)(unsigned long)hdr->cmd_line_ptr;
538 for (i = 0; i < nr_initrds; i++) { 560 for (i = 0; i < nr_initrds; i++) {
@@ -575,32 +597,42 @@ static efi_status_t handle_ramdisks(efi_loaded_image_t *image,
575 597
576 status = efi_call_phys3(boottime->handle_protocol, 598 status = efi_call_phys3(boottime->handle_protocol,
577 image->device_handle, &fs_proto, &io); 599 image->device_handle, &fs_proto, &io);
578 if (status != EFI_SUCCESS) 600 if (status != EFI_SUCCESS) {
601 efi_printk("Failed to handle fs_proto\n");
579 goto free_initrds; 602 goto free_initrds;
603 }
580 604
581 status = efi_call_phys2(io->open_volume, io, &fh); 605 status = efi_call_phys2(io->open_volume, io, &fh);
582 if (status != EFI_SUCCESS) 606 if (status != EFI_SUCCESS) {
607 efi_printk("Failed to open volume\n");
583 goto free_initrds; 608 goto free_initrds;
609 }
584 } 610 }
585 611
586 status = efi_call_phys5(fh->open, fh, &h, filename_16, 612 status = efi_call_phys5(fh->open, fh, &h, filename_16,
587 EFI_FILE_MODE_READ, (u64)0); 613 EFI_FILE_MODE_READ, (u64)0);
588 if (status != EFI_SUCCESS) 614 if (status != EFI_SUCCESS) {
615 efi_printk("Failed to open initrd file\n");
589 goto close_handles; 616 goto close_handles;
617 }
590 618
591 initrd->handle = h; 619 initrd->handle = h;
592 620
593 info_sz = 0; 621 info_sz = 0;
594 status = efi_call_phys4(h->get_info, h, &info_guid, 622 status = efi_call_phys4(h->get_info, h, &info_guid,
595 &info_sz, NULL); 623 &info_sz, NULL);
596 if (status != EFI_BUFFER_TOO_SMALL) 624 if (status != EFI_BUFFER_TOO_SMALL) {
625 efi_printk("Failed to get initrd info size\n");
597 goto close_handles; 626 goto close_handles;
627 }
598 628
599grow: 629grow:
600 status = efi_call_phys3(sys_table->boottime->allocate_pool, 630 status = efi_call_phys3(sys_table->boottime->allocate_pool,
601 EFI_LOADER_DATA, info_sz, &info); 631 EFI_LOADER_DATA, info_sz, &info);
602 if (status != EFI_SUCCESS) 632 if (status != EFI_SUCCESS) {
633 efi_printk("Failed to alloc mem for initrd info\n");
603 goto close_handles; 634 goto close_handles;
635 }
604 636
605 status = efi_call_phys4(h->get_info, h, &info_guid, 637 status = efi_call_phys4(h->get_info, h, &info_guid,
606 &info_sz, info); 638 &info_sz, info);
@@ -612,8 +644,10 @@ grow:
612 file_sz = info->file_size; 644 file_sz = info->file_size;
613 efi_call_phys1(sys_table->boottime->free_pool, info); 645 efi_call_phys1(sys_table->boottime->free_pool, info);
614 646
615 if (status != EFI_SUCCESS) 647 if (status != EFI_SUCCESS) {
648 efi_printk("Failed to get initrd info\n");
616 goto close_handles; 649 goto close_handles;
650 }
617 651
618 initrd->size = file_sz; 652 initrd->size = file_sz;
619 initrd_total += file_sz; 653 initrd_total += file_sz;
@@ -629,11 +663,14 @@ grow:
629 */ 663 */
630 status = high_alloc(initrd_total, 0x1000, 664 status = high_alloc(initrd_total, 0x1000,
631 &initrd_addr, hdr->initrd_addr_max); 665 &initrd_addr, hdr->initrd_addr_max);
632 if (status != EFI_SUCCESS) 666 if (status != EFI_SUCCESS) {
667 efi_printk("Failed to alloc highmem for initrds\n");
633 goto close_handles; 668 goto close_handles;
669 }
634 670
635 /* We've run out of free low memory. */ 671 /* We've run out of free low memory. */
636 if (initrd_addr > hdr->initrd_addr_max) { 672 if (initrd_addr > hdr->initrd_addr_max) {
673 efi_printk("We've run out of free low memory\n");
637 status = EFI_INVALID_PARAMETER; 674 status = EFI_INVALID_PARAMETER;
638 goto free_initrd_total; 675 goto free_initrd_total;
639 } 676 }
@@ -652,8 +689,10 @@ grow:
652 status = efi_call_phys3(fh->read, 689 status = efi_call_phys3(fh->read,
653 initrds[j].handle, 690 initrds[j].handle,
654 &chunksize, addr); 691 &chunksize, addr);
655 if (status != EFI_SUCCESS) 692 if (status != EFI_SUCCESS) {
693 efi_printk("Failed to read initrd\n");
656 goto free_initrd_total; 694 goto free_initrd_total;
695 }
657 addr += chunksize; 696 addr += chunksize;
658 size -= chunksize; 697 size -= chunksize;
659 } 698 }
@@ -674,7 +713,7 @@ free_initrd_total:
674 low_free(initrd_total, initrd_addr); 713 low_free(initrd_total, initrd_addr);
675 714
676close_handles: 715close_handles:
677 for (k = j; k < nr_initrds; k++) 716 for (k = j; k < i; k++)
678 efi_call_phys1(fh->close, initrds[k].handle); 717 efi_call_phys1(fh->close, initrds[k].handle);
679free_initrds: 718free_initrds:
680 efi_call_phys1(sys_table->boottime->free_pool, initrds); 719 efi_call_phys1(sys_table->boottime->free_pool, initrds);
@@ -732,8 +771,10 @@ static efi_status_t make_boot_params(struct boot_params *boot_params,
732 options_size++; /* NUL termination */ 771 options_size++; /* NUL termination */
733 772
734 status = low_alloc(options_size, 1, &cmdline); 773 status = low_alloc(options_size, 1, &cmdline);
735 if (status != EFI_SUCCESS) 774 if (status != EFI_SUCCESS) {
775 efi_printk("Failed to alloc mem for cmdline\n");
736 goto fail; 776 goto fail;
777 }
737 778
738 s1 = (u8 *)(unsigned long)cmdline; 779 s1 = (u8 *)(unsigned long)cmdline;
739 s2 = (u16 *)options; 780 s2 = (u16 *)options;
@@ -895,12 +936,16 @@ struct boot_params *efi_main(void *handle, efi_system_table_t *_table)
895 936
896 status = efi_call_phys3(sys_table->boottime->handle_protocol, 937 status = efi_call_phys3(sys_table->boottime->handle_protocol,
897 handle, &proto, (void *)&image); 938 handle, &proto, (void *)&image);
898 if (status != EFI_SUCCESS) 939 if (status != EFI_SUCCESS) {
940 efi_printk("Failed to get handle for LOADED_IMAGE_PROTOCOL\n");
899 goto fail; 941 goto fail;
942 }
900 943
901 status = low_alloc(0x4000, 1, (unsigned long *)&boot_params); 944 status = low_alloc(0x4000, 1, (unsigned long *)&boot_params);
902 if (status != EFI_SUCCESS) 945 if (status != EFI_SUCCESS) {
946 efi_printk("Failed to alloc lowmem for boot params\n");
903 goto fail; 947 goto fail;
948 }
904 949
905 memset(boot_params, 0x0, 0x4000); 950 memset(boot_params, 0x0, 0x4000);
906 951
@@ -933,8 +978,10 @@ struct boot_params *efi_main(void *handle, efi_system_table_t *_table)
933 if (status != EFI_SUCCESS) { 978 if (status != EFI_SUCCESS) {
934 status = low_alloc(hdr->init_size, hdr->kernel_alignment, 979 status = low_alloc(hdr->init_size, hdr->kernel_alignment,
935 &start); 980 &start);
936 if (status != EFI_SUCCESS) 981 if (status != EFI_SUCCESS) {
982 efi_printk("Failed to alloc mem for kernel\n");
937 goto fail; 983 goto fail;
984 }
938 } 985 }
939 986
940 hdr->code32_start = (__u32)start; 987 hdr->code32_start = (__u32)start;
@@ -945,19 +992,25 @@ struct boot_params *efi_main(void *handle, efi_system_table_t *_table)
945 status = efi_call_phys3(sys_table->boottime->allocate_pool, 992 status = efi_call_phys3(sys_table->boottime->allocate_pool,
946 EFI_LOADER_DATA, sizeof(*gdt), 993 EFI_LOADER_DATA, sizeof(*gdt),
947 (void **)&gdt); 994 (void **)&gdt);
948 if (status != EFI_SUCCESS) 995 if (status != EFI_SUCCESS) {
996 efi_printk("Failed to alloc mem for gdt structure\n");
949 goto fail; 997 goto fail;
998 }
950 999
951 gdt->size = 0x800; 1000 gdt->size = 0x800;
952 status = low_alloc(gdt->size, 8, (unsigned long *)&gdt->address); 1001 status = low_alloc(gdt->size, 8, (unsigned long *)&gdt->address);
953 if (status != EFI_SUCCESS) 1002 if (status != EFI_SUCCESS) {
1003 efi_printk("Failed to alloc mem for gdt\n");
954 goto fail; 1004 goto fail;
1005 }
955 1006
956 status = efi_call_phys3(sys_table->boottime->allocate_pool, 1007 status = efi_call_phys3(sys_table->boottime->allocate_pool,
957 EFI_LOADER_DATA, sizeof(*idt), 1008 EFI_LOADER_DATA, sizeof(*idt),
958 (void **)&idt); 1009 (void **)&idt);
959 if (status != EFI_SUCCESS) 1010 if (status != EFI_SUCCESS) {
1011 efi_printk("Failed to alloc mem for idt structure\n");
960 goto fail; 1012 goto fail;
1013 }
961 1014
962 idt->size = 0; 1015 idt->size = 0;
963 idt->address = 0; 1016 idt->address = 0;
diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h
index 39251663e65..3b6e15627c5 100644
--- a/arch/x86/boot/compressed/eboot.h
+++ b/arch/x86/boot/compressed/eboot.h
@@ -58,4 +58,10 @@ struct efi_uga_draw_protocol {
58 void *blt; 58 void *blt;
59}; 59};
60 60
61struct efi_simple_text_output_protocol {
62 void *reset;
63 void *output_string;
64 void *test_string;
65};
66
61#endif /* BOOT_COMPRESSED_EBOOT_H */ 67#endif /* BOOT_COMPRESSED_EBOOT_H */
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 7116dcba0c9..88f7ff6da40 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -108,8 +108,6 @@ static void error(char *m);
108 * This is set up by the setup-routine at boot-time 108 * This is set up by the setup-routine at boot-time
109 */ 109 */
110struct boot_params *real_mode; /* Pointer to real-mode data */ 110struct boot_params *real_mode; /* Pointer to real-mode data */
111static int quiet;
112static int debug;
113 111
114void *memset(void *s, int c, size_t n); 112void *memset(void *s, int c, size_t n);
115void *memcpy(void *dest, const void *src, size_t n); 113void *memcpy(void *dest, const void *src, size_t n);
@@ -170,15 +168,11 @@ static void serial_putchar(int ch)
170 outb(ch, early_serial_base + TXR); 168 outb(ch, early_serial_base + TXR);
171} 169}
172 170
173void __putstr(int error, const char *s) 171void __putstr(const char *s)
174{ 172{
175 int x, y, pos; 173 int x, y, pos;
176 char c; 174 char c;
177 175
178#ifndef CONFIG_X86_VERBOSE_BOOTUP
179 if (!error)
180 return;
181#endif
182 if (early_serial_base) { 176 if (early_serial_base) {
183 const char *str = s; 177 const char *str = s;
184 while (*str) { 178 while (*str) {
@@ -265,9 +259,9 @@ void *memcpy(void *dest, const void *src, size_t n)
265 259
266static void error(char *x) 260static void error(char *x)
267{ 261{
268 __putstr(1, "\n\n"); 262 error_putstr("\n\n");
269 __putstr(1, x); 263 error_putstr(x);
270 __putstr(1, "\n\n -- System halted"); 264 error_putstr("\n\n -- System halted");
271 265
272 while (1) 266 while (1)
273 asm("hlt"); 267 asm("hlt");
@@ -294,8 +288,7 @@ static void parse_elf(void *output)
294 return; 288 return;
295 } 289 }
296 290
297 if (!quiet) 291 debug_putstr("Parsing ELF... ");
298 putstr("Parsing ELF... ");
299 292
300 phdrs = malloc(sizeof(*phdrs) * ehdr.e_phnum); 293 phdrs = malloc(sizeof(*phdrs) * ehdr.e_phnum);
301 if (!phdrs) 294 if (!phdrs)
@@ -332,11 +325,6 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
332{ 325{
333 real_mode = rmode; 326 real_mode = rmode;
334 327
335 if (cmdline_find_option_bool("quiet"))
336 quiet = 1;
337 if (cmdline_find_option_bool("debug"))
338 debug = 1;
339
340 if (real_mode->screen_info.orig_video_mode == 7) { 328 if (real_mode->screen_info.orig_video_mode == 7) {
341 vidmem = (char *) 0xb0000; 329 vidmem = (char *) 0xb0000;
342 vidport = 0x3b4; 330 vidport = 0x3b4;
@@ -349,8 +337,7 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
349 cols = real_mode->screen_info.orig_video_cols; 337 cols = real_mode->screen_info.orig_video_cols;
350 338
351 console_init(); 339 console_init();
352 if (debug) 340 debug_putstr("early console in decompress_kernel\n");
353 putstr("early console in decompress_kernel\n");
354 341
355 free_mem_ptr = heap; /* Heap */ 342 free_mem_ptr = heap; /* Heap */
356 free_mem_end_ptr = heap + BOOT_HEAP_SIZE; 343 free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
@@ -369,11 +356,9 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
369 error("Wrong destination address"); 356 error("Wrong destination address");
370#endif 357#endif
371 358
372 if (!quiet) 359 debug_putstr("\nDecompressing Linux... ");
373 putstr("\nDecompressing Linux... ");
374 decompress(input_data, input_len, NULL, NULL, output, NULL, error); 360 decompress(input_data, input_len, NULL, NULL, output, NULL, error);
375 parse_elf(output); 361 parse_elf(output);
376 if (!quiet) 362 debug_putstr("done.\nBooting the kernel.\n");
377 putstr("done.\nBooting the kernel.\n");
378 return; 363 return;
379} 364}
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 3f19c81a620..0e6dc0ee0ee 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -24,9 +24,21 @@
24 24
25/* misc.c */ 25/* misc.c */
26extern struct boot_params *real_mode; /* Pointer to real-mode data */ 26extern struct boot_params *real_mode; /* Pointer to real-mode data */
27void __putstr(int error, const char *s); 27void __putstr(const char *s);
28#define putstr(__x) __putstr(0, __x) 28#define error_putstr(__x) __putstr(__x)
29#define puts(__x) __putstr(0, __x) 29
30#ifdef CONFIG_X86_VERBOSE_BOOTUP
31
32#define debug_putstr(__x) __putstr(__x)
33
34#else
35
36static inline void debug_putstr(const char *s)
37{ }
38
39#endif
40
41#ifdef CONFIG_EARLY_PRINTK
30 42
31/* cmdline.c */ 43/* cmdline.c */
32int cmdline_find_option(const char *option, char *buffer, int bufsize); 44int cmdline_find_option(const char *option, char *buffer, int bufsize);
@@ -36,4 +48,13 @@ int cmdline_find_option_bool(const char *option);
36extern int early_serial_base; 48extern int early_serial_base;
37void console_init(void); 49void console_init(void);
38 50
51#else
52
53/* early_serial_console.c */
54static const int early_serial_base;
55static inline void console_init(void)
56{ }
57
58#endif
59
39#endif 60#endif
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index fde5bde3b60..9b9c6475b36 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -94,10 +94,10 @@ bs_die:
94 94
95 .section ".bsdata", "a" 95 .section ".bsdata", "a"
96bugger_off_msg: 96bugger_off_msg:
97 .ascii "Direct booting from floppy is no longer supported.\r\n" 97 .ascii "Direct floppy boot is not supported. "
98 .ascii "Please use a boot loader program instead.\r\n" 98 .ascii "Use a boot loader program instead.\r\n"
99 .ascii "\n" 99 .ascii "\n"
100 .ascii "Remove disk and press any key to reboot . . .\r\n" 100 .ascii "Remove disk and press any key to reboot ...\r\n"
101 .byte 0 101 .byte 0
102 102
103#ifdef CONFIG_EFI_STUB 103#ifdef CONFIG_EFI_STUB
@@ -111,7 +111,7 @@ coff_header:
111#else 111#else
112 .word 0x8664 # x86-64 112 .word 0x8664 # x86-64
113#endif 113#endif
114 .word 2 # nr_sections 114 .word 3 # nr_sections
115 .long 0 # TimeDateStamp 115 .long 0 # TimeDateStamp
116 .long 0 # PointerToSymbolTable 116 .long 0 # PointerToSymbolTable
117 .long 1 # NumberOfSymbols 117 .long 1 # NumberOfSymbols
@@ -158,8 +158,8 @@ extra_header_fields:
158#else 158#else
159 .quad 0 # ImageBase 159 .quad 0 # ImageBase
160#endif 160#endif
161 .long 0x1000 # SectionAlignment 161 .long 0x20 # SectionAlignment
162 .long 0x200 # FileAlignment 162 .long 0x20 # FileAlignment
163 .word 0 # MajorOperatingSystemVersion 163 .word 0 # MajorOperatingSystemVersion
164 .word 0 # MinorOperatingSystemVersion 164 .word 0 # MinorOperatingSystemVersion
165 .word 0 # MajorImageVersion 165 .word 0 # MajorImageVersion
@@ -200,8 +200,10 @@ extra_header_fields:
200 200
201 # Section table 201 # Section table
202section_table: 202section_table:
203 .ascii ".text" 203 #
204 .byte 0 204 # The offset & size fields are filled in by build.c.
205 #
206 .ascii ".setup"
205 .byte 0 207 .byte 0
206 .byte 0 208 .byte 0
207 .long 0 209 .long 0
@@ -217,9 +219,8 @@ section_table:
217 219
218 # 220 #
219 # The EFI application loader requires a relocation section 221 # The EFI application loader requires a relocation section
220 # because EFI applications must be relocatable. But since 222 # because EFI applications must be relocatable. The .reloc
221 # we don't need the loader to fixup any relocs for us, we 223 # offset & size fields are filled in by build.c.
222 # just create an empty (zero-length) .reloc section header.
223 # 224 #
224 .ascii ".reloc" 225 .ascii ".reloc"
225 .byte 0 226 .byte 0
@@ -233,6 +234,25 @@ section_table:
233 .word 0 # NumberOfRelocations 234 .word 0 # NumberOfRelocations
234 .word 0 # NumberOfLineNumbers 235 .word 0 # NumberOfLineNumbers
235 .long 0x42100040 # Characteristics (section flags) 236 .long 0x42100040 # Characteristics (section flags)
237
238 #
239 # The offset & size fields are filled in by build.c.
240 #
241 .ascii ".text"
242 .byte 0
243 .byte 0
244 .byte 0
245 .long 0
246 .long 0x0 # startup_{32,64}
247 .long 0 # Size of initialized data
248 # on disk
249 .long 0x0 # startup_{32,64}
250 .long 0 # PointerToRelocations
251 .long 0 # PointerToLineNumbers
252 .word 0 # NumberOfRelocations
253 .word 0 # NumberOfLineNumbers
254 .long 0x60500020 # Characteristics (section flags)
255
236#endif /* CONFIG_EFI_STUB */ 256#endif /* CONFIG_EFI_STUB */
237 257
238 # Kernel attributes; used by setup. This is part 1 of the 258 # Kernel attributes; used by setup. This is part 1 of the
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
index 3f61f6e2b46..4b8e165ee57 100644
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -50,6 +50,8 @@ typedef unsigned int u32;
50u8 buf[SETUP_SECT_MAX*512]; 50u8 buf[SETUP_SECT_MAX*512];
51int is_big_kernel; 51int is_big_kernel;
52 52
53#define PECOFF_RELOC_RESERVE 0x20
54
53/*----------------------------------------------------------------------*/ 55/*----------------------------------------------------------------------*/
54 56
55static const u32 crctab32[] = { 57static const u32 crctab32[] = {
@@ -133,11 +135,103 @@ static void usage(void)
133 die("Usage: build setup system [> image]"); 135 die("Usage: build setup system [> image]");
134} 136}
135 137
136int main(int argc, char ** argv)
137{
138#ifdef CONFIG_EFI_STUB 138#ifdef CONFIG_EFI_STUB
139 unsigned int file_sz, pe_header; 139
140static void update_pecoff_section_header(char *section_name, u32 offset, u32 size)
141{
142 unsigned int pe_header;
143 unsigned short num_sections;
144 u8 *section;
145
146 pe_header = get_unaligned_le32(&buf[0x3c]);
147 num_sections = get_unaligned_le16(&buf[pe_header + 6]);
148
149#ifdef CONFIG_X86_32
150 section = &buf[pe_header + 0xa8];
151#else
152 section = &buf[pe_header + 0xb8];
140#endif 153#endif
154
155 while (num_sections > 0) {
156 if (strncmp((char*)section, section_name, 8) == 0) {
157 /* section header size field */
158 put_unaligned_le32(size, section + 0x8);
159
160 /* section header vma field */
161 put_unaligned_le32(offset, section + 0xc);
162
163 /* section header 'size of initialised data' field */
164 put_unaligned_le32(size, section + 0x10);
165
166 /* section header 'file offset' field */
167 put_unaligned_le32(offset, section + 0x14);
168
169 break;
170 }
171 section += 0x28;
172 num_sections--;
173 }
174}
175
176static void update_pecoff_setup_and_reloc(unsigned int size)
177{
178 u32 setup_offset = 0x200;
179 u32 reloc_offset = size - PECOFF_RELOC_RESERVE;
180 u32 setup_size = reloc_offset - setup_offset;
181
182 update_pecoff_section_header(".setup", setup_offset, setup_size);
183 update_pecoff_section_header(".reloc", reloc_offset, PECOFF_RELOC_RESERVE);
184
185 /*
186 * Modify .reloc section contents with a single entry. The
187 * relocation is applied to offset 10 of the relocation section.
188 */
189 put_unaligned_le32(reloc_offset + 10, &buf[reloc_offset]);
190 put_unaligned_le32(10, &buf[reloc_offset + 4]);
191}
192
193static void update_pecoff_text(unsigned int text_start, unsigned int file_sz)
194{
195 unsigned int pe_header;
196 unsigned int text_sz = file_sz - text_start;
197
198 pe_header = get_unaligned_le32(&buf[0x3c]);
199
200 /* Size of image */
201 put_unaligned_le32(file_sz, &buf[pe_header + 0x50]);
202
203 /*
204 * Size of code: Subtract the size of the first sector (512 bytes)
205 * which includes the header.
206 */
207 put_unaligned_le32(file_sz - 512, &buf[pe_header + 0x1c]);
208
209#ifdef CONFIG_X86_32
210 /*
211 * Address of entry point.
212 *
213 * The EFI stub entry point is +16 bytes from the start of
214 * the .text section.
215 */
216 put_unaligned_le32(text_start + 16, &buf[pe_header + 0x28]);
217#else
218 /*
219 * Address of entry point. startup_32 is at the beginning and
220 * the 64-bit entry point (startup_64) is always 512 bytes
221 * after. The EFI stub entry point is 16 bytes after that, as
222 * the first instruction allows legacy loaders to jump over
223 * the EFI stub initialisation
224 */
225 put_unaligned_le32(text_start + 528, &buf[pe_header + 0x28]);
226#endif /* CONFIG_X86_32 */
227
228 update_pecoff_section_header(".text", text_start, text_sz);
229}
230
231#endif /* CONFIG_EFI_STUB */
232
233int main(int argc, char ** argv)
234{
141 unsigned int i, sz, setup_sectors; 235 unsigned int i, sz, setup_sectors;
142 int c; 236 int c;
143 u32 sys_size; 237 u32 sys_size;
@@ -163,6 +257,12 @@ int main(int argc, char ** argv)
163 die("Boot block hasn't got boot flag (0xAA55)"); 257 die("Boot block hasn't got boot flag (0xAA55)");
164 fclose(file); 258 fclose(file);
165 259
260#ifdef CONFIG_EFI_STUB
261 /* Reserve 0x20 bytes for .reloc section */
262 memset(buf+c, 0, PECOFF_RELOC_RESERVE);
263 c += PECOFF_RELOC_RESERVE;
264#endif
265
166 /* Pad unused space with zeros */ 266 /* Pad unused space with zeros */
167 setup_sectors = (c + 511) / 512; 267 setup_sectors = (c + 511) / 512;
168 if (setup_sectors < SETUP_SECT_MIN) 268 if (setup_sectors < SETUP_SECT_MIN)
@@ -170,6 +270,10 @@ int main(int argc, char ** argv)
170 i = setup_sectors*512; 270 i = setup_sectors*512;
171 memset(buf+c, 0, i-c); 271 memset(buf+c, 0, i-c);
172 272
273#ifdef CONFIG_EFI_STUB
274 update_pecoff_setup_and_reloc(i);
275#endif
276
173 /* Set the default root device */ 277 /* Set the default root device */
174 put_unaligned_le16(DEFAULT_ROOT_DEV, &buf[508]); 278 put_unaligned_le16(DEFAULT_ROOT_DEV, &buf[508]);
175 279
@@ -194,66 +298,8 @@ int main(int argc, char ** argv)
194 put_unaligned_le32(sys_size, &buf[0x1f4]); 298 put_unaligned_le32(sys_size, &buf[0x1f4]);
195 299
196#ifdef CONFIG_EFI_STUB 300#ifdef CONFIG_EFI_STUB
197 file_sz = sz + i + ((sys_size * 16) - sz); 301 update_pecoff_text(setup_sectors * 512, sz + i + ((sys_size * 16) - sz));
198 302#endif
199 pe_header = get_unaligned_le32(&buf[0x3c]);
200
201 /* Size of image */
202 put_unaligned_le32(file_sz, &buf[pe_header + 0x50]);
203
204 /*
205 * Subtract the size of the first section (512 bytes) which
206 * includes the header and .reloc section. The remaining size
207 * is that of the .text section.
208 */
209 file_sz -= 512;
210
211 /* Size of code */
212 put_unaligned_le32(file_sz, &buf[pe_header + 0x1c]);
213
214#ifdef CONFIG_X86_32
215 /*
216 * Address of entry point.
217 *
218 * The EFI stub entry point is +16 bytes from the start of
219 * the .text section.
220 */
221 put_unaligned_le32(i + 16, &buf[pe_header + 0x28]);
222
223 /* .text size */
224 put_unaligned_le32(file_sz, &buf[pe_header + 0xb0]);
225
226 /* .text vma */
227 put_unaligned_le32(0x200, &buf[pe_header + 0xb4]);
228
229 /* .text size of initialised data */
230 put_unaligned_le32(file_sz, &buf[pe_header + 0xb8]);
231
232 /* .text file offset */
233 put_unaligned_le32(0x200, &buf[pe_header + 0xbc]);
234#else
235 /*
236 * Address of entry point. startup_32 is at the beginning and
237 * the 64-bit entry point (startup_64) is always 512 bytes
238 * after. The EFI stub entry point is 16 bytes after that, as
239 * the first instruction allows legacy loaders to jump over
240 * the EFI stub initialisation
241 */
242 put_unaligned_le32(i + 528, &buf[pe_header + 0x28]);
243
244 /* .text size */
245 put_unaligned_le32(file_sz, &buf[pe_header + 0xc0]);
246
247 /* .text vma */
248 put_unaligned_le32(0x200, &buf[pe_header + 0xc4]);
249
250 /* .text size of initialised data */
251 put_unaligned_le32(file_sz, &buf[pe_header + 0xc8]);
252
253 /* .text file offset */
254 put_unaligned_le32(0x200, &buf[pe_header + 0xcc]);
255#endif /* CONFIG_X86_32 */
256#endif /* CONFIG_EFI_STUB */
257 303
258 crc = partial_crc32(buf, i, crc); 304 crc = partial_crc32(buf, i, crc);
259 if (fwrite(buf, 1, i, stdout) != i) 305 if (fwrite(buf, 1, i, stdout) != i)
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index e191ac048b5..e908e5de82d 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -2,6 +2,9 @@
2# Arch-specific CryptoAPI modules. 2# Arch-specific CryptoAPI modules.
3# 3#
4 4
5obj-$(CONFIG_CRYPTO_ABLK_HELPER_X86) += ablk_helper.o
6obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
7
5obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o 8obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
6obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o 9obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
7obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o 10obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
@@ -12,8 +15,10 @@ obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
12obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o 15obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
13obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o 16obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
14obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o 17obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
18obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
15obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o 19obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
16obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o 20obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
21obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
17obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o 22obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
18obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o 23obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
19 24
@@ -30,16 +35,11 @@ camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
30blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o 35blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
31twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o 36twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
32twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o 37twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
38twofish-avx-x86_64-y := twofish-avx-x86_64-asm_64.o twofish_avx_glue.o
33salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o 39salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
34serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o 40serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
41serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o serpent_avx_glue.o
35 42
36aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o 43aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
37
38ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o 44ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
39
40# enable AVX support only when $(AS) can actually assemble the instructions
41ifeq ($(call as-instr,vpxor %xmm0$(comma)%xmm1$(comma)%xmm2,yes,no),yes)
42AFLAGS_sha1_ssse3_asm.o += -DSHA1_ENABLE_AVX_SUPPORT
43CFLAGS_sha1_ssse3_glue.o += -DSHA1_ENABLE_AVX_SUPPORT
44endif
45sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o 45sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
diff --git a/arch/x86/crypto/ablk_helper.c b/arch/x86/crypto/ablk_helper.c
new file mode 100644
index 00000000000..43282fe04a8
--- /dev/null
+++ b/arch/x86/crypto/ablk_helper.c
@@ -0,0 +1,149 @@
1/*
2 * Shared async block cipher helpers
3 *
4 * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * Based on aesni-intel_glue.c by:
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
23 * USA
24 *
25 */
26
27#include <linux/kernel.h>
28#include <linux/crypto.h>
29#include <linux/init.h>
30#include <linux/module.h>
31#include <crypto/algapi.h>
32#include <crypto/cryptd.h>
33#include <asm/i387.h>
34#include <asm/crypto/ablk_helper.h>
35
36int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
37 unsigned int key_len)
38{
39 struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm);
40 struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base;
41 int err;
42
43 crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
44 crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm)
45 & CRYPTO_TFM_REQ_MASK);
46 err = crypto_ablkcipher_setkey(child, key, key_len);
47 crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child)
48 & CRYPTO_TFM_RES_MASK);
49 return err;
50}
51EXPORT_SYMBOL_GPL(ablk_set_key);
52
53int __ablk_encrypt(struct ablkcipher_request *req)
54{
55 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
56 struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm);
57 struct blkcipher_desc desc;
58
59 desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
60 desc.info = req->info;
61 desc.flags = 0;
62
63 return crypto_blkcipher_crt(desc.tfm)->encrypt(
64 &desc, req->dst, req->src, req->nbytes);
65}
66EXPORT_SYMBOL_GPL(__ablk_encrypt);
67
68int ablk_encrypt(struct ablkcipher_request *req)
69{
70 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
71 struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm);
72
73 if (!irq_fpu_usable()) {
74 struct ablkcipher_request *cryptd_req =
75 ablkcipher_request_ctx(req);
76
77 memcpy(cryptd_req, req, sizeof(*req));
78 ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
79
80 return crypto_ablkcipher_encrypt(cryptd_req);
81 } else {
82 return __ablk_encrypt(req);
83 }
84}
85EXPORT_SYMBOL_GPL(ablk_encrypt);
86
87int ablk_decrypt(struct ablkcipher_request *req)
88{
89 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
90 struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm);
91
92 if (!irq_fpu_usable()) {
93 struct ablkcipher_request *cryptd_req =
94 ablkcipher_request_ctx(req);
95
96 memcpy(cryptd_req, req, sizeof(*req));
97 ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
98
99 return crypto_ablkcipher_decrypt(cryptd_req);
100 } else {
101 struct blkcipher_desc desc;
102
103 desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
104 desc.info = req->info;
105 desc.flags = 0;
106
107 return crypto_blkcipher_crt(desc.tfm)->decrypt(
108 &desc, req->dst, req->src, req->nbytes);
109 }
110}
111EXPORT_SYMBOL_GPL(ablk_decrypt);
112
113void ablk_exit(struct crypto_tfm *tfm)
114{
115 struct async_helper_ctx *ctx = crypto_tfm_ctx(tfm);
116
117 cryptd_free_ablkcipher(ctx->cryptd_tfm);
118}
119EXPORT_SYMBOL_GPL(ablk_exit);
120
121int ablk_init_common(struct crypto_tfm *tfm, const char *drv_name)
122{
123 struct async_helper_ctx *ctx = crypto_tfm_ctx(tfm);
124 struct cryptd_ablkcipher *cryptd_tfm;
125
126 cryptd_tfm = cryptd_alloc_ablkcipher(drv_name, 0, 0);
127 if (IS_ERR(cryptd_tfm))
128 return PTR_ERR(cryptd_tfm);
129
130 ctx->cryptd_tfm = cryptd_tfm;
131 tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) +
132 crypto_ablkcipher_reqsize(&cryptd_tfm->base);
133
134 return 0;
135}
136EXPORT_SYMBOL_GPL(ablk_init_common);
137
138int ablk_init(struct crypto_tfm *tfm)
139{
140 char drv_name[CRYPTO_MAX_ALG_NAME];
141
142 snprintf(drv_name, sizeof(drv_name), "__driver-%s",
143 crypto_tfm_alg_driver_name(tfm));
144
145 return ablk_init_common(tfm, drv_name);
146}
147EXPORT_SYMBOL_GPL(ablk_init);
148
149MODULE_LICENSE("GPL");
diff --git a/arch/x86/crypto/aes_glue.c b/arch/x86/crypto/aes_glue.c
index 8efcf42a9d7..59b37deb8c8 100644
--- a/arch/x86/crypto/aes_glue.c
+++ b/arch/x86/crypto/aes_glue.c
@@ -5,7 +5,7 @@
5 5
6#include <linux/module.h> 6#include <linux/module.h>
7#include <crypto/aes.h> 7#include <crypto/aes.h>
8#include <asm/aes.h> 8#include <asm/crypto/aes.h>
9 9
10asmlinkage void aes_enc_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in); 10asmlinkage void aes_enc_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in);
11asmlinkage void aes_dec_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in); 11asmlinkage void aes_dec_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in);
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index be6d9e365a8..3470624d783 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -2460,10 +2460,12 @@ ENTRY(aesni_cbc_dec)
2460 pxor IN3, STATE4 2460 pxor IN3, STATE4
2461 movaps IN4, IV 2461 movaps IN4, IV
2462#else 2462#else
2463 pxor (INP), STATE2
2464 pxor 0x10(INP), STATE3
2465 pxor IN1, STATE4 2463 pxor IN1, STATE4
2466 movaps IN2, IV 2464 movaps IN2, IV
2465 movups (INP), IN1
2466 pxor IN1, STATE2
2467 movups 0x10(INP), IN2
2468 pxor IN2, STATE3
2467#endif 2469#endif
2468 movups STATE1, (OUTP) 2470 movups STATE1, (OUTP)
2469 movups STATE2, 0x10(OUTP) 2471 movups STATE2, 0x10(OUTP)
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index ac7f5cd019e..34fdcff4d2c 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -30,7 +30,8 @@
30#include <crypto/ctr.h> 30#include <crypto/ctr.h>
31#include <asm/cpu_device_id.h> 31#include <asm/cpu_device_id.h>
32#include <asm/i387.h> 32#include <asm/i387.h>
33#include <asm/aes.h> 33#include <asm/crypto/aes.h>
34#include <asm/crypto/ablk_helper.h>
34#include <crypto/scatterwalk.h> 35#include <crypto/scatterwalk.h>
35#include <crypto/internal/aead.h> 36#include <crypto/internal/aead.h>
36#include <linux/workqueue.h> 37#include <linux/workqueue.h>
@@ -52,10 +53,6 @@
52#define HAS_XTS 53#define HAS_XTS
53#endif 54#endif
54 55
55struct async_aes_ctx {
56 struct cryptd_ablkcipher *cryptd_tfm;
57};
58
59/* This data is stored at the end of the crypto_tfm struct. 56/* This data is stored at the end of the crypto_tfm struct.
60 * It's a type of per "session" data storage location. 57 * It's a type of per "session" data storage location.
61 * This needs to be 16 byte aligned. 58 * This needs to be 16 byte aligned.
@@ -377,87 +374,6 @@ static int ctr_crypt(struct blkcipher_desc *desc,
377} 374}
378#endif 375#endif
379 376
380static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
381 unsigned int key_len)
382{
383 struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
384 struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base;
385 int err;
386
387 crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
388 crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm)
389 & CRYPTO_TFM_REQ_MASK);
390 err = crypto_ablkcipher_setkey(child, key, key_len);
391 crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child)
392 & CRYPTO_TFM_RES_MASK);
393 return err;
394}
395
396static int ablk_encrypt(struct ablkcipher_request *req)
397{
398 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
399 struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
400
401 if (!irq_fpu_usable()) {
402 struct ablkcipher_request *cryptd_req =
403 ablkcipher_request_ctx(req);
404 memcpy(cryptd_req, req, sizeof(*req));
405 ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
406 return crypto_ablkcipher_encrypt(cryptd_req);
407 } else {
408 struct blkcipher_desc desc;
409 desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
410 desc.info = req->info;
411 desc.flags = 0;
412 return crypto_blkcipher_crt(desc.tfm)->encrypt(
413 &desc, req->dst, req->src, req->nbytes);
414 }
415}
416
417static int ablk_decrypt(struct ablkcipher_request *req)
418{
419 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
420 struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
421
422 if (!irq_fpu_usable()) {
423 struct ablkcipher_request *cryptd_req =
424 ablkcipher_request_ctx(req);
425 memcpy(cryptd_req, req, sizeof(*req));
426 ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
427 return crypto_ablkcipher_decrypt(cryptd_req);
428 } else {
429 struct blkcipher_desc desc;
430 desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
431 desc.info = req->info;
432 desc.flags = 0;
433 return crypto_blkcipher_crt(desc.tfm)->decrypt(
434 &desc, req->dst, req->src, req->nbytes);
435 }
436}
437
438static void ablk_exit(struct crypto_tfm *tfm)
439{
440 struct async_aes_ctx *ctx = crypto_tfm_ctx(tfm);
441
442 cryptd_free_ablkcipher(ctx->cryptd_tfm);
443}
444
445static int ablk_init_common(struct crypto_tfm *tfm, const char *drv_name)
446{
447 struct async_aes_ctx *ctx = crypto_tfm_ctx(tfm);
448 struct cryptd_ablkcipher *cryptd_tfm;
449
450 cryptd_tfm = cryptd_alloc_ablkcipher(drv_name, 0, 0);
451 if (IS_ERR(cryptd_tfm))
452 return PTR_ERR(cryptd_tfm);
453
454 ctx->cryptd_tfm = cryptd_tfm;
455 tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) +
456 crypto_ablkcipher_reqsize(&cryptd_tfm->base);
457
458 return 0;
459}
460
461static int ablk_ecb_init(struct crypto_tfm *tfm) 377static int ablk_ecb_init(struct crypto_tfm *tfm)
462{ 378{
463 return ablk_init_common(tfm, "__driver-ecb-aes-aesni"); 379 return ablk_init_common(tfm, "__driver-ecb-aes-aesni");
@@ -613,7 +529,7 @@ static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
613 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); 529 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
614 struct aesni_rfc4106_gcm_ctx *child_ctx = 530 struct aesni_rfc4106_gcm_ctx *child_ctx =
615 aesni_rfc4106_gcm_ctx_get(cryptd_child); 531 aesni_rfc4106_gcm_ctx_get(cryptd_child);
616 u8 *new_key_mem = NULL; 532 u8 *new_key_align, *new_key_mem = NULL;
617 533
618 if (key_len < 4) { 534 if (key_len < 4) {
619 crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); 535 crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
@@ -637,9 +553,9 @@ static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
637 if (!new_key_mem) 553 if (!new_key_mem)
638 return -ENOMEM; 554 return -ENOMEM;
639 555
640 new_key_mem = PTR_ALIGN(new_key_mem, AESNI_ALIGN); 556 new_key_align = PTR_ALIGN(new_key_mem, AESNI_ALIGN);
641 memcpy(new_key_mem, key, key_len); 557 memcpy(new_key_align, key, key_len);
642 key = new_key_mem; 558 key = new_key_align;
643 } 559 }
644 560
645 if (!irq_fpu_usable()) 561 if (!irq_fpu_usable())
@@ -968,7 +884,7 @@ static struct crypto_alg aesni_algs[] = { {
968 .cra_priority = 400, 884 .cra_priority = 400,
969 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 885 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
970 .cra_blocksize = AES_BLOCK_SIZE, 886 .cra_blocksize = AES_BLOCK_SIZE,
971 .cra_ctxsize = sizeof(struct async_aes_ctx), 887 .cra_ctxsize = sizeof(struct async_helper_ctx),
972 .cra_alignmask = 0, 888 .cra_alignmask = 0,
973 .cra_type = &crypto_ablkcipher_type, 889 .cra_type = &crypto_ablkcipher_type,
974 .cra_module = THIS_MODULE, 890 .cra_module = THIS_MODULE,
@@ -989,7 +905,7 @@ static struct crypto_alg aesni_algs[] = { {
989 .cra_priority = 400, 905 .cra_priority = 400,
990 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 906 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
991 .cra_blocksize = AES_BLOCK_SIZE, 907 .cra_blocksize = AES_BLOCK_SIZE,
992 .cra_ctxsize = sizeof(struct async_aes_ctx), 908 .cra_ctxsize = sizeof(struct async_helper_ctx),
993 .cra_alignmask = 0, 909 .cra_alignmask = 0,
994 .cra_type = &crypto_ablkcipher_type, 910 .cra_type = &crypto_ablkcipher_type,
995 .cra_module = THIS_MODULE, 911 .cra_module = THIS_MODULE,
@@ -1033,7 +949,7 @@ static struct crypto_alg aesni_algs[] = { {
1033 .cra_priority = 400, 949 .cra_priority = 400,
1034 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 950 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
1035 .cra_blocksize = 1, 951 .cra_blocksize = 1,
1036 .cra_ctxsize = sizeof(struct async_aes_ctx), 952 .cra_ctxsize = sizeof(struct async_helper_ctx),
1037 .cra_alignmask = 0, 953 .cra_alignmask = 0,
1038 .cra_type = &crypto_ablkcipher_type, 954 .cra_type = &crypto_ablkcipher_type,
1039 .cra_module = THIS_MODULE, 955 .cra_module = THIS_MODULE,
@@ -1098,7 +1014,7 @@ static struct crypto_alg aesni_algs[] = { {
1098 .cra_priority = 400, 1014 .cra_priority = 400,
1099 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 1015 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
1100 .cra_blocksize = 1, 1016 .cra_blocksize = 1,
1101 .cra_ctxsize = sizeof(struct async_aes_ctx), 1017 .cra_ctxsize = sizeof(struct async_helper_ctx),
1102 .cra_alignmask = 0, 1018 .cra_alignmask = 0,
1103 .cra_type = &crypto_ablkcipher_type, 1019 .cra_type = &crypto_ablkcipher_type,
1104 .cra_module = THIS_MODULE, 1020 .cra_module = THIS_MODULE,
@@ -1126,7 +1042,7 @@ static struct crypto_alg aesni_algs[] = { {
1126 .cra_priority = 400, 1042 .cra_priority = 400,
1127 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 1043 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
1128 .cra_blocksize = AES_BLOCK_SIZE, 1044 .cra_blocksize = AES_BLOCK_SIZE,
1129 .cra_ctxsize = sizeof(struct async_aes_ctx), 1045 .cra_ctxsize = sizeof(struct async_helper_ctx),
1130 .cra_alignmask = 0, 1046 .cra_alignmask = 0,
1131 .cra_type = &crypto_ablkcipher_type, 1047 .cra_type = &crypto_ablkcipher_type,
1132 .cra_module = THIS_MODULE, 1048 .cra_module = THIS_MODULE,
@@ -1150,7 +1066,7 @@ static struct crypto_alg aesni_algs[] = { {
1150 .cra_priority = 400, 1066 .cra_priority = 400,
1151 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 1067 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
1152 .cra_blocksize = AES_BLOCK_SIZE, 1068 .cra_blocksize = AES_BLOCK_SIZE,
1153 .cra_ctxsize = sizeof(struct async_aes_ctx), 1069 .cra_ctxsize = sizeof(struct async_helper_ctx),
1154 .cra_alignmask = 0, 1070 .cra_alignmask = 0,
1155 .cra_type = &crypto_ablkcipher_type, 1071 .cra_type = &crypto_ablkcipher_type,
1156 .cra_module = THIS_MODULE, 1072 .cra_module = THIS_MODULE,
@@ -1174,7 +1090,7 @@ static struct crypto_alg aesni_algs[] = { {
1174 .cra_priority = 400, 1090 .cra_priority = 400,
1175 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 1091 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
1176 .cra_blocksize = AES_BLOCK_SIZE, 1092 .cra_blocksize = AES_BLOCK_SIZE,
1177 .cra_ctxsize = sizeof(struct async_aes_ctx), 1093 .cra_ctxsize = sizeof(struct async_helper_ctx),
1178 .cra_alignmask = 0, 1094 .cra_alignmask = 0,
1179 .cra_type = &crypto_ablkcipher_type, 1095 .cra_type = &crypto_ablkcipher_type,
1180 .cra_module = THIS_MODULE, 1096 .cra_module = THIS_MODULE,
diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c
index 3306dc0b139..eeb2b3b743e 100644
--- a/arch/x86/crypto/camellia_glue.c
+++ b/arch/x86/crypto/camellia_glue.c
@@ -5,10 +5,6 @@
5 * 5 *
6 * Camellia parts based on code by: 6 * Camellia parts based on code by:
7 * Copyright (C) 2006 NTT (Nippon Telegraph and Telephone Corporation) 7 * Copyright (C) 2006 NTT (Nippon Telegraph and Telephone Corporation)
8 * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
9 * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
10 * CTR part based on code (crypto/ctr.c) by:
11 * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
12 * 8 *
13 * This program is free software; you can redistribute it and/or modify 9 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by 10 * it under the terms of the GNU General Public License as published by
@@ -34,9 +30,9 @@
34#include <linux/module.h> 30#include <linux/module.h>
35#include <linux/types.h> 31#include <linux/types.h>
36#include <crypto/algapi.h> 32#include <crypto/algapi.h>
37#include <crypto/b128ops.h>
38#include <crypto/lrw.h> 33#include <crypto/lrw.h>
39#include <crypto/xts.h> 34#include <crypto/xts.h>
35#include <asm/crypto/glue_helper.h>
40 36
41#define CAMELLIA_MIN_KEY_SIZE 16 37#define CAMELLIA_MIN_KEY_SIZE 16
42#define CAMELLIA_MAX_KEY_SIZE 32 38#define CAMELLIA_MAX_KEY_SIZE 32
@@ -1312,307 +1308,128 @@ static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key,
1312 &tfm->crt_flags); 1308 &tfm->crt_flags);
1313} 1309}
1314 1310
1315static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, 1311static void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src)
1316 void (*fn)(struct camellia_ctx *, u8 *, const u8 *),
1317 void (*fn_2way)(struct camellia_ctx *, u8 *, const u8 *))
1318{ 1312{
1319 struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 1313 u128 iv = *src;
1320 unsigned int bsize = CAMELLIA_BLOCK_SIZE;
1321 unsigned int nbytes;
1322 int err;
1323
1324 err = blkcipher_walk_virt(desc, walk);
1325
1326 while ((nbytes = walk->nbytes)) {
1327 u8 *wsrc = walk->src.virt.addr;
1328 u8 *wdst = walk->dst.virt.addr;
1329
1330 /* Process two block batch */
1331 if (nbytes >= bsize * 2) {
1332 do {
1333 fn_2way(ctx, wdst, wsrc);
1334
1335 wsrc += bsize * 2;
1336 wdst += bsize * 2;
1337 nbytes -= bsize * 2;
1338 } while (nbytes >= bsize * 2);
1339
1340 if (nbytes < bsize)
1341 goto done;
1342 }
1343
1344 /* Handle leftovers */
1345 do {
1346 fn(ctx, wdst, wsrc);
1347
1348 wsrc += bsize;
1349 wdst += bsize;
1350 nbytes -= bsize;
1351 } while (nbytes >= bsize);
1352
1353done:
1354 err = blkcipher_walk_done(desc, walk, nbytes);
1355 }
1356
1357 return err;
1358}
1359
1360static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1361 struct scatterlist *src, unsigned int nbytes)
1362{
1363 struct blkcipher_walk walk;
1364
1365 blkcipher_walk_init(&walk, dst, src, nbytes);
1366 return ecb_crypt(desc, &walk, camellia_enc_blk, camellia_enc_blk_2way);
1367}
1368 1314
1369static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 1315 camellia_dec_blk_2way(ctx, (u8 *)dst, (u8 *)src);
1370 struct scatterlist *src, unsigned int nbytes)
1371{
1372 struct blkcipher_walk walk;
1373
1374 blkcipher_walk_init(&walk, dst, src, nbytes);
1375 return ecb_crypt(desc, &walk, camellia_dec_blk, camellia_dec_blk_2way);
1376}
1377 1316
1378static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, 1317 u128_xor(&dst[1], &dst[1], &iv);
1379 struct blkcipher_walk *walk)
1380{
1381 struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
1382 unsigned int bsize = CAMELLIA_BLOCK_SIZE;
1383 unsigned int nbytes = walk->nbytes;
1384 u128 *src = (u128 *)walk->src.virt.addr;
1385 u128 *dst = (u128 *)walk->dst.virt.addr;
1386 u128 *iv = (u128 *)walk->iv;
1387
1388 do {
1389 u128_xor(dst, src, iv);
1390 camellia_enc_blk(ctx, (u8 *)dst, (u8 *)dst);
1391 iv = dst;
1392
1393 src += 1;
1394 dst += 1;
1395 nbytes -= bsize;
1396 } while (nbytes >= bsize);
1397
1398 u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv);
1399 return nbytes;
1400} 1318}
1401 1319
1402static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 1320static void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
1403 struct scatterlist *src, unsigned int nbytes)
1404{ 1321{
1405 struct blkcipher_walk walk; 1322 be128 ctrblk;
1406 int err;
1407 1323
1408 blkcipher_walk_init(&walk, dst, src, nbytes); 1324 if (dst != src)
1409 err = blkcipher_walk_virt(desc, &walk); 1325 *dst = *src;
1410 1326
1411 while ((nbytes = walk.nbytes)) { 1327 u128_to_be128(&ctrblk, iv);
1412 nbytes = __cbc_encrypt(desc, &walk); 1328 u128_inc(iv);
1413 err = blkcipher_walk_done(desc, &walk, nbytes);
1414 }
1415 1329
1416 return err; 1330 camellia_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk);
1417} 1331}
1418 1332
1419static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, 1333static void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src,
1420 struct blkcipher_walk *walk) 1334 u128 *iv)
1421{ 1335{
1422 struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 1336 be128 ctrblks[2];
1423 unsigned int bsize = CAMELLIA_BLOCK_SIZE;
1424 unsigned int nbytes = walk->nbytes;
1425 u128 *src = (u128 *)walk->src.virt.addr;
1426 u128 *dst = (u128 *)walk->dst.virt.addr;
1427 u128 ivs[2 - 1];
1428 u128 last_iv;
1429 1337
1430 /* Start of the last block. */ 1338 if (dst != src) {
1431 src += nbytes / bsize - 1; 1339 dst[0] = src[0];
1432 dst += nbytes / bsize - 1; 1340 dst[1] = src[1];
1433
1434 last_iv = *src;
1435
1436 /* Process two block batch */
1437 if (nbytes >= bsize * 2) {
1438 do {
1439 nbytes -= bsize * (2 - 1);
1440 src -= 2 - 1;
1441 dst -= 2 - 1;
1442
1443 ivs[0] = src[0];
1444
1445 camellia_dec_blk_2way(ctx, (u8 *)dst, (u8 *)src);
1446
1447 u128_xor(dst + 1, dst + 1, ivs + 0);
1448
1449 nbytes -= bsize;
1450 if (nbytes < bsize)
1451 goto done;
1452
1453 u128_xor(dst, dst, src - 1);
1454 src -= 1;
1455 dst -= 1;
1456 } while (nbytes >= bsize * 2);
1457
1458 if (nbytes < bsize)
1459 goto done;
1460 } 1341 }
1461 1342
1462 /* Handle leftovers */ 1343 u128_to_be128(&ctrblks[0], iv);
1463 for (;;) { 1344 u128_inc(iv);
1464 camellia_dec_blk(ctx, (u8 *)dst, (u8 *)src); 1345 u128_to_be128(&ctrblks[1], iv);
1465 1346 u128_inc(iv);
1466 nbytes -= bsize;
1467 if (nbytes < bsize)
1468 break;
1469 1347
1470 u128_xor(dst, dst, src - 1); 1348 camellia_enc_blk_xor_2way(ctx, (u8 *)dst, (u8 *)ctrblks);
1471 src -= 1;
1472 dst -= 1;
1473 }
1474
1475done:
1476 u128_xor(dst, dst, (u128 *)walk->iv);
1477 *(u128 *)walk->iv = last_iv;
1478
1479 return nbytes;
1480} 1349}
1481 1350
1482static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 1351static const struct common_glue_ctx camellia_enc = {
1483 struct scatterlist *src, unsigned int nbytes) 1352 .num_funcs = 2,
1484{ 1353 .fpu_blocks_limit = -1,
1485 struct blkcipher_walk walk; 1354
1486 int err; 1355 .funcs = { {
1487 1356 .num_blocks = 2,
1488 blkcipher_walk_init(&walk, dst, src, nbytes); 1357 .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk_2way) }
1489 err = blkcipher_walk_virt(desc, &walk); 1358 }, {
1359 .num_blocks = 1,
1360 .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk) }
1361 } }
1362};
1490 1363
1491 while ((nbytes = walk.nbytes)) { 1364static const struct common_glue_ctx camellia_ctr = {
1492 nbytes = __cbc_decrypt(desc, &walk); 1365 .num_funcs = 2,
1493 err = blkcipher_walk_done(desc, &walk, nbytes); 1366 .fpu_blocks_limit = -1,
1494 } 1367
1368 .funcs = { {
1369 .num_blocks = 2,
1370 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr_2way) }
1371 }, {
1372 .num_blocks = 1,
1373 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr) }
1374 } }
1375};
1495 1376
1496 return err; 1377static const struct common_glue_ctx camellia_dec = {
1497} 1378 .num_funcs = 2,
1379 .fpu_blocks_limit = -1,
1380
1381 .funcs = { {
1382 .num_blocks = 2,
1383 .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk_2way) }
1384 }, {
1385 .num_blocks = 1,
1386 .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk) }
1387 } }
1388};
1498 1389
1499static inline void u128_to_be128(be128 *dst, const u128 *src) 1390static const struct common_glue_ctx camellia_dec_cbc = {
1500{ 1391 .num_funcs = 2,
1501 dst->a = cpu_to_be64(src->a); 1392 .fpu_blocks_limit = -1,
1502 dst->b = cpu_to_be64(src->b); 1393
1503} 1394 .funcs = { {
1395 .num_blocks = 2,
1396 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_decrypt_cbc_2way) }
1397 }, {
1398 .num_blocks = 1,
1399 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_dec_blk) }
1400 } }
1401};
1504 1402
1505static inline void be128_to_u128(u128 *dst, const be128 *src) 1403static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1404 struct scatterlist *src, unsigned int nbytes)
1506{ 1405{
1507 dst->a = be64_to_cpu(src->a); 1406 return glue_ecb_crypt_128bit(&camellia_enc, desc, dst, src, nbytes);
1508 dst->b = be64_to_cpu(src->b);
1509} 1407}
1510 1408
1511static inline void u128_inc(u128 *i) 1409static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1410 struct scatterlist *src, unsigned int nbytes)
1512{ 1411{
1513 i->b++; 1412 return glue_ecb_crypt_128bit(&camellia_dec, desc, dst, src, nbytes);
1514 if (!i->b)
1515 i->a++;
1516} 1413}
1517 1414
1518static void ctr_crypt_final(struct blkcipher_desc *desc, 1415static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1519 struct blkcipher_walk *walk) 1416 struct scatterlist *src, unsigned int nbytes)
1520{ 1417{
1521 struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 1418 return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(camellia_enc_blk), desc,
1522 u8 keystream[CAMELLIA_BLOCK_SIZE]; 1419 dst, src, nbytes);
1523 u8 *src = walk->src.virt.addr;
1524 u8 *dst = walk->dst.virt.addr;
1525 unsigned int nbytes = walk->nbytes;
1526 u128 ctrblk;
1527
1528 memcpy(keystream, src, nbytes);
1529 camellia_enc_blk_xor(ctx, keystream, walk->iv);
1530 memcpy(dst, keystream, nbytes);
1531
1532 be128_to_u128(&ctrblk, (be128 *)walk->iv);
1533 u128_inc(&ctrblk);
1534 u128_to_be128((be128 *)walk->iv, &ctrblk);
1535} 1420}
1536 1421
1537static unsigned int __ctr_crypt(struct blkcipher_desc *desc, 1422static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1538 struct blkcipher_walk *walk) 1423 struct scatterlist *src, unsigned int nbytes)
1539{ 1424{
1540 struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 1425 return glue_cbc_decrypt_128bit(&camellia_dec_cbc, desc, dst, src,
1541 unsigned int bsize = CAMELLIA_BLOCK_SIZE; 1426 nbytes);
1542 unsigned int nbytes = walk->nbytes;
1543 u128 *src = (u128 *)walk->src.virt.addr;
1544 u128 *dst = (u128 *)walk->dst.virt.addr;
1545 u128 ctrblk;
1546 be128 ctrblocks[2];
1547
1548 be128_to_u128(&ctrblk, (be128 *)walk->iv);
1549
1550 /* Process two block batch */
1551 if (nbytes >= bsize * 2) {
1552 do {
1553 if (dst != src) {
1554 dst[0] = src[0];
1555 dst[1] = src[1];
1556 }
1557
1558 /* create ctrblks for parallel encrypt */
1559 u128_to_be128(&ctrblocks[0], &ctrblk);
1560 u128_inc(&ctrblk);
1561 u128_to_be128(&ctrblocks[1], &ctrblk);
1562 u128_inc(&ctrblk);
1563
1564 camellia_enc_blk_xor_2way(ctx, (u8 *)dst,
1565 (u8 *)ctrblocks);
1566
1567 src += 2;
1568 dst += 2;
1569 nbytes -= bsize * 2;
1570 } while (nbytes >= bsize * 2);
1571
1572 if (nbytes < bsize)
1573 goto done;
1574 }
1575
1576 /* Handle leftovers */
1577 do {
1578 if (dst != src)
1579 *dst = *src;
1580
1581 u128_to_be128(&ctrblocks[0], &ctrblk);
1582 u128_inc(&ctrblk);
1583
1584 camellia_enc_blk_xor(ctx, (u8 *)dst, (u8 *)ctrblocks);
1585
1586 src += 1;
1587 dst += 1;
1588 nbytes -= bsize;
1589 } while (nbytes >= bsize);
1590
1591done:
1592 u128_to_be128((be128 *)walk->iv, &ctrblk);
1593 return nbytes;
1594} 1427}
1595 1428
1596static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, 1429static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1597 struct scatterlist *src, unsigned int nbytes) 1430 struct scatterlist *src, unsigned int nbytes)
1598{ 1431{
1599 struct blkcipher_walk walk; 1432 return glue_ctr_crypt_128bit(&camellia_ctr, desc, dst, src, nbytes);
1600 int err;
1601
1602 blkcipher_walk_init(&walk, dst, src, nbytes);
1603 err = blkcipher_walk_virt_block(desc, &walk, CAMELLIA_BLOCK_SIZE);
1604
1605 while ((nbytes = walk.nbytes) >= CAMELLIA_BLOCK_SIZE) {
1606 nbytes = __ctr_crypt(desc, &walk);
1607 err = blkcipher_walk_done(desc, &walk, nbytes);
1608 }
1609
1610 if (walk.nbytes) {
1611 ctr_crypt_final(desc, &walk);
1612 err = blkcipher_walk_done(desc, &walk, 0);
1613 }
1614
1615 return err;
1616} 1433}
1617 1434
1618static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) 1435static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
new file mode 100644
index 00000000000..4854f0f31e4
--- /dev/null
+++ b/arch/x86/crypto/glue_helper.c
@@ -0,0 +1,307 @@
1/*
2 * Shared glue code for 128bit block ciphers
3 *
4 * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
7 * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
8 * CTR part based on code (crypto/ctr.c) by:
9 * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
24 * USA
25 *
26 */
27
28#include <linux/module.h>
29#include <crypto/b128ops.h>
30#include <crypto/lrw.h>
31#include <crypto/xts.h>
32#include <asm/crypto/glue_helper.h>
33#include <crypto/scatterwalk.h>
34
35static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
36 struct blkcipher_desc *desc,
37 struct blkcipher_walk *walk)
38{
39 void *ctx = crypto_blkcipher_ctx(desc->tfm);
40 const unsigned int bsize = 128 / 8;
41 unsigned int nbytes, i, func_bytes;
42 bool fpu_enabled = false;
43 int err;
44
45 err = blkcipher_walk_virt(desc, walk);
46
47 while ((nbytes = walk->nbytes)) {
48 u8 *wsrc = walk->src.virt.addr;
49 u8 *wdst = walk->dst.virt.addr;
50
51 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
52 desc, fpu_enabled, nbytes);
53
54 for (i = 0; i < gctx->num_funcs; i++) {
55 func_bytes = bsize * gctx->funcs[i].num_blocks;
56
57 /* Process multi-block batch */
58 if (nbytes >= func_bytes) {
59 do {
60 gctx->funcs[i].fn_u.ecb(ctx, wdst,
61 wsrc);
62
63 wsrc += func_bytes;
64 wdst += func_bytes;
65 nbytes -= func_bytes;
66 } while (nbytes >= func_bytes);
67
68 if (nbytes < bsize)
69 goto done;
70 }
71 }
72
73done:
74 err = blkcipher_walk_done(desc, walk, nbytes);
75 }
76
77 glue_fpu_end(fpu_enabled);
78 return err;
79}
80
81int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
82 struct blkcipher_desc *desc, struct scatterlist *dst,
83 struct scatterlist *src, unsigned int nbytes)
84{
85 struct blkcipher_walk walk;
86
87 blkcipher_walk_init(&walk, dst, src, nbytes);
88 return __glue_ecb_crypt_128bit(gctx, desc, &walk);
89}
90EXPORT_SYMBOL_GPL(glue_ecb_crypt_128bit);
91
92static unsigned int __glue_cbc_encrypt_128bit(const common_glue_func_t fn,
93 struct blkcipher_desc *desc,
94 struct blkcipher_walk *walk)
95{
96 void *ctx = crypto_blkcipher_ctx(desc->tfm);
97 const unsigned int bsize = 128 / 8;
98 unsigned int nbytes = walk->nbytes;
99 u128 *src = (u128 *)walk->src.virt.addr;
100 u128 *dst = (u128 *)walk->dst.virt.addr;
101 u128 *iv = (u128 *)walk->iv;
102
103 do {
104 u128_xor(dst, src, iv);
105 fn(ctx, (u8 *)dst, (u8 *)dst);
106 iv = dst;
107
108 src += 1;
109 dst += 1;
110 nbytes -= bsize;
111 } while (nbytes >= bsize);
112
113 u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv);
114 return nbytes;
115}
116
117int glue_cbc_encrypt_128bit(const common_glue_func_t fn,
118 struct blkcipher_desc *desc,
119 struct scatterlist *dst,
120 struct scatterlist *src, unsigned int nbytes)
121{
122 struct blkcipher_walk walk;
123 int err;
124
125 blkcipher_walk_init(&walk, dst, src, nbytes);
126 err = blkcipher_walk_virt(desc, &walk);
127
128 while ((nbytes = walk.nbytes)) {
129 nbytes = __glue_cbc_encrypt_128bit(fn, desc, &walk);
130 err = blkcipher_walk_done(desc, &walk, nbytes);
131 }
132
133 return err;
134}
135EXPORT_SYMBOL_GPL(glue_cbc_encrypt_128bit);
136
137static unsigned int
138__glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
139 struct blkcipher_desc *desc,
140 struct blkcipher_walk *walk)
141{
142 void *ctx = crypto_blkcipher_ctx(desc->tfm);
143 const unsigned int bsize = 128 / 8;
144 unsigned int nbytes = walk->nbytes;
145 u128 *src = (u128 *)walk->src.virt.addr;
146 u128 *dst = (u128 *)walk->dst.virt.addr;
147 u128 last_iv;
148 unsigned int num_blocks, func_bytes;
149 unsigned int i;
150
151 /* Start of the last block. */
152 src += nbytes / bsize - 1;
153 dst += nbytes / bsize - 1;
154
155 last_iv = *src;
156
157 for (i = 0; i < gctx->num_funcs; i++) {
158 num_blocks = gctx->funcs[i].num_blocks;
159 func_bytes = bsize * num_blocks;
160
161 /* Process multi-block batch */
162 if (nbytes >= func_bytes) {
163 do {
164 nbytes -= func_bytes - bsize;
165 src -= num_blocks - 1;
166 dst -= num_blocks - 1;
167
168 gctx->funcs[i].fn_u.cbc(ctx, dst, src);
169
170 nbytes -= bsize;
171 if (nbytes < bsize)
172 goto done;
173
174 u128_xor(dst, dst, src - 1);
175 src -= 1;
176 dst -= 1;
177 } while (nbytes >= func_bytes);
178
179 if (nbytes < bsize)
180 goto done;
181 }
182 }
183
184done:
185 u128_xor(dst, dst, (u128 *)walk->iv);
186 *(u128 *)walk->iv = last_iv;
187
188 return nbytes;
189}
190
191int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
192 struct blkcipher_desc *desc,
193 struct scatterlist *dst,
194 struct scatterlist *src, unsigned int nbytes)
195{
196 const unsigned int bsize = 128 / 8;
197 bool fpu_enabled = false;
198 struct blkcipher_walk walk;
199 int err;
200
201 blkcipher_walk_init(&walk, dst, src, nbytes);
202 err = blkcipher_walk_virt(desc, &walk);
203
204 while ((nbytes = walk.nbytes)) {
205 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
206 desc, fpu_enabled, nbytes);
207 nbytes = __glue_cbc_decrypt_128bit(gctx, desc, &walk);
208 err = blkcipher_walk_done(desc, &walk, nbytes);
209 }
210
211 glue_fpu_end(fpu_enabled);
212 return err;
213}
214EXPORT_SYMBOL_GPL(glue_cbc_decrypt_128bit);
215
216static void glue_ctr_crypt_final_128bit(const common_glue_ctr_func_t fn_ctr,
217 struct blkcipher_desc *desc,
218 struct blkcipher_walk *walk)
219{
220 void *ctx = crypto_blkcipher_ctx(desc->tfm);
221 u8 *src = (u8 *)walk->src.virt.addr;
222 u8 *dst = (u8 *)walk->dst.virt.addr;
223 unsigned int nbytes = walk->nbytes;
224 u128 ctrblk;
225 u128 tmp;
226
227 be128_to_u128(&ctrblk, (be128 *)walk->iv);
228
229 memcpy(&tmp, src, nbytes);
230 fn_ctr(ctx, &tmp, &tmp, &ctrblk);
231 memcpy(dst, &tmp, nbytes);
232
233 u128_to_be128((be128 *)walk->iv, &ctrblk);
234}
235EXPORT_SYMBOL_GPL(glue_ctr_crypt_final_128bit);
236
237static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
238 struct blkcipher_desc *desc,
239 struct blkcipher_walk *walk)
240{
241 const unsigned int bsize = 128 / 8;
242 void *ctx = crypto_blkcipher_ctx(desc->tfm);
243 unsigned int nbytes = walk->nbytes;
244 u128 *src = (u128 *)walk->src.virt.addr;
245 u128 *dst = (u128 *)walk->dst.virt.addr;
246 u128 ctrblk;
247 unsigned int num_blocks, func_bytes;
248 unsigned int i;
249
250 be128_to_u128(&ctrblk, (be128 *)walk->iv);
251
252 /* Process multi-block batch */
253 for (i = 0; i < gctx->num_funcs; i++) {
254 num_blocks = gctx->funcs[i].num_blocks;
255 func_bytes = bsize * num_blocks;
256
257 if (nbytes >= func_bytes) {
258 do {
259 gctx->funcs[i].fn_u.ctr(ctx, dst, src, &ctrblk);
260
261 src += num_blocks;
262 dst += num_blocks;
263 nbytes -= func_bytes;
264 } while (nbytes >= func_bytes);
265
266 if (nbytes < bsize)
267 goto done;
268 }
269 }
270
271done:
272 u128_to_be128((be128 *)walk->iv, &ctrblk);
273 return nbytes;
274}
275
276int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
277 struct blkcipher_desc *desc, struct scatterlist *dst,
278 struct scatterlist *src, unsigned int nbytes)
279{
280 const unsigned int bsize = 128 / 8;
281 bool fpu_enabled = false;
282 struct blkcipher_walk walk;
283 int err;
284
285 blkcipher_walk_init(&walk, dst, src, nbytes);
286 err = blkcipher_walk_virt_block(desc, &walk, bsize);
287
288 while ((nbytes = walk.nbytes) >= bsize) {
289 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
290 desc, fpu_enabled, nbytes);
291 nbytes = __glue_ctr_crypt_128bit(gctx, desc, &walk);
292 err = blkcipher_walk_done(desc, &walk, nbytes);
293 }
294
295 glue_fpu_end(fpu_enabled);
296
297 if (walk.nbytes) {
298 glue_ctr_crypt_final_128bit(
299 gctx->funcs[gctx->num_funcs - 1].fn_u.ctr, desc, &walk);
300 err = blkcipher_walk_done(desc, &walk, 0);
301 }
302
303 return err;
304}
305EXPORT_SYMBOL_GPL(glue_ctr_crypt_128bit);
306
307MODULE_LICENSE("GPL");
diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
new file mode 100644
index 00000000000..504106bf04a
--- /dev/null
+++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
@@ -0,0 +1,704 @@
1/*
2 * Serpent Cipher 8-way parallel algorithm (x86_64/AVX)
3 *
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
6 *
7 * Based on arch/x86/crypto/serpent-sse2-x86_64-asm_64.S by
8 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
23 * USA
24 *
25 */
26
27.file "serpent-avx-x86_64-asm_64.S"
28.text
29
30#define CTX %rdi
31
32/**********************************************************************
33 8-way AVX serpent
34 **********************************************************************/
35#define RA1 %xmm0
36#define RB1 %xmm1
37#define RC1 %xmm2
38#define RD1 %xmm3
39#define RE1 %xmm4
40
41#define tp %xmm5
42
43#define RA2 %xmm6
44#define RB2 %xmm7
45#define RC2 %xmm8
46#define RD2 %xmm9
47#define RE2 %xmm10
48
49#define RNOT %xmm11
50
51#define RK0 %xmm12
52#define RK1 %xmm13
53#define RK2 %xmm14
54#define RK3 %xmm15
55
56
57#define S0_1(x0, x1, x2, x3, x4) \
58 vpor x0, x3, tp; \
59 vpxor x3, x0, x0; \
60 vpxor x2, x3, x4; \
61 vpxor RNOT, x4, x4; \
62 vpxor x1, tp, x3; \
63 vpand x0, x1, x1; \
64 vpxor x4, x1, x1; \
65 vpxor x0, x2, x2;
66#define S0_2(x0, x1, x2, x3, x4) \
67 vpxor x3, x0, x0; \
68 vpor x0, x4, x4; \
69 vpxor x2, x0, x0; \
70 vpand x1, x2, x2; \
71 vpxor x2, x3, x3; \
72 vpxor RNOT, x1, x1; \
73 vpxor x4, x2, x2; \
74 vpxor x2, x1, x1;
75
76#define S1_1(x0, x1, x2, x3, x4) \
77 vpxor x0, x1, tp; \
78 vpxor x3, x0, x0; \
79 vpxor RNOT, x3, x3; \
80 vpand tp, x1, x4; \
81 vpor tp, x0, x0; \
82 vpxor x2, x3, x3; \
83 vpxor x3, x0, x0; \
84 vpxor x3, tp, x1;
85#define S1_2(x0, x1, x2, x3, x4) \
86 vpxor x4, x3, x3; \
87 vpor x4, x1, x1; \
88 vpxor x2, x4, x4; \
89 vpand x0, x2, x2; \
90 vpxor x1, x2, x2; \
91 vpor x0, x1, x1; \
92 vpxor RNOT, x0, x0; \
93 vpxor x2, x0, x0; \
94 vpxor x1, x4, x4;
95
96#define S2_1(x0, x1, x2, x3, x4) \
97 vpxor RNOT, x3, x3; \
98 vpxor x0, x1, x1; \
99 vpand x2, x0, tp; \
100 vpxor x3, tp, tp; \
101 vpor x0, x3, x3; \
102 vpxor x1, x2, x2; \
103 vpxor x1, x3, x3; \
104 vpand tp, x1, x1;
105#define S2_2(x0, x1, x2, x3, x4) \
106 vpxor x2, tp, tp; \
107 vpand x3, x2, x2; \
108 vpor x1, x3, x3; \
109 vpxor RNOT, tp, tp; \
110 vpxor tp, x3, x3; \
111 vpxor tp, x0, x4; \
112 vpxor x2, tp, x0; \
113 vpor x2, x1, x1;
114
115#define S3_1(x0, x1, x2, x3, x4) \
116 vpxor x3, x1, tp; \
117 vpor x0, x3, x3; \
118 vpand x0, x1, x4; \
119 vpxor x2, x0, x0; \
120 vpxor tp, x2, x2; \
121 vpand x3, tp, x1; \
122 vpxor x3, x2, x2; \
123 vpor x4, x0, x0; \
124 vpxor x3, x4, x4;
125#define S3_2(x0, x1, x2, x3, x4) \
126 vpxor x0, x1, x1; \
127 vpand x3, x0, x0; \
128 vpand x4, x3, x3; \
129 vpxor x2, x3, x3; \
130 vpor x1, x4, x4; \
131 vpand x1, x2, x2; \
132 vpxor x3, x4, x4; \
133 vpxor x3, x0, x0; \
134 vpxor x2, x3, x3;
135
136#define S4_1(x0, x1, x2, x3, x4) \
137 vpand x0, x3, tp; \
138 vpxor x3, x0, x0; \
139 vpxor x2, tp, tp; \
140 vpor x3, x2, x2; \
141 vpxor x1, x0, x0; \
142 vpxor tp, x3, x4; \
143 vpor x0, x2, x2; \
144 vpxor x1, x2, x2;
145#define S4_2(x0, x1, x2, x3, x4) \
146 vpand x0, x1, x1; \
147 vpxor x4, x1, x1; \
148 vpand x2, x4, x4; \
149 vpxor tp, x2, x2; \
150 vpxor x0, x4, x4; \
151 vpor x1, tp, x3; \
152 vpxor RNOT, x1, x1; \
153 vpxor x0, x3, x3;
154
155#define S5_1(x0, x1, x2, x3, x4) \
156 vpor x0, x1, tp; \
157 vpxor tp, x2, x2; \
158 vpxor RNOT, x3, x3; \
159 vpxor x0, x1, x4; \
160 vpxor x2, x0, x0; \
161 vpand x4, tp, x1; \
162 vpor x3, x4, x4; \
163 vpxor x0, x4, x4;
164#define S5_2(x0, x1, x2, x3, x4) \
165 vpand x3, x0, x0; \
166 vpxor x3, x1, x1; \
167 vpxor x2, x3, x3; \
168 vpxor x1, x0, x0; \
169 vpand x4, x2, x2; \
170 vpxor x2, x1, x1; \
171 vpand x0, x2, x2; \
172 vpxor x2, x3, x3;
173
174#define S6_1(x0, x1, x2, x3, x4) \
175 vpxor x0, x3, x3; \
176 vpxor x2, x1, tp; \
177 vpxor x0, x2, x2; \
178 vpand x3, x0, x0; \
179 vpor x3, tp, tp; \
180 vpxor RNOT, x1, x4; \
181 vpxor tp, x0, x0; \
182 vpxor x2, tp, x1;
183#define S6_2(x0, x1, x2, x3, x4) \
184 vpxor x4, x3, x3; \
185 vpxor x0, x4, x4; \
186 vpand x0, x2, x2; \
187 vpxor x1, x4, x4; \
188 vpxor x3, x2, x2; \
189 vpand x1, x3, x3; \
190 vpxor x0, x3, x3; \
191 vpxor x2, x1, x1;
192
193#define S7_1(x0, x1, x2, x3, x4) \
194 vpxor RNOT, x1, tp; \
195 vpxor RNOT, x0, x0; \
196 vpand x2, tp, x1; \
197 vpxor x3, x1, x1; \
198 vpor tp, x3, x3; \
199 vpxor x2, tp, x4; \
200 vpxor x3, x2, x2; \
201 vpxor x0, x3, x3; \
202 vpor x1, x0, x0;
203#define S7_2(x0, x1, x2, x3, x4) \
204 vpand x0, x2, x2; \
205 vpxor x4, x0, x0; \
206 vpxor x3, x4, x4; \
207 vpand x0, x3, x3; \
208 vpxor x1, x4, x4; \
209 vpxor x4, x2, x2; \
210 vpxor x1, x3, x3; \
211 vpor x0, x4, x4; \
212 vpxor x1, x4, x4;
213
214#define SI0_1(x0, x1, x2, x3, x4) \
215 vpxor x0, x1, x1; \
216 vpor x1, x3, tp; \
217 vpxor x1, x3, x4; \
218 vpxor RNOT, x0, x0; \
219 vpxor tp, x2, x2; \
220 vpxor x0, tp, x3; \
221 vpand x1, x0, x0; \
222 vpxor x2, x0, x0;
223#define SI0_2(x0, x1, x2, x3, x4) \
224 vpand x3, x2, x2; \
225 vpxor x4, x3, x3; \
226 vpxor x3, x2, x2; \
227 vpxor x3, x1, x1; \
228 vpand x0, x3, x3; \
229 vpxor x0, x1, x1; \
230 vpxor x2, x0, x0; \
231 vpxor x3, x4, x4;
232
233#define SI1_1(x0, x1, x2, x3, x4) \
234 vpxor x3, x1, x1; \
235 vpxor x2, x0, tp; \
236 vpxor RNOT, x2, x2; \
237 vpor x1, x0, x4; \
238 vpxor x3, x4, x4; \
239 vpand x1, x3, x3; \
240 vpxor x2, x1, x1; \
241 vpand x4, x2, x2;
242#define SI1_2(x0, x1, x2, x3, x4) \
243 vpxor x1, x4, x4; \
244 vpor x3, x1, x1; \
245 vpxor tp, x3, x3; \
246 vpxor tp, x2, x2; \
247 vpor x4, tp, x0; \
248 vpxor x4, x2, x2; \
249 vpxor x0, x1, x1; \
250 vpxor x1, x4, x4;
251
252#define SI2_1(x0, x1, x2, x3, x4) \
253 vpxor x1, x2, x2; \
254 vpxor RNOT, x3, tp; \
255 vpor x2, tp, tp; \
256 vpxor x3, x2, x2; \
257 vpxor x0, x3, x4; \
258 vpxor x1, tp, x3; \
259 vpor x2, x1, x1; \
260 vpxor x0, x2, x2;
261#define SI2_2(x0, x1, x2, x3, x4) \
262 vpxor x4, x1, x1; \
263 vpor x3, x4, x4; \
264 vpxor x3, x2, x2; \
265 vpxor x2, x4, x4; \
266 vpand x1, x2, x2; \
267 vpxor x3, x2, x2; \
268 vpxor x4, x3, x3; \
269 vpxor x0, x4, x4;
270
271#define SI3_1(x0, x1, x2, x3, x4) \
272 vpxor x1, x2, x2; \
273 vpand x2, x1, tp; \
274 vpxor x0, tp, tp; \
275 vpor x1, x0, x0; \
276 vpxor x3, x1, x4; \
277 vpxor x3, x0, x0; \
278 vpor tp, x3, x3; \
279 vpxor x2, tp, x1;
280#define SI3_2(x0, x1, x2, x3, x4) \
281 vpxor x3, x1, x1; \
282 vpxor x2, x0, x0; \
283 vpxor x3, x2, x2; \
284 vpand x1, x3, x3; \
285 vpxor x0, x1, x1; \
286 vpand x2, x0, x0; \
287 vpxor x3, x4, x4; \
288 vpxor x0, x3, x3; \
289 vpxor x1, x0, x0;
290
291#define SI4_1(x0, x1, x2, x3, x4) \
292 vpxor x3, x2, x2; \
293 vpand x1, x0, tp; \
294 vpxor x2, tp, tp; \
295 vpor x3, x2, x2; \
296 vpxor RNOT, x0, x4; \
297 vpxor tp, x1, x1; \
298 vpxor x2, tp, x0; \
299 vpand x4, x2, x2;
300#define SI4_2(x0, x1, x2, x3, x4) \
301 vpxor x0, x2, x2; \
302 vpor x4, x0, x0; \
303 vpxor x3, x0, x0; \
304 vpand x2, x3, x3; \
305 vpxor x3, x4, x4; \
306 vpxor x1, x3, x3; \
307 vpand x0, x1, x1; \
308 vpxor x1, x4, x4; \
309 vpxor x3, x0, x0;
310
311#define SI5_1(x0, x1, x2, x3, x4) \
312 vpor x2, x1, tp; \
313 vpxor x1, x2, x2; \
314 vpxor x3, tp, tp; \
315 vpand x1, x3, x3; \
316 vpxor x3, x2, x2; \
317 vpor x0, x3, x3; \
318 vpxor RNOT, x0, x0; \
319 vpxor x2, x3, x3; \
320 vpor x0, x2, x2;
321#define SI5_2(x0, x1, x2, x3, x4) \
322 vpxor tp, x1, x4; \
323 vpxor x4, x2, x2; \
324 vpand x0, x4, x4; \
325 vpxor tp, x0, x0; \
326 vpxor x3, tp, x1; \
327 vpand x2, x0, x0; \
328 vpxor x3, x2, x2; \
329 vpxor x2, x0, x0; \
330 vpxor x4, x2, x2; \
331 vpxor x3, x4, x4;
332
333#define SI6_1(x0, x1, x2, x3, x4) \
334 vpxor x2, x0, x0; \
335 vpand x3, x0, tp; \
336 vpxor x3, x2, x2; \
337 vpxor x2, tp, tp; \
338 vpxor x1, x3, x3; \
339 vpor x0, x2, x2; \
340 vpxor x3, x2, x2; \
341 vpand tp, x3, x3;
342#define SI6_2(x0, x1, x2, x3, x4) \
343 vpxor RNOT, tp, tp; \
344 vpxor x1, x3, x3; \
345 vpand x2, x1, x1; \
346 vpxor tp, x0, x4; \
347 vpxor x4, x3, x3; \
348 vpxor x2, x4, x4; \
349 vpxor x1, tp, x0; \
350 vpxor x0, x2, x2;
351
352#define SI7_1(x0, x1, x2, x3, x4) \
353 vpand x0, x3, tp; \
354 vpxor x2, x0, x0; \
355 vpor x3, x2, x2; \
356 vpxor x1, x3, x4; \
357 vpxor RNOT, x0, x0; \
358 vpor tp, x1, x1; \
359 vpxor x0, x4, x4; \
360 vpand x2, x0, x0; \
361 vpxor x1, x0, x0;
362#define SI7_2(x0, x1, x2, x3, x4) \
363 vpand x2, x1, x1; \
364 vpxor x2, tp, x3; \
365 vpxor x3, x4, x4; \
366 vpand x3, x2, x2; \
367 vpor x0, x3, x3; \
368 vpxor x4, x1, x1; \
369 vpxor x4, x3, x3; \
370 vpand x0, x4, x4; \
371 vpxor x2, x4, x4;
372
373#define get_key(i, j, t) \
374 vbroadcastss (4*(i)+(j))*4(CTX), t;
375
376#define K2(x0, x1, x2, x3, x4, i) \
377 get_key(i, 0, RK0); \
378 get_key(i, 1, RK1); \
379 get_key(i, 2, RK2); \
380 get_key(i, 3, RK3); \
381 vpxor RK0, x0 ## 1, x0 ## 1; \
382 vpxor RK1, x1 ## 1, x1 ## 1; \
383 vpxor RK2, x2 ## 1, x2 ## 1; \
384 vpxor RK3, x3 ## 1, x3 ## 1; \
385 vpxor RK0, x0 ## 2, x0 ## 2; \
386 vpxor RK1, x1 ## 2, x1 ## 2; \
387 vpxor RK2, x2 ## 2, x2 ## 2; \
388 vpxor RK3, x3 ## 2, x3 ## 2;
389
390#define LK2(x0, x1, x2, x3, x4, i) \
391 vpslld $13, x0 ## 1, x4 ## 1; \
392 vpsrld $(32 - 13), x0 ## 1, x0 ## 1; \
393 vpor x4 ## 1, x0 ## 1, x0 ## 1; \
394 vpxor x0 ## 1, x1 ## 1, x1 ## 1; \
395 vpslld $3, x2 ## 1, x4 ## 1; \
396 vpsrld $(32 - 3), x2 ## 1, x2 ## 1; \
397 vpor x4 ## 1, x2 ## 1, x2 ## 1; \
398 vpxor x2 ## 1, x1 ## 1, x1 ## 1; \
399 vpslld $13, x0 ## 2, x4 ## 2; \
400 vpsrld $(32 - 13), x0 ## 2, x0 ## 2; \
401 vpor x4 ## 2, x0 ## 2, x0 ## 2; \
402 vpxor x0 ## 2, x1 ## 2, x1 ## 2; \
403 vpslld $3, x2 ## 2, x4 ## 2; \
404 vpsrld $(32 - 3), x2 ## 2, x2 ## 2; \
405 vpor x4 ## 2, x2 ## 2, x2 ## 2; \
406 vpxor x2 ## 2, x1 ## 2, x1 ## 2; \
407 vpslld $1, x1 ## 1, x4 ## 1; \
408 vpsrld $(32 - 1), x1 ## 1, x1 ## 1; \
409 vpor x4 ## 1, x1 ## 1, x1 ## 1; \
410 vpslld $3, x0 ## 1, x4 ## 1; \
411 vpxor x2 ## 1, x3 ## 1, x3 ## 1; \
412 vpxor x4 ## 1, x3 ## 1, x3 ## 1; \
413 get_key(i, 1, RK1); \
414 vpslld $1, x1 ## 2, x4 ## 2; \
415 vpsrld $(32 - 1), x1 ## 2, x1 ## 2; \
416 vpor x4 ## 2, x1 ## 2, x1 ## 2; \
417 vpslld $3, x0 ## 2, x4 ## 2; \
418 vpxor x2 ## 2, x3 ## 2, x3 ## 2; \
419 vpxor x4 ## 2, x3 ## 2, x3 ## 2; \
420 get_key(i, 3, RK3); \
421 vpslld $7, x3 ## 1, x4 ## 1; \
422 vpsrld $(32 - 7), x3 ## 1, x3 ## 1; \
423 vpor x4 ## 1, x3 ## 1, x3 ## 1; \
424 vpslld $7, x1 ## 1, x4 ## 1; \
425 vpxor x1 ## 1, x0 ## 1, x0 ## 1; \
426 vpxor x3 ## 1, x0 ## 1, x0 ## 1; \
427 vpxor x3 ## 1, x2 ## 1, x2 ## 1; \
428 vpxor x4 ## 1, x2 ## 1, x2 ## 1; \
429 get_key(i, 0, RK0); \
430 vpslld $7, x3 ## 2, x4 ## 2; \
431 vpsrld $(32 - 7), x3 ## 2, x3 ## 2; \
432 vpor x4 ## 2, x3 ## 2, x3 ## 2; \
433 vpslld $7, x1 ## 2, x4 ## 2; \
434 vpxor x1 ## 2, x0 ## 2, x0 ## 2; \
435 vpxor x3 ## 2, x0 ## 2, x0 ## 2; \
436 vpxor x3 ## 2, x2 ## 2, x2 ## 2; \
437 vpxor x4 ## 2, x2 ## 2, x2 ## 2; \
438 get_key(i, 2, RK2); \
439 vpxor RK1, x1 ## 1, x1 ## 1; \
440 vpxor RK3, x3 ## 1, x3 ## 1; \
441 vpslld $5, x0 ## 1, x4 ## 1; \
442 vpsrld $(32 - 5), x0 ## 1, x0 ## 1; \
443 vpor x4 ## 1, x0 ## 1, x0 ## 1; \
444 vpslld $22, x2 ## 1, x4 ## 1; \
445 vpsrld $(32 - 22), x2 ## 1, x2 ## 1; \
446 vpor x4 ## 1, x2 ## 1, x2 ## 1; \
447 vpxor RK0, x0 ## 1, x0 ## 1; \
448 vpxor RK2, x2 ## 1, x2 ## 1; \
449 vpxor RK1, x1 ## 2, x1 ## 2; \
450 vpxor RK3, x3 ## 2, x3 ## 2; \
451 vpslld $5, x0 ## 2, x4 ## 2; \
452 vpsrld $(32 - 5), x0 ## 2, x0 ## 2; \
453 vpor x4 ## 2, x0 ## 2, x0 ## 2; \
454 vpslld $22, x2 ## 2, x4 ## 2; \
455 vpsrld $(32 - 22), x2 ## 2, x2 ## 2; \
456 vpor x4 ## 2, x2 ## 2, x2 ## 2; \
457 vpxor RK0, x0 ## 2, x0 ## 2; \
458 vpxor RK2, x2 ## 2, x2 ## 2;
459
460#define KL2(x0, x1, x2, x3, x4, i) \
461 vpxor RK0, x0 ## 1, x0 ## 1; \
462 vpxor RK2, x2 ## 1, x2 ## 1; \
463 vpsrld $5, x0 ## 1, x4 ## 1; \
464 vpslld $(32 - 5), x0 ## 1, x0 ## 1; \
465 vpor x4 ## 1, x0 ## 1, x0 ## 1; \
466 vpxor RK3, x3 ## 1, x3 ## 1; \
467 vpxor RK1, x1 ## 1, x1 ## 1; \
468 vpsrld $22, x2 ## 1, x4 ## 1; \
469 vpslld $(32 - 22), x2 ## 1, x2 ## 1; \
470 vpor x4 ## 1, x2 ## 1, x2 ## 1; \
471 vpxor x3 ## 1, x2 ## 1, x2 ## 1; \
472 vpxor RK0, x0 ## 2, x0 ## 2; \
473 vpxor RK2, x2 ## 2, x2 ## 2; \
474 vpsrld $5, x0 ## 2, x4 ## 2; \
475 vpslld $(32 - 5), x0 ## 2, x0 ## 2; \
476 vpor x4 ## 2, x0 ## 2, x0 ## 2; \
477 vpxor RK3, x3 ## 2, x3 ## 2; \
478 vpxor RK1, x1 ## 2, x1 ## 2; \
479 vpsrld $22, x2 ## 2, x4 ## 2; \
480 vpslld $(32 - 22), x2 ## 2, x2 ## 2; \
481 vpor x4 ## 2, x2 ## 2, x2 ## 2; \
482 vpxor x3 ## 2, x2 ## 2, x2 ## 2; \
483 vpxor x3 ## 1, x0 ## 1, x0 ## 1; \
484 vpslld $7, x1 ## 1, x4 ## 1; \
485 vpxor x1 ## 1, x0 ## 1, x0 ## 1; \
486 vpxor x4 ## 1, x2 ## 1, x2 ## 1; \
487 vpsrld $1, x1 ## 1, x4 ## 1; \
488 vpslld $(32 - 1), x1 ## 1, x1 ## 1; \
489 vpor x4 ## 1, x1 ## 1, x1 ## 1; \
490 vpxor x3 ## 2, x0 ## 2, x0 ## 2; \
491 vpslld $7, x1 ## 2, x4 ## 2; \
492 vpxor x1 ## 2, x0 ## 2, x0 ## 2; \
493 vpxor x4 ## 2, x2 ## 2, x2 ## 2; \
494 vpsrld $1, x1 ## 2, x4 ## 2; \
495 vpslld $(32 - 1), x1 ## 2, x1 ## 2; \
496 vpor x4 ## 2, x1 ## 2, x1 ## 2; \
497 vpsrld $7, x3 ## 1, x4 ## 1; \
498 vpslld $(32 - 7), x3 ## 1, x3 ## 1; \
499 vpor x4 ## 1, x3 ## 1, x3 ## 1; \
500 vpxor x0 ## 1, x1 ## 1, x1 ## 1; \
501 vpslld $3, x0 ## 1, x4 ## 1; \
502 vpxor x4 ## 1, x3 ## 1, x3 ## 1; \
503 vpsrld $7, x3 ## 2, x4 ## 2; \
504 vpslld $(32 - 7), x3 ## 2, x3 ## 2; \
505 vpor x4 ## 2, x3 ## 2, x3 ## 2; \
506 vpxor x0 ## 2, x1 ## 2, x1 ## 2; \
507 vpslld $3, x0 ## 2, x4 ## 2; \
508 vpxor x4 ## 2, x3 ## 2, x3 ## 2; \
509 vpsrld $13, x0 ## 1, x4 ## 1; \
510 vpslld $(32 - 13), x0 ## 1, x0 ## 1; \
511 vpor x4 ## 1, x0 ## 1, x0 ## 1; \
512 vpxor x2 ## 1, x1 ## 1, x1 ## 1; \
513 vpxor x2 ## 1, x3 ## 1, x3 ## 1; \
514 vpsrld $3, x2 ## 1, x4 ## 1; \
515 vpslld $(32 - 3), x2 ## 1, x2 ## 1; \
516 vpor x4 ## 1, x2 ## 1, x2 ## 1; \
517 vpsrld $13, x0 ## 2, x4 ## 2; \
518 vpslld $(32 - 13), x0 ## 2, x0 ## 2; \
519 vpor x4 ## 2, x0 ## 2, x0 ## 2; \
520 vpxor x2 ## 2, x1 ## 2, x1 ## 2; \
521 vpxor x2 ## 2, x3 ## 2, x3 ## 2; \
522 vpsrld $3, x2 ## 2, x4 ## 2; \
523 vpslld $(32 - 3), x2 ## 2, x2 ## 2; \
524 vpor x4 ## 2, x2 ## 2, x2 ## 2;
525
526#define S(SBOX, x0, x1, x2, x3, x4) \
527 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
528 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
529 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
530 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
531
532#define SP(SBOX, x0, x1, x2, x3, x4, i) \
533 get_key(i, 0, RK0); \
534 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
535 get_key(i, 2, RK2); \
536 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
537 get_key(i, 3, RK3); \
538 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
539 get_key(i, 1, RK1); \
540 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
541
542#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
543 vpunpckldq x1, x0, t0; \
544 vpunpckhdq x1, x0, t2; \
545 vpunpckldq x3, x2, t1; \
546 vpunpckhdq x3, x2, x3; \
547 \
548 vpunpcklqdq t1, t0, x0; \
549 vpunpckhqdq t1, t0, x1; \
550 vpunpcklqdq x3, t2, x2; \
551 vpunpckhqdq x3, t2, x3;
552
553#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
554 vmovdqu (0*4*4)(in), x0; \
555 vmovdqu (1*4*4)(in), x1; \
556 vmovdqu (2*4*4)(in), x2; \
557 vmovdqu (3*4*4)(in), x3; \
558 \
559 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
560
561#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
562 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
563 \
564 vmovdqu x0, (0*4*4)(out); \
565 vmovdqu x1, (1*4*4)(out); \
566 vmovdqu x2, (2*4*4)(out); \
567 vmovdqu x3, (3*4*4)(out);
568
569#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
570 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
571 \
572 vpxor (0*4*4)(out), x0, x0; \
573 vmovdqu x0, (0*4*4)(out); \
574 vpxor (1*4*4)(out), x1, x1; \
575 vmovdqu x1, (1*4*4)(out); \
576 vpxor (2*4*4)(out), x2, x2; \
577 vmovdqu x2, (2*4*4)(out); \
578 vpxor (3*4*4)(out), x3, x3; \
579 vmovdqu x3, (3*4*4)(out);
580
581.align 8
582.global __serpent_enc_blk_8way_avx
583.type __serpent_enc_blk_8way_avx,@function;
584
585__serpent_enc_blk_8way_avx:
586 /* input:
587 * %rdi: ctx, CTX
588 * %rsi: dst
589 * %rdx: src
590 * %rcx: bool, if true: xor output
591 */
592
593 vpcmpeqd RNOT, RNOT, RNOT;
594
595 leaq (4*4*4)(%rdx), %rax;
596 read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
597 read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
598
599 K2(RA, RB, RC, RD, RE, 0);
600 S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);
601 S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2);
602 S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3);
603 S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4);
604 S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5);
605 S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6);
606 S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7);
607 S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8);
608 S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9);
609 S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10);
610 S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11);
611 S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12);
612 S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13);
613 S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14);
614 S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15);
615 S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16);
616 S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17);
617 S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18);
618 S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19);
619 S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20);
620 S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21);
621 S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22);
622 S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23);
623 S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24);
624 S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25);
625 S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26);
626 S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27);
627 S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28);
628 S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29);
629 S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30);
630 S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);
631 S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);
632
633 leaq (4*4*4)(%rsi), %rax;
634
635 testb %cl, %cl;
636 jnz __enc_xor8;
637
638 write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
639 write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
640
641 ret;
642
643__enc_xor8:
644 xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
645 xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
646
647 ret;
648
649.align 8
650.global serpent_dec_blk_8way_avx
651.type serpent_dec_blk_8way_avx,@function;
652
653serpent_dec_blk_8way_avx:
654 /* input:
655 * %rdi: ctx, CTX
656 * %rsi: dst
657 * %rdx: src
658 */
659
660 vpcmpeqd RNOT, RNOT, RNOT;
661
662 leaq (4*4*4)(%rdx), %rax;
663 read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
664 read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
665
666 K2(RA, RB, RC, RD, RE, 32);
667 SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);
668 SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30);
669 SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29);
670 SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28);
671 SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27);
672 SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26);
673 SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25);
674 SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24);
675 SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23);
676 SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22);
677 SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21);
678 SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20);
679 SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19);
680 SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18);
681 SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17);
682 SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16);
683 SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15);
684 SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14);
685 SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13);
686 SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12);
687 SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11);
688 SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10);
689 SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9);
690 SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8);
691 SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7);
692 SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6);
693 SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5);
694 SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4);
695 SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3);
696 SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2);
697 SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);
698 S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);
699
700 leaq (4*4*4)(%rsi), %rax;
701 write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2);
702 write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2);
703
704 ret;
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
new file mode 100644
index 00000000000..b36bdac237e
--- /dev/null
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -0,0 +1,636 @@
1/*
2 * Glue Code for AVX assembler versions of Serpent Cipher
3 *
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
6 *
7 * Glue code based on serpent_sse2_glue.c by:
8 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
23 * USA
24 *
25 */
26
27#include <linux/module.h>
28#include <linux/hardirq.h>
29#include <linux/types.h>
30#include <linux/crypto.h>
31#include <linux/err.h>
32#include <crypto/algapi.h>
33#include <crypto/serpent.h>
34#include <crypto/cryptd.h>
35#include <crypto/b128ops.h>
36#include <crypto/ctr.h>
37#include <crypto/lrw.h>
38#include <crypto/xts.h>
39#include <asm/xcr.h>
40#include <asm/xsave.h>
41#include <asm/crypto/serpent-avx.h>
42#include <asm/crypto/ablk_helper.h>
43#include <asm/crypto/glue_helper.h>
44
45static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)
46{
47 u128 ivs[SERPENT_PARALLEL_BLOCKS - 1];
48 unsigned int j;
49
50 for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
51 ivs[j] = src[j];
52
53 serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
54
55 for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
56 u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
57}
58
59static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
60{
61 be128 ctrblk;
62
63 u128_to_be128(&ctrblk, iv);
64 u128_inc(iv);
65
66 __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
67 u128_xor(dst, src, (u128 *)&ctrblk);
68}
69
70static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
71 u128 *iv)
72{
73 be128 ctrblks[SERPENT_PARALLEL_BLOCKS];
74 unsigned int i;
75
76 for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) {
77 if (dst != src)
78 dst[i] = src[i];
79
80 u128_to_be128(&ctrblks[i], iv);
81 u128_inc(iv);
82 }
83
84 serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
85}
86
87static const struct common_glue_ctx serpent_enc = {
88 .num_funcs = 2,
89 .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
90
91 .funcs = { {
92 .num_blocks = SERPENT_PARALLEL_BLOCKS,
93 .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_enc_blk_xway) }
94 }, {
95 .num_blocks = 1,
96 .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) }
97 } }
98};
99
100static const struct common_glue_ctx serpent_ctr = {
101 .num_funcs = 2,
102 .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
103
104 .funcs = { {
105 .num_blocks = SERPENT_PARALLEL_BLOCKS,
106 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr_xway) }
107 }, {
108 .num_blocks = 1,
109 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) }
110 } }
111};
112
113static const struct common_glue_ctx serpent_dec = {
114 .num_funcs = 2,
115 .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
116
117 .funcs = { {
118 .num_blocks = SERPENT_PARALLEL_BLOCKS,
119 .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_dec_blk_xway) }
120 }, {
121 .num_blocks = 1,
122 .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) }
123 } }
124};
125
126static const struct common_glue_ctx serpent_dec_cbc = {
127 .num_funcs = 2,
128 .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
129
130 .funcs = { {
131 .num_blocks = SERPENT_PARALLEL_BLOCKS,
132 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_decrypt_cbc_xway) }
133 }, {
134 .num_blocks = 1,
135 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) }
136 } }
137};
138
139static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
140 struct scatterlist *src, unsigned int nbytes)
141{
142 return glue_ecb_crypt_128bit(&serpent_enc, desc, dst, src, nbytes);
143}
144
145static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
146 struct scatterlist *src, unsigned int nbytes)
147{
148 return glue_ecb_crypt_128bit(&serpent_dec, desc, dst, src, nbytes);
149}
150
151static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
152 struct scatterlist *src, unsigned int nbytes)
153{
154 return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(__serpent_encrypt), desc,
155 dst, src, nbytes);
156}
157
158static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
159 struct scatterlist *src, unsigned int nbytes)
160{
161 return glue_cbc_decrypt_128bit(&serpent_dec_cbc, desc, dst, src,
162 nbytes);
163}
164
165static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
166 struct scatterlist *src, unsigned int nbytes)
167{
168 return glue_ctr_crypt_128bit(&serpent_ctr, desc, dst, src, nbytes);
169}
170
171static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes)
172{
173 return glue_fpu_begin(SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS,
174 NULL, fpu_enabled, nbytes);
175}
176
177static inline void serpent_fpu_end(bool fpu_enabled)
178{
179 glue_fpu_end(fpu_enabled);
180}
181
182struct crypt_priv {
183 struct serpent_ctx *ctx;
184 bool fpu_enabled;
185};
186
187static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
188{
189 const unsigned int bsize = SERPENT_BLOCK_SIZE;
190 struct crypt_priv *ctx = priv;
191 int i;
192
193 ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
194
195 if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
196 serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst);
197 return;
198 }
199
200 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
201 __serpent_encrypt(ctx->ctx, srcdst, srcdst);
202}
203
204static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
205{
206 const unsigned int bsize = SERPENT_BLOCK_SIZE;
207 struct crypt_priv *ctx = priv;
208 int i;
209
210 ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
211
212 if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
213 serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst);
214 return;
215 }
216
217 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
218 __serpent_decrypt(ctx->ctx, srcdst, srcdst);
219}
220
221struct serpent_lrw_ctx {
222 struct lrw_table_ctx lrw_table;
223 struct serpent_ctx serpent_ctx;
224};
225
226static int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
227 unsigned int keylen)
228{
229 struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
230 int err;
231
232 err = __serpent_setkey(&ctx->serpent_ctx, key, keylen -
233 SERPENT_BLOCK_SIZE);
234 if (err)
235 return err;
236
237 return lrw_init_table(&ctx->lrw_table, key + keylen -
238 SERPENT_BLOCK_SIZE);
239}
240
241static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
242 struct scatterlist *src, unsigned int nbytes)
243{
244 struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
245 be128 buf[SERPENT_PARALLEL_BLOCKS];
246 struct crypt_priv crypt_ctx = {
247 .ctx = &ctx->serpent_ctx,
248 .fpu_enabled = false,
249 };
250 struct lrw_crypt_req req = {
251 .tbuf = buf,
252 .tbuflen = sizeof(buf),
253
254 .table_ctx = &ctx->lrw_table,
255 .crypt_ctx = &crypt_ctx,
256 .crypt_fn = encrypt_callback,
257 };
258 int ret;
259
260 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
261 ret = lrw_crypt(desc, dst, src, nbytes, &req);
262 serpent_fpu_end(crypt_ctx.fpu_enabled);
263
264 return ret;
265}
266
267static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
268 struct scatterlist *src, unsigned int nbytes)
269{
270 struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
271 be128 buf[SERPENT_PARALLEL_BLOCKS];
272 struct crypt_priv crypt_ctx = {
273 .ctx = &ctx->serpent_ctx,
274 .fpu_enabled = false,
275 };
276 struct lrw_crypt_req req = {
277 .tbuf = buf,
278 .tbuflen = sizeof(buf),
279
280 .table_ctx = &ctx->lrw_table,
281 .crypt_ctx = &crypt_ctx,
282 .crypt_fn = decrypt_callback,
283 };
284 int ret;
285
286 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
287 ret = lrw_crypt(desc, dst, src, nbytes, &req);
288 serpent_fpu_end(crypt_ctx.fpu_enabled);
289
290 return ret;
291}
292
293static void lrw_exit_tfm(struct crypto_tfm *tfm)
294{
295 struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
296
297 lrw_free_table(&ctx->lrw_table);
298}
299
300struct serpent_xts_ctx {
301 struct serpent_ctx tweak_ctx;
302 struct serpent_ctx crypt_ctx;
303};
304
305static int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
306 unsigned int keylen)
307{
308 struct serpent_xts_ctx *ctx = crypto_tfm_ctx(tfm);
309 u32 *flags = &tfm->crt_flags;
310 int err;
311
312 /* key consists of keys of equal size concatenated, therefore
313 * the length must be even
314 */
315 if (keylen % 2) {
316 *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
317 return -EINVAL;
318 }
319
320 /* first half of xts-key is for crypt */
321 err = __serpent_setkey(&ctx->crypt_ctx, key, keylen / 2);
322 if (err)
323 return err;
324
325 /* second half of xts-key is for tweak */
326 return __serpent_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2);
327}
328
329static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
330 struct scatterlist *src, unsigned int nbytes)
331{
332 struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
333 be128 buf[SERPENT_PARALLEL_BLOCKS];
334 struct crypt_priv crypt_ctx = {
335 .ctx = &ctx->crypt_ctx,
336 .fpu_enabled = false,
337 };
338 struct xts_crypt_req req = {
339 .tbuf = buf,
340 .tbuflen = sizeof(buf),
341
342 .tweak_ctx = &ctx->tweak_ctx,
343 .tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt),
344 .crypt_ctx = &crypt_ctx,
345 .crypt_fn = encrypt_callback,
346 };
347 int ret;
348
349 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
350 ret = xts_crypt(desc, dst, src, nbytes, &req);
351 serpent_fpu_end(crypt_ctx.fpu_enabled);
352
353 return ret;
354}
355
356static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
357 struct scatterlist *src, unsigned int nbytes)
358{
359 struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
360 be128 buf[SERPENT_PARALLEL_BLOCKS];
361 struct crypt_priv crypt_ctx = {
362 .ctx = &ctx->crypt_ctx,
363 .fpu_enabled = false,
364 };
365 struct xts_crypt_req req = {
366 .tbuf = buf,
367 .tbuflen = sizeof(buf),
368
369 .tweak_ctx = &ctx->tweak_ctx,
370 .tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt),
371 .crypt_ctx = &crypt_ctx,
372 .crypt_fn = decrypt_callback,
373 };
374 int ret;
375
376 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
377 ret = xts_crypt(desc, dst, src, nbytes, &req);
378 serpent_fpu_end(crypt_ctx.fpu_enabled);
379
380 return ret;
381}
382
383static struct crypto_alg serpent_algs[10] = { {
384 .cra_name = "__ecb-serpent-avx",
385 .cra_driver_name = "__driver-ecb-serpent-avx",
386 .cra_priority = 0,
387 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
388 .cra_blocksize = SERPENT_BLOCK_SIZE,
389 .cra_ctxsize = sizeof(struct serpent_ctx),
390 .cra_alignmask = 0,
391 .cra_type = &crypto_blkcipher_type,
392 .cra_module = THIS_MODULE,
393 .cra_list = LIST_HEAD_INIT(serpent_algs[0].cra_list),
394 .cra_u = {
395 .blkcipher = {
396 .min_keysize = SERPENT_MIN_KEY_SIZE,
397 .max_keysize = SERPENT_MAX_KEY_SIZE,
398 .setkey = serpent_setkey,
399 .encrypt = ecb_encrypt,
400 .decrypt = ecb_decrypt,
401 },
402 },
403}, {
404 .cra_name = "__cbc-serpent-avx",
405 .cra_driver_name = "__driver-cbc-serpent-avx",
406 .cra_priority = 0,
407 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
408 .cra_blocksize = SERPENT_BLOCK_SIZE,
409 .cra_ctxsize = sizeof(struct serpent_ctx),
410 .cra_alignmask = 0,
411 .cra_type = &crypto_blkcipher_type,
412 .cra_module = THIS_MODULE,
413 .cra_list = LIST_HEAD_INIT(serpent_algs[1].cra_list),
414 .cra_u = {
415 .blkcipher = {
416 .min_keysize = SERPENT_MIN_KEY_SIZE,
417 .max_keysize = SERPENT_MAX_KEY_SIZE,
418 .setkey = serpent_setkey,
419 .encrypt = cbc_encrypt,
420 .decrypt = cbc_decrypt,
421 },
422 },
423}, {
424 .cra_name = "__ctr-serpent-avx",
425 .cra_driver_name = "__driver-ctr-serpent-avx",
426 .cra_priority = 0,
427 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
428 .cra_blocksize = 1,
429 .cra_ctxsize = sizeof(struct serpent_ctx),
430 .cra_alignmask = 0,
431 .cra_type = &crypto_blkcipher_type,
432 .cra_module = THIS_MODULE,
433 .cra_list = LIST_HEAD_INIT(serpent_algs[2].cra_list),
434 .cra_u = {
435 .blkcipher = {
436 .min_keysize = SERPENT_MIN_KEY_SIZE,
437 .max_keysize = SERPENT_MAX_KEY_SIZE,
438 .ivsize = SERPENT_BLOCK_SIZE,
439 .setkey = serpent_setkey,
440 .encrypt = ctr_crypt,
441 .decrypt = ctr_crypt,
442 },
443 },
444}, {
445 .cra_name = "__lrw-serpent-avx",
446 .cra_driver_name = "__driver-lrw-serpent-avx",
447 .cra_priority = 0,
448 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
449 .cra_blocksize = SERPENT_BLOCK_SIZE,
450 .cra_ctxsize = sizeof(struct serpent_lrw_ctx),
451 .cra_alignmask = 0,
452 .cra_type = &crypto_blkcipher_type,
453 .cra_module = THIS_MODULE,
454 .cra_list = LIST_HEAD_INIT(serpent_algs[3].cra_list),
455 .cra_exit = lrw_exit_tfm,
456 .cra_u = {
457 .blkcipher = {
458 .min_keysize = SERPENT_MIN_KEY_SIZE +
459 SERPENT_BLOCK_SIZE,
460 .max_keysize = SERPENT_MAX_KEY_SIZE +
461 SERPENT_BLOCK_SIZE,
462 .ivsize = SERPENT_BLOCK_SIZE,
463 .setkey = lrw_serpent_setkey,
464 .encrypt = lrw_encrypt,
465 .decrypt = lrw_decrypt,
466 },
467 },
468}, {
469 .cra_name = "__xts-serpent-avx",
470 .cra_driver_name = "__driver-xts-serpent-avx",
471 .cra_priority = 0,
472 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
473 .cra_blocksize = SERPENT_BLOCK_SIZE,
474 .cra_ctxsize = sizeof(struct serpent_xts_ctx),
475 .cra_alignmask = 0,
476 .cra_type = &crypto_blkcipher_type,
477 .cra_module = THIS_MODULE,
478 .cra_list = LIST_HEAD_INIT(serpent_algs[4].cra_list),
479 .cra_u = {
480 .blkcipher = {
481 .min_keysize = SERPENT_MIN_KEY_SIZE * 2,
482 .max_keysize = SERPENT_MAX_KEY_SIZE * 2,
483 .ivsize = SERPENT_BLOCK_SIZE,
484 .setkey = xts_serpent_setkey,
485 .encrypt = xts_encrypt,
486 .decrypt = xts_decrypt,
487 },
488 },
489}, {
490 .cra_name = "ecb(serpent)",
491 .cra_driver_name = "ecb-serpent-avx",
492 .cra_priority = 500,
493 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
494 .cra_blocksize = SERPENT_BLOCK_SIZE,
495 .cra_ctxsize = sizeof(struct async_helper_ctx),
496 .cra_alignmask = 0,
497 .cra_type = &crypto_ablkcipher_type,
498 .cra_module = THIS_MODULE,
499 .cra_list = LIST_HEAD_INIT(serpent_algs[5].cra_list),
500 .cra_init = ablk_init,
501 .cra_exit = ablk_exit,
502 .cra_u = {
503 .ablkcipher = {
504 .min_keysize = SERPENT_MIN_KEY_SIZE,
505 .max_keysize = SERPENT_MAX_KEY_SIZE,
506 .setkey = ablk_set_key,
507 .encrypt = ablk_encrypt,
508 .decrypt = ablk_decrypt,
509 },
510 },
511}, {
512 .cra_name = "cbc(serpent)",
513 .cra_driver_name = "cbc-serpent-avx",
514 .cra_priority = 500,
515 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
516 .cra_blocksize = SERPENT_BLOCK_SIZE,
517 .cra_ctxsize = sizeof(struct async_helper_ctx),
518 .cra_alignmask = 0,
519 .cra_type = &crypto_ablkcipher_type,
520 .cra_module = THIS_MODULE,
521 .cra_list = LIST_HEAD_INIT(serpent_algs[6].cra_list),
522 .cra_init = ablk_init,
523 .cra_exit = ablk_exit,
524 .cra_u = {
525 .ablkcipher = {
526 .min_keysize = SERPENT_MIN_KEY_SIZE,
527 .max_keysize = SERPENT_MAX_KEY_SIZE,
528 .ivsize = SERPENT_BLOCK_SIZE,
529 .setkey = ablk_set_key,
530 .encrypt = __ablk_encrypt,
531 .decrypt = ablk_decrypt,
532 },
533 },
534}, {
535 .cra_name = "ctr(serpent)",
536 .cra_driver_name = "ctr-serpent-avx",
537 .cra_priority = 500,
538 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
539 .cra_blocksize = 1,
540 .cra_ctxsize = sizeof(struct async_helper_ctx),
541 .cra_alignmask = 0,
542 .cra_type = &crypto_ablkcipher_type,
543 .cra_module = THIS_MODULE,
544 .cra_list = LIST_HEAD_INIT(serpent_algs[7].cra_list),
545 .cra_init = ablk_init,
546 .cra_exit = ablk_exit,
547 .cra_u = {
548 .ablkcipher = {
549 .min_keysize = SERPENT_MIN_KEY_SIZE,
550 .max_keysize = SERPENT_MAX_KEY_SIZE,
551 .ivsize = SERPENT_BLOCK_SIZE,
552 .setkey = ablk_set_key,
553 .encrypt = ablk_encrypt,
554 .decrypt = ablk_encrypt,
555 .geniv = "chainiv",
556 },
557 },
558}, {
559 .cra_name = "lrw(serpent)",
560 .cra_driver_name = "lrw-serpent-avx",
561 .cra_priority = 500,
562 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
563 .cra_blocksize = SERPENT_BLOCK_SIZE,
564 .cra_ctxsize = sizeof(struct async_helper_ctx),
565 .cra_alignmask = 0,
566 .cra_type = &crypto_ablkcipher_type,
567 .cra_module = THIS_MODULE,
568 .cra_list = LIST_HEAD_INIT(serpent_algs[8].cra_list),
569 .cra_init = ablk_init,
570 .cra_exit = ablk_exit,
571 .cra_u = {
572 .ablkcipher = {
573 .min_keysize = SERPENT_MIN_KEY_SIZE +
574 SERPENT_BLOCK_SIZE,
575 .max_keysize = SERPENT_MAX_KEY_SIZE +
576 SERPENT_BLOCK_SIZE,
577 .ivsize = SERPENT_BLOCK_SIZE,
578 .setkey = ablk_set_key,
579 .encrypt = ablk_encrypt,
580 .decrypt = ablk_decrypt,
581 },
582 },
583}, {
584 .cra_name = "xts(serpent)",
585 .cra_driver_name = "xts-serpent-avx",
586 .cra_priority = 500,
587 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
588 .cra_blocksize = SERPENT_BLOCK_SIZE,
589 .cra_ctxsize = sizeof(struct async_helper_ctx),
590 .cra_alignmask = 0,
591 .cra_type = &crypto_ablkcipher_type,
592 .cra_module = THIS_MODULE,
593 .cra_list = LIST_HEAD_INIT(serpent_algs[9].cra_list),
594 .cra_init = ablk_init,
595 .cra_exit = ablk_exit,
596 .cra_u = {
597 .ablkcipher = {
598 .min_keysize = SERPENT_MIN_KEY_SIZE * 2,
599 .max_keysize = SERPENT_MAX_KEY_SIZE * 2,
600 .ivsize = SERPENT_BLOCK_SIZE,
601 .setkey = ablk_set_key,
602 .encrypt = ablk_encrypt,
603 .decrypt = ablk_decrypt,
604 },
605 },
606} };
607
608static int __init serpent_init(void)
609{
610 u64 xcr0;
611
612 if (!cpu_has_avx || !cpu_has_osxsave) {
613 printk(KERN_INFO "AVX instructions are not detected.\n");
614 return -ENODEV;
615 }
616
617 xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
618 if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
619 printk(KERN_INFO "AVX detected but unusable.\n");
620 return -ENODEV;
621 }
622
623 return crypto_register_algs(serpent_algs, ARRAY_SIZE(serpent_algs));
624}
625
626static void __exit serpent_exit(void)
627{
628 crypto_unregister_algs(serpent_algs, ARRAY_SIZE(serpent_algs));
629}
630
631module_init(serpent_init);
632module_exit(serpent_exit);
633
634MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX optimized");
635MODULE_LICENSE("GPL");
636MODULE_ALIAS("serpent");
diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
index 4b21be85e0a..d679c8675f4 100644
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -41,358 +41,145 @@
41#include <crypto/ctr.h> 41#include <crypto/ctr.h>
42#include <crypto/lrw.h> 42#include <crypto/lrw.h>
43#include <crypto/xts.h> 43#include <crypto/xts.h>
44#include <asm/i387.h> 44#include <asm/crypto/serpent-sse2.h>
45#include <asm/serpent.h> 45#include <asm/crypto/ablk_helper.h>
46#include <crypto/scatterwalk.h> 46#include <asm/crypto/glue_helper.h>
47#include <linux/workqueue.h>
48#include <linux/spinlock.h>
49
50struct async_serpent_ctx {
51 struct cryptd_ablkcipher *cryptd_tfm;
52};
53 47
54static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes) 48static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)
55{
56 if (fpu_enabled)
57 return true;
58
59 /* SSE2 is only used when chunk to be processed is large enough, so
60 * do not enable FPU until it is necessary.
61 */
62 if (nbytes < SERPENT_BLOCK_SIZE * SERPENT_PARALLEL_BLOCKS)
63 return false;
64
65 kernel_fpu_begin();
66 return true;
67}
68
69static inline void serpent_fpu_end(bool fpu_enabled)
70{ 49{
71 if (fpu_enabled) 50 u128 ivs[SERPENT_PARALLEL_BLOCKS - 1];
72 kernel_fpu_end(); 51 unsigned int j;
73}
74
75static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
76 bool enc)
77{
78 bool fpu_enabled = false;
79 struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
80 const unsigned int bsize = SERPENT_BLOCK_SIZE;
81 unsigned int nbytes;
82 int err;
83
84 err = blkcipher_walk_virt(desc, walk);
85 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
86
87 while ((nbytes = walk->nbytes)) {
88 u8 *wsrc = walk->src.virt.addr;
89 u8 *wdst = walk->dst.virt.addr;
90
91 fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes);
92
93 /* Process multi-block batch */
94 if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) {
95 do {
96 if (enc)
97 serpent_enc_blk_xway(ctx, wdst, wsrc);
98 else
99 serpent_dec_blk_xway(ctx, wdst, wsrc);
100
101 wsrc += bsize * SERPENT_PARALLEL_BLOCKS;
102 wdst += bsize * SERPENT_PARALLEL_BLOCKS;
103 nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
104 } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS);
105
106 if (nbytes < bsize)
107 goto done;
108 }
109
110 /* Handle leftovers */
111 do {
112 if (enc)
113 __serpent_encrypt(ctx, wdst, wsrc);
114 else
115 __serpent_decrypt(ctx, wdst, wsrc);
116
117 wsrc += bsize;
118 wdst += bsize;
119 nbytes -= bsize;
120 } while (nbytes >= bsize);
121
122done:
123 err = blkcipher_walk_done(desc, walk, nbytes);
124 }
125 52
126 serpent_fpu_end(fpu_enabled); 53 for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
127 return err; 54 ivs[j] = src[j];
128}
129 55
130static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 56 serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
131 struct scatterlist *src, unsigned int nbytes)
132{
133 struct blkcipher_walk walk;
134 57
135 blkcipher_walk_init(&walk, dst, src, nbytes); 58 for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
136 return ecb_crypt(desc, &walk, true); 59 u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
137} 60}
138 61
139static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 62static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
140 struct scatterlist *src, unsigned int nbytes)
141{ 63{
142 struct blkcipher_walk walk; 64 be128 ctrblk;
143 65
144 blkcipher_walk_init(&walk, dst, src, nbytes); 66 u128_to_be128(&ctrblk, iv);
145 return ecb_crypt(desc, &walk, false); 67 u128_inc(iv);
146}
147 68
148static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, 69 __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
149 struct blkcipher_walk *walk) 70 u128_xor(dst, src, (u128 *)&ctrblk);
150{
151 struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
152 const unsigned int bsize = SERPENT_BLOCK_SIZE;
153 unsigned int nbytes = walk->nbytes;
154 u128 *src = (u128 *)walk->src.virt.addr;
155 u128 *dst = (u128 *)walk->dst.virt.addr;
156 u128 *iv = (u128 *)walk->iv;
157
158 do {
159 u128_xor(dst, src, iv);
160 __serpent_encrypt(ctx, (u8 *)dst, (u8 *)dst);
161 iv = dst;
162
163 src += 1;
164 dst += 1;
165 nbytes -= bsize;
166 } while (nbytes >= bsize);
167
168 u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv);
169 return nbytes;
170} 71}
171 72
172static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 73static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
173 struct scatterlist *src, unsigned int nbytes) 74 u128 *iv)
174{ 75{
175 struct blkcipher_walk walk; 76 be128 ctrblks[SERPENT_PARALLEL_BLOCKS];
176 int err; 77 unsigned int i;
177 78
178 blkcipher_walk_init(&walk, dst, src, nbytes); 79 for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) {
179 err = blkcipher_walk_virt(desc, &walk); 80 if (dst != src)
81 dst[i] = src[i];
180 82
181 while ((nbytes = walk.nbytes)) { 83 u128_to_be128(&ctrblks[i], iv);
182 nbytes = __cbc_encrypt(desc, &walk); 84 u128_inc(iv);
183 err = blkcipher_walk_done(desc, &walk, nbytes);
184 } 85 }
185 86
186 return err; 87 serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
187} 88}
188 89
189static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, 90static const struct common_glue_ctx serpent_enc = {
190 struct blkcipher_walk *walk) 91 .num_funcs = 2,
191{ 92 .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
192 struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
193 const unsigned int bsize = SERPENT_BLOCK_SIZE;
194 unsigned int nbytes = walk->nbytes;
195 u128 *src = (u128 *)walk->src.virt.addr;
196 u128 *dst = (u128 *)walk->dst.virt.addr;
197 u128 ivs[SERPENT_PARALLEL_BLOCKS - 1];
198 u128 last_iv;
199 int i;
200
201 /* Start of the last block. */
202 src += nbytes / bsize - 1;
203 dst += nbytes / bsize - 1;
204
205 last_iv = *src;
206
207 /* Process multi-block batch */
208 if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) {
209 do {
210 nbytes -= bsize * (SERPENT_PARALLEL_BLOCKS - 1);
211 src -= SERPENT_PARALLEL_BLOCKS - 1;
212 dst -= SERPENT_PARALLEL_BLOCKS - 1;
213
214 for (i = 0; i < SERPENT_PARALLEL_BLOCKS - 1; i++)
215 ivs[i] = src[i];
216
217 serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
218
219 for (i = 0; i < SERPENT_PARALLEL_BLOCKS - 1; i++)
220 u128_xor(dst + (i + 1), dst + (i + 1), ivs + i);
221
222 nbytes -= bsize;
223 if (nbytes < bsize)
224 goto done;
225 93
226 u128_xor(dst, dst, src - 1); 94 .funcs = { {
227 src -= 1; 95 .num_blocks = SERPENT_PARALLEL_BLOCKS,
228 dst -= 1; 96 .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_enc_blk_xway) }
229 } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS); 97 }, {
230 98 .num_blocks = 1,
231 if (nbytes < bsize) 99 .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) }
232 goto done; 100 } }
233 } 101};
234
235 /* Handle leftovers */
236 for (;;) {
237 __serpent_decrypt(ctx, (u8 *)dst, (u8 *)src);
238
239 nbytes -= bsize;
240 if (nbytes < bsize)
241 break;
242 102
243 u128_xor(dst, dst, src - 1); 103static const struct common_glue_ctx serpent_ctr = {
244 src -= 1; 104 .num_funcs = 2,
245 dst -= 1; 105 .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
246 } 106
107 .funcs = { {
108 .num_blocks = SERPENT_PARALLEL_BLOCKS,
109 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr_xway) }
110 }, {
111 .num_blocks = 1,
112 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) }
113 } }
114};
247 115
248done: 116static const struct common_glue_ctx serpent_dec = {
249 u128_xor(dst, dst, (u128 *)walk->iv); 117 .num_funcs = 2,
250 *(u128 *)walk->iv = last_iv; 118 .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
119
120 .funcs = { {
121 .num_blocks = SERPENT_PARALLEL_BLOCKS,
122 .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_dec_blk_xway) }
123 }, {
124 .num_blocks = 1,
125 .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) }
126 } }
127};
251 128
252 return nbytes; 129static const struct common_glue_ctx serpent_dec_cbc = {
253} 130 .num_funcs = 2,
131 .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
132
133 .funcs = { {
134 .num_blocks = SERPENT_PARALLEL_BLOCKS,
135 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_decrypt_cbc_xway) }
136 }, {
137 .num_blocks = 1,
138 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) }
139 } }
140};
254 141
255static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 142static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
256 struct scatterlist *src, unsigned int nbytes) 143 struct scatterlist *src, unsigned int nbytes)
257{ 144{
258 bool fpu_enabled = false; 145 return glue_ecb_crypt_128bit(&serpent_enc, desc, dst, src, nbytes);
259 struct blkcipher_walk walk;
260 int err;
261
262 blkcipher_walk_init(&walk, dst, src, nbytes);
263 err = blkcipher_walk_virt(desc, &walk);
264 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
265
266 while ((nbytes = walk.nbytes)) {
267 fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes);
268 nbytes = __cbc_decrypt(desc, &walk);
269 err = blkcipher_walk_done(desc, &walk, nbytes);
270 }
271
272 serpent_fpu_end(fpu_enabled);
273 return err;
274} 146}
275 147
276static inline void u128_to_be128(be128 *dst, const u128 *src) 148static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
149 struct scatterlist *src, unsigned int nbytes)
277{ 150{
278 dst->a = cpu_to_be64(src->a); 151 return glue_ecb_crypt_128bit(&serpent_dec, desc, dst, src, nbytes);
279 dst->b = cpu_to_be64(src->b);
280} 152}
281 153
282static inline void be128_to_u128(u128 *dst, const be128 *src) 154static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
155 struct scatterlist *src, unsigned int nbytes)
283{ 156{
284 dst->a = be64_to_cpu(src->a); 157 return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(__serpent_encrypt), desc,
285 dst->b = be64_to_cpu(src->b); 158 dst, src, nbytes);
286} 159}
287 160
288static inline void u128_inc(u128 *i) 161static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
162 struct scatterlist *src, unsigned int nbytes)
289{ 163{
290 i->b++; 164 return glue_cbc_decrypt_128bit(&serpent_dec_cbc, desc, dst, src,
291 if (!i->b) 165 nbytes);
292 i->a++;
293} 166}
294 167
295static void ctr_crypt_final(struct blkcipher_desc *desc, 168static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
296 struct blkcipher_walk *walk) 169 struct scatterlist *src, unsigned int nbytes)
297{ 170{
298 struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 171 return glue_ctr_crypt_128bit(&serpent_ctr, desc, dst, src, nbytes);
299 u8 *ctrblk = walk->iv;
300 u8 keystream[SERPENT_BLOCK_SIZE];
301 u8 *src = walk->src.virt.addr;
302 u8 *dst = walk->dst.virt.addr;
303 unsigned int nbytes = walk->nbytes;
304
305 __serpent_encrypt(ctx, keystream, ctrblk);
306 crypto_xor(keystream, src, nbytes);
307 memcpy(dst, keystream, nbytes);
308
309 crypto_inc(ctrblk, SERPENT_BLOCK_SIZE);
310} 172}
311 173
312static unsigned int __ctr_crypt(struct blkcipher_desc *desc, 174static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes)
313 struct blkcipher_walk *walk)
314{ 175{
315 struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 176 return glue_fpu_begin(SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS,
316 const unsigned int bsize = SERPENT_BLOCK_SIZE; 177 NULL, fpu_enabled, nbytes);
317 unsigned int nbytes = walk->nbytes;
318 u128 *src = (u128 *)walk->src.virt.addr;
319 u128 *dst = (u128 *)walk->dst.virt.addr;
320 u128 ctrblk;
321 be128 ctrblocks[SERPENT_PARALLEL_BLOCKS];
322 int i;
323
324 be128_to_u128(&ctrblk, (be128 *)walk->iv);
325
326 /* Process multi-block batch */
327 if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) {
328 do {
329 /* create ctrblks for parallel encrypt */
330 for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) {
331 if (dst != src)
332 dst[i] = src[i];
333
334 u128_to_be128(&ctrblocks[i], &ctrblk);
335 u128_inc(&ctrblk);
336 }
337
338 serpent_enc_blk_xway_xor(ctx, (u8 *)dst,
339 (u8 *)ctrblocks);
340
341 src += SERPENT_PARALLEL_BLOCKS;
342 dst += SERPENT_PARALLEL_BLOCKS;
343 nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
344 } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS);
345
346 if (nbytes < bsize)
347 goto done;
348 }
349
350 /* Handle leftovers */
351 do {
352 if (dst != src)
353 *dst = *src;
354
355 u128_to_be128(&ctrblocks[0], &ctrblk);
356 u128_inc(&ctrblk);
357
358 __serpent_encrypt(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks);
359 u128_xor(dst, dst, (u128 *)ctrblocks);
360
361 src += 1;
362 dst += 1;
363 nbytes -= bsize;
364 } while (nbytes >= bsize);
365
366done:
367 u128_to_be128((be128 *)walk->iv, &ctrblk);
368 return nbytes;
369} 178}
370 179
371static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, 180static inline void serpent_fpu_end(bool fpu_enabled)
372 struct scatterlist *src, unsigned int nbytes)
373{ 181{
374 bool fpu_enabled = false; 182 glue_fpu_end(fpu_enabled);
375 struct blkcipher_walk walk;
376 int err;
377
378 blkcipher_walk_init(&walk, dst, src, nbytes);
379 err = blkcipher_walk_virt_block(desc, &walk, SERPENT_BLOCK_SIZE);
380 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
381
382 while ((nbytes = walk.nbytes) >= SERPENT_BLOCK_SIZE) {
383 fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes);
384 nbytes = __ctr_crypt(desc, &walk);
385 err = blkcipher_walk_done(desc, &walk, nbytes);
386 }
387
388 serpent_fpu_end(fpu_enabled);
389
390 if (walk.nbytes) {
391 ctr_crypt_final(desc, &walk);
392 err = blkcipher_walk_done(desc, &walk, 0);
393 }
394
395 return err;
396} 183}
397 184
398struct crypt_priv { 185struct crypt_priv {
@@ -596,106 +383,6 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
596 return ret; 383 return ret;
597} 384}
598 385
599static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
600 unsigned int key_len)
601{
602 struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
603 struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base;
604 int err;
605
606 crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
607 crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm)
608 & CRYPTO_TFM_REQ_MASK);
609 err = crypto_ablkcipher_setkey(child, key, key_len);
610 crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child)
611 & CRYPTO_TFM_RES_MASK);
612 return err;
613}
614
615static int __ablk_encrypt(struct ablkcipher_request *req)
616{
617 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
618 struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
619 struct blkcipher_desc desc;
620
621 desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
622 desc.info = req->info;
623 desc.flags = 0;
624
625 return crypto_blkcipher_crt(desc.tfm)->encrypt(
626 &desc, req->dst, req->src, req->nbytes);
627}
628
629static int ablk_encrypt(struct ablkcipher_request *req)
630{
631 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
632 struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
633
634 if (!irq_fpu_usable()) {
635 struct ablkcipher_request *cryptd_req =
636 ablkcipher_request_ctx(req);
637
638 memcpy(cryptd_req, req, sizeof(*req));
639 ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
640
641 return crypto_ablkcipher_encrypt(cryptd_req);
642 } else {
643 return __ablk_encrypt(req);
644 }
645}
646
647static int ablk_decrypt(struct ablkcipher_request *req)
648{
649 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
650 struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
651
652 if (!irq_fpu_usable()) {
653 struct ablkcipher_request *cryptd_req =
654 ablkcipher_request_ctx(req);
655
656 memcpy(cryptd_req, req, sizeof(*req));
657 ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
658
659 return crypto_ablkcipher_decrypt(cryptd_req);
660 } else {
661 struct blkcipher_desc desc;
662
663 desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
664 desc.info = req->info;
665 desc.flags = 0;
666
667 return crypto_blkcipher_crt(desc.tfm)->decrypt(
668 &desc, req->dst, req->src, req->nbytes);
669 }
670}
671
672static void ablk_exit(struct crypto_tfm *tfm)
673{
674 struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm);
675
676 cryptd_free_ablkcipher(ctx->cryptd_tfm);
677}
678
679static int ablk_init(struct crypto_tfm *tfm)
680{
681 struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm);
682 struct cryptd_ablkcipher *cryptd_tfm;
683 char drv_name[CRYPTO_MAX_ALG_NAME];
684
685 snprintf(drv_name, sizeof(drv_name), "__driver-%s",
686 crypto_tfm_alg_driver_name(tfm));
687
688 cryptd_tfm = cryptd_alloc_ablkcipher(drv_name, 0, 0);
689 if (IS_ERR(cryptd_tfm))
690 return PTR_ERR(cryptd_tfm);
691
692 ctx->cryptd_tfm = cryptd_tfm;
693 tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) +
694 crypto_ablkcipher_reqsize(&cryptd_tfm->base);
695
696 return 0;
697}
698
699static struct crypto_alg serpent_algs[10] = { { 386static struct crypto_alg serpent_algs[10] = { {
700 .cra_name = "__ecb-serpent-sse2", 387 .cra_name = "__ecb-serpent-sse2",
701 .cra_driver_name = "__driver-ecb-serpent-sse2", 388 .cra_driver_name = "__driver-ecb-serpent-sse2",
@@ -808,7 +495,7 @@ static struct crypto_alg serpent_algs[10] = { {
808 .cra_priority = 400, 495 .cra_priority = 400,
809 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 496 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
810 .cra_blocksize = SERPENT_BLOCK_SIZE, 497 .cra_blocksize = SERPENT_BLOCK_SIZE,
811 .cra_ctxsize = sizeof(struct async_serpent_ctx), 498 .cra_ctxsize = sizeof(struct async_helper_ctx),
812 .cra_alignmask = 0, 499 .cra_alignmask = 0,
813 .cra_type = &crypto_ablkcipher_type, 500 .cra_type = &crypto_ablkcipher_type,
814 .cra_module = THIS_MODULE, 501 .cra_module = THIS_MODULE,
@@ -830,7 +517,7 @@ static struct crypto_alg serpent_algs[10] = { {
830 .cra_priority = 400, 517 .cra_priority = 400,
831 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 518 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
832 .cra_blocksize = SERPENT_BLOCK_SIZE, 519 .cra_blocksize = SERPENT_BLOCK_SIZE,
833 .cra_ctxsize = sizeof(struct async_serpent_ctx), 520 .cra_ctxsize = sizeof(struct async_helper_ctx),
834 .cra_alignmask = 0, 521 .cra_alignmask = 0,
835 .cra_type = &crypto_ablkcipher_type, 522 .cra_type = &crypto_ablkcipher_type,
836 .cra_module = THIS_MODULE, 523 .cra_module = THIS_MODULE,
@@ -853,7 +540,7 @@ static struct crypto_alg serpent_algs[10] = { {
853 .cra_priority = 400, 540 .cra_priority = 400,
854 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 541 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
855 .cra_blocksize = 1, 542 .cra_blocksize = 1,
856 .cra_ctxsize = sizeof(struct async_serpent_ctx), 543 .cra_ctxsize = sizeof(struct async_helper_ctx),
857 .cra_alignmask = 0, 544 .cra_alignmask = 0,
858 .cra_type = &crypto_ablkcipher_type, 545 .cra_type = &crypto_ablkcipher_type,
859 .cra_module = THIS_MODULE, 546 .cra_module = THIS_MODULE,
@@ -877,7 +564,7 @@ static struct crypto_alg serpent_algs[10] = { {
877 .cra_priority = 400, 564 .cra_priority = 400,
878 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 565 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
879 .cra_blocksize = SERPENT_BLOCK_SIZE, 566 .cra_blocksize = SERPENT_BLOCK_SIZE,
880 .cra_ctxsize = sizeof(struct async_serpent_ctx), 567 .cra_ctxsize = sizeof(struct async_helper_ctx),
881 .cra_alignmask = 0, 568 .cra_alignmask = 0,
882 .cra_type = &crypto_ablkcipher_type, 569 .cra_type = &crypto_ablkcipher_type,
883 .cra_module = THIS_MODULE, 570 .cra_module = THIS_MODULE,
@@ -902,7 +589,7 @@ static struct crypto_alg serpent_algs[10] = { {
902 .cra_priority = 400, 589 .cra_priority = 400,
903 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 590 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
904 .cra_blocksize = SERPENT_BLOCK_SIZE, 591 .cra_blocksize = SERPENT_BLOCK_SIZE,
905 .cra_ctxsize = sizeof(struct async_serpent_ctx), 592 .cra_ctxsize = sizeof(struct async_helper_ctx),
906 .cra_alignmask = 0, 593 .cra_alignmask = 0,
907 .cra_type = &crypto_ablkcipher_type, 594 .cra_type = &crypto_ablkcipher_type,
908 .cra_module = THIS_MODULE, 595 .cra_module = THIS_MODULE,
diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S
index b2c2f57d70e..49d6987a73d 100644
--- a/arch/x86/crypto/sha1_ssse3_asm.S
+++ b/arch/x86/crypto/sha1_ssse3_asm.S
@@ -468,7 +468,7 @@ W_PRECALC_SSSE3
468 */ 468 */
469SHA1_VECTOR_ASM sha1_transform_ssse3 469SHA1_VECTOR_ASM sha1_transform_ssse3
470 470
471#ifdef SHA1_ENABLE_AVX_SUPPORT 471#ifdef CONFIG_AS_AVX
472 472
473.macro W_PRECALC_AVX 473.macro W_PRECALC_AVX
474 474
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
index f916499d0ab..4a11a9d7245 100644
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -35,7 +35,7 @@
35 35
36asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data, 36asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data,
37 unsigned int rounds); 37 unsigned int rounds);
38#ifdef SHA1_ENABLE_AVX_SUPPORT 38#ifdef CONFIG_AS_AVX
39asmlinkage void sha1_transform_avx(u32 *digest, const char *data, 39asmlinkage void sha1_transform_avx(u32 *digest, const char *data,
40 unsigned int rounds); 40 unsigned int rounds);
41#endif 41#endif
@@ -184,7 +184,7 @@ static struct shash_alg alg = {
184 } 184 }
185}; 185};
186 186
187#ifdef SHA1_ENABLE_AVX_SUPPORT 187#ifdef CONFIG_AS_AVX
188static bool __init avx_usable(void) 188static bool __init avx_usable(void)
189{ 189{
190 u64 xcr0; 190 u64 xcr0;
@@ -209,7 +209,7 @@ static int __init sha1_ssse3_mod_init(void)
209 if (cpu_has_ssse3) 209 if (cpu_has_ssse3)
210 sha1_transform_asm = sha1_transform_ssse3; 210 sha1_transform_asm = sha1_transform_ssse3;
211 211
212#ifdef SHA1_ENABLE_AVX_SUPPORT 212#ifdef CONFIG_AS_AVX
213 /* allow AVX to override SSSE3, it's a little faster */ 213 /* allow AVX to override SSSE3, it's a little faster */
214 if (avx_usable()) 214 if (avx_usable())
215 sha1_transform_asm = sha1_transform_avx; 215 sha1_transform_asm = sha1_transform_avx;
diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
new file mode 100644
index 00000000000..35f45574390
--- /dev/null
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -0,0 +1,300 @@
1/*
2 * Twofish Cipher 8-way parallel algorithm (AVX/x86_64)
3 *
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
20 * USA
21 *
22 */
23
24.file "twofish-avx-x86_64-asm_64.S"
25.text
26
27/* structure of crypto context */
28#define s0 0
29#define s1 1024
30#define s2 2048
31#define s3 3072
32#define w 4096
33#define k 4128
34
35/**********************************************************************
36 8-way AVX twofish
37 **********************************************************************/
38#define CTX %rdi
39
40#define RA1 %xmm0
41#define RB1 %xmm1
42#define RC1 %xmm2
43#define RD1 %xmm3
44
45#define RA2 %xmm4
46#define RB2 %xmm5
47#define RC2 %xmm6
48#define RD2 %xmm7
49
50#define RX %xmm8
51#define RY %xmm9
52
53#define RK1 %xmm10
54#define RK2 %xmm11
55
56#define RID1 %rax
57#define RID1b %al
58#define RID2 %rbx
59#define RID2b %bl
60
61#define RGI1 %rdx
62#define RGI1bl %dl
63#define RGI1bh %dh
64#define RGI2 %rcx
65#define RGI2bl %cl
66#define RGI2bh %ch
67
68#define RGS1 %r8
69#define RGS1d %r8d
70#define RGS2 %r9
71#define RGS2d %r9d
72#define RGS3 %r10
73#define RGS3d %r10d
74
75
76#define lookup_32bit(t0, t1, t2, t3, src, dst) \
77 movb src ## bl, RID1b; \
78 movb src ## bh, RID2b; \
79 movl t0(CTX, RID1, 4), dst ## d; \
80 xorl t1(CTX, RID2, 4), dst ## d; \
81 shrq $16, src; \
82 movb src ## bl, RID1b; \
83 movb src ## bh, RID2b; \
84 xorl t2(CTX, RID1, 4), dst ## d; \
85 xorl t3(CTX, RID2, 4), dst ## d;
86
87#define G(a, x, t0, t1, t2, t3) \
88 vmovq a, RGI1; \
89 vpsrldq $8, a, x; \
90 vmovq x, RGI2; \
91 \
92 lookup_32bit(t0, t1, t2, t3, RGI1, RGS1); \
93 shrq $16, RGI1; \
94 lookup_32bit(t0, t1, t2, t3, RGI1, RGS2); \
95 shlq $32, RGS2; \
96 orq RGS1, RGS2; \
97 \
98 lookup_32bit(t0, t1, t2, t3, RGI2, RGS1); \
99 shrq $16, RGI2; \
100 lookup_32bit(t0, t1, t2, t3, RGI2, RGS3); \
101 shlq $32, RGS3; \
102 orq RGS1, RGS3; \
103 \
104 vmovq RGS2, x; \
105 vpinsrq $1, RGS3, x, x;
106
107#define encround(a, b, c, d, x, y) \
108 G(a, x, s0, s1, s2, s3); \
109 G(b, y, s1, s2, s3, s0); \
110 vpaddd x, y, x; \
111 vpaddd y, x, y; \
112 vpaddd x, RK1, x; \
113 vpaddd y, RK2, y; \
114 vpxor x, c, c; \
115 vpsrld $1, c, x; \
116 vpslld $(32 - 1), c, c; \
117 vpor c, x, c; \
118 vpslld $1, d, x; \
119 vpsrld $(32 - 1), d, d; \
120 vpor d, x, d; \
121 vpxor d, y, d;
122
123#define decround(a, b, c, d, x, y) \
124 G(a, x, s0, s1, s2, s3); \
125 G(b, y, s1, s2, s3, s0); \
126 vpaddd x, y, x; \
127 vpaddd y, x, y; \
128 vpaddd y, RK2, y; \
129 vpxor d, y, d; \
130 vpsrld $1, d, y; \
131 vpslld $(32 - 1), d, d; \
132 vpor d, y, d; \
133 vpslld $1, c, y; \
134 vpsrld $(32 - 1), c, c; \
135 vpor c, y, c; \
136 vpaddd x, RK1, x; \
137 vpxor x, c, c;
138
139#define encrypt_round(n, a, b, c, d) \
140 vbroadcastss (k+4*(2*(n)))(CTX), RK1; \
141 vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \
142 encround(a ## 1, b ## 1, c ## 1, d ## 1, RX, RY); \
143 encround(a ## 2, b ## 2, c ## 2, d ## 2, RX, RY);
144
145#define decrypt_round(n, a, b, c, d) \
146 vbroadcastss (k+4*(2*(n)))(CTX), RK1; \
147 vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \
148 decround(a ## 1, b ## 1, c ## 1, d ## 1, RX, RY); \
149 decround(a ## 2, b ## 2, c ## 2, d ## 2, RX, RY);
150
151#define encrypt_cycle(n) \
152 encrypt_round((2*n), RA, RB, RC, RD); \
153 encrypt_round(((2*n) + 1), RC, RD, RA, RB);
154
155#define decrypt_cycle(n) \
156 decrypt_round(((2*n) + 1), RC, RD, RA, RB); \
157 decrypt_round((2*n), RA, RB, RC, RD);
158
159
160#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
161 vpunpckldq x1, x0, t0; \
162 vpunpckhdq x1, x0, t2; \
163 vpunpckldq x3, x2, t1; \
164 vpunpckhdq x3, x2, x3; \
165 \
166 vpunpcklqdq t1, t0, x0; \
167 vpunpckhqdq t1, t0, x1; \
168 vpunpcklqdq x3, t2, x2; \
169 vpunpckhqdq x3, t2, x3;
170
171#define inpack_blocks(in, x0, x1, x2, x3, wkey, t0, t1, t2) \
172 vpxor (0*4*4)(in), wkey, x0; \
173 vpxor (1*4*4)(in), wkey, x1; \
174 vpxor (2*4*4)(in), wkey, x2; \
175 vpxor (3*4*4)(in), wkey, x3; \
176 \
177 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
178
179#define outunpack_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \
180 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
181 \
182 vpxor x0, wkey, x0; \
183 vmovdqu x0, (0*4*4)(out); \
184 vpxor x1, wkey, x1; \
185 vmovdqu x1, (1*4*4)(out); \
186 vpxor x2, wkey, x2; \
187 vmovdqu x2, (2*4*4)(out); \
188 vpxor x3, wkey, x3; \
189 vmovdqu x3, (3*4*4)(out);
190
191#define outunpack_xor_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \
192 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
193 \
194 vpxor x0, wkey, x0; \
195 vpxor (0*4*4)(out), x0, x0; \
196 vmovdqu x0, (0*4*4)(out); \
197 vpxor x1, wkey, x1; \
198 vpxor (1*4*4)(out), x1, x1; \
199 vmovdqu x1, (1*4*4)(out); \
200 vpxor x2, wkey, x2; \
201 vpxor (2*4*4)(out), x2, x2; \
202 vmovdqu x2, (2*4*4)(out); \
203 vpxor x3, wkey, x3; \
204 vpxor (3*4*4)(out), x3, x3; \
205 vmovdqu x3, (3*4*4)(out);
206
207.align 8
208.global __twofish_enc_blk_8way
209.type __twofish_enc_blk_8way,@function;
210
211__twofish_enc_blk_8way:
212 /* input:
213 * %rdi: ctx, CTX
214 * %rsi: dst
215 * %rdx: src
216 * %rcx: bool, if true: xor output
217 */
218
219 pushq %rbx;
220 pushq %rcx;
221
222 vmovdqu w(CTX), RK1;
223
224 leaq (4*4*4)(%rdx), %rax;
225 inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RK1, RX, RY, RK2);
226 inpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX, RY, RK2);
227
228 xorq RID1, RID1;
229 xorq RID2, RID2;
230
231 encrypt_cycle(0);
232 encrypt_cycle(1);
233 encrypt_cycle(2);
234 encrypt_cycle(3);
235 encrypt_cycle(4);
236 encrypt_cycle(5);
237 encrypt_cycle(6);
238 encrypt_cycle(7);
239
240 vmovdqu (w+4*4)(CTX), RK1;
241
242 popq %rcx;
243 popq %rbx;
244
245 leaq (4*4*4)(%rsi), %rax;
246
247 testb %cl, %cl;
248 jnz __enc_xor8;
249
250 outunpack_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2);
251 outunpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2);
252
253 ret;
254
255__enc_xor8:
256 outunpack_xor_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2);
257 outunpack_xor_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2);
258
259 ret;
260
261.align 8
262.global twofish_dec_blk_8way
263.type twofish_dec_blk_8way,@function;
264
265twofish_dec_blk_8way:
266 /* input:
267 * %rdi: ctx, CTX
268 * %rsi: dst
269 * %rdx: src
270 */
271
272 pushq %rbx;
273
274 vmovdqu (w+4*4)(CTX), RK1;
275
276 leaq (4*4*4)(%rdx), %rax;
277 inpack_blocks(%rdx, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2);
278 inpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2);
279
280 xorq RID1, RID1;
281 xorq RID2, RID2;
282
283 decrypt_cycle(7);
284 decrypt_cycle(6);
285 decrypt_cycle(5);
286 decrypt_cycle(4);
287 decrypt_cycle(3);
288 decrypt_cycle(2);
289 decrypt_cycle(1);
290 decrypt_cycle(0);
291
292 vmovdqu (w)(CTX), RK1;
293
294 popq %rbx;
295
296 leaq (4*4*4)(%rsi), %rax;
297 outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RK1, RX, RY, RK2);
298 outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX, RY, RK2);
299
300 ret;
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
new file mode 100644
index 00000000000..782b67ddaf6
--- /dev/null
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -0,0 +1,624 @@
1/*
2 * Glue Code for AVX assembler version of Twofish Cipher
3 *
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
20 * USA
21 *
22 */
23
24#include <linux/module.h>
25#include <linux/hardirq.h>
26#include <linux/types.h>
27#include <linux/crypto.h>
28#include <linux/err.h>
29#include <crypto/algapi.h>
30#include <crypto/twofish.h>
31#include <crypto/cryptd.h>
32#include <crypto/b128ops.h>
33#include <crypto/ctr.h>
34#include <crypto/lrw.h>
35#include <crypto/xts.h>
36#include <asm/i387.h>
37#include <asm/xcr.h>
38#include <asm/xsave.h>
39#include <asm/crypto/twofish.h>
40#include <asm/crypto/ablk_helper.h>
41#include <asm/crypto/glue_helper.h>
42#include <crypto/scatterwalk.h>
43#include <linux/workqueue.h>
44#include <linux/spinlock.h>
45
46#define TWOFISH_PARALLEL_BLOCKS 8
47
48static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
49 const u8 *src)
50{
51 __twofish_enc_blk_3way(ctx, dst, src, false);
52}
53
54/* 8-way parallel cipher functions */
55asmlinkage void __twofish_enc_blk_8way(struct twofish_ctx *ctx, u8 *dst,
56 const u8 *src, bool xor);
57asmlinkage void twofish_dec_blk_8way(struct twofish_ctx *ctx, u8 *dst,
58 const u8 *src);
59
60static inline void twofish_enc_blk_xway(struct twofish_ctx *ctx, u8 *dst,
61 const u8 *src)
62{
63 __twofish_enc_blk_8way(ctx, dst, src, false);
64}
65
66static inline void twofish_enc_blk_xway_xor(struct twofish_ctx *ctx, u8 *dst,
67 const u8 *src)
68{
69 __twofish_enc_blk_8way(ctx, dst, src, true);
70}
71
72static inline void twofish_dec_blk_xway(struct twofish_ctx *ctx, u8 *dst,
73 const u8 *src)
74{
75 twofish_dec_blk_8way(ctx, dst, src);
76}
77
78static void twofish_dec_blk_cbc_xway(void *ctx, u128 *dst, const u128 *src)
79{
80 u128 ivs[TWOFISH_PARALLEL_BLOCKS - 1];
81 unsigned int j;
82
83 for (j = 0; j < TWOFISH_PARALLEL_BLOCKS - 1; j++)
84 ivs[j] = src[j];
85
86 twofish_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
87
88 for (j = 0; j < TWOFISH_PARALLEL_BLOCKS - 1; j++)
89 u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
90}
91
92static void twofish_enc_blk_ctr_xway(void *ctx, u128 *dst, const u128 *src,
93 u128 *iv)
94{
95 be128 ctrblks[TWOFISH_PARALLEL_BLOCKS];
96 unsigned int i;
97
98 for (i = 0; i < TWOFISH_PARALLEL_BLOCKS; i++) {
99 if (dst != src)
100 dst[i] = src[i];
101
102 u128_to_be128(&ctrblks[i], iv);
103 u128_inc(iv);
104 }
105
106 twofish_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
107}
108
109static const struct common_glue_ctx twofish_enc = {
110 .num_funcs = 3,
111 .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
112
113 .funcs = { {
114 .num_blocks = TWOFISH_PARALLEL_BLOCKS,
115 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_xway) }
116 }, {
117 .num_blocks = 3,
118 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) }
119 }, {
120 .num_blocks = 1,
121 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk) }
122 } }
123};
124
125static const struct common_glue_ctx twofish_ctr = {
126 .num_funcs = 3,
127 .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
128
129 .funcs = { {
130 .num_blocks = TWOFISH_PARALLEL_BLOCKS,
131 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_xway) }
132 }, {
133 .num_blocks = 3,
134 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) }
135 }, {
136 .num_blocks = 1,
137 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr) }
138 } }
139};
140
141static const struct common_glue_ctx twofish_dec = {
142 .num_funcs = 3,
143 .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
144
145 .funcs = { {
146 .num_blocks = TWOFISH_PARALLEL_BLOCKS,
147 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_xway) }
148 }, {
149 .num_blocks = 3,
150 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) }
151 }, {
152 .num_blocks = 1,
153 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk) }
154 } }
155};
156
157static const struct common_glue_ctx twofish_dec_cbc = {
158 .num_funcs = 3,
159 .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
160
161 .funcs = { {
162 .num_blocks = TWOFISH_PARALLEL_BLOCKS,
163 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_xway) }
164 }, {
165 .num_blocks = 3,
166 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) }
167 }, {
168 .num_blocks = 1,
169 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk) }
170 } }
171};
172
173static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
174 struct scatterlist *src, unsigned int nbytes)
175{
176 return glue_ecb_crypt_128bit(&twofish_enc, desc, dst, src, nbytes);
177}
178
179static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
180 struct scatterlist *src, unsigned int nbytes)
181{
182 return glue_ecb_crypt_128bit(&twofish_dec, desc, dst, src, nbytes);
183}
184
185static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
186 struct scatterlist *src, unsigned int nbytes)
187{
188 return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(twofish_enc_blk), desc,
189 dst, src, nbytes);
190}
191
192static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
193 struct scatterlist *src, unsigned int nbytes)
194{
195 return glue_cbc_decrypt_128bit(&twofish_dec_cbc, desc, dst, src,
196 nbytes);
197}
198
199static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
200 struct scatterlist *src, unsigned int nbytes)
201{
202 return glue_ctr_crypt_128bit(&twofish_ctr, desc, dst, src, nbytes);
203}
204
205static inline bool twofish_fpu_begin(bool fpu_enabled, unsigned int nbytes)
206{
207 return glue_fpu_begin(TF_BLOCK_SIZE, TWOFISH_PARALLEL_BLOCKS, NULL,
208 fpu_enabled, nbytes);
209}
210
211static inline void twofish_fpu_end(bool fpu_enabled)
212{
213 glue_fpu_end(fpu_enabled);
214}
215
216struct crypt_priv {
217 struct twofish_ctx *ctx;
218 bool fpu_enabled;
219};
220
221static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
222{
223 const unsigned int bsize = TF_BLOCK_SIZE;
224 struct crypt_priv *ctx = priv;
225 int i;
226
227 ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
228
229 if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
230 twofish_enc_blk_xway(ctx->ctx, srcdst, srcdst);
231 return;
232 }
233
234 for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3)
235 twofish_enc_blk_3way(ctx->ctx, srcdst, srcdst);
236
237 nbytes %= bsize * 3;
238
239 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
240 twofish_enc_blk(ctx->ctx, srcdst, srcdst);
241}
242
243static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
244{
245 const unsigned int bsize = TF_BLOCK_SIZE;
246 struct crypt_priv *ctx = priv;
247 int i;
248
249 ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
250
251 if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
252 twofish_dec_blk_xway(ctx->ctx, srcdst, srcdst);
253 return;
254 }
255
256 for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3)
257 twofish_dec_blk_3way(ctx->ctx, srcdst, srcdst);
258
259 nbytes %= bsize * 3;
260
261 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
262 twofish_dec_blk(ctx->ctx, srcdst, srcdst);
263}
264
265static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
266 struct scatterlist *src, unsigned int nbytes)
267{
268 struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
269 be128 buf[TWOFISH_PARALLEL_BLOCKS];
270 struct crypt_priv crypt_ctx = {
271 .ctx = &ctx->twofish_ctx,
272 .fpu_enabled = false,
273 };
274 struct lrw_crypt_req req = {
275 .tbuf = buf,
276 .tbuflen = sizeof(buf),
277
278 .table_ctx = &ctx->lrw_table,
279 .crypt_ctx = &crypt_ctx,
280 .crypt_fn = encrypt_callback,
281 };
282 int ret;
283
284 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
285 ret = lrw_crypt(desc, dst, src, nbytes, &req);
286 twofish_fpu_end(crypt_ctx.fpu_enabled);
287
288 return ret;
289}
290
291static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
292 struct scatterlist *src, unsigned int nbytes)
293{
294 struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
295 be128 buf[TWOFISH_PARALLEL_BLOCKS];
296 struct crypt_priv crypt_ctx = {
297 .ctx = &ctx->twofish_ctx,
298 .fpu_enabled = false,
299 };
300 struct lrw_crypt_req req = {
301 .tbuf = buf,
302 .tbuflen = sizeof(buf),
303
304 .table_ctx = &ctx->lrw_table,
305 .crypt_ctx = &crypt_ctx,
306 .crypt_fn = decrypt_callback,
307 };
308 int ret;
309
310 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
311 ret = lrw_crypt(desc, dst, src, nbytes, &req);
312 twofish_fpu_end(crypt_ctx.fpu_enabled);
313
314 return ret;
315}
316
317static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
318 struct scatterlist *src, unsigned int nbytes)
319{
320 struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
321 be128 buf[TWOFISH_PARALLEL_BLOCKS];
322 struct crypt_priv crypt_ctx = {
323 .ctx = &ctx->crypt_ctx,
324 .fpu_enabled = false,
325 };
326 struct xts_crypt_req req = {
327 .tbuf = buf,
328 .tbuflen = sizeof(buf),
329
330 .tweak_ctx = &ctx->tweak_ctx,
331 .tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk),
332 .crypt_ctx = &crypt_ctx,
333 .crypt_fn = encrypt_callback,
334 };
335 int ret;
336
337 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
338 ret = xts_crypt(desc, dst, src, nbytes, &req);
339 twofish_fpu_end(crypt_ctx.fpu_enabled);
340
341 return ret;
342}
343
344static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
345 struct scatterlist *src, unsigned int nbytes)
346{
347 struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
348 be128 buf[TWOFISH_PARALLEL_BLOCKS];
349 struct crypt_priv crypt_ctx = {
350 .ctx = &ctx->crypt_ctx,
351 .fpu_enabled = false,
352 };
353 struct xts_crypt_req req = {
354 .tbuf = buf,
355 .tbuflen = sizeof(buf),
356
357 .tweak_ctx = &ctx->tweak_ctx,
358 .tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk),
359 .crypt_ctx = &crypt_ctx,
360 .crypt_fn = decrypt_callback,
361 };
362 int ret;
363
364 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
365 ret = xts_crypt(desc, dst, src, nbytes, &req);
366 twofish_fpu_end(crypt_ctx.fpu_enabled);
367
368 return ret;
369}
370
371static struct crypto_alg twofish_algs[10] = { {
372 .cra_name = "__ecb-twofish-avx",
373 .cra_driver_name = "__driver-ecb-twofish-avx",
374 .cra_priority = 0,
375 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
376 .cra_blocksize = TF_BLOCK_SIZE,
377 .cra_ctxsize = sizeof(struct twofish_ctx),
378 .cra_alignmask = 0,
379 .cra_type = &crypto_blkcipher_type,
380 .cra_module = THIS_MODULE,
381 .cra_list = LIST_HEAD_INIT(twofish_algs[0].cra_list),
382 .cra_u = {
383 .blkcipher = {
384 .min_keysize = TF_MIN_KEY_SIZE,
385 .max_keysize = TF_MAX_KEY_SIZE,
386 .setkey = twofish_setkey,
387 .encrypt = ecb_encrypt,
388 .decrypt = ecb_decrypt,
389 },
390 },
391}, {
392 .cra_name = "__cbc-twofish-avx",
393 .cra_driver_name = "__driver-cbc-twofish-avx",
394 .cra_priority = 0,
395 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
396 .cra_blocksize = TF_BLOCK_SIZE,
397 .cra_ctxsize = sizeof(struct twofish_ctx),
398 .cra_alignmask = 0,
399 .cra_type = &crypto_blkcipher_type,
400 .cra_module = THIS_MODULE,
401 .cra_list = LIST_HEAD_INIT(twofish_algs[1].cra_list),
402 .cra_u = {
403 .blkcipher = {
404 .min_keysize = TF_MIN_KEY_SIZE,
405 .max_keysize = TF_MAX_KEY_SIZE,
406 .setkey = twofish_setkey,
407 .encrypt = cbc_encrypt,
408 .decrypt = cbc_decrypt,
409 },
410 },
411}, {
412 .cra_name = "__ctr-twofish-avx",
413 .cra_driver_name = "__driver-ctr-twofish-avx",
414 .cra_priority = 0,
415 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
416 .cra_blocksize = 1,
417 .cra_ctxsize = sizeof(struct twofish_ctx),
418 .cra_alignmask = 0,
419 .cra_type = &crypto_blkcipher_type,
420 .cra_module = THIS_MODULE,
421 .cra_list = LIST_HEAD_INIT(twofish_algs[2].cra_list),
422 .cra_u = {
423 .blkcipher = {
424 .min_keysize = TF_MIN_KEY_SIZE,
425 .max_keysize = TF_MAX_KEY_SIZE,
426 .ivsize = TF_BLOCK_SIZE,
427 .setkey = twofish_setkey,
428 .encrypt = ctr_crypt,
429 .decrypt = ctr_crypt,
430 },
431 },
432}, {
433 .cra_name = "__lrw-twofish-avx",
434 .cra_driver_name = "__driver-lrw-twofish-avx",
435 .cra_priority = 0,
436 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
437 .cra_blocksize = TF_BLOCK_SIZE,
438 .cra_ctxsize = sizeof(struct twofish_lrw_ctx),
439 .cra_alignmask = 0,
440 .cra_type = &crypto_blkcipher_type,
441 .cra_module = THIS_MODULE,
442 .cra_list = LIST_HEAD_INIT(twofish_algs[3].cra_list),
443 .cra_exit = lrw_twofish_exit_tfm,
444 .cra_u = {
445 .blkcipher = {
446 .min_keysize = TF_MIN_KEY_SIZE +
447 TF_BLOCK_SIZE,
448 .max_keysize = TF_MAX_KEY_SIZE +
449 TF_BLOCK_SIZE,
450 .ivsize = TF_BLOCK_SIZE,
451 .setkey = lrw_twofish_setkey,
452 .encrypt = lrw_encrypt,
453 .decrypt = lrw_decrypt,
454 },
455 },
456}, {
457 .cra_name = "__xts-twofish-avx",
458 .cra_driver_name = "__driver-xts-twofish-avx",
459 .cra_priority = 0,
460 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
461 .cra_blocksize = TF_BLOCK_SIZE,
462 .cra_ctxsize = sizeof(struct twofish_xts_ctx),
463 .cra_alignmask = 0,
464 .cra_type = &crypto_blkcipher_type,
465 .cra_module = THIS_MODULE,
466 .cra_list = LIST_HEAD_INIT(twofish_algs[4].cra_list),
467 .cra_u = {
468 .blkcipher = {
469 .min_keysize = TF_MIN_KEY_SIZE * 2,
470 .max_keysize = TF_MAX_KEY_SIZE * 2,
471 .ivsize = TF_BLOCK_SIZE,
472 .setkey = xts_twofish_setkey,
473 .encrypt = xts_encrypt,
474 .decrypt = xts_decrypt,
475 },
476 },
477}, {
478 .cra_name = "ecb(twofish)",
479 .cra_driver_name = "ecb-twofish-avx",
480 .cra_priority = 400,
481 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
482 .cra_blocksize = TF_BLOCK_SIZE,
483 .cra_ctxsize = sizeof(struct async_helper_ctx),
484 .cra_alignmask = 0,
485 .cra_type = &crypto_ablkcipher_type,
486 .cra_module = THIS_MODULE,
487 .cra_list = LIST_HEAD_INIT(twofish_algs[5].cra_list),
488 .cra_init = ablk_init,
489 .cra_exit = ablk_exit,
490 .cra_u = {
491 .ablkcipher = {
492 .min_keysize = TF_MIN_KEY_SIZE,
493 .max_keysize = TF_MAX_KEY_SIZE,
494 .setkey = ablk_set_key,
495 .encrypt = ablk_encrypt,
496 .decrypt = ablk_decrypt,
497 },
498 },
499}, {
500 .cra_name = "cbc(twofish)",
501 .cra_driver_name = "cbc-twofish-avx",
502 .cra_priority = 400,
503 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
504 .cra_blocksize = TF_BLOCK_SIZE,
505 .cra_ctxsize = sizeof(struct async_helper_ctx),
506 .cra_alignmask = 0,
507 .cra_type = &crypto_ablkcipher_type,
508 .cra_module = THIS_MODULE,
509 .cra_list = LIST_HEAD_INIT(twofish_algs[6].cra_list),
510 .cra_init = ablk_init,
511 .cra_exit = ablk_exit,
512 .cra_u = {
513 .ablkcipher = {
514 .min_keysize = TF_MIN_KEY_SIZE,
515 .max_keysize = TF_MAX_KEY_SIZE,
516 .ivsize = TF_BLOCK_SIZE,
517 .setkey = ablk_set_key,
518 .encrypt = __ablk_encrypt,
519 .decrypt = ablk_decrypt,
520 },
521 },
522}, {
523 .cra_name = "ctr(twofish)",
524 .cra_driver_name = "ctr-twofish-avx",
525 .cra_priority = 400,
526 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
527 .cra_blocksize = 1,
528 .cra_ctxsize = sizeof(struct async_helper_ctx),
529 .cra_alignmask = 0,
530 .cra_type = &crypto_ablkcipher_type,
531 .cra_module = THIS_MODULE,
532 .cra_list = LIST_HEAD_INIT(twofish_algs[7].cra_list),
533 .cra_init = ablk_init,
534 .cra_exit = ablk_exit,
535 .cra_u = {
536 .ablkcipher = {
537 .min_keysize = TF_MIN_KEY_SIZE,
538 .max_keysize = TF_MAX_KEY_SIZE,
539 .ivsize = TF_BLOCK_SIZE,
540 .setkey = ablk_set_key,
541 .encrypt = ablk_encrypt,
542 .decrypt = ablk_encrypt,
543 .geniv = "chainiv",
544 },
545 },
546}, {
547 .cra_name = "lrw(twofish)",
548 .cra_driver_name = "lrw-twofish-avx",
549 .cra_priority = 400,
550 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
551 .cra_blocksize = TF_BLOCK_SIZE,
552 .cra_ctxsize = sizeof(struct async_helper_ctx),
553 .cra_alignmask = 0,
554 .cra_type = &crypto_ablkcipher_type,
555 .cra_module = THIS_MODULE,
556 .cra_list = LIST_HEAD_INIT(twofish_algs[8].cra_list),
557 .cra_init = ablk_init,
558 .cra_exit = ablk_exit,
559 .cra_u = {
560 .ablkcipher = {
561 .min_keysize = TF_MIN_KEY_SIZE +
562 TF_BLOCK_SIZE,
563 .max_keysize = TF_MAX_KEY_SIZE +
564 TF_BLOCK_SIZE,
565 .ivsize = TF_BLOCK_SIZE,
566 .setkey = ablk_set_key,
567 .encrypt = ablk_encrypt,
568 .decrypt = ablk_decrypt,
569 },
570 },
571}, {
572 .cra_name = "xts(twofish)",
573 .cra_driver_name = "xts-twofish-avx",
574 .cra_priority = 400,
575 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
576 .cra_blocksize = TF_BLOCK_SIZE,
577 .cra_ctxsize = sizeof(struct async_helper_ctx),
578 .cra_alignmask = 0,
579 .cra_type = &crypto_ablkcipher_type,
580 .cra_module = THIS_MODULE,
581 .cra_list = LIST_HEAD_INIT(twofish_algs[9].cra_list),
582 .cra_init = ablk_init,
583 .cra_exit = ablk_exit,
584 .cra_u = {
585 .ablkcipher = {
586 .min_keysize = TF_MIN_KEY_SIZE * 2,
587 .max_keysize = TF_MAX_KEY_SIZE * 2,
588 .ivsize = TF_BLOCK_SIZE,
589 .setkey = ablk_set_key,
590 .encrypt = ablk_encrypt,
591 .decrypt = ablk_decrypt,
592 },
593 },
594} };
595
596static int __init twofish_init(void)
597{
598 u64 xcr0;
599
600 if (!cpu_has_avx || !cpu_has_osxsave) {
601 printk(KERN_INFO "AVX instructions are not detected.\n");
602 return -ENODEV;
603 }
604
605 xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
606 if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
607 printk(KERN_INFO "AVX detected but unusable.\n");
608 return -ENODEV;
609 }
610
611 return crypto_register_algs(twofish_algs, ARRAY_SIZE(twofish_algs));
612}
613
614static void __exit twofish_exit(void)
615{
616 crypto_unregister_algs(twofish_algs, ARRAY_SIZE(twofish_algs));
617}
618
619module_init(twofish_init);
620module_exit(twofish_exit);
621
622MODULE_DESCRIPTION("Twofish Cipher Algorithm, AVX optimized");
623MODULE_LICENSE("GPL");
624MODULE_ALIAS("twofish");
diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c
index 922ab24cce3..15f9347316c 100644
--- a/arch/x86/crypto/twofish_glue_3way.c
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -3,11 +3,6 @@
3 * 3 *
4 * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 4 * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 * 5 *
6 * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
7 * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
8 * CTR part based on code (crypto/ctr.c) by:
9 * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
10 *
11 * This program is free software; you can redistribute it and/or modify 6 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by 7 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or 8 * the Free Software Foundation; either version 2 of the License, or
@@ -33,20 +28,13 @@
33#include <crypto/algapi.h> 28#include <crypto/algapi.h>
34#include <crypto/twofish.h> 29#include <crypto/twofish.h>
35#include <crypto/b128ops.h> 30#include <crypto/b128ops.h>
31#include <asm/crypto/twofish.h>
32#include <asm/crypto/glue_helper.h>
36#include <crypto/lrw.h> 33#include <crypto/lrw.h>
37#include <crypto/xts.h> 34#include <crypto/xts.h>
38 35
39/* regular block cipher functions from twofish_x86_64 module */ 36EXPORT_SYMBOL_GPL(__twofish_enc_blk_3way);
40asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst, 37EXPORT_SYMBOL_GPL(twofish_dec_blk_3way);
41 const u8 *src);
42asmlinkage void twofish_dec_blk(struct twofish_ctx *ctx, u8 *dst,
43 const u8 *src);
44
45/* 3-way parallel cipher functions */
46asmlinkage void __twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
47 const u8 *src, bool xor);
48asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst,
49 const u8 *src);
50 38
51static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, 39static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
52 const u8 *src) 40 const u8 *src)
@@ -60,311 +48,139 @@ static inline void twofish_enc_blk_xor_3way(struct twofish_ctx *ctx, u8 *dst,
60 __twofish_enc_blk_3way(ctx, dst, src, true); 48 __twofish_enc_blk_3way(ctx, dst, src, true);
61} 49}
62 50
63static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, 51void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src)
64 void (*fn)(struct twofish_ctx *, u8 *, const u8 *),
65 void (*fn_3way)(struct twofish_ctx *, u8 *, const u8 *))
66{
67 struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
68 unsigned int bsize = TF_BLOCK_SIZE;
69 unsigned int nbytes;
70 int err;
71
72 err = blkcipher_walk_virt(desc, walk);
73
74 while ((nbytes = walk->nbytes)) {
75 u8 *wsrc = walk->src.virt.addr;
76 u8 *wdst = walk->dst.virt.addr;
77
78 /* Process three block batch */
79 if (nbytes >= bsize * 3) {
80 do {
81 fn_3way(ctx, wdst, wsrc);
82
83 wsrc += bsize * 3;
84 wdst += bsize * 3;
85 nbytes -= bsize * 3;
86 } while (nbytes >= bsize * 3);
87
88 if (nbytes < bsize)
89 goto done;
90 }
91
92 /* Handle leftovers */
93 do {
94 fn(ctx, wdst, wsrc);
95
96 wsrc += bsize;
97 wdst += bsize;
98 nbytes -= bsize;
99 } while (nbytes >= bsize);
100
101done:
102 err = blkcipher_walk_done(desc, walk, nbytes);
103 }
104
105 return err;
106}
107
108static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
109 struct scatterlist *src, unsigned int nbytes)
110{ 52{
111 struct blkcipher_walk walk; 53 u128 ivs[2];
112 54
113 blkcipher_walk_init(&walk, dst, src, nbytes); 55 ivs[0] = src[0];
114 return ecb_crypt(desc, &walk, twofish_enc_blk, twofish_enc_blk_3way); 56 ivs[1] = src[1];
115}
116 57
117static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 58 twofish_dec_blk_3way(ctx, (u8 *)dst, (u8 *)src);
118 struct scatterlist *src, unsigned int nbytes)
119{
120 struct blkcipher_walk walk;
121 59
122 blkcipher_walk_init(&walk, dst, src, nbytes); 60 u128_xor(&dst[1], &dst[1], &ivs[0]);
123 return ecb_crypt(desc, &walk, twofish_dec_blk, twofish_dec_blk_3way); 61 u128_xor(&dst[2], &dst[2], &ivs[1]);
124} 62}
63EXPORT_SYMBOL_GPL(twofish_dec_blk_cbc_3way);
125 64
126static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, 65void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
127 struct blkcipher_walk *walk)
128{
129 struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
130 unsigned int bsize = TF_BLOCK_SIZE;
131 unsigned int nbytes = walk->nbytes;
132 u128 *src = (u128 *)walk->src.virt.addr;
133 u128 *dst = (u128 *)walk->dst.virt.addr;
134 u128 *iv = (u128 *)walk->iv;
135
136 do {
137 u128_xor(dst, src, iv);
138 twofish_enc_blk(ctx, (u8 *)dst, (u8 *)dst);
139 iv = dst;
140
141 src += 1;
142 dst += 1;
143 nbytes -= bsize;
144 } while (nbytes >= bsize);
145
146 u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv);
147 return nbytes;
148}
149
150static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
151 struct scatterlist *src, unsigned int nbytes)
152{ 66{
153 struct blkcipher_walk walk; 67 be128 ctrblk;
154 int err;
155 68
156 blkcipher_walk_init(&walk, dst, src, nbytes); 69 if (dst != src)
157 err = blkcipher_walk_virt(desc, &walk); 70 *dst = *src;
158 71
159 while ((nbytes = walk.nbytes)) { 72 u128_to_be128(&ctrblk, iv);
160 nbytes = __cbc_encrypt(desc, &walk); 73 u128_inc(iv);
161 err = blkcipher_walk_done(desc, &walk, nbytes);
162 }
163 74
164 return err; 75 twofish_enc_blk(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
76 u128_xor(dst, dst, (u128 *)&ctrblk);
165} 77}
78EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr);
166 79
167static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, 80void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src,
168 struct blkcipher_walk *walk) 81 u128 *iv)
169{ 82{
170 struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 83 be128 ctrblks[3];
171 unsigned int bsize = TF_BLOCK_SIZE;
172 unsigned int nbytes = walk->nbytes;
173 u128 *src = (u128 *)walk->src.virt.addr;
174 u128 *dst = (u128 *)walk->dst.virt.addr;
175 u128 ivs[3 - 1];
176 u128 last_iv;
177
178 /* Start of the last block. */
179 src += nbytes / bsize - 1;
180 dst += nbytes / bsize - 1;
181
182 last_iv = *src;
183
184 /* Process three block batch */
185 if (nbytes >= bsize * 3) {
186 do {
187 nbytes -= bsize * (3 - 1);
188 src -= 3 - 1;
189 dst -= 3 - 1;
190
191 ivs[0] = src[0];
192 ivs[1] = src[1];
193
194 twofish_dec_blk_3way(ctx, (u8 *)dst, (u8 *)src);
195
196 u128_xor(dst + 1, dst + 1, ivs + 0);
197 u128_xor(dst + 2, dst + 2, ivs + 1);
198
199 nbytes -= bsize;
200 if (nbytes < bsize)
201 goto done;
202
203 u128_xor(dst, dst, src - 1);
204 src -= 1;
205 dst -= 1;
206 } while (nbytes >= bsize * 3);
207
208 if (nbytes < bsize)
209 goto done;
210 }
211
212 /* Handle leftovers */
213 for (;;) {
214 twofish_dec_blk(ctx, (u8 *)dst, (u8 *)src);
215
216 nbytes -= bsize;
217 if (nbytes < bsize)
218 break;
219 84
220 u128_xor(dst, dst, src - 1); 85 if (dst != src) {
221 src -= 1; 86 dst[0] = src[0];
222 dst -= 1; 87 dst[1] = src[1];
88 dst[2] = src[2];
223 } 89 }
224 90
225done: 91 u128_to_be128(&ctrblks[0], iv);
226 u128_xor(dst, dst, (u128 *)walk->iv); 92 u128_inc(iv);
227 *(u128 *)walk->iv = last_iv; 93 u128_to_be128(&ctrblks[1], iv);
94 u128_inc(iv);
95 u128_to_be128(&ctrblks[2], iv);
96 u128_inc(iv);
228 97
229 return nbytes; 98 twofish_enc_blk_xor_3way(ctx, (u8 *)dst, (u8 *)ctrblks);
230} 99}
100EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr_3way);
101
102static const struct common_glue_ctx twofish_enc = {
103 .num_funcs = 2,
104 .fpu_blocks_limit = -1,
105
106 .funcs = { {
107 .num_blocks = 3,
108 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) }
109 }, {
110 .num_blocks = 1,
111 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk) }
112 } }
113};
231 114
232static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 115static const struct common_glue_ctx twofish_ctr = {
233 struct scatterlist *src, unsigned int nbytes) 116 .num_funcs = 2,
234{ 117 .fpu_blocks_limit = -1,
235 struct blkcipher_walk walk; 118
236 int err; 119 .funcs = { {
237 120 .num_blocks = 3,
238 blkcipher_walk_init(&walk, dst, src, nbytes); 121 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_ctr_3way) }
239 err = blkcipher_walk_virt(desc, &walk); 122 }, {
123 .num_blocks = 1,
124 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_ctr) }
125 } }
126};
240 127
241 while ((nbytes = walk.nbytes)) { 128static const struct common_glue_ctx twofish_dec = {
242 nbytes = __cbc_decrypt(desc, &walk); 129 .num_funcs = 2,
243 err = blkcipher_walk_done(desc, &walk, nbytes); 130 .fpu_blocks_limit = -1,
244 } 131
132 .funcs = { {
133 .num_blocks = 3,
134 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) }
135 }, {
136 .num_blocks = 1,
137 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk) }
138 } }
139};
245 140
246 return err; 141static const struct common_glue_ctx twofish_dec_cbc = {
247} 142 .num_funcs = 2,
143 .fpu_blocks_limit = -1,
144
145 .funcs = { {
146 .num_blocks = 3,
147 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) }
148 }, {
149 .num_blocks = 1,
150 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk) }
151 } }
152};
248 153
249static inline void u128_to_be128(be128 *dst, const u128 *src) 154static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
155 struct scatterlist *src, unsigned int nbytes)
250{ 156{
251 dst->a = cpu_to_be64(src->a); 157 return glue_ecb_crypt_128bit(&twofish_enc, desc, dst, src, nbytes);
252 dst->b = cpu_to_be64(src->b);
253} 158}
254 159
255static inline void be128_to_u128(u128 *dst, const be128 *src) 160static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
161 struct scatterlist *src, unsigned int nbytes)
256{ 162{
257 dst->a = be64_to_cpu(src->a); 163 return glue_ecb_crypt_128bit(&twofish_dec, desc, dst, src, nbytes);
258 dst->b = be64_to_cpu(src->b);
259} 164}
260 165
261static inline void u128_inc(u128 *i) 166static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
167 struct scatterlist *src, unsigned int nbytes)
262{ 168{
263 i->b++; 169 return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(twofish_enc_blk), desc,
264 if (!i->b) 170 dst, src, nbytes);
265 i->a++;
266} 171}
267 172
268static void ctr_crypt_final(struct blkcipher_desc *desc, 173static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
269 struct blkcipher_walk *walk) 174 struct scatterlist *src, unsigned int nbytes)
270{ 175{
271 struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 176 return glue_cbc_decrypt_128bit(&twofish_dec_cbc, desc, dst, src,
272 u8 *ctrblk = walk->iv; 177 nbytes);
273 u8 keystream[TF_BLOCK_SIZE];
274 u8 *src = walk->src.virt.addr;
275 u8 *dst = walk->dst.virt.addr;
276 unsigned int nbytes = walk->nbytes;
277
278 twofish_enc_blk(ctx, keystream, ctrblk);
279 crypto_xor(keystream, src, nbytes);
280 memcpy(dst, keystream, nbytes);
281
282 crypto_inc(ctrblk, TF_BLOCK_SIZE);
283}
284
285static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
286 struct blkcipher_walk *walk)
287{
288 struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
289 unsigned int bsize = TF_BLOCK_SIZE;
290 unsigned int nbytes = walk->nbytes;
291 u128 *src = (u128 *)walk->src.virt.addr;
292 u128 *dst = (u128 *)walk->dst.virt.addr;
293 u128 ctrblk;
294 be128 ctrblocks[3];
295
296 be128_to_u128(&ctrblk, (be128 *)walk->iv);
297
298 /* Process three block batch */
299 if (nbytes >= bsize * 3) {
300 do {
301 if (dst != src) {
302 dst[0] = src[0];
303 dst[1] = src[1];
304 dst[2] = src[2];
305 }
306
307 /* create ctrblks for parallel encrypt */
308 u128_to_be128(&ctrblocks[0], &ctrblk);
309 u128_inc(&ctrblk);
310 u128_to_be128(&ctrblocks[1], &ctrblk);
311 u128_inc(&ctrblk);
312 u128_to_be128(&ctrblocks[2], &ctrblk);
313 u128_inc(&ctrblk);
314
315 twofish_enc_blk_xor_3way(ctx, (u8 *)dst,
316 (u8 *)ctrblocks);
317
318 src += 3;
319 dst += 3;
320 nbytes -= bsize * 3;
321 } while (nbytes >= bsize * 3);
322
323 if (nbytes < bsize)
324 goto done;
325 }
326
327 /* Handle leftovers */
328 do {
329 if (dst != src)
330 *dst = *src;
331
332 u128_to_be128(&ctrblocks[0], &ctrblk);
333 u128_inc(&ctrblk);
334
335 twofish_enc_blk(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks);
336 u128_xor(dst, dst, (u128 *)ctrblocks);
337
338 src += 1;
339 dst += 1;
340 nbytes -= bsize;
341 } while (nbytes >= bsize);
342
343done:
344 u128_to_be128((be128 *)walk->iv, &ctrblk);
345 return nbytes;
346} 178}
347 179
348static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, 180static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
349 struct scatterlist *src, unsigned int nbytes) 181 struct scatterlist *src, unsigned int nbytes)
350{ 182{
351 struct blkcipher_walk walk; 183 return glue_ctr_crypt_128bit(&twofish_ctr, desc, dst, src, nbytes);
352 int err;
353
354 blkcipher_walk_init(&walk, dst, src, nbytes);
355 err = blkcipher_walk_virt_block(desc, &walk, TF_BLOCK_SIZE);
356
357 while ((nbytes = walk.nbytes) >= TF_BLOCK_SIZE) {
358 nbytes = __ctr_crypt(desc, &walk);
359 err = blkcipher_walk_done(desc, &walk, nbytes);
360 }
361
362 if (walk.nbytes) {
363 ctr_crypt_final(desc, &walk);
364 err = blkcipher_walk_done(desc, &walk, 0);
365 }
366
367 return err;
368} 184}
369 185
370static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) 186static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
@@ -397,13 +213,8 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
397 twofish_dec_blk(ctx, srcdst, srcdst); 213 twofish_dec_blk(ctx, srcdst, srcdst);
398} 214}
399 215
400struct twofish_lrw_ctx { 216int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
401 struct lrw_table_ctx lrw_table; 217 unsigned int keylen)
402 struct twofish_ctx twofish_ctx;
403};
404
405static int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
406 unsigned int keylen)
407{ 218{
408 struct twofish_lrw_ctx *ctx = crypto_tfm_ctx(tfm); 219 struct twofish_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
409 int err; 220 int err;
@@ -415,6 +226,7 @@ static int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
415 226
416 return lrw_init_table(&ctx->lrw_table, key + keylen - TF_BLOCK_SIZE); 227 return lrw_init_table(&ctx->lrw_table, key + keylen - TF_BLOCK_SIZE);
417} 228}
229EXPORT_SYMBOL_GPL(lrw_twofish_setkey);
418 230
419static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 231static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
420 struct scatterlist *src, unsigned int nbytes) 232 struct scatterlist *src, unsigned int nbytes)
@@ -450,20 +262,16 @@ static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
450 return lrw_crypt(desc, dst, src, nbytes, &req); 262 return lrw_crypt(desc, dst, src, nbytes, &req);
451} 263}
452 264
453static void lrw_exit_tfm(struct crypto_tfm *tfm) 265void lrw_twofish_exit_tfm(struct crypto_tfm *tfm)
454{ 266{
455 struct twofish_lrw_ctx *ctx = crypto_tfm_ctx(tfm); 267 struct twofish_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
456 268
457 lrw_free_table(&ctx->lrw_table); 269 lrw_free_table(&ctx->lrw_table);
458} 270}
271EXPORT_SYMBOL_GPL(lrw_twofish_exit_tfm);
459 272
460struct twofish_xts_ctx { 273int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
461 struct twofish_ctx tweak_ctx; 274 unsigned int keylen)
462 struct twofish_ctx crypt_ctx;
463};
464
465static int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
466 unsigned int keylen)
467{ 275{
468 struct twofish_xts_ctx *ctx = crypto_tfm_ctx(tfm); 276 struct twofish_xts_ctx *ctx = crypto_tfm_ctx(tfm);
469 u32 *flags = &tfm->crt_flags; 277 u32 *flags = &tfm->crt_flags;
@@ -486,6 +294,7 @@ static int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
486 return __twofish_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2, 294 return __twofish_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2,
487 flags); 295 flags);
488} 296}
297EXPORT_SYMBOL_GPL(xts_twofish_setkey);
489 298
490static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 299static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
491 struct scatterlist *src, unsigned int nbytes) 300 struct scatterlist *src, unsigned int nbytes)
@@ -596,7 +405,7 @@ static struct crypto_alg tf_algs[5] = { {
596 .cra_type = &crypto_blkcipher_type, 405 .cra_type = &crypto_blkcipher_type,
597 .cra_module = THIS_MODULE, 406 .cra_module = THIS_MODULE,
598 .cra_list = LIST_HEAD_INIT(tf_algs[3].cra_list), 407 .cra_list = LIST_HEAD_INIT(tf_algs[3].cra_list),
599 .cra_exit = lrw_exit_tfm, 408 .cra_exit = lrw_twofish_exit_tfm,
600 .cra_u = { 409 .cra_u = {
601 .blkcipher = { 410 .blkcipher = {
602 .min_keysize = TF_MIN_KEY_SIZE + TF_BLOCK_SIZE, 411 .min_keysize = TF_MIN_KEY_SIZE + TF_BLOCK_SIZE,
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 98bd70faccc..673ac9b63d6 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -38,7 +38,7 @@
38int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from) 38int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
39{ 39{
40 int err = 0; 40 int err = 0;
41 bool ia32 = is_ia32_task(); 41 bool ia32 = test_thread_flag(TIF_IA32);
42 42
43 if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t))) 43 if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t)))
44 return -EFAULT; 44 return -EFAULT;
@@ -273,7 +273,6 @@ asmlinkage long sys32_sigreturn(struct pt_regs *regs)
273 sizeof(frame->extramask)))) 273 sizeof(frame->extramask))))
274 goto badframe; 274 goto badframe;
275 275
276 sigdelsetmask(&set, ~_BLOCKABLE);
277 set_current_blocked(&set); 276 set_current_blocked(&set);
278 277
279 if (ia32_restore_sigcontext(regs, &frame->sc, &ax)) 278 if (ia32_restore_sigcontext(regs, &frame->sc, &ax))
@@ -299,7 +298,6 @@ asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs)
299 if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) 298 if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
300 goto badframe; 299 goto badframe;
301 300
302 sigdelsetmask(&set, ~_BLOCKABLE);
303 set_current_blocked(&set); 301 set_current_blocked(&set);
304 302
305 if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) 303 if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 610001d385d..0c44630d178 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -29,7 +29,7 @@
29#include <asm/processor.h> 29#include <asm/processor.h>
30#include <asm/mmu.h> 30#include <asm/mmu.h>
31#include <asm/mpspec.h> 31#include <asm/mpspec.h>
32#include <asm/trampoline.h> 32#include <asm/realmode.h>
33 33
34#define COMPILER_DEPENDENT_INT64 long long 34#define COMPILER_DEPENDENT_INT64 long long
35#define COMPILER_DEPENDENT_UINT64 unsigned long long 35#define COMPILER_DEPENDENT_UINT64 unsigned long long
@@ -117,11 +117,8 @@ static inline void acpi_disable_pci(void)
117/* Low-level suspend routine. */ 117/* Low-level suspend routine. */
118extern int acpi_suspend_lowlevel(void); 118extern int acpi_suspend_lowlevel(void);
119 119
120extern const unsigned char acpi_wakeup_code[]; 120/* Physical address to resume after wakeup */
121#define acpi_wakeup_address (__pa(TRAMPOLINE_SYM(acpi_wakeup_code))) 121#define acpi_wakeup_address ((unsigned long)(real_mode_header->wakeup_start))
122
123/* early initialization routine */
124extern void acpi_reserve_wakeup_memory(void);
125 122
126/* 123/*
127 * Check if the CPU can handle C2 and deeper 124 * Check if the CPU can handle C2 and deeper
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 49331bedc15..70780689599 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -75,23 +75,54 @@ static inline int alternatives_text_reserved(void *start, void *end)
75} 75}
76#endif /* CONFIG_SMP */ 76#endif /* CONFIG_SMP */
77 77
78#define OLDINSTR(oldinstr) "661:\n\t" oldinstr "\n662:\n"
79
80#define b_replacement(number) "663"#number
81#define e_replacement(number) "664"#number
82
83#define alt_slen "662b-661b"
84#define alt_rlen(number) e_replacement(number)"f-"b_replacement(number)"f"
85
86#define ALTINSTR_ENTRY(feature, number) \
87 " .long 661b - .\n" /* label */ \
88 " .long " b_replacement(number)"f - .\n" /* new instruction */ \
89 " .word " __stringify(feature) "\n" /* feature bit */ \
90 " .byte " alt_slen "\n" /* source len */ \
91 " .byte " alt_rlen(number) "\n" /* replacement len */
92
93#define DISCARD_ENTRY(number) /* rlen <= slen */ \
94 " .byte 0xff + (" alt_rlen(number) ") - (" alt_slen ")\n"
95
96#define ALTINSTR_REPLACEMENT(newinstr, feature, number) /* replacement */ \
97 b_replacement(number)":\n\t" newinstr "\n" e_replacement(number) ":\n\t"
98
78/* alternative assembly primitive: */ 99/* alternative assembly primitive: */
79#define ALTERNATIVE(oldinstr, newinstr, feature) \ 100#define ALTERNATIVE(oldinstr, newinstr, feature) \
80 \ 101 OLDINSTR(oldinstr) \
81 "661:\n\t" oldinstr "\n662:\n" \ 102 ".section .altinstructions,\"a\"\n" \
82 ".section .altinstructions,\"a\"\n" \ 103 ALTINSTR_ENTRY(feature, 1) \
83 " .long 661b - .\n" /* label */ \ 104 ".previous\n" \
84 " .long 663f - .\n" /* new instruction */ \ 105 ".section .discard,\"aw\",@progbits\n" \
85 " .word " __stringify(feature) "\n" /* feature bit */ \ 106 DISCARD_ENTRY(1) \
86 " .byte 662b-661b\n" /* sourcelen */ \ 107 ".previous\n" \
87 " .byte 664f-663f\n" /* replacementlen */ \ 108 ".section .altinstr_replacement, \"ax\"\n" \
88 ".previous\n" \ 109 ALTINSTR_REPLACEMENT(newinstr, feature, 1) \
89 ".section .discard,\"aw\",@progbits\n" \ 110 ".previous"
90 " .byte 0xff + (664f-663f) - (662b-661b)\n" /* rlen <= slen */ \ 111
91 ".previous\n" \ 112#define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\
92 ".section .altinstr_replacement, \"ax\"\n" \ 113 OLDINSTR(oldinstr) \
93 "663:\n\t" newinstr "\n664:\n" /* replacement */ \ 114 ".section .altinstructions,\"a\"\n" \
94 ".previous" 115 ALTINSTR_ENTRY(feature1, 1) \
116 ALTINSTR_ENTRY(feature2, 2) \
117 ".previous\n" \
118 ".section .discard,\"aw\",@progbits\n" \
119 DISCARD_ENTRY(1) \
120 DISCARD_ENTRY(2) \
121 ".previous\n" \
122 ".section .altinstr_replacement, \"ax\"\n" \
123 ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \
124 ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \
125 ".previous"
95 126
96/* 127/*
97 * This must be included *after* the definition of ALTERNATIVE due to 128 * This must be included *after* the definition of ALTERNATIVE due to
@@ -140,6 +171,19 @@ static inline int alternatives_text_reserved(void *start, void *end)
140 : output : [old] "i" (oldfunc), [new] "i" (newfunc), ## input) 171 : output : [old] "i" (oldfunc), [new] "i" (newfunc), ## input)
141 172
142/* 173/*
174 * Like alternative_call, but there are two features and respective functions.
175 * If CPU has feature2, function2 is used.
176 * Otherwise, if CPU has feature1, function1 is used.
177 * Otherwise, old function is used.
178 */
179#define alternative_call_2(oldfunc, newfunc1, feature1, newfunc2, feature2, \
180 output, input...) \
181 asm volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", feature1,\
182 "call %P[new2]", feature2) \
183 : output : [old] "i" (oldfunc), [new1] "i" (newfunc1), \
184 [new2] "i" (newfunc2), ## input)
185
186/*
143 * use this macro(s) if you need more than one output parameter 187 * use this macro(s) if you need more than one output parameter
144 * in alternative_io 188 * in alternative_io
145 */ 189 */
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 49ad773f4b9..b3341e9cd8f 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -26,10 +26,31 @@ struct amd_l3_cache {
26 u8 subcaches[4]; 26 u8 subcaches[4];
27}; 27};
28 28
29struct threshold_block {
30 unsigned int block;
31 unsigned int bank;
32 unsigned int cpu;
33 u32 address;
34 u16 interrupt_enable;
35 bool interrupt_capable;
36 u16 threshold_limit;
37 struct kobject kobj;
38 struct list_head miscj;
39};
40
41struct threshold_bank {
42 struct kobject *kobj;
43 struct threshold_block *blocks;
44
45 /* initialized to the number of CPUs on the node sharing this bank */
46 atomic_t cpus;
47};
48
29struct amd_northbridge { 49struct amd_northbridge {
30 struct pci_dev *misc; 50 struct pci_dev *misc;
31 struct pci_dev *link; 51 struct pci_dev *link;
32 struct amd_l3_cache l3_cache; 52 struct amd_l3_cache l3_cache;
53 struct threshold_bank *bank4;
33}; 54};
34 55
35struct amd_northbridge_info { 56struct amd_northbridge_info {
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index eaff4790ed9..3ea51a84a0e 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -306,7 +306,8 @@ struct apic {
306 unsigned long (*check_apicid_used)(physid_mask_t *map, int apicid); 306 unsigned long (*check_apicid_used)(physid_mask_t *map, int apicid);
307 unsigned long (*check_apicid_present)(int apicid); 307 unsigned long (*check_apicid_present)(int apicid);
308 308
309 void (*vector_allocation_domain)(int cpu, struct cpumask *retmask); 309 void (*vector_allocation_domain)(int cpu, struct cpumask *retmask,
310 const struct cpumask *mask);
310 void (*init_apic_ldr)(void); 311 void (*init_apic_ldr)(void);
311 312
312 void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap); 313 void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap);
@@ -331,9 +332,9 @@ struct apic {
331 unsigned long (*set_apic_id)(unsigned int id); 332 unsigned long (*set_apic_id)(unsigned int id);
332 unsigned long apic_id_mask; 333 unsigned long apic_id_mask;
333 334
334 unsigned int (*cpu_mask_to_apicid)(const struct cpumask *cpumask); 335 int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask,
335 unsigned int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask, 336 const struct cpumask *andmask,
336 const struct cpumask *andmask); 337 unsigned int *apicid);
337 338
338 /* ipi */ 339 /* ipi */
339 void (*send_IPI_mask)(const struct cpumask *mask, int vector); 340 void (*send_IPI_mask)(const struct cpumask *mask, int vector);
@@ -464,6 +465,8 @@ static inline u32 safe_apic_wait_icr_idle(void)
464 return apic->safe_wait_icr_idle(); 465 return apic->safe_wait_icr_idle();
465} 466}
466 467
468extern void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v));
469
467#else /* CONFIG_X86_LOCAL_APIC */ 470#else /* CONFIG_X86_LOCAL_APIC */
468 471
469static inline u32 apic_read(u32 reg) { return 0; } 472static inline u32 apic_read(u32 reg) { return 0; }
@@ -473,6 +476,7 @@ static inline u64 apic_icr_read(void) { return 0; }
473static inline void apic_icr_write(u32 low, u32 high) { } 476static inline void apic_icr_write(u32 low, u32 high) { }
474static inline void apic_wait_icr_idle(void) { } 477static inline void apic_wait_icr_idle(void) { }
475static inline u32 safe_apic_wait_icr_idle(void) { return 0; } 478static inline u32 safe_apic_wait_icr_idle(void) { return 0; }
479static inline void apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) {}
476 480
477#endif /* CONFIG_X86_LOCAL_APIC */ 481#endif /* CONFIG_X86_LOCAL_APIC */
478 482
@@ -537,6 +541,11 @@ static inline const struct cpumask *default_target_cpus(void)
537#endif 541#endif
538} 542}
539 543
544static inline const struct cpumask *online_target_cpus(void)
545{
546 return cpu_online_mask;
547}
548
540DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid); 549DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid);
541 550
542 551
@@ -586,21 +595,50 @@ static inline int default_phys_pkg_id(int cpuid_apic, int index_msb)
586 595
587#endif 596#endif
588 597
589static inline unsigned int 598static inline int
590default_cpu_mask_to_apicid(const struct cpumask *cpumask) 599flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
600 const struct cpumask *andmask,
601 unsigned int *apicid)
591{ 602{
592 return cpumask_bits(cpumask)[0] & APIC_ALL_CPUS; 603 unsigned long cpu_mask = cpumask_bits(cpumask)[0] &
604 cpumask_bits(andmask)[0] &
605 cpumask_bits(cpu_online_mask)[0] &
606 APIC_ALL_CPUS;
607
608 if (likely(cpu_mask)) {
609 *apicid = (unsigned int)cpu_mask;
610 return 0;
611 } else {
612 return -EINVAL;
613 }
593} 614}
594 615
595static inline unsigned int 616extern int
596default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, 617default_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
597 const struct cpumask *andmask) 618 const struct cpumask *andmask,
619 unsigned int *apicid);
620
621static inline void
622flat_vector_allocation_domain(int cpu, struct cpumask *retmask,
623 const struct cpumask *mask)
598{ 624{
599 unsigned long mask1 = cpumask_bits(cpumask)[0]; 625 /* Careful. Some cpus do not strictly honor the set of cpus
600 unsigned long mask2 = cpumask_bits(andmask)[0]; 626 * specified in the interrupt destination when using lowest
601 unsigned long mask3 = cpumask_bits(cpu_online_mask)[0]; 627 * priority interrupt delivery mode.
628 *
629 * In particular there was a hyperthreading cpu observed to
630 * deliver interrupts to the wrong hyperthread when only one
631 * hyperthread was specified in the interrupt desitination.
632 */
633 cpumask_clear(retmask);
634 cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
635}
602 636
603 return (unsigned int)(mask1 & mask2 & mask3); 637static inline void
638default_vector_allocation_domain(int cpu, struct cpumask *retmask,
639 const struct cpumask *mask)
640{
641 cpumask_copy(retmask, cpumask_of(cpu));
604} 642}
605 643
606static inline unsigned long default_check_apicid_used(physid_mask_t *map, int apicid) 644static inline unsigned long default_check_apicid_used(physid_mask_t *map, int apicid)
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index b97596e2b68..72f5009deb5 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -15,6 +15,8 @@
15#include <linux/compiler.h> 15#include <linux/compiler.h>
16#include <asm/alternative.h> 16#include <asm/alternative.h>
17 17
18#define BIT_64(n) (U64_C(1) << (n))
19
18/* 20/*
19 * These have to be done with inline assembly: that way the bit-setting 21 * These have to be done with inline assembly: that way the bit-setting
20 * is guaranteed to be atomic. All bit operations return 0 if the bit 22 * is guaranteed to be atomic. All bit operations return 0 if the bit
@@ -262,6 +264,13 @@ static inline int test_and_clear_bit(int nr, volatile unsigned long *addr)
262 * This operation is non-atomic and can be reordered. 264 * This operation is non-atomic and can be reordered.
263 * If two examples of this operation race, one can appear to succeed 265 * If two examples of this operation race, one can appear to succeed
264 * but actually fail. You must protect multiple accesses with a lock. 266 * but actually fail. You must protect multiple accesses with a lock.
267 *
268 * Note: the operation is performed atomically with respect to
269 * the local CPU, but not other CPUs. Portable code should not
270 * rely on this behaviour.
271 * KVM relies on this behaviour on x86 for modifying memory that is also
272 * accessed from a hypervisor on the same CPU if running in a VM: don't change
273 * this without also updating arch/x86/kernel/kvm.c
265 */ 274 */
266static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) 275static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
267{ 276{
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 340ee49961a..6b7ee5ff682 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -176,7 +176,7 @@
176#define X86_FEATURE_XSAVEOPT (7*32+ 4) /* Optimized Xsave */ 176#define X86_FEATURE_XSAVEOPT (7*32+ 4) /* Optimized Xsave */
177#define X86_FEATURE_PLN (7*32+ 5) /* Intel Power Limit Notification */ 177#define X86_FEATURE_PLN (7*32+ 5) /* Intel Power Limit Notification */
178#define X86_FEATURE_PTS (7*32+ 6) /* Intel Package Thermal Status */ 178#define X86_FEATURE_PTS (7*32+ 6) /* Intel Package Thermal Status */
179#define X86_FEATURE_DTS (7*32+ 7) /* Digital Thermal Sensor */ 179#define X86_FEATURE_DTHERM (7*32+ 7) /* Digital Thermal Sensor */
180#define X86_FEATURE_HW_PSTATE (7*32+ 8) /* AMD HW-PState */ 180#define X86_FEATURE_HW_PSTATE (7*32+ 8) /* AMD HW-PState */
181 181
182/* Virtualization flags: Linux defined, word 8 */ 182/* Virtualization flags: Linux defined, word 8 */
@@ -207,6 +207,8 @@
207#define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */ 207#define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */
208#define X86_FEATURE_INVPCID (9*32+10) /* Invalidate Processor Context ID */ 208#define X86_FEATURE_INVPCID (9*32+10) /* Invalidate Processor Context ID */
209#define X86_FEATURE_RTM (9*32+11) /* Restricted Transactional Memory */ 209#define X86_FEATURE_RTM (9*32+11) /* Restricted Transactional Memory */
210#define X86_FEATURE_RDSEED (9*32+18) /* The RDSEED instruction */
211#define X86_FEATURE_ADX (9*32+19) /* The ADCX and ADOX instructions */
210 212
211#if defined(__KERNEL__) && !defined(__ASSEMBLY__) 213#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
212 214
diff --git a/arch/x86/include/asm/crypto/ablk_helper.h b/arch/x86/include/asm/crypto/ablk_helper.h
new file mode 100644
index 00000000000..4f93df50c23
--- /dev/null
+++ b/arch/x86/include/asm/crypto/ablk_helper.h
@@ -0,0 +1,31 @@
1/*
2 * Shared async block cipher helpers
3 */
4
5#ifndef _CRYPTO_ABLK_HELPER_H
6#define _CRYPTO_ABLK_HELPER_H
7
8#include <linux/crypto.h>
9#include <linux/kernel.h>
10#include <crypto/cryptd.h>
11
12struct async_helper_ctx {
13 struct cryptd_ablkcipher *cryptd_tfm;
14};
15
16extern int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
17 unsigned int key_len);
18
19extern int __ablk_encrypt(struct ablkcipher_request *req);
20
21extern int ablk_encrypt(struct ablkcipher_request *req);
22
23extern int ablk_decrypt(struct ablkcipher_request *req);
24
25extern void ablk_exit(struct crypto_tfm *tfm);
26
27extern int ablk_init_common(struct crypto_tfm *tfm, const char *drv_name);
28
29extern int ablk_init(struct crypto_tfm *tfm);
30
31#endif /* _CRYPTO_ABLK_HELPER_H */
diff --git a/arch/x86/include/asm/aes.h b/arch/x86/include/asm/crypto/aes.h
index 80545a1cbe3..80545a1cbe3 100644
--- a/arch/x86/include/asm/aes.h
+++ b/arch/x86/include/asm/crypto/aes.h
diff --git a/arch/x86/include/asm/crypto/glue_helper.h b/arch/x86/include/asm/crypto/glue_helper.h
new file mode 100644
index 00000000000..3e408bddc96
--- /dev/null
+++ b/arch/x86/include/asm/crypto/glue_helper.h
@@ -0,0 +1,115 @@
1/*
2 * Shared glue code for 128bit block ciphers
3 */
4
5#ifndef _CRYPTO_GLUE_HELPER_H
6#define _CRYPTO_GLUE_HELPER_H
7
8#include <linux/kernel.h>
9#include <linux/crypto.h>
10#include <asm/i387.h>
11#include <crypto/b128ops.h>
12
13typedef void (*common_glue_func_t)(void *ctx, u8 *dst, const u8 *src);
14typedef void (*common_glue_cbc_func_t)(void *ctx, u128 *dst, const u128 *src);
15typedef void (*common_glue_ctr_func_t)(void *ctx, u128 *dst, const u128 *src,
16 u128 *iv);
17
18#define GLUE_FUNC_CAST(fn) ((common_glue_func_t)(fn))
19#define GLUE_CBC_FUNC_CAST(fn) ((common_glue_cbc_func_t)(fn))
20#define GLUE_CTR_FUNC_CAST(fn) ((common_glue_ctr_func_t)(fn))
21
22struct common_glue_func_entry {
23 unsigned int num_blocks; /* number of blocks that @fn will process */
24 union {
25 common_glue_func_t ecb;
26 common_glue_cbc_func_t cbc;
27 common_glue_ctr_func_t ctr;
28 } fn_u;
29};
30
31struct common_glue_ctx {
32 unsigned int num_funcs;
33 int fpu_blocks_limit; /* -1 means fpu not needed at all */
34
35 /*
36 * First funcs entry must have largest num_blocks and last funcs entry
37 * must have num_blocks == 1!
38 */
39 struct common_glue_func_entry funcs[];
40};
41
42static inline bool glue_fpu_begin(unsigned int bsize, int fpu_blocks_limit,
43 struct blkcipher_desc *desc,
44 bool fpu_enabled, unsigned int nbytes)
45{
46 if (likely(fpu_blocks_limit < 0))
47 return false;
48
49 if (fpu_enabled)
50 return true;
51
52 /*
53 * Vector-registers are only used when chunk to be processed is large
54 * enough, so do not enable FPU until it is necessary.
55 */
56 if (nbytes < bsize * (unsigned int)fpu_blocks_limit)
57 return false;
58
59 if (desc) {
60 /* prevent sleeping if FPU is in use */
61 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
62 }
63
64 kernel_fpu_begin();
65 return true;
66}
67
68static inline void glue_fpu_end(bool fpu_enabled)
69{
70 if (fpu_enabled)
71 kernel_fpu_end();
72}
73
74static inline void u128_to_be128(be128 *dst, const u128 *src)
75{
76 dst->a = cpu_to_be64(src->a);
77 dst->b = cpu_to_be64(src->b);
78}
79
80static inline void be128_to_u128(u128 *dst, const be128 *src)
81{
82 dst->a = be64_to_cpu(src->a);
83 dst->b = be64_to_cpu(src->b);
84}
85
86static inline void u128_inc(u128 *i)
87{
88 i->b++;
89 if (!i->b)
90 i->a++;
91}
92
93extern int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
94 struct blkcipher_desc *desc,
95 struct scatterlist *dst,
96 struct scatterlist *src, unsigned int nbytes);
97
98extern int glue_cbc_encrypt_128bit(const common_glue_func_t fn,
99 struct blkcipher_desc *desc,
100 struct scatterlist *dst,
101 struct scatterlist *src,
102 unsigned int nbytes);
103
104extern int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
105 struct blkcipher_desc *desc,
106 struct scatterlist *dst,
107 struct scatterlist *src,
108 unsigned int nbytes);
109
110extern int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
111 struct blkcipher_desc *desc,
112 struct scatterlist *dst,
113 struct scatterlist *src, unsigned int nbytes);
114
115#endif /* _CRYPTO_GLUE_HELPER_H */
diff --git a/arch/x86/include/asm/crypto/serpent-avx.h b/arch/x86/include/asm/crypto/serpent-avx.h
new file mode 100644
index 00000000000..432deedd294
--- /dev/null
+++ b/arch/x86/include/asm/crypto/serpent-avx.h
@@ -0,0 +1,32 @@
1#ifndef ASM_X86_SERPENT_AVX_H
2#define ASM_X86_SERPENT_AVX_H
3
4#include <linux/crypto.h>
5#include <crypto/serpent.h>
6
7#define SERPENT_PARALLEL_BLOCKS 8
8
9asmlinkage void __serpent_enc_blk_8way_avx(struct serpent_ctx *ctx, u8 *dst,
10 const u8 *src, bool xor);
11asmlinkage void serpent_dec_blk_8way_avx(struct serpent_ctx *ctx, u8 *dst,
12 const u8 *src);
13
14static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst,
15 const u8 *src)
16{
17 __serpent_enc_blk_8way_avx(ctx, dst, src, false);
18}
19
20static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst,
21 const u8 *src)
22{
23 __serpent_enc_blk_8way_avx(ctx, dst, src, true);
24}
25
26static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst,
27 const u8 *src)
28{
29 serpent_dec_blk_8way_avx(ctx, dst, src);
30}
31
32#endif
diff --git a/arch/x86/include/asm/serpent.h b/arch/x86/include/asm/crypto/serpent-sse2.h
index d3ef63fe0c8..e6e77dffbda 100644
--- a/arch/x86/include/asm/serpent.h
+++ b/arch/x86/include/asm/crypto/serpent-sse2.h
@@ -1,5 +1,5 @@
1#ifndef ASM_X86_SERPENT_H 1#ifndef ASM_X86_SERPENT_SSE2_H
2#define ASM_X86_SERPENT_H 2#define ASM_X86_SERPENT_SSE2_H
3 3
4#include <linux/crypto.h> 4#include <linux/crypto.h>
5#include <crypto/serpent.h> 5#include <crypto/serpent.h>
diff --git a/arch/x86/include/asm/crypto/twofish.h b/arch/x86/include/asm/crypto/twofish.h
new file mode 100644
index 00000000000..9d2c514bd5f
--- /dev/null
+++ b/arch/x86/include/asm/crypto/twofish.h
@@ -0,0 +1,46 @@
1#ifndef ASM_X86_TWOFISH_H
2#define ASM_X86_TWOFISH_H
3
4#include <linux/crypto.h>
5#include <crypto/twofish.h>
6#include <crypto/lrw.h>
7#include <crypto/b128ops.h>
8
9struct twofish_lrw_ctx {
10 struct lrw_table_ctx lrw_table;
11 struct twofish_ctx twofish_ctx;
12};
13
14struct twofish_xts_ctx {
15 struct twofish_ctx tweak_ctx;
16 struct twofish_ctx crypt_ctx;
17};
18
19/* regular block cipher functions from twofish_x86_64 module */
20asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst,
21 const u8 *src);
22asmlinkage void twofish_dec_blk(struct twofish_ctx *ctx, u8 *dst,
23 const u8 *src);
24
25/* 3-way parallel cipher functions */
26asmlinkage void __twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
27 const u8 *src, bool xor);
28asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst,
29 const u8 *src);
30
31/* helpers from twofish_x86_64-3way module */
32extern void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src);
33extern void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src,
34 u128 *iv);
35extern void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src,
36 u128 *iv);
37
38extern int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
39 unsigned int keylen);
40
41extern void lrw_twofish_exit_tfm(struct crypto_tfm *tfm);
42
43extern int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
44 unsigned int keylen);
45
46#endif /* ASM_X86_TWOFISH_H */
diff --git a/arch/x86/include/asm/dma-contiguous.h b/arch/x86/include/asm/dma-contiguous.h
new file mode 100644
index 00000000000..c0924165997
--- /dev/null
+++ b/arch/x86/include/asm/dma-contiguous.h
@@ -0,0 +1,13 @@
1#ifndef ASMX86_DMA_CONTIGUOUS_H
2#define ASMX86_DMA_CONTIGUOUS_H
3
4#ifdef __KERNEL__
5
6#include <linux/types.h>
7#include <asm-generic/dma-contiguous.h>
8
9static inline void
10dma_contiguous_early_fixup(phys_addr_t base, unsigned long size) { }
11
12#endif
13#endif
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 61c0bd25845..f7b4c7903e7 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -13,6 +13,7 @@
13#include <asm/io.h> 13#include <asm/io.h>
14#include <asm/swiotlb.h> 14#include <asm/swiotlb.h>
15#include <asm-generic/dma-coherent.h> 15#include <asm-generic/dma-coherent.h>
16#include <linux/dma-contiguous.h>
16 17
17#ifdef CONFIG_ISA 18#ifdef CONFIG_ISA
18# define ISA_DMA_BIT_MASK DMA_BIT_MASK(24) 19# define ISA_DMA_BIT_MASK DMA_BIT_MASK(24)
@@ -62,6 +63,10 @@ extern void *dma_generic_alloc_coherent(struct device *dev, size_t size,
62 dma_addr_t *dma_addr, gfp_t flag, 63 dma_addr_t *dma_addr, gfp_t flag,
63 struct dma_attrs *attrs); 64 struct dma_attrs *attrs);
64 65
66extern void dma_generic_free_coherent(struct device *dev, size_t size,
67 void *vaddr, dma_addr_t dma_addr,
68 struct dma_attrs *attrs);
69
65#ifdef CONFIG_X86_DMA_REMAP /* Platform code defines bridge-specific code */ 70#ifdef CONFIG_X86_DMA_REMAP /* Platform code defines bridge-specific code */
66extern bool dma_capable(struct device *dev, dma_addr_t addr, size_t size); 71extern bool dma_capable(struct device *dev, dma_addr_t addr, size_t size);
67extern dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr); 72extern dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr);
diff --git a/arch/x86/include/asm/emergency-restart.h b/arch/x86/include/asm/emergency-restart.h
index cc70c1c78ca..75ce3f47d20 100644
--- a/arch/x86/include/asm/emergency-restart.h
+++ b/arch/x86/include/asm/emergency-restart.h
@@ -4,9 +4,7 @@
4enum reboot_type { 4enum reboot_type {
5 BOOT_TRIPLE = 't', 5 BOOT_TRIPLE = 't',
6 BOOT_KBD = 'k', 6 BOOT_KBD = 'k',
7#ifdef CONFIG_X86_32
8 BOOT_BIOS = 'b', 7 BOOT_BIOS = 'b',
9#endif
10 BOOT_ACPI = 'a', 8 BOOT_ACPI = 'a',
11 BOOT_EFI = 'e', 9 BOOT_EFI = 'e',
12 BOOT_CF9 = 'p', 10 BOOT_CF9 = 'p',
diff --git a/arch/x86/include/asm/floppy.h b/arch/x86/include/asm/floppy.h
index dbe82a5c5ea..d3d74698dce 100644
--- a/arch/x86/include/asm/floppy.h
+++ b/arch/x86/include/asm/floppy.h
@@ -99,7 +99,7 @@ static irqreturn_t floppy_hardint(int irq, void *dev_id)
99 virtual_dma_residue += virtual_dma_count; 99 virtual_dma_residue += virtual_dma_count;
100 virtual_dma_count = 0; 100 virtual_dma_count = 0;
101#ifdef TRACE_FLPY_INT 101#ifdef TRACE_FLPY_INT
102 printk("count=%x, residue=%x calls=%d bytes=%d dma_wait=%d\n", 102 printk(KERN_DEBUG "count=%x, residue=%x calls=%d bytes=%d dma_wait=%d\n",
103 virtual_dma_count, virtual_dma_residue, calls, bytes, 103 virtual_dma_count, virtual_dma_residue, calls, bytes,
104 dma_wait); 104 dma_wait);
105 calls = 0; 105 calls = 0;
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 18d9005d9e4..b0767bc0874 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -34,7 +34,7 @@
34 34
35#ifndef __ASSEMBLY__ 35#ifndef __ASSEMBLY__
36extern void mcount(void); 36extern void mcount(void);
37extern int modifying_ftrace_code; 37extern atomic_t modifying_ftrace_code;
38 38
39static inline unsigned long ftrace_call_adjust(unsigned long addr) 39static inline unsigned long ftrace_call_adjust(unsigned long addr)
40{ 40{
diff --git a/arch/x86/include/asm/gpio.h b/arch/x86/include/asm/gpio.h
index 91d915a6525..b3799d88ffc 100644
--- a/arch/x86/include/asm/gpio.h
+++ b/arch/x86/include/asm/gpio.h
@@ -1,53 +1,4 @@
1/* 1#ifndef __LINUX_GPIO_H
2 * Generic GPIO API implementation for x86. 2#warning Include linux/gpio.h instead of asm/gpio.h
3 * 3#include <linux/gpio.h>
4 * Derived from the generic GPIO API for powerpc: 4#endif
5 *
6 * Copyright (c) 2007-2008 MontaVista Software, Inc.
7 *
8 * Author: Anton Vorontsov <avorontsov@ru.mvista.com>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 */
15
16#ifndef _ASM_X86_GPIO_H
17#define _ASM_X86_GPIO_H
18
19#include <asm-generic/gpio.h>
20
21#ifdef CONFIG_GPIOLIB
22
23/*
24 * Just call gpiolib.
25 */
26static inline int gpio_get_value(unsigned int gpio)
27{
28 return __gpio_get_value(gpio);
29}
30
31static inline void gpio_set_value(unsigned int gpio, int value)
32{
33 __gpio_set_value(gpio, value);
34}
35
36static inline int gpio_cansleep(unsigned int gpio)
37{
38 return __gpio_cansleep(gpio);
39}
40
41static inline int gpio_to_irq(unsigned int gpio)
42{
43 return __gpio_to_irq(gpio);
44}
45
46static inline int irq_to_gpio(unsigned int irq)
47{
48 return -EINVAL;
49}
50
51#endif /* CONFIG_GPIOLIB */
52
53#endif /* _ASM_X86_GPIO_H */
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index 7a15153c675..b518c750993 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -49,6 +49,7 @@ extern const struct hypervisor_x86 *x86_hyper;
49extern const struct hypervisor_x86 x86_hyper_vmware; 49extern const struct hypervisor_x86 x86_hyper_vmware;
50extern const struct hypervisor_x86 x86_hyper_ms_hyperv; 50extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
51extern const struct hypervisor_x86 x86_hyper_xen_hvm; 51extern const struct hypervisor_x86 x86_hyper_xen_hvm;
52extern const struct hypervisor_x86 x86_hyper_kvm;
52 53
53static inline bool hypervisor_x2apic_available(void) 54static inline bool hypervisor_x2apic_available(void)
54{ 55{
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index dffc38ee625..345c99cef15 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -5,7 +5,6 @@ extern struct dma_map_ops nommu_dma_ops;
5extern int force_iommu, no_iommu; 5extern int force_iommu, no_iommu;
6extern int iommu_detected; 6extern int iommu_detected;
7extern int iommu_pass_through; 7extern int iommu_pass_through;
8extern int iommu_group_mf;
9 8
10/* 10 seconds */ 9/* 10 seconds */
11#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) 10#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000)
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index e7d1c194d27..246617efd67 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -12,6 +12,7 @@
12/* Select x86 specific features in <linux/kvm.h> */ 12/* Select x86 specific features in <linux/kvm.h> */
13#define __KVM_HAVE_PIT 13#define __KVM_HAVE_PIT
14#define __KVM_HAVE_IOAPIC 14#define __KVM_HAVE_IOAPIC
15#define __KVM_HAVE_IRQ_LINE
15#define __KVM_HAVE_DEVICE_ASSIGNMENT 16#define __KVM_HAVE_DEVICE_ASSIGNMENT
16#define __KVM_HAVE_MSI 17#define __KVM_HAVE_MSI
17#define __KVM_HAVE_USER_NMI 18#define __KVM_HAVE_USER_NMI
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index c222e1a1b12..c764f43b71c 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -192,15 +192,15 @@ struct x86_emulate_ops {
192 struct x86_instruction_info *info, 192 struct x86_instruction_info *info,
193 enum x86_intercept_stage stage); 193 enum x86_intercept_stage stage);
194 194
195 bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, 195 void (*get_cpuid)(struct x86_emulate_ctxt *ctxt,
196 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); 196 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
197}; 197};
198 198
199typedef u32 __attribute__((vector_size(16))) sse128_t; 199typedef u32 __attribute__((vector_size(16))) sse128_t;
200 200
201/* Type, address-of, and value of an instruction's operand. */ 201/* Type, address-of, and value of an instruction's operand. */
202struct operand { 202struct operand {
203 enum { OP_REG, OP_MEM, OP_IMM, OP_XMM, OP_NONE } type; 203 enum { OP_REG, OP_MEM, OP_IMM, OP_XMM, OP_MM, OP_NONE } type;
204 unsigned int bytes; 204 unsigned int bytes;
205 union { 205 union {
206 unsigned long orig_val; 206 unsigned long orig_val;
@@ -213,12 +213,14 @@ struct operand {
213 unsigned seg; 213 unsigned seg;
214 } mem; 214 } mem;
215 unsigned xmm; 215 unsigned xmm;
216 unsigned mm;
216 } addr; 217 } addr;
217 union { 218 union {
218 unsigned long val; 219 unsigned long val;
219 u64 val64; 220 u64 val64;
220 char valptr[sizeof(unsigned long) + 2]; 221 char valptr[sizeof(unsigned long) + 2];
221 sse128_t vec_val; 222 sse128_t vec_val;
223 u64 mm_val;
222 }; 224 };
223}; 225};
224 226
@@ -278,9 +280,9 @@ struct x86_emulate_ctxt {
278 u8 modrm_seg; 280 u8 modrm_seg;
279 bool rip_relative; 281 bool rip_relative;
280 unsigned long _eip; 282 unsigned long _eip;
283 struct operand memop;
281 /* Fields above regs are cleared together. */ 284 /* Fields above regs are cleared together. */
282 unsigned long regs[NR_VCPU_REGS]; 285 unsigned long regs[NR_VCPU_REGS];
283 struct operand memop;
284 struct operand *memopp; 286 struct operand *memopp;
285 struct fetch_cache fetch; 287 struct fetch_cache fetch;
286 struct read_cache io_read; 288 struct read_cache io_read;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e5b97be12d2..09155d64cf7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -48,12 +48,13 @@
48 48
49#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) 49#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
50#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) 50#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
51#define CR3_PCID_ENABLED_RESERVED_BITS 0xFFFFFF0000000000ULL
51#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ 52#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \
52 0xFFFFFF0000000000ULL) 53 0xFFFFFF0000000000ULL)
53#define CR4_RESERVED_BITS \ 54#define CR4_RESERVED_BITS \
54 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ 55 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
55 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ 56 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
56 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ 57 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \
57 | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_RDWRGSFS \ 58 | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_RDWRGSFS \
58 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) 59 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
59 60
@@ -173,6 +174,16 @@ enum {
173#define DR7_FIXED_1 0x00000400 174#define DR7_FIXED_1 0x00000400
174#define DR7_VOLATILE 0xffff23ff 175#define DR7_VOLATILE 0xffff23ff
175 176
177/* apic attention bits */
178#define KVM_APIC_CHECK_VAPIC 0
179/*
180 * The following bit is set with PV-EOI, unset on EOI.
181 * We detect PV-EOI changes by guest by comparing
182 * this bit with PV-EOI in guest memory.
183 * See the implementation in apic_update_pv_eoi.
184 */
185#define KVM_APIC_PV_EOI_PENDING 1
186
176/* 187/*
177 * We don't want allocation failures within the mmu code, so we preallocate 188 * We don't want allocation failures within the mmu code, so we preallocate
178 * enough memory for a single page fault in a cache. 189 * enough memory for a single page fault in a cache.
@@ -238,8 +249,6 @@ struct kvm_mmu_page {
238#endif 249#endif
239 250
240 int write_flooding_count; 251 int write_flooding_count;
241
242 struct rcu_head rcu;
243}; 252};
244 253
245struct kvm_pio_request { 254struct kvm_pio_request {
@@ -312,8 +321,8 @@ struct kvm_pmu {
312 u64 counter_bitmask[2]; 321 u64 counter_bitmask[2];
313 u64 global_ctrl_mask; 322 u64 global_ctrl_mask;
314 u8 version; 323 u8 version;
315 struct kvm_pmc gp_counters[X86_PMC_MAX_GENERIC]; 324 struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC];
316 struct kvm_pmc fixed_counters[X86_PMC_MAX_FIXED]; 325 struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED];
317 struct irq_work irq_work; 326 struct irq_work irq_work;
318 u64 reprogram_pmi; 327 u64 reprogram_pmi;
319}; 328};
@@ -338,6 +347,7 @@ struct kvm_vcpu_arch {
338 u64 efer; 347 u64 efer;
339 u64 apic_base; 348 u64 apic_base;
340 struct kvm_lapic *apic; /* kernel irqchip context */ 349 struct kvm_lapic *apic; /* kernel irqchip context */
350 unsigned long apic_attention;
341 int32_t apic_arb_prio; 351 int32_t apic_arb_prio;
342 int mp_state; 352 int mp_state;
343 int sipi_vector; 353 int sipi_vector;
@@ -482,6 +492,11 @@ struct kvm_vcpu_arch {
482 u64 length; 492 u64 length;
483 u64 status; 493 u64 status;
484 } osvw; 494 } osvw;
495
496 struct {
497 u64 msr_val;
498 struct gfn_to_hva_cache data;
499 } pv_eoi;
485}; 500};
486 501
487struct kvm_lpage_info { 502struct kvm_lpage_info {
@@ -537,8 +552,6 @@ struct kvm_arch {
537 u64 hv_guest_os_id; 552 u64 hv_guest_os_id;
538 u64 hv_hypercall; 553 u64 hv_hypercall;
539 554
540 atomic_t reader_counter;
541
542 #ifdef CONFIG_KVM_MMU_AUDIT 555 #ifdef CONFIG_KVM_MMU_AUDIT
543 int audit_point; 556 int audit_point;
544 #endif 557 #endif
@@ -661,6 +674,7 @@ struct kvm_x86_ops {
661 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); 674 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
662 int (*get_lpage_level)(void); 675 int (*get_lpage_level)(void);
663 bool (*rdtscp_supported)(void); 676 bool (*rdtscp_supported)(void);
677 bool (*invpcid_supported)(void);
664 void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment, bool host); 678 void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment, bool host);
665 679
666 void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); 680 void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
@@ -713,8 +727,9 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
713 727
714int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); 728int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
715void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); 729void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
716int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, 730void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
717 struct kvm_memory_slot *slot); 731 struct kvm_memory_slot *slot,
732 gfn_t gfn_offset, unsigned long mask);
718void kvm_mmu_zap_all(struct kvm *kvm); 733void kvm_mmu_zap_all(struct kvm *kvm);
719unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); 734unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
720void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); 735void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
@@ -801,7 +816,20 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
801void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); 816void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
802bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); 817bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
803 818
804int kvm_pic_set_irq(void *opaque, int irq, int level); 819static inline int __kvm_irq_line_state(unsigned long *irq_state,
820 int irq_source_id, int level)
821{
822 /* Logical OR for level trig interrupt */
823 if (level)
824 __set_bit(irq_source_id, irq_state);
825 else
826 __clear_bit(irq_source_id, irq_state);
827
828 return !!(*irq_state);
829}
830
831int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level);
832void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
805 833
806void kvm_inject_nmi(struct kvm_vcpu *vcpu); 834void kvm_inject_nmi(struct kvm_vcpu *vcpu);
807 835
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 183922e13de..2f7712e08b1 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -22,6 +22,7 @@
22#define KVM_FEATURE_CLOCKSOURCE2 3 22#define KVM_FEATURE_CLOCKSOURCE2 3
23#define KVM_FEATURE_ASYNC_PF 4 23#define KVM_FEATURE_ASYNC_PF 4
24#define KVM_FEATURE_STEAL_TIME 5 24#define KVM_FEATURE_STEAL_TIME 5
25#define KVM_FEATURE_PV_EOI 6
25 26
26/* The last 8 bits are used to indicate how to interpret the flags field 27/* The last 8 bits are used to indicate how to interpret the flags field
27 * in pvclock structure. If no bits are set, all flags are ignored. 28 * in pvclock structure. If no bits are set, all flags are ignored.
@@ -37,6 +38,7 @@
37#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 38#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
38#define MSR_KVM_ASYNC_PF_EN 0x4b564d02 39#define MSR_KVM_ASYNC_PF_EN 0x4b564d02
39#define MSR_KVM_STEAL_TIME 0x4b564d03 40#define MSR_KVM_STEAL_TIME 0x4b564d03
41#define MSR_KVM_PV_EOI_EN 0x4b564d04
40 42
41struct kvm_steal_time { 43struct kvm_steal_time {
42 __u64 steal; 44 __u64 steal;
@@ -89,12 +91,25 @@ struct kvm_vcpu_pv_apf_data {
89 __u32 enabled; 91 __u32 enabled;
90}; 92};
91 93
94#define KVM_PV_EOI_BIT 0
95#define KVM_PV_EOI_MASK (0x1 << KVM_PV_EOI_BIT)
96#define KVM_PV_EOI_ENABLED KVM_PV_EOI_MASK
97#define KVM_PV_EOI_DISABLED 0x0
98
92#ifdef __KERNEL__ 99#ifdef __KERNEL__
93#include <asm/processor.h> 100#include <asm/processor.h>
94 101
95extern void kvmclock_init(void); 102extern void kvmclock_init(void);
96extern int kvm_register_clock(char *txt); 103extern int kvm_register_clock(char *txt);
97 104
105#ifdef CONFIG_KVM_CLOCK
106bool kvm_check_and_clear_guest_paused(void);
107#else
108static inline bool kvm_check_and_clear_guest_paused(void)
109{
110 return false;
111}
112#endif /* CONFIG_KVMCLOCK */
98 113
99/* This instruction is vmcall. On non-VT architectures, it will generate a 114/* This instruction is vmcall. On non-VT architectures, it will generate a
100 * trap that we will then rewrite to the appropriate instruction. 115 * trap that we will then rewrite to the appropriate instruction.
@@ -173,14 +188,16 @@ static inline int kvm_para_available(void)
173 if (boot_cpu_data.cpuid_level < 0) 188 if (boot_cpu_data.cpuid_level < 0)
174 return 0; /* So we don't blow up on old processors */ 189 return 0; /* So we don't blow up on old processors */
175 190
176 cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx); 191 if (cpu_has_hypervisor) {
177 memcpy(signature + 0, &ebx, 4); 192 cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx);
178 memcpy(signature + 4, &ecx, 4); 193 memcpy(signature + 0, &ebx, 4);
179 memcpy(signature + 8, &edx, 4); 194 memcpy(signature + 4, &ecx, 4);
180 signature[12] = 0; 195 memcpy(signature + 8, &edx, 4);
196 signature[12] = 0;
181 197
182 if (strcmp(signature, "KVMKVMKVM") == 0) 198 if (strcmp(signature, "KVMKVMKVM") == 0)
183 return 1; 199 return 1;
200 }
184 201
185 return 0; 202 return 0;
186} 203}
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 084ef95274c..813ed103f45 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -115,8 +115,8 @@ notrace static inline int native_write_msr_safe(unsigned int msr,
115 115
116extern unsigned long long native_read_tsc(void); 116extern unsigned long long native_read_tsc(void);
117 117
118extern int native_rdmsr_safe_regs(u32 regs[8]); 118extern int rdmsr_safe_regs(u32 regs[8]);
119extern int native_wrmsr_safe_regs(u32 regs[8]); 119extern int wrmsr_safe_regs(u32 regs[8]);
120 120
121static __always_inline unsigned long long __native_read_tsc(void) 121static __always_inline unsigned long long __native_read_tsc(void)
122{ 122{
@@ -187,43 +187,6 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
187 return err; 187 return err;
188} 188}
189 189
190static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
191{
192 u32 gprs[8] = { 0 };
193 int err;
194
195 gprs[1] = msr;
196 gprs[7] = 0x9c5a203a;
197
198 err = native_rdmsr_safe_regs(gprs);
199
200 *p = gprs[0] | ((u64)gprs[2] << 32);
201
202 return err;
203}
204
205static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val)
206{
207 u32 gprs[8] = { 0 };
208
209 gprs[0] = (u32)val;
210 gprs[1] = msr;
211 gprs[2] = val >> 32;
212 gprs[7] = 0x9c5a203a;
213
214 return native_wrmsr_safe_regs(gprs);
215}
216
217static inline int rdmsr_safe_regs(u32 regs[8])
218{
219 return native_rdmsr_safe_regs(regs);
220}
221
222static inline int wrmsr_safe_regs(u32 regs[8])
223{
224 return native_wrmsr_safe_regs(regs);
225}
226
227#define rdtscl(low) \ 190#define rdtscl(low) \
228 ((low) = (u32)__native_read_tsc()) 191 ((low) = (u32)__native_read_tsc())
229 192
@@ -237,6 +200,8 @@ do { \
237 (high) = (u32)(_l >> 32); \ 200 (high) = (u32)(_l >> 32); \
238} while (0) 201} while (0)
239 202
203#define rdpmcl(counter, val) ((val) = native_read_pmc(counter))
204
240#define rdtscp(low, high, aux) \ 205#define rdtscp(low, high, aux) \
241do { \ 206do { \
242 unsigned long long _val = native_read_tscp(&(aux)); \ 207 unsigned long long _val = native_read_tscp(&(aux)); \
@@ -248,8 +213,7 @@ do { \
248 213
249#endif /* !CONFIG_PARAVIRT */ 214#endif /* !CONFIG_PARAVIRT */
250 215
251 216#define wrmsrl_safe(msr, val) wrmsr_safe((msr), (u32)(val), \
252#define checking_wrmsrl(msr, val) wrmsr_safe((msr), (u32)(val), \
253 (u32)((val) >> 32)) 217 (u32)((val) >> 32))
254 218
255#define write_tsc(val1, val2) wrmsr(MSR_IA32_TSC, (val1), (val2)) 219#define write_tsc(val1, val2) wrmsr(MSR_IA32_TSC, (val1), (val2))
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 0e3793b821e..c0fa356e90d 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -44,14 +44,14 @@ struct nmiaction {
44 const char *name; 44 const char *name;
45}; 45};
46 46
47#define register_nmi_handler(t, fn, fg, n) \ 47#define register_nmi_handler(t, fn, fg, n, init...) \
48({ \ 48({ \
49 static struct nmiaction fn##_na = { \ 49 static struct nmiaction init fn##_na = { \
50 .handler = (fn), \ 50 .handler = (fn), \
51 .name = (n), \ 51 .name = (n), \
52 .flags = (fg), \ 52 .flags = (fg), \
53 }; \ 53 }; \
54 __register_nmi_handler((t), &fn##_na); \ 54 __register_nmi_handler((t), &fn##_na); \
55}) 55})
56 56
57int __register_nmi_handler(unsigned int, struct nmiaction *); 57int __register_nmi_handler(unsigned int, struct nmiaction *);
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 6cbbabf5270..0b47ddb6f00 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -128,21 +128,11 @@ static inline u64 paravirt_read_msr(unsigned msr, int *err)
128 return PVOP_CALL2(u64, pv_cpu_ops.read_msr, msr, err); 128 return PVOP_CALL2(u64, pv_cpu_ops.read_msr, msr, err);
129} 129}
130 130
131static inline int paravirt_rdmsr_regs(u32 *regs)
132{
133 return PVOP_CALL1(int, pv_cpu_ops.rdmsr_regs, regs);
134}
135
136static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high) 131static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high)
137{ 132{
138 return PVOP_CALL3(int, pv_cpu_ops.write_msr, msr, low, high); 133 return PVOP_CALL3(int, pv_cpu_ops.write_msr, msr, low, high);
139} 134}
140 135
141static inline int paravirt_wrmsr_regs(u32 *regs)
142{
143 return PVOP_CALL1(int, pv_cpu_ops.wrmsr_regs, regs);
144}
145
146/* These should all do BUG_ON(_err), but our headers are too tangled. */ 136/* These should all do BUG_ON(_err), but our headers are too tangled. */
147#define rdmsr(msr, val1, val2) \ 137#define rdmsr(msr, val1, val2) \
148do { \ 138do { \
@@ -176,9 +166,6 @@ do { \
176 _err; \ 166 _err; \
177}) 167})
178 168
179#define rdmsr_safe_regs(regs) paravirt_rdmsr_regs(regs)
180#define wrmsr_safe_regs(regs) paravirt_wrmsr_regs(regs)
181
182static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) 169static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
183{ 170{
184 int err; 171 int err;
@@ -186,32 +173,6 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
186 *p = paravirt_read_msr(msr, &err); 173 *p = paravirt_read_msr(msr, &err);
187 return err; 174 return err;
188} 175}
189static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
190{
191 u32 gprs[8] = { 0 };
192 int err;
193
194 gprs[1] = msr;
195 gprs[7] = 0x9c5a203a;
196
197 err = paravirt_rdmsr_regs(gprs);
198
199 *p = gprs[0] | ((u64)gprs[2] << 32);
200
201 return err;
202}
203
204static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val)
205{
206 u32 gprs[8] = { 0 };
207
208 gprs[0] = (u32)val;
209 gprs[1] = msr;
210 gprs[2] = val >> 32;
211 gprs[7] = 0x9c5a203a;
212
213 return paravirt_wrmsr_regs(gprs);
214}
215 176
216static inline u64 paravirt_read_tsc(void) 177static inline u64 paravirt_read_tsc(void)
217{ 178{
@@ -252,6 +213,8 @@ do { \
252 high = _l >> 32; \ 213 high = _l >> 32; \
253} while (0) 214} while (0)
254 215
216#define rdpmcl(counter, val) ((val) = paravirt_read_pmc(counter))
217
255static inline unsigned long long paravirt_rdtscp(unsigned int *aux) 218static inline unsigned long long paravirt_rdtscp(unsigned int *aux)
256{ 219{
257 return PVOP_CALL1(u64, pv_cpu_ops.read_tscp, aux); 220 return PVOP_CALL1(u64, pv_cpu_ops.read_tscp, aux);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 8e8b9a4987e..8613cbb7ba4 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -153,9 +153,7 @@ struct pv_cpu_ops {
153 /* MSR, PMC and TSR operations. 153 /* MSR, PMC and TSR operations.
154 err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */ 154 err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */
155 u64 (*read_msr)(unsigned int msr, int *err); 155 u64 (*read_msr)(unsigned int msr, int *err);
156 int (*rdmsr_regs)(u32 *regs);
157 int (*write_msr)(unsigned int msr, unsigned low, unsigned high); 156 int (*write_msr)(unsigned int msr, unsigned low, unsigned high);
158 int (*wrmsr_regs)(u32 *regs);
159 157
160 u64 (*read_tsc)(void); 158 u64 (*read_tsc)(void);
161 u64 (*read_pmc)(int counter); 159 u64 (*read_pmc)(int counter);
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index b3a53174602..73e8eeff22e 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -7,9 +7,13 @@
7#undef DEBUG 7#undef DEBUG
8 8
9#ifdef DEBUG 9#ifdef DEBUG
10#define DBG(x...) printk(x) 10#define DBG(fmt, ...) printk(fmt, ##__VA_ARGS__)
11#else 11#else
12#define DBG(x...) 12#define DBG(fmt, ...) \
13do { \
14 if (0) \
15 printk(fmt, ##__VA_ARGS__); \
16} while (0)
13#endif 17#endif
14 18
15#define PCI_PROBE_BIOS 0x0001 19#define PCI_PROBE_BIOS 0x0001
@@ -100,6 +104,7 @@ struct pci_raw_ops {
100extern const struct pci_raw_ops *raw_pci_ops; 104extern const struct pci_raw_ops *raw_pci_ops;
101extern const struct pci_raw_ops *raw_pci_ext_ops; 105extern const struct pci_raw_ops *raw_pci_ext_ops;
102 106
107extern const struct pci_raw_ops pci_mmcfg;
103extern const struct pci_raw_ops pci_direct_conf1; 108extern const struct pci_raw_ops pci_direct_conf1;
104extern bool port_cf9_safe; 109extern bool port_cf9_safe;
105 110
@@ -135,6 +140,12 @@ struct pci_mmcfg_region {
135 140
136extern int __init pci_mmcfg_arch_init(void); 141extern int __init pci_mmcfg_arch_init(void);
137extern void __init pci_mmcfg_arch_free(void); 142extern void __init pci_mmcfg_arch_free(void);
143extern int __devinit pci_mmcfg_arch_map(struct pci_mmcfg_region *cfg);
144extern void pci_mmcfg_arch_unmap(struct pci_mmcfg_region *cfg);
145extern int __devinit pci_mmconfig_insert(struct device *dev,
146 u16 seg, u8 start,
147 u8 end, phys_addr_t addr);
148extern int pci_mmconfig_delete(u16 seg, u8 start, u8 end);
138extern struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus); 149extern struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus);
139 150
140extern struct list_head pci_mmcfg_list; 151extern struct list_head pci_mmcfg_list;
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 588f52ea810..c78f14a0df0 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -5,11 +5,10 @@
5 * Performance event hw details: 5 * Performance event hw details:
6 */ 6 */
7 7
8#define X86_PMC_MAX_GENERIC 32 8#define INTEL_PMC_MAX_GENERIC 32
9#define X86_PMC_MAX_FIXED 3 9#define INTEL_PMC_MAX_FIXED 3
10#define INTEL_PMC_IDX_FIXED 32
10 11
11#define X86_PMC_IDX_GENERIC 0
12#define X86_PMC_IDX_FIXED 32
13#define X86_PMC_IDX_MAX 64 12#define X86_PMC_IDX_MAX 64
14 13
15#define MSR_ARCH_PERFMON_PERFCTR0 0xc1 14#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
@@ -48,8 +47,7 @@
48 (X86_RAW_EVENT_MASK | \ 47 (X86_RAW_EVENT_MASK | \
49 AMD64_EVENTSEL_EVENT) 48 AMD64_EVENTSEL_EVENT)
50#define AMD64_NUM_COUNTERS 4 49#define AMD64_NUM_COUNTERS 4
51#define AMD64_NUM_COUNTERS_F15H 6 50#define AMD64_NUM_COUNTERS_CORE 6
52#define AMD64_NUM_COUNTERS_MAX AMD64_NUM_COUNTERS_F15H
53 51
54#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c 52#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c
55#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) 53#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
@@ -121,16 +119,16 @@ struct x86_pmu_capability {
121 119
122/* Instr_Retired.Any: */ 120/* Instr_Retired.Any: */
123#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309 121#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309
124#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0) 122#define INTEL_PMC_IDX_FIXED_INSTRUCTIONS (INTEL_PMC_IDX_FIXED + 0)
125 123
126/* CPU_CLK_Unhalted.Core: */ 124/* CPU_CLK_Unhalted.Core: */
127#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a 125#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a
128#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1) 126#define INTEL_PMC_IDX_FIXED_CPU_CYCLES (INTEL_PMC_IDX_FIXED + 1)
129 127
130/* CPU_CLK_Unhalted.Ref: */ 128/* CPU_CLK_Unhalted.Ref: */
131#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b 129#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b
132#define X86_PMC_IDX_FIXED_REF_CYCLES (X86_PMC_IDX_FIXED + 2) 130#define INTEL_PMC_IDX_FIXED_REF_CYCLES (INTEL_PMC_IDX_FIXED + 2)
133#define X86_PMC_MSK_FIXED_REF_CYCLES (1ULL << X86_PMC_IDX_FIXED_REF_CYCLES) 131#define INTEL_PMC_MSK_FIXED_REF_CYCLES (1ULL << INTEL_PMC_IDX_FIXED_REF_CYCLES)
134 132
135/* 133/*
136 * We model BTS tracing as another fixed-mode PMC. 134 * We model BTS tracing as another fixed-mode PMC.
@@ -139,7 +137,7 @@ struct x86_pmu_capability {
139 * values are used by actual fixed events and higher values are used 137 * values are used by actual fixed events and higher values are used
140 * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr. 138 * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr.
141 */ 139 */
142#define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16) 140#define INTEL_PMC_IDX_FIXED_BTS (INTEL_PMC_IDX_FIXED + 16)
143 141
144/* 142/*
145 * IBS cpuid feature detection 143 * IBS cpuid feature detection
@@ -234,6 +232,7 @@ struct perf_guest_switch_msr {
234 232
235extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr); 233extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr);
236extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap); 234extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap);
235extern void perf_check_microcode(void);
237#else 236#else
238static inline perf_guest_switch_msr *perf_guest_get_msrs(int *nr) 237static inline perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
239{ 238{
@@ -247,6 +246,7 @@ static inline void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
247} 246}
248 247
249static inline void perf_events_lapic_init(void) { } 248static inline void perf_events_lapic_init(void) { }
249static inline void perf_check_microcode(void) { }
250#endif 250#endif
251 251
252#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) 252#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index 98391db840c..f2b489cf160 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -2,9 +2,9 @@
2#define _ASM_X86_PGTABLE_2LEVEL_H 2#define _ASM_X86_PGTABLE_2LEVEL_H
3 3
4#define pte_ERROR(e) \ 4#define pte_ERROR(e) \
5 printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, (e).pte_low) 5 pr_err("%s:%d: bad pte %08lx\n", __FILE__, __LINE__, (e).pte_low)
6#define pgd_ERROR(e) \ 6#define pgd_ERROR(e) \
7 printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) 7 pr_err("%s:%d: bad pgd %08lx\n", __FILE__, __LINE__, pgd_val(e))
8 8
9/* 9/*
10 * Certain architectures need to do special things when PTEs 10 * Certain architectures need to do special things when PTEs
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index effff47a3c8..4cc9f2b7cdc 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -9,13 +9,13 @@
9 */ 9 */
10 10
11#define pte_ERROR(e) \ 11#define pte_ERROR(e) \
12 printk("%s:%d: bad pte %p(%08lx%08lx).\n", \ 12 pr_err("%s:%d: bad pte %p(%08lx%08lx)\n", \
13 __FILE__, __LINE__, &(e), (e).pte_high, (e).pte_low) 13 __FILE__, __LINE__, &(e), (e).pte_high, (e).pte_low)
14#define pmd_ERROR(e) \ 14#define pmd_ERROR(e) \
15 printk("%s:%d: bad pmd %p(%016Lx).\n", \ 15 pr_err("%s:%d: bad pmd %p(%016Lx)\n", \
16 __FILE__, __LINE__, &(e), pmd_val(e)) 16 __FILE__, __LINE__, &(e), pmd_val(e))
17#define pgd_ERROR(e) \ 17#define pgd_ERROR(e) \
18 printk("%s:%d: bad pgd %p(%016Lx).\n", \ 18 pr_err("%s:%d: bad pgd %p(%016Lx)\n", \
19 __FILE__, __LINE__, &(e), pgd_val(e)) 19 __FILE__, __LINE__, &(e), pgd_val(e))
20 20
21/* Rules for using set_pte: the pte being assigned *must* be 21/* Rules for using set_pte: the pte being assigned *must* be
@@ -31,6 +31,60 @@ static inline void native_set_pte(pte_t *ptep, pte_t pte)
31 ptep->pte_low = pte.pte_low; 31 ptep->pte_low = pte.pte_low;
32} 32}
33 33
34#define pmd_read_atomic pmd_read_atomic
35/*
36 * pte_offset_map_lock on 32bit PAE kernels was reading the pmd_t with
37 * a "*pmdp" dereference done by gcc. Problem is, in certain places
38 * where pte_offset_map_lock is called, concurrent page faults are
39 * allowed, if the mmap_sem is hold for reading. An example is mincore
40 * vs page faults vs MADV_DONTNEED. On the page fault side
41 * pmd_populate rightfully does a set_64bit, but if we're reading the
42 * pmd_t with a "*pmdp" on the mincore side, a SMP race can happen
43 * because gcc will not read the 64bit of the pmd atomically. To fix
44 * this all places running pmd_offset_map_lock() while holding the
45 * mmap_sem in read mode, shall read the pmdp pointer using this
46 * function to know if the pmd is null nor not, and in turn to know if
47 * they can run pmd_offset_map_lock or pmd_trans_huge or other pmd
48 * operations.
49 *
50 * Without THP if the mmap_sem is hold for reading, the pmd can only
51 * transition from null to not null while pmd_read_atomic runs. So
52 * we can always return atomic pmd values with this function.
53 *
54 * With THP if the mmap_sem is hold for reading, the pmd can become
55 * trans_huge or none or point to a pte (and in turn become "stable")
56 * at any time under pmd_read_atomic. We could read it really
57 * atomically here with a atomic64_read for the THP enabled case (and
58 * it would be a whole lot simpler), but to avoid using cmpxchg8b we
59 * only return an atomic pmdval if the low part of the pmdval is later
60 * found stable (i.e. pointing to a pte). And we're returning a none
61 * pmdval if the low part of the pmd is none. In some cases the high
62 * and low part of the pmdval returned may not be consistent if THP is
63 * enabled (the low part may point to previously mapped hugepage,
64 * while the high part may point to a more recently mapped hugepage),
65 * but pmd_none_or_trans_huge_or_clear_bad() only needs the low part
66 * of the pmd to be read atomically to decide if the pmd is unstable
67 * or not, with the only exception of when the low part of the pmd is
68 * zero in which case we return a none pmd.
69 */
70static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
71{
72 pmdval_t ret;
73 u32 *tmp = (u32 *)pmdp;
74
75 ret = (pmdval_t) (*tmp);
76 if (ret) {
77 /*
78 * If the low part is null, we must not read the high part
79 * or we can end up with a partial pmd.
80 */
81 smp_rmb();
82 ret |= ((pmdval_t)*(tmp + 1)) << 32;
83 }
84
85 return (pmd_t) { ret };
86}
87
34static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) 88static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
35{ 89{
36 set_64bit((unsigned long long *)(ptep), native_pte_val(pte)); 90 set_64bit((unsigned long long *)(ptep), native_pte_val(pte));
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 975f709e09a..8251be02301 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -26,16 +26,16 @@ extern pgd_t init_level4_pgt[];
26extern void paging_init(void); 26extern void paging_init(void);
27 27
28#define pte_ERROR(e) \ 28#define pte_ERROR(e) \
29 printk("%s:%d: bad pte %p(%016lx).\n", \ 29 pr_err("%s:%d: bad pte %p(%016lx)\n", \
30 __FILE__, __LINE__, &(e), pte_val(e)) 30 __FILE__, __LINE__, &(e), pte_val(e))
31#define pmd_ERROR(e) \ 31#define pmd_ERROR(e) \
32 printk("%s:%d: bad pmd %p(%016lx).\n", \ 32 pr_err("%s:%d: bad pmd %p(%016lx)\n", \
33 __FILE__, __LINE__, &(e), pmd_val(e)) 33 __FILE__, __LINE__, &(e), pmd_val(e))
34#define pud_ERROR(e) \ 34#define pud_ERROR(e) \
35 printk("%s:%d: bad pud %p(%016lx).\n", \ 35 pr_err("%s:%d: bad pud %p(%016lx)\n", \
36 __FILE__, __LINE__, &(e), pud_val(e)) 36 __FILE__, __LINE__, &(e), pud_val(e))
37#define pgd_ERROR(e) \ 37#define pgd_ERROR(e) \
38 printk("%s:%d: bad pgd %p(%016lx).\n", \ 38 pr_err("%s:%d: bad pgd %p(%016lx)\n", \
39 __FILE__, __LINE__, &(e), pgd_val(e)) 39 __FILE__, __LINE__, &(e), pgd_val(e))
40 40
41struct mm_struct; 41struct mm_struct;
diff --git a/arch/x86/include/asm/posix_types_32.h b/arch/x86/include/asm/posix_types_32.h
index 99f262e04b9..8e525059e7d 100644
--- a/arch/x86/include/asm/posix_types_32.h
+++ b/arch/x86/include/asm/posix_types_32.h
@@ -10,9 +10,6 @@
10typedef unsigned short __kernel_mode_t; 10typedef unsigned short __kernel_mode_t;
11#define __kernel_mode_t __kernel_mode_t 11#define __kernel_mode_t __kernel_mode_t
12 12
13typedef unsigned short __kernel_nlink_t;
14#define __kernel_nlink_t __kernel_nlink_t
15
16typedef unsigned short __kernel_ipc_pid_t; 13typedef unsigned short __kernel_ipc_pid_t;
17#define __kernel_ipc_pid_t __kernel_ipc_pid_t 14#define __kernel_ipc_pid_t __kernel_ipc_pid_t
18 15
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index f8ab3eaad12..aea1d1d848c 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -44,6 +44,7 @@
44 */ 44 */
45#define X86_CR3_PWT 0x00000008 /* Page Write Through */ 45#define X86_CR3_PWT 0x00000008 /* Page Write Through */
46#define X86_CR3_PCD 0x00000010 /* Page Cache Disable */ 46#define X86_CR3_PCD 0x00000010 /* Page Cache Disable */
47#define X86_CR3_PCID_MASK 0x00000fff /* PCID Mask */
47 48
48/* 49/*
49 * Intel CPU features in CR4 50 * Intel CPU features in CR4
@@ -61,6 +62,7 @@
61#define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */ 62#define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */
62#define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */ 63#define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */
63#define X86_CR4_RDWRGSFS 0x00010000 /* enable RDWRGSFS support */ 64#define X86_CR4_RDWRGSFS 0x00010000 /* enable RDWRGSFS support */
65#define X86_CR4_PCIDE 0x00020000 /* enable PCID support */
64#define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */ 66#define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */
65#define X86_CR4_SMEP 0x00100000 /* enable SMEP support */ 67#define X86_CR4_SMEP 0x00100000 /* enable SMEP support */
66 68
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 7745b257f03..39bc5777211 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -544,13 +544,16 @@ static inline void load_sp0(struct tss_struct *tss,
544 * enable), so that any CPU's that boot up 544 * enable), so that any CPU's that boot up
545 * after us can get the correct flags. 545 * after us can get the correct flags.
546 */ 546 */
547extern unsigned long mmu_cr4_features; 547extern unsigned long mmu_cr4_features;
548extern u32 *trampoline_cr4_features;
548 549
549static inline void set_in_cr4(unsigned long mask) 550static inline void set_in_cr4(unsigned long mask)
550{ 551{
551 unsigned long cr4; 552 unsigned long cr4;
552 553
553 mmu_cr4_features |= mask; 554 mmu_cr4_features |= mask;
555 if (trampoline_cr4_features)
556 *trampoline_cr4_features = mmu_cr4_features;
554 cr4 = read_cr4(); 557 cr4 = read_cr4();
555 cr4 |= mask; 558 cr4 |= mask;
556 write_cr4(cr4); 559 write_cr4(cr4);
@@ -561,6 +564,8 @@ static inline void clear_in_cr4(unsigned long mask)
561 unsigned long cr4; 564 unsigned long cr4;
562 565
563 mmu_cr4_features &= ~mask; 566 mmu_cr4_features &= ~mask;
567 if (trampoline_cr4_features)
568 *trampoline_cr4_features = mmu_cr4_features;
564 cr4 = read_cr4(); 569 cr4 = read_cr4();
565 cr4 &= ~mask; 570 cr4 &= ~mask;
566 write_cr4(cr4); 571 write_cr4(cr4);
diff --git a/arch/x86/include/asm/pvclock-abi.h b/arch/x86/include/asm/pvclock-abi.h
index 35f2d1948ad..6167fd79818 100644
--- a/arch/x86/include/asm/pvclock-abi.h
+++ b/arch/x86/include/asm/pvclock-abi.h
@@ -40,5 +40,6 @@ struct pvclock_wall_clock {
40} __attribute__((__packed__)); 40} __attribute__((__packed__));
41 41
42#define PVCLOCK_TSC_STABLE_BIT (1 << 0) 42#define PVCLOCK_TSC_STABLE_BIT (1 << 0)
43#define PVCLOCK_GUEST_STOPPED (1 << 1)
43#endif /* __ASSEMBLY__ */ 44#endif /* __ASSEMBLY__ */
44#endif /* _ASM_X86_PVCLOCK_ABI_H */ 45#endif /* _ASM_X86_PVCLOCK_ABI_H */
diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
new file mode 100644
index 00000000000..fe1ec5bcd84
--- /dev/null
+++ b/arch/x86/include/asm/realmode.h
@@ -0,0 +1,63 @@
1#ifndef _ARCH_X86_REALMODE_H
2#define _ARCH_X86_REALMODE_H
3
4#include <linux/types.h>
5#include <asm/io.h>
6
7/* This must match data at realmode.S */
8struct real_mode_header {
9 u32 text_start;
10 u32 ro_end;
11 /* SMP trampoline */
12 u32 trampoline_start;
13 u32 trampoline_status;
14 u32 trampoline_header;
15#ifdef CONFIG_X86_64
16 u32 trampoline_pgd;
17#endif
18 /* ACPI S3 wakeup */
19#ifdef CONFIG_ACPI_SLEEP
20 u32 wakeup_start;
21 u32 wakeup_header;
22#endif
23 /* APM/BIOS reboot */
24 u32 machine_real_restart_asm;
25#ifdef CONFIG_X86_64
26 u32 machine_real_restart_seg;
27#endif
28};
29
30/* This must match data at trampoline_32/64.S */
31struct trampoline_header {
32#ifdef CONFIG_X86_32
33 u32 start;
34 u16 gdt_pad;
35 u16 gdt_limit;
36 u32 gdt_base;
37#else
38 u64 start;
39 u64 efer;
40 u32 cr4;
41#endif
42};
43
44extern struct real_mode_header *real_mode_header;
45extern unsigned char real_mode_blob_end[];
46
47extern unsigned long init_rsp;
48extern unsigned long initial_code;
49extern unsigned long initial_gs;
50
51extern unsigned char real_mode_blob[];
52extern unsigned char real_mode_relocs[];
53
54#ifdef CONFIG_X86_32
55extern unsigned char startup_32_smp[];
56extern unsigned char boot_gdt[];
57#else
58extern unsigned char secondary_startup_64[];
59#endif
60
61extern void __init setup_real_mode(void);
62
63#endif /* _ARCH_X86_REALMODE_H */
diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h
index 92f297069e8..a82c4f1b4d8 100644
--- a/arch/x86/include/asm/reboot.h
+++ b/arch/x86/include/asm/reboot.h
@@ -18,8 +18,8 @@ extern struct machine_ops machine_ops;
18 18
19void native_machine_crash_shutdown(struct pt_regs *regs); 19void native_machine_crash_shutdown(struct pt_regs *regs);
20void native_machine_shutdown(void); 20void native_machine_shutdown(void);
21void machine_real_restart(unsigned int type); 21void __noreturn machine_real_restart(unsigned int type);
22/* These must match dispatch_table in reboot_32.S */ 22/* These must match dispatch in arch/x86/realmore/rm/reboot.S */
23#define MRR_BIOS 0 23#define MRR_BIOS 0
24#define MRR_APM 1 24#define MRR_APM 1
25 25
diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h
index ada93b3b8c6..beff97f7df3 100644
--- a/arch/x86/include/asm/sighandling.h
+++ b/arch/x86/include/asm/sighandling.h
@@ -7,8 +7,6 @@
7 7
8#include <asm/processor-flags.h> 8#include <asm/processor-flags.h>
9 9
10#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
11
12#define __FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \ 10#define __FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \
13 X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \ 11 X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \
14 X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \ 12 X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index f48394513c3..2ffa95dc233 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -169,11 +169,6 @@ void x86_idle_thread_init(unsigned int cpu, struct task_struct *idle);
169void smp_store_cpu_info(int id); 169void smp_store_cpu_info(int id);
170#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu) 170#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
171 171
172/* We don't mark CPUs online until __cpu_up(), so we need another measure */
173static inline int num_booting_cpus(void)
174{
175 return cpumask_weight(cpu_callout_mask);
176}
177#else /* !CONFIG_SMP */ 172#else /* !CONFIG_SMP */
178#define wbinvd_on_cpu(cpu) wbinvd() 173#define wbinvd_on_cpu(cpu) wbinvd()
179static inline int wbinvd_on_all_cpus(void) 174static inline int wbinvd_on_all_cpus(void)
diff --git a/arch/x86/include/asm/sta2x11.h b/arch/x86/include/asm/sta2x11.h
new file mode 100644
index 00000000000..e9d32df89cc
--- /dev/null
+++ b/arch/x86/include/asm/sta2x11.h
@@ -0,0 +1,12 @@
1/*
2 * Header file for STMicroelectronics ConneXt (STA2X11) IOHub
3 */
4#ifndef __ASM_STA2X11_H
5#define __ASM_STA2X11_H
6
7#include <linux/pci.h>
8
9/* This needs to be called from the MFD to configure its sub-devices */
10struct sta2x11_instance *sta2x11_get_instance(struct pci_dev *pdev);
11
12#endif /* __ASM_STA2X11_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 3c9aebc00d3..89f794f007e 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -85,6 +85,7 @@ struct thread_info {
85#define TIF_SECCOMP 8 /* secure computing */ 85#define TIF_SECCOMP 8 /* secure computing */
86#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ 86#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */
87#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ 87#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
88#define TIF_UPROBE 12 /* breakpointed or singlestepping */
88#define TIF_NOTSC 16 /* TSC is not accessible in userland */ 89#define TIF_NOTSC 16 /* TSC is not accessible in userland */
89#define TIF_IA32 17 /* IA32 compatibility process */ 90#define TIF_IA32 17 /* IA32 compatibility process */
90#define TIF_FORK 18 /* ret_from_fork */ 91#define TIF_FORK 18 /* ret_from_fork */
@@ -109,6 +110,7 @@ struct thread_info {
109#define _TIF_SECCOMP (1 << TIF_SECCOMP) 110#define _TIF_SECCOMP (1 << TIF_SECCOMP)
110#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY) 111#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY)
111#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) 112#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)
113#define _TIF_UPROBE (1 << TIF_UPROBE)
112#define _TIF_NOTSC (1 << TIF_NOTSC) 114#define _TIF_NOTSC (1 << TIF_NOTSC)
113#define _TIF_IA32 (1 << TIF_IA32) 115#define _TIF_IA32 (1 << TIF_IA32)
114#define _TIF_FORK (1 << TIF_FORK) 116#define _TIF_FORK (1 << TIF_FORK)
@@ -246,7 +248,23 @@ static inline void set_restore_sigmask(void)
246{ 248{
247 struct thread_info *ti = current_thread_info(); 249 struct thread_info *ti = current_thread_info();
248 ti->status |= TS_RESTORE_SIGMASK; 250 ti->status |= TS_RESTORE_SIGMASK;
249 set_bit(TIF_SIGPENDING, (unsigned long *)&ti->flags); 251 WARN_ON(!test_bit(TIF_SIGPENDING, (unsigned long *)&ti->flags));
252}
253static inline void clear_restore_sigmask(void)
254{
255 current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
256}
257static inline bool test_restore_sigmask(void)
258{
259 return current_thread_info()->status & TS_RESTORE_SIGMASK;
260}
261static inline bool test_and_clear_restore_sigmask(void)
262{
263 struct thread_info *ti = current_thread_info();
264 if (!(ti->status & TS_RESTORE_SIGMASK))
265 return false;
266 ti->status &= ~TS_RESTORE_SIGMASK;
267 return true;
250} 268}
251 269
252static inline bool is_ia32_task(void) 270static inline bool is_ia32_task(void)
diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h
deleted file mode 100644
index feca3118a73..00000000000
--- a/arch/x86/include/asm/trampoline.h
+++ /dev/null
@@ -1,39 +0,0 @@
1#ifndef _ASM_X86_TRAMPOLINE_H
2#define _ASM_X86_TRAMPOLINE_H
3
4#ifndef __ASSEMBLY__
5
6#include <linux/types.h>
7#include <asm/io.h>
8
9/*
10 * Trampoline 80x86 program as an array. These are in the init rodata
11 * segment, but that's okay, because we only care about the relative
12 * addresses of the symbols.
13 */
14extern const unsigned char x86_trampoline_start [];
15extern const unsigned char x86_trampoline_end [];
16extern unsigned char *x86_trampoline_base;
17
18extern unsigned long init_rsp;
19extern unsigned long initial_code;
20extern unsigned long initial_gs;
21
22extern void __init setup_trampolines(void);
23
24extern const unsigned char trampoline_data[];
25extern const unsigned char trampoline_status[];
26
27#define TRAMPOLINE_SYM(x) \
28 ((void *)(x86_trampoline_base + \
29 ((const unsigned char *)(x) - x86_trampoline_start)))
30
31/* Address of the SMP trampoline */
32static inline unsigned long trampoline_address(void)
33{
34 return virt_to_phys(TRAMPOLINE_SYM(trampoline_data));
35}
36
37#endif /* __ASSEMBLY__ */
38
39#endif /* _ASM_X86_TRAMPOLINE_H */
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 851fe0dc13b..e1f3a17034f 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -32,9 +32,9 @@
32 32
33#define segment_eq(a, b) ((a).seg == (b).seg) 33#define segment_eq(a, b) ((a).seg == (b).seg)
34 34
35#define __addr_ok(addr) \ 35#define user_addr_max() (current_thread_info()->addr_limit.seg)
36 ((unsigned long __force)(addr) < \ 36#define __addr_ok(addr) \
37 (current_thread_info()->addr_limit.seg)) 37 ((unsigned long __force)(addr) < user_addr_max())
38 38
39/* 39/*
40 * Test whether a block of memory is a valid user space address. 40 * Test whether a block of memory is a valid user space address.
@@ -46,14 +46,14 @@
46 * This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry... 46 * This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry...
47 */ 47 */
48 48
49#define __range_not_ok(addr, size) \ 49#define __range_not_ok(addr, size, limit) \
50({ \ 50({ \
51 unsigned long flag, roksum; \ 51 unsigned long flag, roksum; \
52 __chk_user_ptr(addr); \ 52 __chk_user_ptr(addr); \
53 asm("add %3,%1 ; sbb %0,%0 ; cmp %1,%4 ; sbb $0,%0" \ 53 asm("add %3,%1 ; sbb %0,%0 ; cmp %1,%4 ; sbb $0,%0" \
54 : "=&r" (flag), "=r" (roksum) \ 54 : "=&r" (flag), "=r" (roksum) \
55 : "1" (addr), "g" ((long)(size)), \ 55 : "1" (addr), "g" ((long)(size)), \
56 "rm" (current_thread_info()->addr_limit.seg)); \ 56 "rm" (limit)); \
57 flag; \ 57 flag; \
58}) 58})
59 59
@@ -76,7 +76,8 @@
76 * checks that the pointer is in the user space range - after calling 76 * checks that the pointer is in the user space range - after calling
77 * this function, memory access functions may still return -EFAULT. 77 * this function, memory access functions may still return -EFAULT.
78 */ 78 */
79#define access_ok(type, addr, size) (likely(__range_not_ok(addr, size) == 0)) 79#define access_ok(type, addr, size) \
80 (likely(__range_not_ok(addr, size, user_addr_max()) == 0))
80 81
81/* 82/*
82 * The exception table consists of pairs of addresses relative to the 83 * The exception table consists of pairs of addresses relative to the
@@ -565,6 +566,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n);
565extern __must_check long 566extern __must_check long
566strncpy_from_user(char *dst, const char __user *src, long count); 567strncpy_from_user(char *dst, const char __user *src, long count);
567 568
569extern __must_check long strlen_user(const char __user *str);
570extern __must_check long strnlen_user(const char __user *str, long n);
571
568/* 572/*
569 * movsl can be slow when source and dest are not both 8-byte aligned 573 * movsl can be slow when source and dest are not both 8-byte aligned
570 */ 574 */
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 8084bc73b18..576e39bca6a 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -213,23 +213,6 @@ static inline unsigned long __must_check copy_from_user(void *to,
213 return n; 213 return n;
214} 214}
215 215
216/**
217 * strlen_user: - Get the size of a string in user space.
218 * @str: The string to measure.
219 *
220 * Context: User context only. This function may sleep.
221 *
222 * Get the size of a NUL-terminated string in user space.
223 *
224 * Returns the size of the string INCLUDING the terminating NUL.
225 * On exception, returns 0.
226 *
227 * If there is a limit on the length of a valid string, you may wish to
228 * consider using strnlen_user() instead.
229 */
230#define strlen_user(str) strnlen_user(str, LONG_MAX)
231
232long strnlen_user(const char __user *str, long n);
233unsigned long __must_check clear_user(void __user *mem, unsigned long len); 216unsigned long __must_check clear_user(void __user *mem, unsigned long len);
234unsigned long __must_check __clear_user(void __user *mem, unsigned long len); 217unsigned long __must_check __clear_user(void __user *mem, unsigned long len);
235 218
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index fcd4b6f3ef0..d8def8b3dba 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -17,6 +17,8 @@
17 17
18/* Handles exceptions in both to and from, but doesn't do access_ok */ 18/* Handles exceptions in both to and from, but doesn't do access_ok */
19__must_check unsigned long 19__must_check unsigned long
20copy_user_enhanced_fast_string(void *to, const void *from, unsigned len);
21__must_check unsigned long
20copy_user_generic_string(void *to, const void *from, unsigned len); 22copy_user_generic_string(void *to, const void *from, unsigned len);
21__must_check unsigned long 23__must_check unsigned long
22copy_user_generic_unrolled(void *to, const void *from, unsigned len); 24copy_user_generic_unrolled(void *to, const void *from, unsigned len);
@@ -26,9 +28,16 @@ copy_user_generic(void *to, const void *from, unsigned len)
26{ 28{
27 unsigned ret; 29 unsigned ret;
28 30
29 alternative_call(copy_user_generic_unrolled, 31 /*
32 * If CPU has ERMS feature, use copy_user_enhanced_fast_string.
33 * Otherwise, if CPU has rep_good feature, use copy_user_generic_string.
34 * Otherwise, use copy_user_generic_unrolled.
35 */
36 alternative_call_2(copy_user_generic_unrolled,
30 copy_user_generic_string, 37 copy_user_generic_string,
31 X86_FEATURE_REP_GOOD, 38 X86_FEATURE_REP_GOOD,
39 copy_user_enhanced_fast_string,
40 X86_FEATURE_ERMS,
32 ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from), 41 ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from),
33 "=d" (len)), 42 "=d" (len)),
34 "1" (to), "2" (from), "3" (len) 43 "1" (to), "2" (from), "3" (len)
@@ -208,9 +217,6 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
208 } 217 }
209} 218}
210 219
211__must_check long strnlen_user(const char __user *str, long n);
212__must_check long __strnlen_user(const char __user *str, long n);
213__must_check long strlen_user(const char __user *str);
214__must_check unsigned long clear_user(void __user *mem, unsigned long len); 220__must_check unsigned long clear_user(void __user *mem, unsigned long len);
215__must_check unsigned long __clear_user(void __user *mem, unsigned long len); 221__must_check unsigned long __clear_user(void __user *mem, unsigned long len);
216 222
diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h
new file mode 100644
index 00000000000..f3971bbcd1d
--- /dev/null
+++ b/arch/x86/include/asm/uprobes.h
@@ -0,0 +1,57 @@
1#ifndef _ASM_UPROBES_H
2#define _ASM_UPROBES_H
3/*
4 * User-space Probes (UProbes) for x86
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 *
20 * Copyright (C) IBM Corporation, 2008-2011
21 * Authors:
22 * Srikar Dronamraju
23 * Jim Keniston
24 */
25
26#include <linux/notifier.h>
27
28typedef u8 uprobe_opcode_t;
29
30#define MAX_UINSN_BYTES 16
31#define UPROBE_XOL_SLOT_BYTES 128 /* to keep it cache aligned */
32
33#define UPROBE_SWBP_INSN 0xcc
34#define UPROBE_SWBP_INSN_SIZE 1
35
36struct arch_uprobe {
37 u16 fixups;
38 u8 insn[MAX_UINSN_BYTES];
39#ifdef CONFIG_X86_64
40 unsigned long rip_rela_target_address;
41#endif
42};
43
44struct arch_uprobe_task {
45 unsigned long saved_trap_nr;
46#ifdef CONFIG_X86_64
47 unsigned long saved_scratch_register;
48#endif
49};
50
51extern int arch_uprobe_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long addr);
52extern int arch_uprobe_pre_xol(struct arch_uprobe *aup, struct pt_regs *regs);
53extern int arch_uprobe_post_xol(struct arch_uprobe *aup, struct pt_regs *regs);
54extern bool arch_uprobe_xol_was_trapped(struct task_struct *tsk);
55extern int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data);
56extern void arch_uprobe_abort_xol(struct arch_uprobe *aup, struct pt_regs *regs);
57#endif /* _ASM_UPROBES_H */
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index becf47b8173..a06983cdc12 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -140,6 +140,9 @@
140#define IPI_RESET_LIMIT 1 140#define IPI_RESET_LIMIT 1
141/* after this # consecutive successes, bump up the throttle if it was lowered */ 141/* after this # consecutive successes, bump up the throttle if it was lowered */
142#define COMPLETE_THRESHOLD 5 142#define COMPLETE_THRESHOLD 5
143/* after this # of giveups (fall back to kernel IPI's) disable the use of
144 the BAU for a period of time */
145#define GIVEUP_LIMIT 100
143 146
144#define UV_LB_SUBNODEID 0x10 147#define UV_LB_SUBNODEID 0x10
145 148
@@ -149,7 +152,6 @@
149/* 4 bits of software ack period */ 152/* 4 bits of software ack period */
150#define UV2_ACK_MASK 0x7UL 153#define UV2_ACK_MASK 0x7UL
151#define UV2_ACK_UNITS_SHFT 3 154#define UV2_ACK_UNITS_SHFT 3
152#define UV2_LEG_SHFT UV2H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_SHFT
153#define UV2_EXT_SHFT UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT 155#define UV2_EXT_SHFT UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT
154 156
155/* 157/*
@@ -167,7 +169,6 @@
167#define FLUSH_RETRY_TIMEOUT 2 169#define FLUSH_RETRY_TIMEOUT 2
168#define FLUSH_GIVEUP 3 170#define FLUSH_GIVEUP 3
169#define FLUSH_COMPLETE 4 171#define FLUSH_COMPLETE 4
170#define FLUSH_RETRY_BUSYBUG 5
171 172
172/* 173/*
173 * tuning the action when the numalink network is extremely delayed 174 * tuning the action when the numalink network is extremely delayed
@@ -176,7 +177,7 @@
176 microseconds */ 177 microseconds */
177#define CONGESTED_REPS 10 /* long delays averaged over 178#define CONGESTED_REPS 10 /* long delays averaged over
178 this many broadcasts */ 179 this many broadcasts */
179#define CONGESTED_PERIOD 30 /* time for the bau to be 180#define DISABLED_PERIOD 10 /* time for the bau to be
180 disabled, in seconds */ 181 disabled, in seconds */
181/* see msg_type: */ 182/* see msg_type: */
182#define MSG_NOOP 0 183#define MSG_NOOP 0
@@ -521,6 +522,12 @@ struct ptc_stats {
521 unsigned long s_uv2_wars; /* uv2 workaround, perm. busy */ 522 unsigned long s_uv2_wars; /* uv2 workaround, perm. busy */
522 unsigned long s_uv2_wars_hw; /* uv2 workaround, hiwater */ 523 unsigned long s_uv2_wars_hw; /* uv2 workaround, hiwater */
523 unsigned long s_uv2_war_waits; /* uv2 workaround, long waits */ 524 unsigned long s_uv2_war_waits; /* uv2 workaround, long waits */
525 unsigned long s_overipilimit; /* over the ipi reset limit */
526 unsigned long s_giveuplimit; /* disables, over giveup limit*/
527 unsigned long s_enters; /* entries to the driver */
528 unsigned long s_ipifordisabled; /* fall back to IPI; disabled */
529 unsigned long s_plugged; /* plugged by h/w bug*/
530 unsigned long s_congested; /* giveup on long wait */
524 /* destination statistics */ 531 /* destination statistics */
525 unsigned long d_alltlb; /* times all tlb's on this 532 unsigned long d_alltlb; /* times all tlb's on this
526 cpu were flushed */ 533 cpu were flushed */
@@ -587,8 +594,8 @@ struct bau_control {
587 int timeout_tries; 594 int timeout_tries;
588 int ipi_attempts; 595 int ipi_attempts;
589 int conseccompletes; 596 int conseccompletes;
590 int baudisabled; 597 short nobau;
591 int set_bau_off; 598 short baudisabled;
592 short cpu; 599 short cpu;
593 short osnode; 600 short osnode;
594 short uvhub_cpu; 601 short uvhub_cpu;
@@ -597,14 +604,16 @@ struct bau_control {
597 short cpus_in_socket; 604 short cpus_in_socket;
598 short cpus_in_uvhub; 605 short cpus_in_uvhub;
599 short partition_base_pnode; 606 short partition_base_pnode;
600 short using_desc; /* an index, like uvhub_cpu */ 607 short busy; /* all were busy (war) */
601 unsigned int inuse_map;
602 unsigned short message_number; 608 unsigned short message_number;
603 unsigned short uvhub_quiesce; 609 unsigned short uvhub_quiesce;
604 short socket_acknowledge_count[DEST_Q_SIZE]; 610 short socket_acknowledge_count[DEST_Q_SIZE];
605 cycles_t send_message; 611 cycles_t send_message;
612 cycles_t period_end;
613 cycles_t period_time;
606 spinlock_t uvhub_lock; 614 spinlock_t uvhub_lock;
607 spinlock_t queue_lock; 615 spinlock_t queue_lock;
616 spinlock_t disable_lock;
608 /* tunables */ 617 /* tunables */
609 int max_concurr; 618 int max_concurr;
610 int max_concurr_const; 619 int max_concurr_const;
@@ -615,9 +624,9 @@ struct bau_control {
615 int complete_threshold; 624 int complete_threshold;
616 int cong_response_us; 625 int cong_response_us;
617 int cong_reps; 626 int cong_reps;
618 int cong_period; 627 cycles_t disabled_period;
619 unsigned long clocks_per_100_usec; 628 int period_giveups;
620 cycles_t period_time; 629 int giveup_limit;
621 long period_requests; 630 long period_requests;
622 struct hub_and_pnode *thp; 631 struct hub_and_pnode *thp;
623}; 632};
diff --git a/arch/x86/include/asm/vga.h b/arch/x86/include/asm/vga.h
index c4b9dc2f67c..44282fbf7bf 100644
--- a/arch/x86/include/asm/vga.h
+++ b/arch/x86/include/asm/vga.h
@@ -17,4 +17,10 @@
17#define vga_readb(x) (*(x)) 17#define vga_readb(x) (*(x))
18#define vga_writeb(x, y) (*(y) = (x)) 18#define vga_writeb(x, y) (*(y) = (x))
19 19
20#ifdef CONFIG_FB_EFI
21#define __ARCH_HAS_VGA_DEFAULT_DEVICE
22extern struct pci_dev *vga_default_device(void);
23extern void vga_set_default_device(struct pci_dev *pdev);
24#endif
25
20#endif /* _ASM_X86_VGA_H */ 26#endif /* _ASM_X86_VGA_H */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 31f180c21ce..74fcb963595 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -60,6 +60,7 @@
60#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 60#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
61#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 61#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080
62#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 62#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400
63#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000
63 64
64 65
65#define PIN_BASED_EXT_INTR_MASK 0x00000001 66#define PIN_BASED_EXT_INTR_MASK 0x00000001
@@ -281,6 +282,7 @@ enum vmcs_field {
281#define EXIT_REASON_EPT_MISCONFIG 49 282#define EXIT_REASON_EPT_MISCONFIG 49
282#define EXIT_REASON_WBINVD 54 283#define EXIT_REASON_WBINVD 54
283#define EXIT_REASON_XSETBV 55 284#define EXIT_REASON_XSETBV 55
285#define EXIT_REASON_INVPCID 58
284 286
285/* 287/*
286 * Interruption-information format 288 * Interruption-information format
@@ -404,6 +406,7 @@ enum vmcs_field {
404#define VMX_EPTP_WB_BIT (1ull << 14) 406#define VMX_EPTP_WB_BIT (1ull << 14)
405#define VMX_EPT_2MB_PAGE_BIT (1ull << 16) 407#define VMX_EPT_2MB_PAGE_BIT (1ull << 16)
406#define VMX_EPT_1GB_PAGE_BIT (1ull << 17) 408#define VMX_EPT_1GB_PAGE_BIT (1ull << 17)
409#define VMX_EPT_AD_BIT (1ull << 21)
407#define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24) 410#define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24)
408#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) 411#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
409#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) 412#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
@@ -415,11 +418,14 @@ enum vmcs_field {
415#define VMX_EPT_MAX_GAW 0x4 418#define VMX_EPT_MAX_GAW 0x4
416#define VMX_EPT_MT_EPTE_SHIFT 3 419#define VMX_EPT_MT_EPTE_SHIFT 3
417#define VMX_EPT_GAW_EPTP_SHIFT 3 420#define VMX_EPT_GAW_EPTP_SHIFT 3
421#define VMX_EPT_AD_ENABLE_BIT (1ull << 6)
418#define VMX_EPT_DEFAULT_MT 0x6ull 422#define VMX_EPT_DEFAULT_MT 0x6ull
419#define VMX_EPT_READABLE_MASK 0x1ull 423#define VMX_EPT_READABLE_MASK 0x1ull
420#define VMX_EPT_WRITABLE_MASK 0x2ull 424#define VMX_EPT_WRITABLE_MASK 0x2ull
421#define VMX_EPT_EXECUTABLE_MASK 0x4ull 425#define VMX_EPT_EXECUTABLE_MASK 0x4ull
422#define VMX_EPT_IPAT_BIT (1ull << 6) 426#define VMX_EPT_IPAT_BIT (1ull << 6)
427#define VMX_EPT_ACCESS_BIT (1ull << 8)
428#define VMX_EPT_DIRTY_BIT (1ull << 9)
423 429
424#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul 430#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul
425 431
diff --git a/arch/x86/include/asm/word-at-a-time.h b/arch/x86/include/asm/word-at-a-time.h
index e58f03b206c..5b238981542 100644
--- a/arch/x86/include/asm/word-at-a-time.h
+++ b/arch/x86/include/asm/word-at-a-time.h
@@ -1,6 +1,8 @@
1#ifndef _ASM_WORD_AT_A_TIME_H 1#ifndef _ASM_WORD_AT_A_TIME_H
2#define _ASM_WORD_AT_A_TIME_H 2#define _ASM_WORD_AT_A_TIME_H
3 3
4#include <linux/kernel.h>
5
4/* 6/*
5 * This is largely generic for little-endian machines, but the 7 * This is largely generic for little-endian machines, but the
6 * optimal byte mask counting is probably going to be something 8 * optimal byte mask counting is probably going to be something
@@ -8,6 +10,11 @@
8 * bit count instruction, that might be better than the multiply 10 * bit count instruction, that might be better than the multiply
9 * and shift, for example. 11 * and shift, for example.
10 */ 12 */
13struct word_at_a_time {
14 const unsigned long one_bits, high_bits;
15};
16
17#define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0x01), REPEAT_BYTE(0x80) }
11 18
12#ifdef CONFIG_64BIT 19#ifdef CONFIG_64BIT
13 20
@@ -35,12 +42,31 @@ static inline long count_masked_bytes(long mask)
35 42
36#endif 43#endif
37 44
38#define REPEAT_BYTE(x) ((~0ul / 0xff) * (x)) 45/* Return nonzero if it has a zero */
46static inline unsigned long has_zero(unsigned long a, unsigned long *bits, const struct word_at_a_time *c)
47{
48 unsigned long mask = ((a - c->one_bits) & ~a) & c->high_bits;
49 *bits = mask;
50 return mask;
51}
52
53static inline unsigned long prep_zero_mask(unsigned long a, unsigned long bits, const struct word_at_a_time *c)
54{
55 return bits;
56}
57
58static inline unsigned long create_zero_mask(unsigned long bits)
59{
60 bits = (bits - 1) & ~bits;
61 return bits >> 7;
62}
63
64/* The mask we created is directly usable as a bytemask */
65#define zero_bytemask(mask) (mask)
39 66
40/* Return the high bit set in the first byte that is a zero */ 67static inline unsigned long find_zero(unsigned long mask)
41static inline unsigned long has_zero(unsigned long a)
42{ 68{
43 return ((a - REPEAT_BYTE(0x01)) & ~a) & REPEAT_BYTE(0x80); 69 return count_masked_bytes(mask);
44} 70}
45 71
46/* 72/*
diff --git a/arch/x86/include/asm/x2apic.h b/arch/x86/include/asm/x2apic.h
index 92e54abf89e..f90f0a587c6 100644
--- a/arch/x86/include/asm/x2apic.h
+++ b/arch/x86/include/asm/x2apic.h
@@ -9,15 +9,6 @@
9#include <asm/ipi.h> 9#include <asm/ipi.h>
10#include <linux/cpumask.h> 10#include <linux/cpumask.h>
11 11
12/*
13 * Need to use more than cpu 0, because we need more vectors
14 * when MSI-X are used.
15 */
16static const struct cpumask *x2apic_target_cpus(void)
17{
18 return cpu_online_mask;
19}
20
21static int x2apic_apic_id_valid(int apicid) 12static int x2apic_apic_id_valid(int apicid)
22{ 13{
23 return 1; 14 return 1;
@@ -28,15 +19,6 @@ static int x2apic_apic_id_registered(void)
28 return 1; 19 return 1;
29} 20}
30 21
31/*
32 * For now each logical cpu is in its own vector allocation domain.
33 */
34static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
35{
36 cpumask_clear(retmask);
37 cpumask_set_cpu(cpu, retmask);
38}
39
40static void 22static void
41__x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest) 23__x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
42{ 24{
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 42d2ae18dab..38155f66714 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -163,6 +163,7 @@ struct x86_cpuinit_ops {
163 * @i8042_detect pre-detect if i8042 controller exists 163 * @i8042_detect pre-detect if i8042 controller exists
164 * @save_sched_clock_state: save state for sched_clock() on suspend 164 * @save_sched_clock_state: save state for sched_clock() on suspend
165 * @restore_sched_clock_state: restore state for sched_clock() on resume 165 * @restore_sched_clock_state: restore state for sched_clock() on resume
166 * @apic_post_init: adjust apic if neeeded
166 */ 167 */
167struct x86_platform_ops { 168struct x86_platform_ops {
168 unsigned long (*calibrate_tsc)(void); 169 unsigned long (*calibrate_tsc)(void);
@@ -175,6 +176,7 @@ struct x86_platform_ops {
175 int (*i8042_detect)(void); 176 int (*i8042_detect)(void);
176 void (*save_sched_clock_state)(void); 177 void (*save_sched_clock_state)(void);
177 void (*restore_sched_clock_state)(void); 178 void (*restore_sched_clock_state)(void);
179 void (*apic_post_init)(void);
178}; 180};
179 181
180struct pci_dev; 182struct pci_dev;
diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h
index 1df35417c41..cc146d51449 100644
--- a/arch/x86/include/asm/xen/events.h
+++ b/arch/x86/include/asm/xen/events.h
@@ -6,6 +6,7 @@ enum ipi_vector {
6 XEN_CALL_FUNCTION_VECTOR, 6 XEN_CALL_FUNCTION_VECTOR,
7 XEN_CALL_FUNCTION_SINGLE_VECTOR, 7 XEN_CALL_FUNCTION_SINGLE_VECTOR,
8 XEN_SPIN_UNLOCK_VECTOR, 8 XEN_SPIN_UNLOCK_VECTOR,
9 XEN_IRQ_WORK_VECTOR,
9 10
10 XEN_NR_IPIS, 11 XEN_NR_IPIS,
11}; 12};
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 5728852fb90..59c226d120c 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -48,6 +48,7 @@
48#include <xen/interface/sched.h> 48#include <xen/interface/sched.h>
49#include <xen/interface/physdev.h> 49#include <xen/interface/physdev.h>
50#include <xen/interface/platform.h> 50#include <xen/interface/platform.h>
51#include <xen/interface/xen-mca.h>
51 52
52/* 53/*
53 * The hypercall asms have to meet several constraints: 54 * The hypercall asms have to meet several constraints:
@@ -302,6 +303,13 @@ HYPERVISOR_set_timer_op(u64 timeout)
302} 303}
303 304
304static inline int 305static inline int
306HYPERVISOR_mca(struct xen_mc *mc_op)
307{
308 mc_op->interface_version = XEN_MCA_INTERFACE_VERSION;
309 return _hypercall1(int, mca, mc_op);
310}
311
312static inline int
305HYPERVISOR_dom0_op(struct xen_platform_op *platform_op) 313HYPERVISOR_dom0_op(struct xen_platform_op *platform_op)
306{ 314{
307 platform_op->interface_version = XENPF_INTERFACE_VERSION; 315 platform_op->interface_version = XENPF_INTERFACE_VERSION;
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index c34f96c2f7a..93971e841dd 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -44,6 +44,7 @@ extern unsigned long machine_to_phys_nr;
44 44
45extern unsigned long get_phys_to_machine(unsigned long pfn); 45extern unsigned long get_phys_to_machine(unsigned long pfn);
46extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); 46extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
47extern bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn);
47extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); 48extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
48extern unsigned long set_phys_range_identity(unsigned long pfn_s, 49extern unsigned long set_phys_range_identity(unsigned long pfn_s,
49 unsigned long pfn_e); 50 unsigned long pfn_e);
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index bb8529275aa..8215e5652d9 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -35,7 +35,6 @@ obj-y += tsc.o io_delay.o rtc.o
35obj-y += pci-iommu_table.o 35obj-y += pci-iommu_table.o
36obj-y += resource.o 36obj-y += resource.o
37 37
38obj-y += trampoline.o trampoline_$(BITS).o
39obj-y += process.o 38obj-y += process.o
40obj-y += i387.o xsave.o 39obj-y += i387.o xsave.o
41obj-y += ptrace.o 40obj-y += ptrace.o
@@ -48,7 +47,6 @@ obj-$(CONFIG_STACKTRACE) += stacktrace.o
48obj-y += cpu/ 47obj-y += cpu/
49obj-y += acpi/ 48obj-y += acpi/
50obj-y += reboot.o 49obj-y += reboot.o
51obj-$(CONFIG_X86_32) += reboot_32.o
52obj-$(CONFIG_X86_MSR) += msr.o 50obj-$(CONFIG_X86_MSR) += msr.o
53obj-$(CONFIG_X86_CPUID) += cpuid.o 51obj-$(CONFIG_X86_CPUID) += cpuid.o
54obj-$(CONFIG_PCI) += early-quirks.o 52obj-$(CONFIG_PCI) += early-quirks.o
@@ -100,6 +98,7 @@ obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
100 98
101obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o 99obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
102obj-$(CONFIG_OF) += devicetree.o 100obj-$(CONFIG_OF) += devicetree.o
101obj-$(CONFIG_UPROBES) += uprobes.o
103 102
104### 103###
105# 64 bit specific files 104# 64 bit specific files
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index 6f35260bb3e..163b2258147 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -1,14 +1,7 @@
1subdir- := realmode
2
3obj-$(CONFIG_ACPI) += boot.o 1obj-$(CONFIG_ACPI) += boot.o
4obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_rm.o wakeup_$(BITS).o 2obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o
5 3
6ifneq ($(CONFIG_ACPI_PROCESSOR),) 4ifneq ($(CONFIG_ACPI_PROCESSOR),)
7obj-y += cstate.o 5obj-y += cstate.o
8endif 6endif
9 7
10$(obj)/wakeup_rm.o: $(obj)/realmode/wakeup.bin
11
12$(obj)/realmode/wakeup.bin: FORCE
13 $(Q)$(MAKE) $(build)=$(obj)/realmode
14
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 8afb6931981..b2297e58c6e 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -422,12 +422,14 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
422 return 0; 422 return 0;
423 } 423 }
424 424
425 if (intsrc->source_irq == 0 && intsrc->global_irq == 2) { 425 if (intsrc->source_irq == 0) {
426 if (acpi_skip_timer_override) { 426 if (acpi_skip_timer_override) {
427 printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n"); 427 printk(PREFIX "BIOS IRQ0 override ignored.\n");
428 return 0; 428 return 0;
429 } 429 }
430 if (acpi_fix_pin2_polarity && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) { 430
431 if ((intsrc->global_irq == 2) && acpi_fix_pin2_polarity
432 && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) {
431 intsrc->inti_flags &= ~ACPI_MADT_POLARITY_MASK; 433 intsrc->inti_flags &= ~ACPI_MADT_POLARITY_MASK;
432 printk(PREFIX "BIOS IRQ0 pin2 override: forcing polarity to high active.\n"); 434 printk(PREFIX "BIOS IRQ0 pin2 override: forcing polarity to high active.\n");
433 } 435 }
@@ -1334,17 +1336,12 @@ static int __init dmi_disable_acpi(const struct dmi_system_id *d)
1334} 1336}
1335 1337
1336/* 1338/*
1337 * Force ignoring BIOS IRQ0 pin2 override 1339 * Force ignoring BIOS IRQ0 override
1338 */ 1340 */
1339static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) 1341static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
1340{ 1342{
1341 /*
1342 * The ati_ixp4x0_rev() early PCI quirk should have set
1343 * the acpi_skip_timer_override flag already:
1344 */
1345 if (!acpi_skip_timer_override) { 1343 if (!acpi_skip_timer_override) {
1346 WARN(1, KERN_ERR "ati_ixp4x0 quirk not complete.\n"); 1344 pr_notice("%s detected: Ignoring BIOS IRQ0 override\n",
1347 pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n",
1348 d->ident); 1345 d->ident);
1349 acpi_skip_timer_override = 1; 1346 acpi_skip_timer_override = 1;
1350 } 1347 }
@@ -1438,7 +1435,7 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
1438 * is enabled. This input is incorrectly designated the 1435 * is enabled. This input is incorrectly designated the
1439 * ISA IRQ 0 via an interrupt source override even though 1436 * ISA IRQ 0 via an interrupt source override even though
1440 * it is wired to the output of the master 8259A and INTIN0 1437 * it is wired to the output of the master 8259A and INTIN0
1441 * is not connected at all. Force ignoring BIOS IRQ0 pin2 1438 * is not connected at all. Force ignoring BIOS IRQ0
1442 * override in that cases. 1439 * override in that cases.
1443 */ 1440 */
1444 { 1441 {
@@ -1473,6 +1470,14 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
1473 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"), 1470 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"),
1474 }, 1471 },
1475 }, 1472 },
1473 {
1474 .callback = dmi_ignore_irq0_timer_override,
1475 .ident = "FUJITSU SIEMENS",
1476 .matches = {
1477 DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"),
1478 DMI_MATCH(DMI_PRODUCT_NAME, "AMILO PRO V2030"),
1479 },
1480 },
1476 {} 1481 {}
1477}; 1482};
1478 1483
diff --git a/arch/x86/kernel/acpi/realmode/.gitignore b/arch/x86/kernel/acpi/realmode/.gitignore
deleted file mode 100644
index 58f1f48a58f..00000000000
--- a/arch/x86/kernel/acpi/realmode/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
1wakeup.bin
2wakeup.elf
3wakeup.lds
diff --git a/arch/x86/kernel/acpi/realmode/Makefile b/arch/x86/kernel/acpi/realmode/Makefile
deleted file mode 100644
index 6a564ac67ef..00000000000
--- a/arch/x86/kernel/acpi/realmode/Makefile
+++ /dev/null
@@ -1,59 +0,0 @@
1#
2# arch/x86/kernel/acpi/realmode/Makefile
3#
4# This file is subject to the terms and conditions of the GNU General Public
5# License. See the file "COPYING" in the main directory of this archive
6# for more details.
7#
8
9always := wakeup.bin
10targets := wakeup.elf wakeup.lds
11
12wakeup-y += wakeup.o wakemain.o video-mode.o copy.o bioscall.o regs.o
13
14# The link order of the video-*.o modules can matter. In particular,
15# video-vga.o *must* be listed first, followed by video-vesa.o.
16# Hardware-specific drivers should follow in the order they should be
17# probed, and video-bios.o should typically be last.
18wakeup-y += video-vga.o
19wakeup-y += video-vesa.o
20wakeup-y += video-bios.o
21
22targets += $(wakeup-y)
23
24bootsrc := $(src)/../../../boot
25
26# ---------------------------------------------------------------------------
27
28# How to compile the 16-bit code. Note we always compile for -march=i386,
29# that way we can complain to the user if the CPU is insufficient.
30# Compile with _SETUP since this is similar to the boot-time setup code.
31KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D_WAKEUP -D__KERNEL__ \
32 -I$(srctree)/$(bootsrc) \
33 $(cflags-y) \
34 -Wall -Wstrict-prototypes \
35 -march=i386 -mregparm=3 \
36 -include $(srctree)/$(bootsrc)/code16gcc.h \
37 -fno-strict-aliasing -fomit-frame-pointer \
38 $(call cc-option, -ffreestanding) \
39 $(call cc-option, -fno-toplevel-reorder,\
40 $(call cc-option, -fno-unit-at-a-time)) \
41 $(call cc-option, -fno-stack-protector) \
42 $(call cc-option, -mpreferred-stack-boundary=2)
43KBUILD_CFLAGS += $(call cc-option, -m32)
44KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
45GCOV_PROFILE := n
46
47WAKEUP_OBJS = $(addprefix $(obj)/,$(wakeup-y))
48
49LDFLAGS_wakeup.elf := -T
50
51CPPFLAGS_wakeup.lds += -P -C
52
53$(obj)/wakeup.elf: $(obj)/wakeup.lds $(WAKEUP_OBJS) FORCE
54 $(call if_changed,ld)
55
56OBJCOPYFLAGS_wakeup.bin := -O binary
57
58$(obj)/wakeup.bin: $(obj)/wakeup.elf FORCE
59 $(call if_changed,objcopy)
diff --git a/arch/x86/kernel/acpi/realmode/bioscall.S b/arch/x86/kernel/acpi/realmode/bioscall.S
deleted file mode 100644
index f51eb0bb56c..00000000000
--- a/arch/x86/kernel/acpi/realmode/bioscall.S
+++ /dev/null
@@ -1 +0,0 @@
1#include "../../../boot/bioscall.S"
diff --git a/arch/x86/kernel/acpi/realmode/copy.S b/arch/x86/kernel/acpi/realmode/copy.S
deleted file mode 100644
index dc59ebee69d..00000000000
--- a/arch/x86/kernel/acpi/realmode/copy.S
+++ /dev/null
@@ -1 +0,0 @@
1#include "../../../boot/copy.S"
diff --git a/arch/x86/kernel/acpi/realmode/regs.c b/arch/x86/kernel/acpi/realmode/regs.c
deleted file mode 100644
index 6206033ba20..00000000000
--- a/arch/x86/kernel/acpi/realmode/regs.c
+++ /dev/null
@@ -1 +0,0 @@
1#include "../../../boot/regs.c"
diff --git a/arch/x86/kernel/acpi/realmode/video-bios.c b/arch/x86/kernel/acpi/realmode/video-bios.c
deleted file mode 100644
index 7deabc144a2..00000000000
--- a/arch/x86/kernel/acpi/realmode/video-bios.c
+++ /dev/null
@@ -1 +0,0 @@
1#include "../../../boot/video-bios.c"
diff --git a/arch/x86/kernel/acpi/realmode/video-mode.c b/arch/x86/kernel/acpi/realmode/video-mode.c
deleted file mode 100644
index 328ad209f11..00000000000
--- a/arch/x86/kernel/acpi/realmode/video-mode.c
+++ /dev/null
@@ -1 +0,0 @@
1#include "../../../boot/video-mode.c"
diff --git a/arch/x86/kernel/acpi/realmode/video-vesa.c b/arch/x86/kernel/acpi/realmode/video-vesa.c
deleted file mode 100644
index 9dbb9672226..00000000000
--- a/arch/x86/kernel/acpi/realmode/video-vesa.c
+++ /dev/null
@@ -1 +0,0 @@
1#include "../../../boot/video-vesa.c"
diff --git a/arch/x86/kernel/acpi/realmode/video-vga.c b/arch/x86/kernel/acpi/realmode/video-vga.c
deleted file mode 100644
index bcc81255f37..00000000000
--- a/arch/x86/kernel/acpi/realmode/video-vga.c
+++ /dev/null
@@ -1 +0,0 @@
1#include "../../../boot/video-vga.c"
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.lds.S b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
deleted file mode 100644
index d4f8010a5b1..00000000000
--- a/arch/x86/kernel/acpi/realmode/wakeup.lds.S
+++ /dev/null
@@ -1,62 +0,0 @@
1/*
2 * wakeup.ld
3 *
4 * Linker script for the real-mode wakeup code
5 */
6#undef i386
7#include "wakeup.h"
8
9OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
10OUTPUT_ARCH(i386)
11ENTRY(_start)
12
13SECTIONS
14{
15 . = 0;
16 .jump : {
17 *(.jump)
18 } = 0x90909090
19
20 . = WAKEUP_HEADER_OFFSET;
21 .header : {
22 *(.header)
23 }
24
25 . = ALIGN(16);
26 .text : {
27 *(.text*)
28 } = 0x90909090
29
30 . = ALIGN(16);
31 .rodata : {
32 *(.rodata*)
33 }
34
35 .videocards : {
36 video_cards = .;
37 *(.videocards)
38 video_cards_end = .;
39 }
40
41 . = ALIGN(16);
42 .data : {
43 *(.data*)
44 }
45
46 . = ALIGN(16);
47 .bss : {
48 __bss_start = .;
49 *(.bss)
50 __bss_end = .;
51 }
52
53 .signature : {
54 *(.signature)
55 }
56
57 _end = .;
58
59 /DISCARD/ : {
60 *(.note*)
61 }
62}
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 146a49c763a..95bf99de905 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -14,8 +14,9 @@
14#include <asm/desc.h> 14#include <asm/desc.h>
15#include <asm/pgtable.h> 15#include <asm/pgtable.h>
16#include <asm/cacheflush.h> 16#include <asm/cacheflush.h>
17#include <asm/realmode.h>
17 18
18#include "realmode/wakeup.h" 19#include "../../realmode/rm/wakeup.h"
19#include "sleep.h" 20#include "sleep.h"
20 21
21unsigned long acpi_realmode_flags; 22unsigned long acpi_realmode_flags;
@@ -36,13 +37,9 @@ asmlinkage void acpi_enter_s3(void)
36 */ 37 */
37int acpi_suspend_lowlevel(void) 38int acpi_suspend_lowlevel(void)
38{ 39{
39 struct wakeup_header *header; 40 struct wakeup_header *header =
40 /* address in low memory of the wakeup routine. */ 41 (struct wakeup_header *) __va(real_mode_header->wakeup_header);
41 char *acpi_realmode;
42 42
43 acpi_realmode = TRAMPOLINE_SYM(acpi_wakeup_code);
44
45 header = (struct wakeup_header *)(acpi_realmode + WAKEUP_HEADER_OFFSET);
46 if (header->signature != WAKEUP_HEADER_SIGNATURE) { 43 if (header->signature != WAKEUP_HEADER_SIGNATURE) {
47 printk(KERN_ERR "wakeup header does not match\n"); 44 printk(KERN_ERR "wakeup header does not match\n");
48 return -EINVAL; 45 return -EINVAL;
@@ -50,27 +47,6 @@ int acpi_suspend_lowlevel(void)
50 47
51 header->video_mode = saved_video_mode; 48 header->video_mode = saved_video_mode;
52 49
53 header->wakeup_jmp_seg = acpi_wakeup_address >> 4;
54
55 /*
56 * Set up the wakeup GDT. We set these up as Big Real Mode,
57 * that is, with limits set to 4 GB. At least the Lenovo
58 * Thinkpad X61 is known to need this for the video BIOS
59 * initialization quirk to work; this is likely to also
60 * be the case for other laptops or integrated video devices.
61 */
62
63 /* GDT[0]: GDT self-pointer */
64 header->wakeup_gdt[0] =
65 (u64)(sizeof(header->wakeup_gdt) - 1) +
66 ((u64)__pa(&header->wakeup_gdt) << 16);
67 /* GDT[1]: big real mode-like code segment */
68 header->wakeup_gdt[1] =
69 GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff);
70 /* GDT[2]: big real mode-like data segment */
71 header->wakeup_gdt[2] =
72 GDT_ENTRY(0x8093, acpi_wakeup_address, 0xfffff);
73
74#ifndef CONFIG_64BIT 50#ifndef CONFIG_64BIT
75 store_gdt((struct desc_ptr *)&header->pmode_gdt); 51 store_gdt((struct desc_ptr *)&header->pmode_gdt);
76 52
@@ -95,7 +71,6 @@ int acpi_suspend_lowlevel(void)
95 header->pmode_cr3 = (u32)__pa(&initial_page_table); 71 header->pmode_cr3 = (u32)__pa(&initial_page_table);
96 saved_magic = 0x12345678; 72 saved_magic = 0x12345678;
97#else /* CONFIG_64BIT */ 73#else /* CONFIG_64BIT */
98 header->trampoline_segment = trampoline_address() >> 4;
99#ifdef CONFIG_SMP 74#ifdef CONFIG_SMP
100 stack_start = (unsigned long)temp_stack + sizeof(temp_stack); 75 stack_start = (unsigned long)temp_stack + sizeof(temp_stack);
101 early_gdt_descr.address = 76 early_gdt_descr.address =
diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h
index d68677a2a01..5653a5791ec 100644
--- a/arch/x86/kernel/acpi/sleep.h
+++ b/arch/x86/kernel/acpi/sleep.h
@@ -2,8 +2,8 @@
2 * Variables and functions used by the code in sleep.c 2 * Variables and functions used by the code in sleep.c
3 */ 3 */
4 4
5#include <asm/trampoline.h>
6#include <linux/linkage.h> 5#include <linux/linkage.h>
6#include <asm/realmode.h>
7 7
8extern unsigned long saved_video_mode; 8extern unsigned long saved_video_mode;
9extern long saved_magic; 9extern long saved_magic;
diff --git a/arch/x86/kernel/acpi/wakeup_rm.S b/arch/x86/kernel/acpi/wakeup_rm.S
deleted file mode 100644
index 63b8ab524f2..00000000000
--- a/arch/x86/kernel/acpi/wakeup_rm.S
+++ /dev/null
@@ -1,12 +0,0 @@
1/*
2 * Wrapper script for the realmode binary as a transport object
3 * before copying to low memory.
4 */
5#include <asm/page_types.h>
6
7 .section ".x86_trampoline","a"
8 .balign PAGE_SIZE
9 .globl acpi_wakeup_code
10acpi_wakeup_code:
11 .incbin "arch/x86/kernel/acpi/realmode/wakeup.bin"
12 .size acpi_wakeup_code, .-acpi_wakeup_code
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 1f84794f075..931280ff829 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1,3 +1,5 @@
1#define pr_fmt(fmt) "SMP alternatives: " fmt
2
1#include <linux/module.h> 3#include <linux/module.h>
2#include <linux/sched.h> 4#include <linux/sched.h>
3#include <linux/mutex.h> 5#include <linux/mutex.h>
@@ -63,8 +65,11 @@ static int __init setup_noreplace_paravirt(char *str)
63__setup("noreplace-paravirt", setup_noreplace_paravirt); 65__setup("noreplace-paravirt", setup_noreplace_paravirt);
64#endif 66#endif
65 67
66#define DPRINTK(fmt, args...) if (debug_alternative) \ 68#define DPRINTK(fmt, ...) \
67 printk(KERN_DEBUG fmt, args) 69do { \
70 if (debug_alternative) \
71 printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
72} while (0)
68 73
69/* 74/*
70 * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes 75 * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
@@ -428,7 +433,7 @@ void alternatives_smp_switch(int smp)
428 * If this still occurs then you should see a hang 433 * If this still occurs then you should see a hang
429 * or crash shortly after this line: 434 * or crash shortly after this line:
430 */ 435 */
431 printk("lockdep: fixing up alternatives.\n"); 436 pr_info("lockdep: fixing up alternatives\n");
432#endif 437#endif
433 438
434 if (noreplace_smp || smp_alt_once || skip_smp_alternatives) 439 if (noreplace_smp || smp_alt_once || skip_smp_alternatives)
@@ -444,14 +449,14 @@ void alternatives_smp_switch(int smp)
444 if (smp == smp_mode) { 449 if (smp == smp_mode) {
445 /* nothing */ 450 /* nothing */
446 } else if (smp) { 451 } else if (smp) {
447 printk(KERN_INFO "SMP alternatives: switching to SMP code\n"); 452 pr_info("switching to SMP code\n");
448 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); 453 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
449 clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP); 454 clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
450 list_for_each_entry(mod, &smp_alt_modules, next) 455 list_for_each_entry(mod, &smp_alt_modules, next)
451 alternatives_smp_lock(mod->locks, mod->locks_end, 456 alternatives_smp_lock(mod->locks, mod->locks_end,
452 mod->text, mod->text_end); 457 mod->text, mod->text_end);
453 } else { 458 } else {
454 printk(KERN_INFO "SMP alternatives: switching to UP code\n"); 459 pr_info("switching to UP code\n");
455 set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); 460 set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
456 set_cpu_cap(&cpu_data(0), X86_FEATURE_UP); 461 set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
457 list_for_each_entry(mod, &smp_alt_modules, next) 462 list_for_each_entry(mod, &smp_alt_modules, next)
@@ -546,7 +551,7 @@ void __init alternative_instructions(void)
546#ifdef CONFIG_SMP 551#ifdef CONFIG_SMP
547 if (smp_alt_once) { 552 if (smp_alt_once) {
548 if (1 == num_possible_cpus()) { 553 if (1 == num_possible_cpus()) {
549 printk(KERN_INFO "SMP alternatives: switching to UP code\n"); 554 pr_info("switching to UP code\n");
550 set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); 555 set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
551 set_cpu_cap(&cpu_data(0), X86_FEATURE_UP); 556 set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
552 557
@@ -664,7 +669,7 @@ static int __kprobes stop_machine_text_poke(void *data)
664 struct text_poke_param *p; 669 struct text_poke_param *p;
665 int i; 670 int i;
666 671
667 if (atomic_dec_and_test(&stop_machine_first)) { 672 if (atomic_xchg(&stop_machine_first, 0)) {
668 for (i = 0; i < tpp->nparams; i++) { 673 for (i = 0; i < tpp->nparams; i++) {
669 p = &tpp->params[i]; 674 p = &tpp->params[i];
670 text_poke(p->addr, p->opcode, p->len); 675 text_poke(p->addr, p->opcode, p->len);
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index be16854591c..aadf3359e2a 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -2,6 +2,9 @@
2 * Shared support code for AMD K8 northbridges and derivates. 2 * Shared support code for AMD K8 northbridges and derivates.
3 * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2. 3 * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2.
4 */ 4 */
5
6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7
5#include <linux/types.h> 8#include <linux/types.h>
6#include <linux/slab.h> 9#include <linux/slab.h>
7#include <linux/init.h> 10#include <linux/init.h>
@@ -16,6 +19,7 @@ const struct pci_device_id amd_nb_misc_ids[] = {
16 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, 19 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
17 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, 20 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
18 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) }, 21 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) },
22 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) },
19 {} 23 {}
20}; 24};
21EXPORT_SYMBOL(amd_nb_misc_ids); 25EXPORT_SYMBOL(amd_nb_misc_ids);
@@ -258,7 +262,7 @@ void amd_flush_garts(void)
258 } 262 }
259 spin_unlock_irqrestore(&gart_lock, flags); 263 spin_unlock_irqrestore(&gart_lock, flags);
260 if (!flushed) 264 if (!flushed)
261 printk("nothing to flush?\n"); 265 pr_notice("nothing to flush?\n");
262} 266}
263EXPORT_SYMBOL_GPL(amd_flush_garts); 267EXPORT_SYMBOL_GPL(amd_flush_garts);
264 268
@@ -269,11 +273,10 @@ static __init int init_amd_nbs(void)
269 err = amd_cache_northbridges(); 273 err = amd_cache_northbridges();
270 274
271 if (err < 0) 275 if (err < 0)
272 printk(KERN_NOTICE "AMD NB: Cannot enumerate AMD northbridges.\n"); 276 pr_notice("Cannot enumerate AMD northbridges\n");
273 277
274 if (amd_cache_gart() < 0) 278 if (amd_cache_gart() < 0)
275 printk(KERN_NOTICE "AMD NB: Cannot initialize GART flush words, " 279 pr_notice("Cannot initialize GART flush words, GART support disabled\n");
276 "GART support disabled.\n");
277 280
278 return err; 281 return err;
279} 282}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 6e76c191a83..d5fd66f0d4c 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -20,7 +20,6 @@
20#include <linux/bitops.h> 20#include <linux/bitops.h>
21#include <linux/ioport.h> 21#include <linux/ioport.h>
22#include <linux/suspend.h> 22#include <linux/suspend.h>
23#include <linux/kmemleak.h>
24#include <asm/e820.h> 23#include <asm/e820.h>
25#include <asm/io.h> 24#include <asm/io.h>
26#include <asm/iommu.h> 25#include <asm/iommu.h>
@@ -95,11 +94,6 @@ static u32 __init allocate_aperture(void)
95 return 0; 94 return 0;
96 } 95 }
97 memblock_reserve(addr, aper_size); 96 memblock_reserve(addr, aper_size);
98 /*
99 * Kmemleak should not scan this block as it may not be mapped via the
100 * kernel direct mapping.
101 */
102 kmemleak_ignore(phys_to_virt(addr));
103 printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n", 97 printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n",
104 aper_size >> 10, addr); 98 aper_size >> 10, addr);
105 insert_aperture_resource((u32)addr, aper_size); 99 insert_aperture_resource((u32)addr, aper_size);
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 39a222e094a..98e24131ff3 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -2123,6 +2123,42 @@ void default_init_apic_ldr(void)
2123 apic_write(APIC_LDR, val); 2123 apic_write(APIC_LDR, val);
2124} 2124}
2125 2125
2126int default_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
2127 const struct cpumask *andmask,
2128 unsigned int *apicid)
2129{
2130 unsigned int cpu;
2131
2132 for_each_cpu_and(cpu, cpumask, andmask) {
2133 if (cpumask_test_cpu(cpu, cpu_online_mask))
2134 break;
2135 }
2136
2137 if (likely(cpu < nr_cpu_ids)) {
2138 *apicid = per_cpu(x86_cpu_to_apicid, cpu);
2139 return 0;
2140 }
2141
2142 return -EINVAL;
2143}
2144
2145/*
2146 * Override the generic EOI implementation with an optimized version.
2147 * Only called during early boot when only one CPU is active and with
2148 * interrupts disabled, so we know this does not race with actual APIC driver
2149 * use.
2150 */
2151void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v))
2152{
2153 struct apic **drv;
2154
2155 for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
2156 /* Should happen once for each apic */
2157 WARN_ON((*drv)->eoi_write == eoi_write);
2158 (*drv)->eoi_write = eoi_write;
2159 }
2160}
2161
2126/* 2162/*
2127 * Power management 2163 * Power management
2128 */ 2164 */
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 0e881c46e8c..00c77cf78e9 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -36,25 +36,6 @@ static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
36 return 1; 36 return 1;
37} 37}
38 38
39static const struct cpumask *flat_target_cpus(void)
40{
41 return cpu_online_mask;
42}
43
44static void flat_vector_allocation_domain(int cpu, struct cpumask *retmask)
45{
46 /* Careful. Some cpus do not strictly honor the set of cpus
47 * specified in the interrupt destination when using lowest
48 * priority interrupt delivery mode.
49 *
50 * In particular there was a hyperthreading cpu observed to
51 * deliver interrupts to the wrong hyperthread when only one
52 * hyperthread was specified in the interrupt desitination.
53 */
54 cpumask_clear(retmask);
55 cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
56}
57
58/* 39/*
59 * Set up the logical destination ID. 40 * Set up the logical destination ID.
60 * 41 *
@@ -92,7 +73,7 @@ static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector)
92} 73}
93 74
94static void 75static void
95 flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) 76flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector)
96{ 77{
97 unsigned long mask = cpumask_bits(cpumask)[0]; 78 unsigned long mask = cpumask_bits(cpumask)[0];
98 int cpu = smp_processor_id(); 79 int cpu = smp_processor_id();
@@ -186,7 +167,7 @@ static struct apic apic_flat = {
186 .irq_delivery_mode = dest_LowestPrio, 167 .irq_delivery_mode = dest_LowestPrio,
187 .irq_dest_mode = 1, /* logical */ 168 .irq_dest_mode = 1, /* logical */
188 169
189 .target_cpus = flat_target_cpus, 170 .target_cpus = online_target_cpus,
190 .disable_esr = 0, 171 .disable_esr = 0,
191 .dest_logical = APIC_DEST_LOGICAL, 172 .dest_logical = APIC_DEST_LOGICAL,
192 .check_apicid_used = NULL, 173 .check_apicid_used = NULL,
@@ -210,8 +191,7 @@ static struct apic apic_flat = {
210 .set_apic_id = set_apic_id, 191 .set_apic_id = set_apic_id,
211 .apic_id_mask = 0xFFu << 24, 192 .apic_id_mask = 0xFFu << 24,
212 193
213 .cpu_mask_to_apicid = default_cpu_mask_to_apicid, 194 .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
214 .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
215 195
216 .send_IPI_mask = flat_send_IPI_mask, 196 .send_IPI_mask = flat_send_IPI_mask,
217 .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself, 197 .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself,
@@ -262,17 +242,6 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
262 return 0; 242 return 0;
263} 243}
264 244
265static const struct cpumask *physflat_target_cpus(void)
266{
267 return cpu_online_mask;
268}
269
270static void physflat_vector_allocation_domain(int cpu, struct cpumask *retmask)
271{
272 cpumask_clear(retmask);
273 cpumask_set_cpu(cpu, retmask);
274}
275
276static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector) 245static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector)
277{ 246{
278 default_send_IPI_mask_sequence_phys(cpumask, vector); 247 default_send_IPI_mask_sequence_phys(cpumask, vector);
@@ -294,38 +263,6 @@ static void physflat_send_IPI_all(int vector)
294 physflat_send_IPI_mask(cpu_online_mask, vector); 263 physflat_send_IPI_mask(cpu_online_mask, vector);
295} 264}
296 265
297static unsigned int physflat_cpu_mask_to_apicid(const struct cpumask *cpumask)
298{
299 int cpu;
300
301 /*
302 * We're using fixed IRQ delivery, can only return one phys APIC ID.
303 * May as well be the first.
304 */
305 cpu = cpumask_first(cpumask);
306 if ((unsigned)cpu < nr_cpu_ids)
307 return per_cpu(x86_cpu_to_apicid, cpu);
308 else
309 return BAD_APICID;
310}
311
312static unsigned int
313physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
314 const struct cpumask *andmask)
315{
316 int cpu;
317
318 /*
319 * We're using fixed IRQ delivery, can only return one phys APIC ID.
320 * May as well be the first.
321 */
322 for_each_cpu_and(cpu, cpumask, andmask) {
323 if (cpumask_test_cpu(cpu, cpu_online_mask))
324 break;
325 }
326 return per_cpu(x86_cpu_to_apicid, cpu);
327}
328
329static int physflat_probe(void) 266static int physflat_probe(void)
330{ 267{
331 if (apic == &apic_physflat || num_possible_cpus() > 8) 268 if (apic == &apic_physflat || num_possible_cpus() > 8)
@@ -345,13 +282,13 @@ static struct apic apic_physflat = {
345 .irq_delivery_mode = dest_Fixed, 282 .irq_delivery_mode = dest_Fixed,
346 .irq_dest_mode = 0, /* physical */ 283 .irq_dest_mode = 0, /* physical */
347 284
348 .target_cpus = physflat_target_cpus, 285 .target_cpus = online_target_cpus,
349 .disable_esr = 0, 286 .disable_esr = 0,
350 .dest_logical = 0, 287 .dest_logical = 0,
351 .check_apicid_used = NULL, 288 .check_apicid_used = NULL,
352 .check_apicid_present = NULL, 289 .check_apicid_present = NULL,
353 290
354 .vector_allocation_domain = physflat_vector_allocation_domain, 291 .vector_allocation_domain = default_vector_allocation_domain,
355 /* not needed, but shouldn't hurt: */ 292 /* not needed, but shouldn't hurt: */
356 .init_apic_ldr = flat_init_apic_ldr, 293 .init_apic_ldr = flat_init_apic_ldr,
357 294
@@ -370,8 +307,7 @@ static struct apic apic_physflat = {
370 .set_apic_id = set_apic_id, 307 .set_apic_id = set_apic_id,
371 .apic_id_mask = 0xFFu << 24, 308 .apic_id_mask = 0xFFu << 24,
372 309
373 .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid, 310 .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
374 .cpu_mask_to_apicid_and = physflat_cpu_mask_to_apicid_and,
375 311
376 .send_IPI_mask = physflat_send_IPI_mask, 312 .send_IPI_mask = physflat_send_IPI_mask,
377 .send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself, 313 .send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index a6e4c6e06c0..e145f28b409 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -100,12 +100,12 @@ static unsigned long noop_check_apicid_present(int bit)
100 return physid_isset(bit, phys_cpu_present_map); 100 return physid_isset(bit, phys_cpu_present_map);
101} 101}
102 102
103static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask) 103static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask,
104 const struct cpumask *mask)
104{ 105{
105 if (cpu != 0) 106 if (cpu != 0)
106 pr_warning("APIC: Vector allocated for non-BSP cpu\n"); 107 pr_warning("APIC: Vector allocated for non-BSP cpu\n");
107 cpumask_clear(retmask); 108 cpumask_copy(retmask, cpumask_of(cpu));
108 cpumask_set_cpu(cpu, retmask);
109} 109}
110 110
111static u32 noop_apic_read(u32 reg) 111static u32 noop_apic_read(u32 reg)
@@ -159,8 +159,7 @@ struct apic apic_noop = {
159 .set_apic_id = NULL, 159 .set_apic_id = NULL,
160 .apic_id_mask = 0x0F << 24, 160 .apic_id_mask = 0x0F << 24,
161 161
162 .cpu_mask_to_apicid = default_cpu_mask_to_apicid, 162 .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
163 .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
164 163
165 .send_IPI_mask = noop_send_IPI_mask, 164 .send_IPI_mask = noop_send_IPI_mask,
166 .send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself, 165 .send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself,
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 6ec6d5d297c..bc552cff257 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -72,17 +72,6 @@ static int numachip_phys_pkg_id(int initial_apic_id, int index_msb)
72 return initial_apic_id >> index_msb; 72 return initial_apic_id >> index_msb;
73} 73}
74 74
75static const struct cpumask *numachip_target_cpus(void)
76{
77 return cpu_online_mask;
78}
79
80static void numachip_vector_allocation_domain(int cpu, struct cpumask *retmask)
81{
82 cpumask_clear(retmask);
83 cpumask_set_cpu(cpu, retmask);
84}
85
86static int __cpuinit numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip) 75static int __cpuinit numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip)
87{ 76{
88 union numachip_csr_g3_ext_irq_gen int_gen; 77 union numachip_csr_g3_ext_irq_gen int_gen;
@@ -157,38 +146,6 @@ static void numachip_send_IPI_self(int vector)
157 __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); 146 __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
158} 147}
159 148
160static unsigned int numachip_cpu_mask_to_apicid(const struct cpumask *cpumask)
161{
162 int cpu;
163
164 /*
165 * We're using fixed IRQ delivery, can only return one phys APIC ID.
166 * May as well be the first.
167 */
168 cpu = cpumask_first(cpumask);
169 if (likely((unsigned)cpu < nr_cpu_ids))
170 return per_cpu(x86_cpu_to_apicid, cpu);
171
172 return BAD_APICID;
173}
174
175static unsigned int
176numachip_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
177 const struct cpumask *andmask)
178{
179 int cpu;
180
181 /*
182 * We're using fixed IRQ delivery, can only return one phys APIC ID.
183 * May as well be the first.
184 */
185 for_each_cpu_and(cpu, cpumask, andmask) {
186 if (cpumask_test_cpu(cpu, cpu_online_mask))
187 break;
188 }
189 return per_cpu(x86_cpu_to_apicid, cpu);
190}
191
192static int __init numachip_probe(void) 149static int __init numachip_probe(void)
193{ 150{
194 return apic == &apic_numachip; 151 return apic == &apic_numachip;
@@ -253,13 +210,13 @@ static struct apic apic_numachip __refconst = {
253 .irq_delivery_mode = dest_Fixed, 210 .irq_delivery_mode = dest_Fixed,
254 .irq_dest_mode = 0, /* physical */ 211 .irq_dest_mode = 0, /* physical */
255 212
256 .target_cpus = numachip_target_cpus, 213 .target_cpus = online_target_cpus,
257 .disable_esr = 0, 214 .disable_esr = 0,
258 .dest_logical = 0, 215 .dest_logical = 0,
259 .check_apicid_used = NULL, 216 .check_apicid_used = NULL,
260 .check_apicid_present = NULL, 217 .check_apicid_present = NULL,
261 218
262 .vector_allocation_domain = numachip_vector_allocation_domain, 219 .vector_allocation_domain = default_vector_allocation_domain,
263 .init_apic_ldr = flat_init_apic_ldr, 220 .init_apic_ldr = flat_init_apic_ldr,
264 221
265 .ioapic_phys_id_map = NULL, 222 .ioapic_phys_id_map = NULL,
@@ -277,8 +234,7 @@ static struct apic apic_numachip __refconst = {
277 .set_apic_id = set_apic_id, 234 .set_apic_id = set_apic_id,
278 .apic_id_mask = 0xffU << 24, 235 .apic_id_mask = 0xffU << 24,
279 236
280 .cpu_mask_to_apicid = numachip_cpu_mask_to_apicid, 237 .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
281 .cpu_mask_to_apicid_and = numachip_cpu_mask_to_apicid_and,
282 238
283 .send_IPI_mask = numachip_send_IPI_mask, 239 .send_IPI_mask = numachip_send_IPI_mask,
284 .send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself, 240 .send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself,
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 31fbdbfbf96..d50e3640d5a 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -26,15 +26,6 @@ static int bigsmp_apic_id_registered(void)
26 return 1; 26 return 1;
27} 27}
28 28
29static const struct cpumask *bigsmp_target_cpus(void)
30{
31#ifdef CONFIG_SMP
32 return cpu_online_mask;
33#else
34 return cpumask_of(0);
35#endif
36}
37
38static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid) 29static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid)
39{ 30{
40 return 0; 31 return 0;
@@ -105,32 +96,6 @@ static int bigsmp_check_phys_apicid_present(int phys_apicid)
105 return 1; 96 return 1;
106} 97}
107 98
108/* As we are using single CPU as destination, pick only one CPU here */
109static unsigned int bigsmp_cpu_mask_to_apicid(const struct cpumask *cpumask)
110{
111 int cpu = cpumask_first(cpumask);
112
113 if (cpu < nr_cpu_ids)
114 return cpu_physical_id(cpu);
115 return BAD_APICID;
116}
117
118static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
119 const struct cpumask *andmask)
120{
121 int cpu;
122
123 /*
124 * We're using fixed IRQ delivery, can only return one phys APIC ID.
125 * May as well be the first.
126 */
127 for_each_cpu_and(cpu, cpumask, andmask) {
128 if (cpumask_test_cpu(cpu, cpu_online_mask))
129 return cpu_physical_id(cpu);
130 }
131 return BAD_APICID;
132}
133
134static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) 99static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
135{ 100{
136 return cpuid_apic >> index_msb; 101 return cpuid_apic >> index_msb;
@@ -177,12 +142,6 @@ static const struct dmi_system_id bigsmp_dmi_table[] = {
177 { } /* NULL entry stops DMI scanning */ 142 { } /* NULL entry stops DMI scanning */
178}; 143};
179 144
180static void bigsmp_vector_allocation_domain(int cpu, struct cpumask *retmask)
181{
182 cpumask_clear(retmask);
183 cpumask_set_cpu(cpu, retmask);
184}
185
186static int probe_bigsmp(void) 145static int probe_bigsmp(void)
187{ 146{
188 if (def_to_bigsmp) 147 if (def_to_bigsmp)
@@ -205,13 +164,13 @@ static struct apic apic_bigsmp = {
205 /* phys delivery to target CPU: */ 164 /* phys delivery to target CPU: */
206 .irq_dest_mode = 0, 165 .irq_dest_mode = 0,
207 166
208 .target_cpus = bigsmp_target_cpus, 167 .target_cpus = default_target_cpus,
209 .disable_esr = 1, 168 .disable_esr = 1,
210 .dest_logical = 0, 169 .dest_logical = 0,
211 .check_apicid_used = bigsmp_check_apicid_used, 170 .check_apicid_used = bigsmp_check_apicid_used,
212 .check_apicid_present = bigsmp_check_apicid_present, 171 .check_apicid_present = bigsmp_check_apicid_present,
213 172
214 .vector_allocation_domain = bigsmp_vector_allocation_domain, 173 .vector_allocation_domain = default_vector_allocation_domain,
215 .init_apic_ldr = bigsmp_init_apic_ldr, 174 .init_apic_ldr = bigsmp_init_apic_ldr,
216 175
217 .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map, 176 .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map,
@@ -229,8 +188,7 @@ static struct apic apic_bigsmp = {
229 .set_apic_id = NULL, 188 .set_apic_id = NULL,
230 .apic_id_mask = 0xFF << 24, 189 .apic_id_mask = 0xFF << 24,
231 190
232 .cpu_mask_to_apicid = bigsmp_cpu_mask_to_apicid, 191 .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
233 .cpu_mask_to_apicid_and = bigsmp_cpu_mask_to_apicid_and,
234 192
235 .send_IPI_mask = bigsmp_send_IPI_mask, 193 .send_IPI_mask = bigsmp_send_IPI_mask,
236 .send_IPI_mask_allbutself = NULL, 194 .send_IPI_mask_allbutself = NULL,
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index db4ab1be3c7..0874799a98c 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -394,21 +394,6 @@ static void es7000_enable_apic_mode(void)
394 WARN(1, "Command failed, status = %x\n", mip_status); 394 WARN(1, "Command failed, status = %x\n", mip_status);
395} 395}
396 396
397static void es7000_vector_allocation_domain(int cpu, struct cpumask *retmask)
398{
399 /* Careful. Some cpus do not strictly honor the set of cpus
400 * specified in the interrupt destination when using lowest
401 * priority interrupt delivery mode.
402 *
403 * In particular there was a hyperthreading cpu observed to
404 * deliver interrupts to the wrong hyperthread when only one
405 * hyperthread was specified in the interrupt desitination.
406 */
407 cpumask_clear(retmask);
408 cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
409}
410
411
412static void es7000_wait_for_init_deassert(atomic_t *deassert) 397static void es7000_wait_for_init_deassert(atomic_t *deassert)
413{ 398{
414 while (!atomic_read(deassert)) 399 while (!atomic_read(deassert))
@@ -540,45 +525,49 @@ static int es7000_check_phys_apicid_present(int cpu_physical_apicid)
540 return 1; 525 return 1;
541} 526}
542 527
543static unsigned int es7000_cpu_mask_to_apicid(const struct cpumask *cpumask) 528static inline int
529es7000_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id)
544{ 530{
545 unsigned int round = 0; 531 unsigned int round = 0;
546 int cpu, uninitialized_var(apicid); 532 unsigned int cpu, uninitialized_var(apicid);
547 533
548 /* 534 /*
549 * The cpus in the mask must all be on the apic cluster. 535 * The cpus in the mask must all be on the apic cluster.
550 */ 536 */
551 for_each_cpu(cpu, cpumask) { 537 for_each_cpu_and(cpu, cpumask, cpu_online_mask) {
552 int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); 538 int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
553 539
554 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { 540 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
555 WARN(1, "Not a valid mask!"); 541 WARN(1, "Not a valid mask!");
556 542
557 return BAD_APICID; 543 return -EINVAL;
558 } 544 }
559 apicid = new_apicid; 545 apicid |= new_apicid;
560 round++; 546 round++;
561 } 547 }
562 return apicid; 548 if (!round)
549 return -EINVAL;
550 *dest_id = apicid;
551 return 0;
563} 552}
564 553
565static unsigned int 554static int
566es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask, 555es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask,
567 const struct cpumask *andmask) 556 const struct cpumask *andmask,
557 unsigned int *apicid)
568{ 558{
569 int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
570 cpumask_var_t cpumask; 559 cpumask_var_t cpumask;
560 *apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
571 561
572 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) 562 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
573 return apicid; 563 return 0;
574 564
575 cpumask_and(cpumask, inmask, andmask); 565 cpumask_and(cpumask, inmask, andmask);
576 cpumask_and(cpumask, cpumask, cpu_online_mask); 566 es7000_cpu_mask_to_apicid(cpumask, apicid);
577 apicid = es7000_cpu_mask_to_apicid(cpumask);
578 567
579 free_cpumask_var(cpumask); 568 free_cpumask_var(cpumask);
580 569
581 return apicid; 570 return 0;
582} 571}
583 572
584static int es7000_phys_pkg_id(int cpuid_apic, int index_msb) 573static int es7000_phys_pkg_id(int cpuid_apic, int index_msb)
@@ -638,7 +627,7 @@ static struct apic __refdata apic_es7000_cluster = {
638 .check_apicid_used = es7000_check_apicid_used, 627 .check_apicid_used = es7000_check_apicid_used,
639 .check_apicid_present = es7000_check_apicid_present, 628 .check_apicid_present = es7000_check_apicid_present,
640 629
641 .vector_allocation_domain = es7000_vector_allocation_domain, 630 .vector_allocation_domain = flat_vector_allocation_domain,
642 .init_apic_ldr = es7000_init_apic_ldr_cluster, 631 .init_apic_ldr = es7000_init_apic_ldr_cluster,
643 632
644 .ioapic_phys_id_map = es7000_ioapic_phys_id_map, 633 .ioapic_phys_id_map = es7000_ioapic_phys_id_map,
@@ -656,7 +645,6 @@ static struct apic __refdata apic_es7000_cluster = {
656 .set_apic_id = NULL, 645 .set_apic_id = NULL,
657 .apic_id_mask = 0xFF << 24, 646 .apic_id_mask = 0xFF << 24,
658 647
659 .cpu_mask_to_apicid = es7000_cpu_mask_to_apicid,
660 .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and, 648 .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and,
661 649
662 .send_IPI_mask = es7000_send_IPI_mask, 650 .send_IPI_mask = es7000_send_IPI_mask,
@@ -705,7 +693,7 @@ static struct apic __refdata apic_es7000 = {
705 .check_apicid_used = es7000_check_apicid_used, 693 .check_apicid_used = es7000_check_apicid_used,
706 .check_apicid_present = es7000_check_apicid_present, 694 .check_apicid_present = es7000_check_apicid_present,
707 695
708 .vector_allocation_domain = es7000_vector_allocation_domain, 696 .vector_allocation_domain = flat_vector_allocation_domain,
709 .init_apic_ldr = es7000_init_apic_ldr, 697 .init_apic_ldr = es7000_init_apic_ldr,
710 698
711 .ioapic_phys_id_map = es7000_ioapic_phys_id_map, 699 .ioapic_phys_id_map = es7000_ioapic_phys_id_map,
@@ -723,7 +711,6 @@ static struct apic __refdata apic_es7000 = {
723 .set_apic_id = NULL, 711 .set_apic_id = NULL,
724 .apic_id_mask = 0xFF << 24, 712 .apic_id_mask = 0xFF << 24,
725 713
726 .cpu_mask_to_apicid = es7000_cpu_mask_to_apicid,
727 .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and, 714 .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and,
728 715
729 .send_IPI_mask = es7000_send_IPI_mask, 716 .send_IPI_mask = es7000_send_IPI_mask,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index bce2001b264..406eee78468 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -448,8 +448,8 @@ static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pi
448 448
449 entry = alloc_irq_pin_list(node); 449 entry = alloc_irq_pin_list(node);
450 if (!entry) { 450 if (!entry) {
451 printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n", 451 pr_err("can not alloc irq_pin_list (%d,%d,%d)\n",
452 node, apic, pin); 452 node, apic, pin);
453 return -ENOMEM; 453 return -ENOMEM;
454 } 454 }
455 entry->apic = apic; 455 entry->apic = apic;
@@ -661,7 +661,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
661 ioapic_mask_entry(apic, pin); 661 ioapic_mask_entry(apic, pin);
662 entry = ioapic_read_entry(apic, pin); 662 entry = ioapic_read_entry(apic, pin);
663 if (entry.irr) 663 if (entry.irr)
664 printk(KERN_ERR "Unable to reset IRR for apic: %d, pin :%d\n", 664 pr_err("Unable to reset IRR for apic: %d, pin :%d\n",
665 mpc_ioapic_id(apic), pin); 665 mpc_ioapic_id(apic), pin);
666} 666}
667 667
@@ -895,7 +895,7 @@ static int irq_polarity(int idx)
895 } 895 }
896 case 2: /* reserved */ 896 case 2: /* reserved */
897 { 897 {
898 printk(KERN_WARNING "broken BIOS!!\n"); 898 pr_warn("broken BIOS!!\n");
899 polarity = 1; 899 polarity = 1;
900 break; 900 break;
901 } 901 }
@@ -906,7 +906,7 @@ static int irq_polarity(int idx)
906 } 906 }
907 default: /* invalid */ 907 default: /* invalid */
908 { 908 {
909 printk(KERN_WARNING "broken BIOS!!\n"); 909 pr_warn("broken BIOS!!\n");
910 polarity = 1; 910 polarity = 1;
911 break; 911 break;
912 } 912 }
@@ -948,7 +948,7 @@ static int irq_trigger(int idx)
948 } 948 }
949 default: 949 default:
950 { 950 {
951 printk(KERN_WARNING "broken BIOS!!\n"); 951 pr_warn("broken BIOS!!\n");
952 trigger = 1; 952 trigger = 1;
953 break; 953 break;
954 } 954 }
@@ -962,7 +962,7 @@ static int irq_trigger(int idx)
962 } 962 }
963 case 2: /* reserved */ 963 case 2: /* reserved */
964 { 964 {
965 printk(KERN_WARNING "broken BIOS!!\n"); 965 pr_warn("broken BIOS!!\n");
966 trigger = 1; 966 trigger = 1;
967 break; 967 break;
968 } 968 }
@@ -973,7 +973,7 @@ static int irq_trigger(int idx)
973 } 973 }
974 default: /* invalid */ 974 default: /* invalid */
975 { 975 {
976 printk(KERN_WARNING "broken BIOS!!\n"); 976 pr_warn("broken BIOS!!\n");
977 trigger = 0; 977 trigger = 0;
978 break; 978 break;
979 } 979 }
@@ -991,7 +991,7 @@ static int pin_2_irq(int idx, int apic, int pin)
991 * Debugging check, we are in big trouble if this message pops up! 991 * Debugging check, we are in big trouble if this message pops up!
992 */ 992 */
993 if (mp_irqs[idx].dstirq != pin) 993 if (mp_irqs[idx].dstirq != pin)
994 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); 994 pr_err("broken BIOS or MPTABLE parser, ayiee!!\n");
995 995
996 if (test_bit(bus, mp_bus_not_pci)) { 996 if (test_bit(bus, mp_bus_not_pci)) {
997 irq = mp_irqs[idx].srcbusirq; 997 irq = mp_irqs[idx].srcbusirq;
@@ -1112,8 +1112,7 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1112 * 0x80, because int 0x80 is hm, kind of importantish. ;) 1112 * 0x80, because int 0x80 is hm, kind of importantish. ;)
1113 */ 1113 */
1114 static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START; 1114 static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
1115 static int current_offset = VECTOR_OFFSET_START % 8; 1115 static int current_offset = VECTOR_OFFSET_START % 16;
1116 unsigned int old_vector;
1117 int cpu, err; 1116 int cpu, err;
1118 cpumask_var_t tmp_mask; 1117 cpumask_var_t tmp_mask;
1119 1118
@@ -1123,35 +1122,45 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1123 if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) 1122 if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
1124 return -ENOMEM; 1123 return -ENOMEM;
1125 1124
1126 old_vector = cfg->vector;
1127 if (old_vector) {
1128 cpumask_and(tmp_mask, mask, cpu_online_mask);
1129 cpumask_and(tmp_mask, cfg->domain, tmp_mask);
1130 if (!cpumask_empty(tmp_mask)) {
1131 free_cpumask_var(tmp_mask);
1132 return 0;
1133 }
1134 }
1135
1136 /* Only try and allocate irqs on cpus that are present */ 1125 /* Only try and allocate irqs on cpus that are present */
1137 err = -ENOSPC; 1126 err = -ENOSPC;
1138 for_each_cpu_and(cpu, mask, cpu_online_mask) { 1127 cpumask_clear(cfg->old_domain);
1139 int new_cpu; 1128 cpu = cpumask_first_and(mask, cpu_online_mask);
1140 int vector, offset; 1129 while (cpu < nr_cpu_ids) {
1130 int new_cpu, vector, offset;
1141 1131
1142 apic->vector_allocation_domain(cpu, tmp_mask); 1132 apic->vector_allocation_domain(cpu, tmp_mask, mask);
1133
1134 if (cpumask_subset(tmp_mask, cfg->domain)) {
1135 err = 0;
1136 if (cpumask_equal(tmp_mask, cfg->domain))
1137 break;
1138 /*
1139 * New cpumask using the vector is a proper subset of
1140 * the current in use mask. So cleanup the vector
1141 * allocation for the members that are not used anymore.
1142 */
1143 cpumask_andnot(cfg->old_domain, cfg->domain, tmp_mask);
1144 cfg->move_in_progress = 1;
1145 cpumask_and(cfg->domain, cfg->domain, tmp_mask);
1146 break;
1147 }
1143 1148
1144 vector = current_vector; 1149 vector = current_vector;
1145 offset = current_offset; 1150 offset = current_offset;
1146next: 1151next:
1147 vector += 8; 1152 vector += 16;
1148 if (vector >= first_system_vector) { 1153 if (vector >= first_system_vector) {
1149 /* If out of vectors on large boxen, must share them. */ 1154 offset = (offset + 1) % 16;
1150 offset = (offset + 1) % 8;
1151 vector = FIRST_EXTERNAL_VECTOR + offset; 1155 vector = FIRST_EXTERNAL_VECTOR + offset;
1152 } 1156 }
1153 if (unlikely(current_vector == vector)) 1157
1158 if (unlikely(current_vector == vector)) {
1159 cpumask_or(cfg->old_domain, cfg->old_domain, tmp_mask);
1160 cpumask_andnot(tmp_mask, mask, cfg->old_domain);
1161 cpu = cpumask_first_and(tmp_mask, cpu_online_mask);
1154 continue; 1162 continue;
1163 }
1155 1164
1156 if (test_bit(vector, used_vectors)) 1165 if (test_bit(vector, used_vectors))
1157 goto next; 1166 goto next;
@@ -1162,7 +1171,7 @@ next:
1162 /* Found one! */ 1171 /* Found one! */
1163 current_vector = vector; 1172 current_vector = vector;
1164 current_offset = offset; 1173 current_offset = offset;
1165 if (old_vector) { 1174 if (cfg->vector) {
1166 cfg->move_in_progress = 1; 1175 cfg->move_in_progress = 1;
1167 cpumask_copy(cfg->old_domain, cfg->domain); 1176 cpumask_copy(cfg->old_domain, cfg->domain);
1168 } 1177 }
@@ -1195,7 +1204,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
1195 BUG_ON(!cfg->vector); 1204 BUG_ON(!cfg->vector);
1196 1205
1197 vector = cfg->vector; 1206 vector = cfg->vector;
1198 for_each_cpu_and(cpu, cfg->domain, cpu_online_mask) 1207 for_each_cpu(cpu, cfg->domain)
1199 per_cpu(vector_irq, cpu)[vector] = -1; 1208 per_cpu(vector_irq, cpu)[vector] = -1;
1200 1209
1201 cfg->vector = 0; 1210 cfg->vector = 0;
@@ -1203,7 +1212,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
1203 1212
1204 if (likely(!cfg->move_in_progress)) 1213 if (likely(!cfg->move_in_progress))
1205 return; 1214 return;
1206 for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) { 1215 for_each_cpu(cpu, cfg->old_domain) {
1207 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; 1216 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
1208 vector++) { 1217 vector++) {
1209 if (per_cpu(vector_irq, cpu)[vector] != irq) 1218 if (per_cpu(vector_irq, cpu)[vector] != irq)
@@ -1346,18 +1355,18 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg,
1346 1355
1347 if (!IO_APIC_IRQ(irq)) 1356 if (!IO_APIC_IRQ(irq))
1348 return; 1357 return;
1349 /*
1350 * For legacy irqs, cfg->domain starts with cpu 0 for legacy
1351 * controllers like 8259. Now that IO-APIC can handle this irq, update
1352 * the cfg->domain.
1353 */
1354 if (irq < legacy_pic->nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain))
1355 apic->vector_allocation_domain(0, cfg->domain);
1356 1358
1357 if (assign_irq_vector(irq, cfg, apic->target_cpus())) 1359 if (assign_irq_vector(irq, cfg, apic->target_cpus()))
1358 return; 1360 return;
1359 1361
1360 dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); 1362 if (apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus(),
1363 &dest)) {
1364 pr_warn("Failed to obtain apicid for ioapic %d, pin %d\n",
1365 mpc_ioapic_id(attr->ioapic), attr->ioapic_pin);
1366 __clear_irq_vector(irq, cfg);
1367
1368 return;
1369 }
1361 1370
1362 apic_printk(APIC_VERBOSE,KERN_DEBUG 1371 apic_printk(APIC_VERBOSE,KERN_DEBUG
1363 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " 1372 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
@@ -1366,7 +1375,7 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg,
1366 cfg->vector, irq, attr->trigger, attr->polarity, dest); 1375 cfg->vector, irq, attr->trigger, attr->polarity, dest);
1367 1376
1368 if (setup_ioapic_entry(irq, &entry, dest, cfg->vector, attr)) { 1377 if (setup_ioapic_entry(irq, &entry, dest, cfg->vector, attr)) {
1369 pr_warn("Failed to setup ioapic entry for ioapic %d, pin %d\n", 1378 pr_warn("Failed to setup ioapic entry for ioapic %d, pin %d\n",
1370 mpc_ioapic_id(attr->ioapic), attr->ioapic_pin); 1379 mpc_ioapic_id(attr->ioapic), attr->ioapic_pin);
1371 __clear_irq_vector(irq, cfg); 1380 __clear_irq_vector(irq, cfg);
1372 1381
@@ -1469,9 +1478,10 @@ void setup_IO_APIC_irq_extra(u32 gsi)
1469 * Set up the timer pin, possibly with the 8259A-master behind. 1478 * Set up the timer pin, possibly with the 8259A-master behind.
1470 */ 1479 */
1471static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx, 1480static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx,
1472 unsigned int pin, int vector) 1481 unsigned int pin, int vector)
1473{ 1482{
1474 struct IO_APIC_route_entry entry; 1483 struct IO_APIC_route_entry entry;
1484 unsigned int dest;
1475 1485
1476 if (irq_remapping_enabled) 1486 if (irq_remapping_enabled)
1477 return; 1487 return;
@@ -1482,9 +1492,13 @@ static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx,
1482 * We use logical delivery to get the timer IRQ 1492 * We use logical delivery to get the timer IRQ
1483 * to the first CPU. 1493 * to the first CPU.
1484 */ 1494 */
1495 if (unlikely(apic->cpu_mask_to_apicid_and(apic->target_cpus(),
1496 apic->target_cpus(), &dest)))
1497 dest = BAD_APICID;
1498
1485 entry.dest_mode = apic->irq_dest_mode; 1499 entry.dest_mode = apic->irq_dest_mode;
1486 entry.mask = 0; /* don't mask IRQ for edge */ 1500 entry.mask = 0; /* don't mask IRQ for edge */
1487 entry.dest = apic->cpu_mask_to_apicid(apic->target_cpus()); 1501 entry.dest = dest;
1488 entry.delivery_mode = apic->irq_delivery_mode; 1502 entry.delivery_mode = apic->irq_delivery_mode;
1489 entry.polarity = 0; 1503 entry.polarity = 0;
1490 entry.trigger = 0; 1504 entry.trigger = 0;
@@ -1521,7 +1535,6 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx)
1521 reg_03.raw = io_apic_read(ioapic_idx, 3); 1535 reg_03.raw = io_apic_read(ioapic_idx, 3);
1522 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 1536 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
1523 1537
1524 printk("\n");
1525 printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(ioapic_idx)); 1538 printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(ioapic_idx));
1526 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); 1539 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
1527 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); 1540 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
@@ -1578,7 +1591,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx)
1578 i, 1591 i,
1579 ir_entry->index 1592 ir_entry->index
1580 ); 1593 );
1581 printk("%1d %1d %1d %1d %1d " 1594 pr_cont("%1d %1d %1d %1d %1d "
1582 "%1d %1d %X %02X\n", 1595 "%1d %1d %X %02X\n",
1583 ir_entry->format, 1596 ir_entry->format,
1584 ir_entry->mask, 1597 ir_entry->mask,
@@ -1598,7 +1611,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx)
1598 i, 1611 i,
1599 entry.dest 1612 entry.dest
1600 ); 1613 );
1601 printk("%1d %1d %1d %1d %1d " 1614 pr_cont("%1d %1d %1d %1d %1d "
1602 "%1d %1d %02X\n", 1615 "%1d %1d %02X\n",
1603 entry.mask, 1616 entry.mask,
1604 entry.trigger, 1617 entry.trigger,
@@ -1651,8 +1664,8 @@ __apicdebuginit(void) print_IO_APICs(void)
1651 continue; 1664 continue;
1652 printk(KERN_DEBUG "IRQ%d ", irq); 1665 printk(KERN_DEBUG "IRQ%d ", irq);
1653 for_each_irq_pin(entry, cfg->irq_2_pin) 1666 for_each_irq_pin(entry, cfg->irq_2_pin)
1654 printk("-> %d:%d", entry->apic, entry->pin); 1667 pr_cont("-> %d:%d", entry->apic, entry->pin);
1655 printk("\n"); 1668 pr_cont("\n");
1656 } 1669 }
1657 1670
1658 printk(KERN_INFO ".................................... done.\n"); 1671 printk(KERN_INFO ".................................... done.\n");
@@ -1665,9 +1678,9 @@ __apicdebuginit(void) print_APIC_field(int base)
1665 printk(KERN_DEBUG); 1678 printk(KERN_DEBUG);
1666 1679
1667 for (i = 0; i < 8; i++) 1680 for (i = 0; i < 8; i++)
1668 printk(KERN_CONT "%08x", apic_read(base + i*0x10)); 1681 pr_cont("%08x", apic_read(base + i*0x10));
1669 1682
1670 printk(KERN_CONT "\n"); 1683 pr_cont("\n");
1671} 1684}
1672 1685
1673__apicdebuginit(void) print_local_APIC(void *dummy) 1686__apicdebuginit(void) print_local_APIC(void *dummy)
@@ -1769,7 +1782,7 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
1769 printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v); 1782 printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v);
1770 } 1783 }
1771 } 1784 }
1772 printk("\n"); 1785 pr_cont("\n");
1773} 1786}
1774 1787
1775__apicdebuginit(void) print_local_APICs(int maxcpu) 1788__apicdebuginit(void) print_local_APICs(int maxcpu)
@@ -2065,7 +2078,7 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)
2065 reg_00.raw = io_apic_read(ioapic_idx, 0); 2078 reg_00.raw = io_apic_read(ioapic_idx, 0);
2066 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 2079 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2067 if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx)) 2080 if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx))
2068 printk("could not set ID!\n"); 2081 pr_cont("could not set ID!\n");
2069 else 2082 else
2070 apic_printk(APIC_VERBOSE, " ok.\n"); 2083 apic_printk(APIC_VERBOSE, " ok.\n");
2071 } 2084 }
@@ -2210,72 +2223,6 @@ void send_cleanup_vector(struct irq_cfg *cfg)
2210 cfg->move_in_progress = 0; 2223 cfg->move_in_progress = 0;
2211} 2224}
2212 2225
2213static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
2214{
2215 int apic, pin;
2216 struct irq_pin_list *entry;
2217 u8 vector = cfg->vector;
2218
2219 for_each_irq_pin(entry, cfg->irq_2_pin) {
2220 unsigned int reg;
2221
2222 apic = entry->apic;
2223 pin = entry->pin;
2224 /*
2225 * With interrupt-remapping, destination information comes
2226 * from interrupt-remapping table entry.
2227 */
2228 if (!irq_remapped(cfg))
2229 io_apic_write(apic, 0x11 + pin*2, dest);
2230 reg = io_apic_read(apic, 0x10 + pin*2);
2231 reg &= ~IO_APIC_REDIR_VECTOR_MASK;
2232 reg |= vector;
2233 io_apic_modify(apic, 0x10 + pin*2, reg);
2234 }
2235}
2236
2237/*
2238 * Either sets data->affinity to a valid value, and returns
2239 * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
2240 * leaves data->affinity untouched.
2241 */
2242int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
2243 unsigned int *dest_id)
2244{
2245 struct irq_cfg *cfg = data->chip_data;
2246
2247 if (!cpumask_intersects(mask, cpu_online_mask))
2248 return -1;
2249
2250 if (assign_irq_vector(data->irq, data->chip_data, mask))
2251 return -1;
2252
2253 cpumask_copy(data->affinity, mask);
2254
2255 *dest_id = apic->cpu_mask_to_apicid_and(mask, cfg->domain);
2256 return 0;
2257}
2258
2259static int
2260ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
2261 bool force)
2262{
2263 unsigned int dest, irq = data->irq;
2264 unsigned long flags;
2265 int ret;
2266
2267 raw_spin_lock_irqsave(&ioapic_lock, flags);
2268 ret = __ioapic_set_affinity(data, mask, &dest);
2269 if (!ret) {
2270 /* Only the high 8 bits are valid. */
2271 dest = SET_APIC_LOGICAL_ID(dest);
2272 __target_IO_APIC_irq(irq, dest, data->chip_data);
2273 ret = IRQ_SET_MASK_OK_NOCOPY;
2274 }
2275 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2276 return ret;
2277}
2278
2279asmlinkage void smp_irq_move_cleanup_interrupt(void) 2226asmlinkage void smp_irq_move_cleanup_interrupt(void)
2280{ 2227{
2281 unsigned vector, me; 2228 unsigned vector, me;
@@ -2363,6 +2310,87 @@ void irq_force_complete_move(int irq)
2363static inline void irq_complete_move(struct irq_cfg *cfg) { } 2310static inline void irq_complete_move(struct irq_cfg *cfg) { }
2364#endif 2311#endif
2365 2312
2313static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
2314{
2315 int apic, pin;
2316 struct irq_pin_list *entry;
2317 u8 vector = cfg->vector;
2318
2319 for_each_irq_pin(entry, cfg->irq_2_pin) {
2320 unsigned int reg;
2321
2322 apic = entry->apic;
2323 pin = entry->pin;
2324 /*
2325 * With interrupt-remapping, destination information comes
2326 * from interrupt-remapping table entry.
2327 */
2328 if (!irq_remapped(cfg))
2329 io_apic_write(apic, 0x11 + pin*2, dest);
2330 reg = io_apic_read(apic, 0x10 + pin*2);
2331 reg &= ~IO_APIC_REDIR_VECTOR_MASK;
2332 reg |= vector;
2333 io_apic_modify(apic, 0x10 + pin*2, reg);
2334 }
2335}
2336
2337/*
2338 * Either sets data->affinity to a valid value, and returns
2339 * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
2340 * leaves data->affinity untouched.
2341 */
2342int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
2343 unsigned int *dest_id)
2344{
2345 struct irq_cfg *cfg = data->chip_data;
2346 unsigned int irq = data->irq;
2347 int err;
2348
2349 if (!config_enabled(CONFIG_SMP))
2350 return -1;
2351
2352 if (!cpumask_intersects(mask, cpu_online_mask))
2353 return -EINVAL;
2354
2355 err = assign_irq_vector(irq, cfg, mask);
2356 if (err)
2357 return err;
2358
2359 err = apic->cpu_mask_to_apicid_and(mask, cfg->domain, dest_id);
2360 if (err) {
2361 if (assign_irq_vector(irq, cfg, data->affinity))
2362 pr_err("Failed to recover vector for irq %d\n", irq);
2363 return err;
2364 }
2365
2366 cpumask_copy(data->affinity, mask);
2367
2368 return 0;
2369}
2370
2371static int
2372ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
2373 bool force)
2374{
2375 unsigned int dest, irq = data->irq;
2376 unsigned long flags;
2377 int ret;
2378
2379 if (!config_enabled(CONFIG_SMP))
2380 return -1;
2381
2382 raw_spin_lock_irqsave(&ioapic_lock, flags);
2383 ret = __ioapic_set_affinity(data, mask, &dest);
2384 if (!ret) {
2385 /* Only the high 8 bits are valid. */
2386 dest = SET_APIC_LOGICAL_ID(dest);
2387 __target_IO_APIC_irq(irq, dest, data->chip_data);
2388 ret = IRQ_SET_MASK_OK_NOCOPY;
2389 }
2390 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2391 return ret;
2392}
2393
2366static void ack_apic_edge(struct irq_data *data) 2394static void ack_apic_edge(struct irq_data *data)
2367{ 2395{
2368 irq_complete_move(data->chip_data); 2396 irq_complete_move(data->chip_data);
@@ -2542,9 +2570,7 @@ static void irq_remap_modify_chip_defaults(struct irq_chip *chip)
2542 chip->irq_ack = ir_ack_apic_edge; 2570 chip->irq_ack = ir_ack_apic_edge;
2543 chip->irq_eoi = ir_ack_apic_level; 2571 chip->irq_eoi = ir_ack_apic_level;
2544 2572
2545#ifdef CONFIG_SMP
2546 chip->irq_set_affinity = set_remapped_irq_affinity; 2573 chip->irq_set_affinity = set_remapped_irq_affinity;
2547#endif
2548} 2574}
2549#endif /* CONFIG_IRQ_REMAP */ 2575#endif /* CONFIG_IRQ_REMAP */
2550 2576
@@ -2555,9 +2581,7 @@ static struct irq_chip ioapic_chip __read_mostly = {
2555 .irq_unmask = unmask_ioapic_irq, 2581 .irq_unmask = unmask_ioapic_irq,
2556 .irq_ack = ack_apic_edge, 2582 .irq_ack = ack_apic_edge,
2557 .irq_eoi = ack_apic_level, 2583 .irq_eoi = ack_apic_level,
2558#ifdef CONFIG_SMP
2559 .irq_set_affinity = ioapic_set_affinity, 2584 .irq_set_affinity = ioapic_set_affinity,
2560#endif
2561 .irq_retrigger = ioapic_retrigger_irq, 2585 .irq_retrigger = ioapic_retrigger_irq,
2562}; 2586};
2563 2587
@@ -3039,7 +3063,10 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
3039 if (err) 3063 if (err)
3040 return err; 3064 return err;
3041 3065
3042 dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); 3066 err = apic->cpu_mask_to_apicid_and(cfg->domain,
3067 apic->target_cpus(), &dest);
3068 if (err)
3069 return err;
3043 3070
3044 if (irq_remapped(cfg)) { 3071 if (irq_remapped(cfg)) {
3045 compose_remapped_msi_msg(pdev, irq, dest, msg, hpet_id); 3072 compose_remapped_msi_msg(pdev, irq, dest, msg, hpet_id);
@@ -3073,7 +3100,6 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
3073 return err; 3100 return err;
3074} 3101}
3075 3102
3076#ifdef CONFIG_SMP
3077static int 3103static int
3078msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) 3104msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
3079{ 3105{
@@ -3095,7 +3121,6 @@ msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
3095 3121
3096 return IRQ_SET_MASK_OK_NOCOPY; 3122 return IRQ_SET_MASK_OK_NOCOPY;
3097} 3123}
3098#endif /* CONFIG_SMP */
3099 3124
3100/* 3125/*
3101 * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, 3126 * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
@@ -3106,9 +3131,7 @@ static struct irq_chip msi_chip = {
3106 .irq_unmask = unmask_msi_irq, 3131 .irq_unmask = unmask_msi_irq,
3107 .irq_mask = mask_msi_irq, 3132 .irq_mask = mask_msi_irq,
3108 .irq_ack = ack_apic_edge, 3133 .irq_ack = ack_apic_edge,
3109#ifdef CONFIG_SMP
3110 .irq_set_affinity = msi_set_affinity, 3134 .irq_set_affinity = msi_set_affinity,
3111#endif
3112 .irq_retrigger = ioapic_retrigger_irq, 3135 .irq_retrigger = ioapic_retrigger_irq,
3113}; 3136};
3114 3137
@@ -3193,7 +3216,6 @@ void native_teardown_msi_irq(unsigned int irq)
3193} 3216}
3194 3217
3195#ifdef CONFIG_DMAR_TABLE 3218#ifdef CONFIG_DMAR_TABLE
3196#ifdef CONFIG_SMP
3197static int 3219static int
3198dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, 3220dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
3199 bool force) 3221 bool force)
@@ -3218,16 +3240,12 @@ dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
3218 return IRQ_SET_MASK_OK_NOCOPY; 3240 return IRQ_SET_MASK_OK_NOCOPY;
3219} 3241}
3220 3242
3221#endif /* CONFIG_SMP */
3222
3223static struct irq_chip dmar_msi_type = { 3243static struct irq_chip dmar_msi_type = {
3224 .name = "DMAR_MSI", 3244 .name = "DMAR_MSI",
3225 .irq_unmask = dmar_msi_unmask, 3245 .irq_unmask = dmar_msi_unmask,
3226 .irq_mask = dmar_msi_mask, 3246 .irq_mask = dmar_msi_mask,
3227 .irq_ack = ack_apic_edge, 3247 .irq_ack = ack_apic_edge,
3228#ifdef CONFIG_SMP
3229 .irq_set_affinity = dmar_msi_set_affinity, 3248 .irq_set_affinity = dmar_msi_set_affinity,
3230#endif
3231 .irq_retrigger = ioapic_retrigger_irq, 3249 .irq_retrigger = ioapic_retrigger_irq,
3232}; 3250};
3233 3251
@@ -3248,7 +3266,6 @@ int arch_setup_dmar_msi(unsigned int irq)
3248 3266
3249#ifdef CONFIG_HPET_TIMER 3267#ifdef CONFIG_HPET_TIMER
3250 3268
3251#ifdef CONFIG_SMP
3252static int hpet_msi_set_affinity(struct irq_data *data, 3269static int hpet_msi_set_affinity(struct irq_data *data,
3253 const struct cpumask *mask, bool force) 3270 const struct cpumask *mask, bool force)
3254{ 3271{
@@ -3271,16 +3288,12 @@ static int hpet_msi_set_affinity(struct irq_data *data,
3271 return IRQ_SET_MASK_OK_NOCOPY; 3288 return IRQ_SET_MASK_OK_NOCOPY;
3272} 3289}
3273 3290
3274#endif /* CONFIG_SMP */
3275
3276static struct irq_chip hpet_msi_type = { 3291static struct irq_chip hpet_msi_type = {
3277 .name = "HPET_MSI", 3292 .name = "HPET_MSI",
3278 .irq_unmask = hpet_msi_unmask, 3293 .irq_unmask = hpet_msi_unmask,
3279 .irq_mask = hpet_msi_mask, 3294 .irq_mask = hpet_msi_mask,
3280 .irq_ack = ack_apic_edge, 3295 .irq_ack = ack_apic_edge,
3281#ifdef CONFIG_SMP
3282 .irq_set_affinity = hpet_msi_set_affinity, 3296 .irq_set_affinity = hpet_msi_set_affinity,
3283#endif
3284 .irq_retrigger = ioapic_retrigger_irq, 3297 .irq_retrigger = ioapic_retrigger_irq,
3285}; 3298};
3286 3299
@@ -3315,8 +3328,6 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
3315 */ 3328 */
3316#ifdef CONFIG_HT_IRQ 3329#ifdef CONFIG_HT_IRQ
3317 3330
3318#ifdef CONFIG_SMP
3319
3320static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) 3331static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
3321{ 3332{
3322 struct ht_irq_msg msg; 3333 struct ht_irq_msg msg;
@@ -3344,22 +3355,20 @@ ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
3344 return IRQ_SET_MASK_OK_NOCOPY; 3355 return IRQ_SET_MASK_OK_NOCOPY;
3345} 3356}
3346 3357
3347#endif
3348
3349static struct irq_chip ht_irq_chip = { 3358static struct irq_chip ht_irq_chip = {
3350 .name = "PCI-HT", 3359 .name = "PCI-HT",
3351 .irq_mask = mask_ht_irq, 3360 .irq_mask = mask_ht_irq,
3352 .irq_unmask = unmask_ht_irq, 3361 .irq_unmask = unmask_ht_irq,
3353 .irq_ack = ack_apic_edge, 3362 .irq_ack = ack_apic_edge,
3354#ifdef CONFIG_SMP
3355 .irq_set_affinity = ht_set_affinity, 3363 .irq_set_affinity = ht_set_affinity,
3356#endif
3357 .irq_retrigger = ioapic_retrigger_irq, 3364 .irq_retrigger = ioapic_retrigger_irq,
3358}; 3365};
3359 3366
3360int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) 3367int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3361{ 3368{
3362 struct irq_cfg *cfg; 3369 struct irq_cfg *cfg;
3370 struct ht_irq_msg msg;
3371 unsigned dest;
3363 int err; 3372 int err;
3364 3373
3365 if (disable_apic) 3374 if (disable_apic)
@@ -3367,36 +3376,37 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3367 3376
3368 cfg = irq_cfg(irq); 3377 cfg = irq_cfg(irq);
3369 err = assign_irq_vector(irq, cfg, apic->target_cpus()); 3378 err = assign_irq_vector(irq, cfg, apic->target_cpus());
3370 if (!err) { 3379 if (err)
3371 struct ht_irq_msg msg; 3380 return err;
3372 unsigned dest; 3381
3382 err = apic->cpu_mask_to_apicid_and(cfg->domain,
3383 apic->target_cpus(), &dest);
3384 if (err)
3385 return err;
3373 3386
3374 dest = apic->cpu_mask_to_apicid_and(cfg->domain, 3387 msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
3375 apic->target_cpus());
3376 3388
3377 msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); 3389 msg.address_lo =
3390 HT_IRQ_LOW_BASE |
3391 HT_IRQ_LOW_DEST_ID(dest) |
3392 HT_IRQ_LOW_VECTOR(cfg->vector) |
3393 ((apic->irq_dest_mode == 0) ?
3394 HT_IRQ_LOW_DM_PHYSICAL :
3395 HT_IRQ_LOW_DM_LOGICAL) |
3396 HT_IRQ_LOW_RQEOI_EDGE |
3397 ((apic->irq_delivery_mode != dest_LowestPrio) ?
3398 HT_IRQ_LOW_MT_FIXED :
3399 HT_IRQ_LOW_MT_ARBITRATED) |
3400 HT_IRQ_LOW_IRQ_MASKED;
3378 3401
3379 msg.address_lo = 3402 write_ht_irq_msg(irq, &msg);
3380 HT_IRQ_LOW_BASE |
3381 HT_IRQ_LOW_DEST_ID(dest) |
3382 HT_IRQ_LOW_VECTOR(cfg->vector) |
3383 ((apic->irq_dest_mode == 0) ?
3384 HT_IRQ_LOW_DM_PHYSICAL :
3385 HT_IRQ_LOW_DM_LOGICAL) |
3386 HT_IRQ_LOW_RQEOI_EDGE |
3387 ((apic->irq_delivery_mode != dest_LowestPrio) ?
3388 HT_IRQ_LOW_MT_FIXED :
3389 HT_IRQ_LOW_MT_ARBITRATED) |
3390 HT_IRQ_LOW_IRQ_MASKED;
3391 3403
3392 write_ht_irq_msg(irq, &msg); 3404 irq_set_chip_and_handler_name(irq, &ht_irq_chip,
3405 handle_edge_irq, "edge");
3393 3406
3394 irq_set_chip_and_handler_name(irq, &ht_irq_chip, 3407 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq);
3395 handle_edge_irq, "edge");
3396 3408
3397 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq); 3409 return 0;
3398 }
3399 return err;
3400} 3410}
3401#endif /* CONFIG_HT_IRQ */ 3411#endif /* CONFIG_HT_IRQ */
3402 3412
@@ -3564,7 +3574,8 @@ static int __init io_apic_get_unique_id(int ioapic, int apic_id)
3564 3574
3565 /* Sanity check */ 3575 /* Sanity check */
3566 if (reg_00.bits.ID != apic_id) { 3576 if (reg_00.bits.ID != apic_id) {
3567 printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic); 3577 pr_err("IOAPIC[%d]: Unable to change apic_id!\n",
3578 ioapic);
3568 return -1; 3579 return -1;
3569 } 3580 }
3570 } 3581 }
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index f00a68cca37..d661ee95cab 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -406,16 +406,13 @@ static inline int numaq_check_phys_apicid_present(int phys_apicid)
406 * We use physical apicids here, not logical, so just return the default 406 * We use physical apicids here, not logical, so just return the default
407 * physical broadcast to stop people from breaking us 407 * physical broadcast to stop people from breaking us
408 */ 408 */
409static unsigned int numaq_cpu_mask_to_apicid(const struct cpumask *cpumask) 409static int
410{
411 return 0x0F;
412}
413
414static inline unsigned int
415numaq_cpu_mask_to_apicid_and(const struct cpumask *cpumask, 410numaq_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
416 const struct cpumask *andmask) 411 const struct cpumask *andmask,
412 unsigned int *apicid)
417{ 413{
418 return 0x0F; 414 *apicid = 0x0F;
415 return 0;
419} 416}
420 417
421/* No NUMA-Q box has a HT CPU, but it can't hurt to use the default code. */ 418/* No NUMA-Q box has a HT CPU, but it can't hurt to use the default code. */
@@ -441,20 +438,6 @@ static int probe_numaq(void)
441 return found_numaq; 438 return found_numaq;
442} 439}
443 440
444static void numaq_vector_allocation_domain(int cpu, struct cpumask *retmask)
445{
446 /* Careful. Some cpus do not strictly honor the set of cpus
447 * specified in the interrupt destination when using lowest
448 * priority interrupt delivery mode.
449 *
450 * In particular there was a hyperthreading cpu observed to
451 * deliver interrupts to the wrong hyperthread when only one
452 * hyperthread was specified in the interrupt desitination.
453 */
454 cpumask_clear(retmask);
455 cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
456}
457
458static void numaq_setup_portio_remap(void) 441static void numaq_setup_portio_remap(void)
459{ 442{
460 int num_quads = num_online_nodes(); 443 int num_quads = num_online_nodes();
@@ -491,7 +474,7 @@ static struct apic __refdata apic_numaq = {
491 .check_apicid_used = numaq_check_apicid_used, 474 .check_apicid_used = numaq_check_apicid_used,
492 .check_apicid_present = numaq_check_apicid_present, 475 .check_apicid_present = numaq_check_apicid_present,
493 476
494 .vector_allocation_domain = numaq_vector_allocation_domain, 477 .vector_allocation_domain = flat_vector_allocation_domain,
495 .init_apic_ldr = numaq_init_apic_ldr, 478 .init_apic_ldr = numaq_init_apic_ldr,
496 479
497 .ioapic_phys_id_map = numaq_ioapic_phys_id_map, 480 .ioapic_phys_id_map = numaq_ioapic_phys_id_map,
@@ -509,7 +492,6 @@ static struct apic __refdata apic_numaq = {
509 .set_apic_id = NULL, 492 .set_apic_id = NULL,
510 .apic_id_mask = 0x0F << 24, 493 .apic_id_mask = 0x0F << 24,
511 494
512 .cpu_mask_to_apicid = numaq_cpu_mask_to_apicid,
513 .cpu_mask_to_apicid_and = numaq_cpu_mask_to_apicid_and, 495 .cpu_mask_to_apicid_and = numaq_cpu_mask_to_apicid_and,
514 496
515 .send_IPI_mask = numaq_send_IPI_mask, 497 .send_IPI_mask = numaq_send_IPI_mask,
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 1b291da09e6..eb35ef9ee63 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -66,21 +66,6 @@ static void setup_apic_flat_routing(void)
66#endif 66#endif
67} 67}
68 68
69static void default_vector_allocation_domain(int cpu, struct cpumask *retmask)
70{
71 /*
72 * Careful. Some cpus do not strictly honor the set of cpus
73 * specified in the interrupt destination when using lowest
74 * priority interrupt delivery mode.
75 *
76 * In particular there was a hyperthreading cpu observed to
77 * deliver interrupts to the wrong hyperthread when only one
78 * hyperthread was specified in the interrupt desitination.
79 */
80 cpumask_clear(retmask);
81 cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
82}
83
84/* should be called last. */ 69/* should be called last. */
85static int probe_default(void) 70static int probe_default(void)
86{ 71{
@@ -105,7 +90,7 @@ static struct apic apic_default = {
105 .check_apicid_used = default_check_apicid_used, 90 .check_apicid_used = default_check_apicid_used,
106 .check_apicid_present = default_check_apicid_present, 91 .check_apicid_present = default_check_apicid_present,
107 92
108 .vector_allocation_domain = default_vector_allocation_domain, 93 .vector_allocation_domain = flat_vector_allocation_domain,
109 .init_apic_ldr = default_init_apic_ldr, 94 .init_apic_ldr = default_init_apic_ldr,
110 95
111 .ioapic_phys_id_map = default_ioapic_phys_id_map, 96 .ioapic_phys_id_map = default_ioapic_phys_id_map,
@@ -123,8 +108,7 @@ static struct apic apic_default = {
123 .set_apic_id = NULL, 108 .set_apic_id = NULL,
124 .apic_id_mask = 0x0F << 24, 109 .apic_id_mask = 0x0F << 24,
125 110
126 .cpu_mask_to_apicid = default_cpu_mask_to_apicid, 111 .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
127 .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
128 112
129 .send_IPI_mask = default_send_IPI_mask_logical, 113 .send_IPI_mask = default_send_IPI_mask_logical,
130 .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_logical, 114 .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_logical,
@@ -208,6 +192,9 @@ void __init default_setup_apic_routing(void)
208 192
209 if (apic->setup_apic_routing) 193 if (apic->setup_apic_routing)
210 apic->setup_apic_routing(); 194 apic->setup_apic_routing();
195
196 if (x86_platform.apic_post_init)
197 x86_platform.apic_post_init();
211} 198}
212 199
213void __init generic_apic_probe(void) 200void __init generic_apic_probe(void)
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index 3fe98669892..1793dba7a74 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -23,11 +23,6 @@
23#include <asm/ipi.h> 23#include <asm/ipi.h>
24#include <asm/setup.h> 24#include <asm/setup.h>
25 25
26static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
27{
28 return hard_smp_processor_id() >> index_msb;
29}
30
31/* 26/*
32 * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. 27 * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
33 */ 28 */
@@ -48,10 +43,8 @@ void __init default_setup_apic_routing(void)
48 } 43 }
49 } 44 }
50 45
51 if (is_vsmp_box()) { 46 if (x86_platform.apic_post_init)
52 /* need to update phys_pkg_id */ 47 x86_platform.apic_post_init();
53 apic->phys_pkg_id = apicid_phys_pkg_id;
54 }
55} 48}
56 49
57/* Same for both flat and physical. */ 50/* Same for both flat and physical. */
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 659897c0075..77c95c0e1bf 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -26,6 +26,8 @@
26 * 26 *
27 */ 27 */
28 28
29#define pr_fmt(fmt) "summit: %s: " fmt, __func__
30
29#include <linux/mm.h> 31#include <linux/mm.h>
30#include <linux/init.h> 32#include <linux/init.h>
31#include <asm/io.h> 33#include <asm/io.h>
@@ -235,8 +237,8 @@ static int summit_apic_id_registered(void)
235 237
236static void summit_setup_apic_routing(void) 238static void summit_setup_apic_routing(void)
237{ 239{
238 printk("Enabling APIC mode: Summit. Using %d I/O APICs\n", 240 pr_info("Enabling APIC mode: Summit. Using %d I/O APICs\n",
239 nr_ioapics); 241 nr_ioapics);
240} 242}
241 243
242static int summit_cpu_present_to_apicid(int mps_cpu) 244static int summit_cpu_present_to_apicid(int mps_cpu)
@@ -263,43 +265,48 @@ static int summit_check_phys_apicid_present(int physical_apicid)
263 return 1; 265 return 1;
264} 266}
265 267
266static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask) 268static inline int
269summit_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id)
267{ 270{
268 unsigned int round = 0; 271 unsigned int round = 0;
269 int cpu, apicid = 0; 272 unsigned int cpu, apicid = 0;
270 273
271 /* 274 /*
272 * The cpus in the mask must all be on the apic cluster. 275 * The cpus in the mask must all be on the apic cluster.
273 */ 276 */
274 for_each_cpu(cpu, cpumask) { 277 for_each_cpu_and(cpu, cpumask, cpu_online_mask) {
275 int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); 278 int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
276 279
277 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { 280 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
278 printk("%s: Not a valid mask!\n", __func__); 281 pr_err("Not a valid mask!\n");
279 return BAD_APICID; 282 return -EINVAL;
280 } 283 }
281 apicid |= new_apicid; 284 apicid |= new_apicid;
282 round++; 285 round++;
283 } 286 }
284 return apicid; 287 if (!round)
288 return -EINVAL;
289 *dest_id = apicid;
290 return 0;
285} 291}
286 292
287static unsigned int summit_cpu_mask_to_apicid_and(const struct cpumask *inmask, 293static int
288 const struct cpumask *andmask) 294summit_cpu_mask_to_apicid_and(const struct cpumask *inmask,
295 const struct cpumask *andmask,
296 unsigned int *apicid)
289{ 297{
290 int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
291 cpumask_var_t cpumask; 298 cpumask_var_t cpumask;
299 *apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
292 300
293 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) 301 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
294 return apicid; 302 return 0;
295 303
296 cpumask_and(cpumask, inmask, andmask); 304 cpumask_and(cpumask, inmask, andmask);
297 cpumask_and(cpumask, cpumask, cpu_online_mask); 305 summit_cpu_mask_to_apicid(cpumask, apicid);
298 apicid = summit_cpu_mask_to_apicid(cpumask);
299 306
300 free_cpumask_var(cpumask); 307 free_cpumask_var(cpumask);
301 308
302 return apicid; 309 return 0;
303} 310}
304 311
305/* 312/*
@@ -320,20 +327,6 @@ static int probe_summit(void)
320 return 0; 327 return 0;
321} 328}
322 329
323static void summit_vector_allocation_domain(int cpu, struct cpumask *retmask)
324{
325 /* Careful. Some cpus do not strictly honor the set of cpus
326 * specified in the interrupt destination when using lowest
327 * priority interrupt delivery mode.
328 *
329 * In particular there was a hyperthreading cpu observed to
330 * deliver interrupts to the wrong hyperthread when only one
331 * hyperthread was specified in the interrupt desitination.
332 */
333 cpumask_clear(retmask);
334 cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
335}
336
337#ifdef CONFIG_X86_SUMMIT_NUMA 330#ifdef CONFIG_X86_SUMMIT_NUMA
338static struct rio_table_hdr *rio_table_hdr; 331static struct rio_table_hdr *rio_table_hdr;
339static struct scal_detail *scal_devs[MAX_NUMNODES]; 332static struct scal_detail *scal_devs[MAX_NUMNODES];
@@ -355,7 +348,7 @@ static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
355 } 348 }
356 } 349 }
357 if (i == rio_table_hdr->num_rio_dev) { 350 if (i == rio_table_hdr->num_rio_dev) {
358 printk(KERN_ERR "%s: Couldn't find owner Cyclone for Winnipeg!\n", __func__); 351 pr_err("Couldn't find owner Cyclone for Winnipeg!\n");
359 return last_bus; 352 return last_bus;
360 } 353 }
361 354
@@ -366,7 +359,7 @@ static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
366 } 359 }
367 } 360 }
368 if (i == rio_table_hdr->num_scal_dev) { 361 if (i == rio_table_hdr->num_scal_dev) {
369 printk(KERN_ERR "%s: Couldn't find owner Twister for Cyclone!\n", __func__); 362 pr_err("Couldn't find owner Twister for Cyclone!\n");
370 return last_bus; 363 return last_bus;
371 } 364 }
372 365
@@ -396,7 +389,7 @@ static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
396 num_buses = 9; 389 num_buses = 9;
397 break; 390 break;
398 default: 391 default:
399 printk(KERN_INFO "%s: Unsupported Winnipeg type!\n", __func__); 392 pr_info("Unsupported Winnipeg type!\n");
400 return last_bus; 393 return last_bus;
401 } 394 }
402 395
@@ -411,13 +404,15 @@ static int build_detail_arrays(void)
411 int i, scal_detail_size, rio_detail_size; 404 int i, scal_detail_size, rio_detail_size;
412 405
413 if (rio_table_hdr->num_scal_dev > MAX_NUMNODES) { 406 if (rio_table_hdr->num_scal_dev > MAX_NUMNODES) {
414 printk(KERN_WARNING "%s: MAX_NUMNODES too low! Defined as %d, but system has %d nodes.\n", __func__, MAX_NUMNODES, rio_table_hdr->num_scal_dev); 407 pr_warn("MAX_NUMNODES too low! Defined as %d, but system has %d nodes\n",
408 MAX_NUMNODES, rio_table_hdr->num_scal_dev);
415 return 0; 409 return 0;
416 } 410 }
417 411
418 switch (rio_table_hdr->version) { 412 switch (rio_table_hdr->version) {
419 default: 413 default:
420 printk(KERN_WARNING "%s: Invalid Rio Grande Table Version: %d\n", __func__, rio_table_hdr->version); 414 pr_warn("Invalid Rio Grande Table Version: %d\n",
415 rio_table_hdr->version);
421 return 0; 416 return 0;
422 case 2: 417 case 2:
423 scal_detail_size = 11; 418 scal_detail_size = 11;
@@ -462,7 +457,7 @@ void setup_summit(void)
462 offset = *((unsigned short *)(ptr + offset)); 457 offset = *((unsigned short *)(ptr + offset));
463 } 458 }
464 if (!rio_table_hdr) { 459 if (!rio_table_hdr) {
465 printk(KERN_ERR "%s: Unable to locate Rio Grande Table in EBDA - bailing!\n", __func__); 460 pr_err("Unable to locate Rio Grande Table in EBDA - bailing!\n");
466 return; 461 return;
467 } 462 }
468 463
@@ -509,7 +504,7 @@ static struct apic apic_summit = {
509 .check_apicid_used = summit_check_apicid_used, 504 .check_apicid_used = summit_check_apicid_used,
510 .check_apicid_present = summit_check_apicid_present, 505 .check_apicid_present = summit_check_apicid_present,
511 506
512 .vector_allocation_domain = summit_vector_allocation_domain, 507 .vector_allocation_domain = flat_vector_allocation_domain,
513 .init_apic_ldr = summit_init_apic_ldr, 508 .init_apic_ldr = summit_init_apic_ldr,
514 509
515 .ioapic_phys_id_map = summit_ioapic_phys_id_map, 510 .ioapic_phys_id_map = summit_ioapic_phys_id_map,
@@ -527,7 +522,6 @@ static struct apic apic_summit = {
527 .set_apic_id = NULL, 522 .set_apic_id = NULL,
528 .apic_id_mask = 0xFF << 24, 523 .apic_id_mask = 0xFF << 24,
529 524
530 .cpu_mask_to_apicid = summit_cpu_mask_to_apicid,
531 .cpu_mask_to_apicid_and = summit_cpu_mask_to_apicid_and, 525 .cpu_mask_to_apicid_and = summit_cpu_mask_to_apicid_and,
532 526
533 .send_IPI_mask = summit_send_IPI_mask, 527 .send_IPI_mask = summit_send_IPI_mask,
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index ff35cff0e1a..c88baa4ff0e 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -81,7 +81,7 @@ static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
81} 81}
82 82
83static void 83static void
84 x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) 84x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
85{ 85{
86 __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT); 86 __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
87} 87}
@@ -96,36 +96,37 @@ static void x2apic_send_IPI_all(int vector)
96 __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC); 96 __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
97} 97}
98 98
99static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask) 99static int
100x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
101 const struct cpumask *andmask,
102 unsigned int *apicid)
100{ 103{
101 /* 104 u32 dest = 0;
102 * We're using fixed IRQ delivery, can only return one logical APIC ID. 105 u16 cluster;
103 * May as well be the first. 106 int i;
104 */
105 int cpu = cpumask_first(cpumask);
106 107
107 if ((unsigned)cpu < nr_cpu_ids) 108 for_each_cpu_and(i, cpumask, andmask) {
108 return per_cpu(x86_cpu_to_logical_apicid, cpu); 109 if (!cpumask_test_cpu(i, cpu_online_mask))
109 else 110 continue;
110 return BAD_APICID; 111 dest = per_cpu(x86_cpu_to_logical_apicid, i);
111} 112 cluster = x2apic_cluster(i);
113 break;
114 }
112 115
113static unsigned int 116 if (!dest)
114x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, 117 return -EINVAL;
115 const struct cpumask *andmask)
116{
117 int cpu;
118 118
119 /* 119 for_each_cpu_and(i, cpumask, andmask) {
120 * We're using fixed IRQ delivery, can only return one logical APIC ID. 120 if (!cpumask_test_cpu(i, cpu_online_mask))
121 * May as well be the first. 121 continue;
122 */ 122 if (cluster != x2apic_cluster(i))
123 for_each_cpu_and(cpu, cpumask, andmask) { 123 continue;
124 if (cpumask_test_cpu(cpu, cpu_online_mask)) 124 dest |= per_cpu(x86_cpu_to_logical_apicid, i);
125 break;
126 } 125 }
127 126
128 return per_cpu(x86_cpu_to_logical_apicid, cpu); 127 *apicid = dest;
128
129 return 0;
129} 130}
130 131
131static void init_x2apic_ldr(void) 132static void init_x2apic_ldr(void)
@@ -208,6 +209,32 @@ static int x2apic_cluster_probe(void)
208 return 0; 209 return 0;
209} 210}
210 211
212static const struct cpumask *x2apic_cluster_target_cpus(void)
213{
214 return cpu_all_mask;
215}
216
217/*
218 * Each x2apic cluster is an allocation domain.
219 */
220static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask,
221 const struct cpumask *mask)
222{
223 /*
224 * To minimize vector pressure, default case of boot, device bringup
225 * etc will use a single cpu for the interrupt destination.
226 *
227 * On explicit migration requests coming from irqbalance etc,
228 * interrupts will be routed to the x2apic cluster (cluster-id
229 * derived from the first cpu in the mask) members specified
230 * in the mask.
231 */
232 if (mask == x2apic_cluster_target_cpus())
233 cpumask_copy(retmask, cpumask_of(cpu));
234 else
235 cpumask_and(retmask, mask, per_cpu(cpus_in_cluster, cpu));
236}
237
211static struct apic apic_x2apic_cluster = { 238static struct apic apic_x2apic_cluster = {
212 239
213 .name = "cluster x2apic", 240 .name = "cluster x2apic",
@@ -219,13 +246,13 @@ static struct apic apic_x2apic_cluster = {
219 .irq_delivery_mode = dest_LowestPrio, 246 .irq_delivery_mode = dest_LowestPrio,
220 .irq_dest_mode = 1, /* logical */ 247 .irq_dest_mode = 1, /* logical */
221 248
222 .target_cpus = x2apic_target_cpus, 249 .target_cpus = x2apic_cluster_target_cpus,
223 .disable_esr = 0, 250 .disable_esr = 0,
224 .dest_logical = APIC_DEST_LOGICAL, 251 .dest_logical = APIC_DEST_LOGICAL,
225 .check_apicid_used = NULL, 252 .check_apicid_used = NULL,
226 .check_apicid_present = NULL, 253 .check_apicid_present = NULL,
227 254
228 .vector_allocation_domain = x2apic_vector_allocation_domain, 255 .vector_allocation_domain = cluster_vector_allocation_domain,
229 .init_apic_ldr = init_x2apic_ldr, 256 .init_apic_ldr = init_x2apic_ldr,
230 257
231 .ioapic_phys_id_map = NULL, 258 .ioapic_phys_id_map = NULL,
@@ -243,7 +270,6 @@ static struct apic apic_x2apic_cluster = {
243 .set_apic_id = x2apic_set_apic_id, 270 .set_apic_id = x2apic_set_apic_id,
244 .apic_id_mask = 0xFFFFFFFFu, 271 .apic_id_mask = 0xFFFFFFFFu,
245 272
246 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
247 .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and, 273 .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
248 274
249 .send_IPI_mask = x2apic_send_IPI_mask, 275 .send_IPI_mask = x2apic_send_IPI_mask,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index c17e982db27..e03a1e180e8 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -76,38 +76,6 @@ static void x2apic_send_IPI_all(int vector)
76 __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC); 76 __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
77} 77}
78 78
79static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
80{
81 /*
82 * We're using fixed IRQ delivery, can only return one phys APIC ID.
83 * May as well be the first.
84 */
85 int cpu = cpumask_first(cpumask);
86
87 if ((unsigned)cpu < nr_cpu_ids)
88 return per_cpu(x86_cpu_to_apicid, cpu);
89 else
90 return BAD_APICID;
91}
92
93static unsigned int
94x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
95 const struct cpumask *andmask)
96{
97 int cpu;
98
99 /*
100 * We're using fixed IRQ delivery, can only return one phys APIC ID.
101 * May as well be the first.
102 */
103 for_each_cpu_and(cpu, cpumask, andmask) {
104 if (cpumask_test_cpu(cpu, cpu_online_mask))
105 break;
106 }
107
108 return per_cpu(x86_cpu_to_apicid, cpu);
109}
110
111static void init_x2apic_ldr(void) 79static void init_x2apic_ldr(void)
112{ 80{
113} 81}
@@ -131,13 +99,13 @@ static struct apic apic_x2apic_phys = {
131 .irq_delivery_mode = dest_Fixed, 99 .irq_delivery_mode = dest_Fixed,
132 .irq_dest_mode = 0, /* physical */ 100 .irq_dest_mode = 0, /* physical */
133 101
134 .target_cpus = x2apic_target_cpus, 102 .target_cpus = online_target_cpus,
135 .disable_esr = 0, 103 .disable_esr = 0,
136 .dest_logical = 0, 104 .dest_logical = 0,
137 .check_apicid_used = NULL, 105 .check_apicid_used = NULL,
138 .check_apicid_present = NULL, 106 .check_apicid_present = NULL,
139 107
140 .vector_allocation_domain = x2apic_vector_allocation_domain, 108 .vector_allocation_domain = default_vector_allocation_domain,
141 .init_apic_ldr = init_x2apic_ldr, 109 .init_apic_ldr = init_x2apic_ldr,
142 110
143 .ioapic_phys_id_map = NULL, 111 .ioapic_phys_id_map = NULL,
@@ -155,8 +123,7 @@ static struct apic apic_x2apic_phys = {
155 .set_apic_id = x2apic_set_apic_id, 123 .set_apic_id = x2apic_set_apic_id,
156 .apic_id_mask = 0xFFFFFFFFu, 124 .apic_id_mask = 0xFFFFFFFFu,
157 125
158 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, 126 .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
159 .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
160 127
161 .send_IPI_mask = x2apic_send_IPI_mask, 128 .send_IPI_mask = x2apic_send_IPI_mask,
162 .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself, 129 .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index c6d03f7a440..8cfade9510a 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -185,17 +185,6 @@ EXPORT_SYMBOL_GPL(uv_possible_blades);
185unsigned long sn_rtc_cycles_per_second; 185unsigned long sn_rtc_cycles_per_second;
186EXPORT_SYMBOL(sn_rtc_cycles_per_second); 186EXPORT_SYMBOL(sn_rtc_cycles_per_second);
187 187
188static const struct cpumask *uv_target_cpus(void)
189{
190 return cpu_online_mask;
191}
192
193static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
194{
195 cpumask_clear(retmask);
196 cpumask_set_cpu(cpu, retmask);
197}
198
199static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) 188static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
200{ 189{
201#ifdef CONFIG_SMP 190#ifdef CONFIG_SMP
@@ -280,25 +269,12 @@ static void uv_init_apic_ldr(void)
280{ 269{
281} 270}
282 271
283static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask) 272static int
284{
285 /*
286 * We're using fixed IRQ delivery, can only return one phys APIC ID.
287 * May as well be the first.
288 */
289 int cpu = cpumask_first(cpumask);
290
291 if ((unsigned)cpu < nr_cpu_ids)
292 return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits;
293 else
294 return BAD_APICID;
295}
296
297static unsigned int
298uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask, 273uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
299 const struct cpumask *andmask) 274 const struct cpumask *andmask,
275 unsigned int *apicid)
300{ 276{
301 int cpu; 277 int unsigned cpu;
302 278
303 /* 279 /*
304 * We're using fixed IRQ delivery, can only return one phys APIC ID. 280 * We're using fixed IRQ delivery, can only return one phys APIC ID.
@@ -308,7 +284,13 @@ uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
308 if (cpumask_test_cpu(cpu, cpu_online_mask)) 284 if (cpumask_test_cpu(cpu, cpu_online_mask))
309 break; 285 break;
310 } 286 }
311 return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits; 287
288 if (likely(cpu < nr_cpu_ids)) {
289 *apicid = per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits;
290 return 0;
291 }
292
293 return -EINVAL;
312} 294}
313 295
314static unsigned int x2apic_get_apic_id(unsigned long x) 296static unsigned int x2apic_get_apic_id(unsigned long x)
@@ -362,13 +344,13 @@ static struct apic __refdata apic_x2apic_uv_x = {
362 .irq_delivery_mode = dest_Fixed, 344 .irq_delivery_mode = dest_Fixed,
363 .irq_dest_mode = 0, /* physical */ 345 .irq_dest_mode = 0, /* physical */
364 346
365 .target_cpus = uv_target_cpus, 347 .target_cpus = online_target_cpus,
366 .disable_esr = 0, 348 .disable_esr = 0,
367 .dest_logical = APIC_DEST_LOGICAL, 349 .dest_logical = APIC_DEST_LOGICAL,
368 .check_apicid_used = NULL, 350 .check_apicid_used = NULL,
369 .check_apicid_present = NULL, 351 .check_apicid_present = NULL,
370 352
371 .vector_allocation_domain = uv_vector_allocation_domain, 353 .vector_allocation_domain = default_vector_allocation_domain,
372 .init_apic_ldr = uv_init_apic_ldr, 354 .init_apic_ldr = uv_init_apic_ldr,
373 355
374 .ioapic_phys_id_map = NULL, 356 .ioapic_phys_id_map = NULL,
@@ -386,7 +368,6 @@ static struct apic __refdata apic_x2apic_uv_x = {
386 .set_apic_id = set_apic_id, 368 .set_apic_id = set_apic_id,
387 .apic_id_mask = 0xFFFFFFFFu, 369 .apic_id_mask = 0xFFFFFFFFu,
388 370
389 .cpu_mask_to_apicid = uv_cpu_mask_to_apicid,
390 .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and, 371 .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and,
391 372
392 .send_IPI_mask = uv_send_IPI_mask, 373 .send_IPI_mask = uv_send_IPI_mask,
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 07b0c0db466..d65464e4350 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -201,6 +201,8 @@
201 * http://www.microsoft.com/whdc/archive/amp_12.mspx] 201 * http://www.microsoft.com/whdc/archive/amp_12.mspx]
202 */ 202 */
203 203
204#define pr_fmt(fmt) "apm: " fmt
205
204#include <linux/module.h> 206#include <linux/module.h>
205 207
206#include <linux/poll.h> 208#include <linux/poll.h>
@@ -485,11 +487,11 @@ static void apm_error(char *str, int err)
485 if (error_table[i].key == err) 487 if (error_table[i].key == err)
486 break; 488 break;
487 if (i < ERROR_COUNT) 489 if (i < ERROR_COUNT)
488 printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg); 490 pr_notice("%s: %s\n", str, error_table[i].msg);
489 else if (err < 0) 491 else if (err < 0)
490 printk(KERN_NOTICE "apm: %s: linux error code %i\n", str, err); 492 pr_notice("%s: linux error code %i\n", str, err);
491 else 493 else
492 printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n", 494 pr_notice("%s: unknown error code %#2.2x\n",
493 str, err); 495 str, err);
494} 496}
495 497
@@ -1184,7 +1186,7 @@ static void queue_event(apm_event_t event, struct apm_user *sender)
1184 static int notified; 1186 static int notified;
1185 1187
1186 if (notified++ == 0) 1188 if (notified++ == 0)
1187 printk(KERN_ERR "apm: an event queue overflowed\n"); 1189 pr_err("an event queue overflowed\n");
1188 if (++as->event_tail >= APM_MAX_EVENTS) 1190 if (++as->event_tail >= APM_MAX_EVENTS)
1189 as->event_tail = 0; 1191 as->event_tail = 0;
1190 } 1192 }
@@ -1447,7 +1449,7 @@ static void apm_mainloop(void)
1447static int check_apm_user(struct apm_user *as, const char *func) 1449static int check_apm_user(struct apm_user *as, const char *func)
1448{ 1450{
1449 if (as == NULL || as->magic != APM_BIOS_MAGIC) { 1451 if (as == NULL || as->magic != APM_BIOS_MAGIC) {
1450 printk(KERN_ERR "apm: %s passed bad filp\n", func); 1452 pr_err("%s passed bad filp\n", func);
1451 return 1; 1453 return 1;
1452 } 1454 }
1453 return 0; 1455 return 0;
@@ -1586,7 +1588,7 @@ static int do_release(struct inode *inode, struct file *filp)
1586 as1 = as1->next) 1588 as1 = as1->next)
1587 ; 1589 ;
1588 if (as1 == NULL) 1590 if (as1 == NULL)
1589 printk(KERN_ERR "apm: filp not in user list\n"); 1591 pr_err("filp not in user list\n");
1590 else 1592 else
1591 as1->next = as->next; 1593 as1->next = as->next;
1592 } 1594 }
@@ -1600,11 +1602,9 @@ static int do_open(struct inode *inode, struct file *filp)
1600 struct apm_user *as; 1602 struct apm_user *as;
1601 1603
1602 as = kmalloc(sizeof(*as), GFP_KERNEL); 1604 as = kmalloc(sizeof(*as), GFP_KERNEL);
1603 if (as == NULL) { 1605 if (as == NULL)
1604 printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n",
1605 sizeof(*as));
1606 return -ENOMEM; 1606 return -ENOMEM;
1607 } 1607
1608 as->magic = APM_BIOS_MAGIC; 1608 as->magic = APM_BIOS_MAGIC;
1609 as->event_tail = as->event_head = 0; 1609 as->event_tail = as->event_head = 0;
1610 as->suspends_pending = as->standbys_pending = 0; 1610 as->suspends_pending = as->standbys_pending = 0;
@@ -2313,16 +2313,16 @@ static int __init apm_init(void)
2313 } 2313 }
2314 2314
2315 if (apm_info.disabled) { 2315 if (apm_info.disabled) {
2316 printk(KERN_NOTICE "apm: disabled on user request.\n"); 2316 pr_notice("disabled on user request.\n");
2317 return -ENODEV; 2317 return -ENODEV;
2318 } 2318 }
2319 if ((num_online_cpus() > 1) && !power_off && !smp) { 2319 if ((num_online_cpus() > 1) && !power_off && !smp) {
2320 printk(KERN_NOTICE "apm: disabled - APM is not SMP safe.\n"); 2320 pr_notice("disabled - APM is not SMP safe.\n");
2321 apm_info.disabled = 1; 2321 apm_info.disabled = 1;
2322 return -ENODEV; 2322 return -ENODEV;
2323 } 2323 }
2324 if (!acpi_disabled) { 2324 if (!acpi_disabled) {
2325 printk(KERN_NOTICE "apm: overridden by ACPI.\n"); 2325 pr_notice("overridden by ACPI.\n");
2326 apm_info.disabled = 1; 2326 apm_info.disabled = 1;
2327 return -ENODEV; 2327 return -ENODEV;
2328 } 2328 }
@@ -2356,8 +2356,7 @@ static int __init apm_init(void)
2356 2356
2357 kapmd_task = kthread_create(apm, NULL, "kapmd"); 2357 kapmd_task = kthread_create(apm, NULL, "kapmd");
2358 if (IS_ERR(kapmd_task)) { 2358 if (IS_ERR(kapmd_task)) {
2359 printk(KERN_ERR "apm: disabled - Unable to start kernel " 2359 pr_err("disabled - Unable to start kernel thread\n");
2360 "thread.\n");
2361 err = PTR_ERR(kapmd_task); 2360 err = PTR_ERR(kapmd_task);
2362 kapmd_task = NULL; 2361 kapmd_task = NULL;
2363 remove_proc_entry("apm", NULL); 2362 remove_proc_entry("apm", NULL);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 6ab6aa2fdfd..d30a6a9a012 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -14,7 +14,7 @@ CFLAGS_common.o := $(nostackp)
14 14
15obj-y := intel_cacheinfo.o scattered.o topology.o 15obj-y := intel_cacheinfo.o scattered.o topology.o
16obj-y += proc.o capflags.o powerflags.o common.o 16obj-y += proc.o capflags.o powerflags.o common.o
17obj-y += vmware.o hypervisor.o sched.o mshyperv.o 17obj-y += vmware.o hypervisor.o mshyperv.o
18obj-y += rdrand.o 18obj-y += rdrand.o
19obj-y += match.o 19obj-y += match.o
20 20
@@ -32,7 +32,9 @@ obj-$(CONFIG_PERF_EVENTS) += perf_event.o
32 32
33ifdef CONFIG_PERF_EVENTS 33ifdef CONFIG_PERF_EVENTS
34obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd.o 34obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd.o
35obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_p4.o perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o 35obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_p4.o
36obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
37obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o
36endif 38endif
37 39
38obj-$(CONFIG_X86_MCE) += mcheck/ 40obj-$(CONFIG_X86_MCE) += mcheck/
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 146bb6218ee..9d92e19039f 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -19,6 +19,39 @@
19 19
20#include "cpu.h" 20#include "cpu.h"
21 21
22static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
23{
24 struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
25 u32 gprs[8] = { 0 };
26 int err;
27
28 WARN_ONCE((c->x86 != 0xf), "%s should only be used on K8!\n", __func__);
29
30 gprs[1] = msr;
31 gprs[7] = 0x9c5a203a;
32
33 err = rdmsr_safe_regs(gprs);
34
35 *p = gprs[0] | ((u64)gprs[2] << 32);
36
37 return err;
38}
39
40static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val)
41{
42 struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
43 u32 gprs[8] = { 0 };
44
45 WARN_ONCE((c->x86 != 0xf), "%s should only be used on K8!\n", __func__);
46
47 gprs[0] = (u32)val;
48 gprs[1] = msr;
49 gprs[2] = val >> 32;
50 gprs[7] = 0x9c5a203a;
51
52 return wrmsr_safe_regs(gprs);
53}
54
22#ifdef CONFIG_X86_32 55#ifdef CONFIG_X86_32
23/* 56/*
24 * B step AMD K6 before B 9730xxxx have hardware bugs that can cause 57 * B step AMD K6 before B 9730xxxx have hardware bugs that can cause
@@ -586,9 +619,9 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
586 !cpu_has(c, X86_FEATURE_TOPOEXT)) { 619 !cpu_has(c, X86_FEATURE_TOPOEXT)) {
587 u64 val; 620 u64 val;
588 621
589 if (!rdmsrl_amd_safe(0xc0011005, &val)) { 622 if (!rdmsrl_safe(0xc0011005, &val)) {
590 val |= 1ULL << 54; 623 val |= 1ULL << 54;
591 wrmsrl_amd_safe(0xc0011005, val); 624 wrmsrl_safe(0xc0011005, val);
592 rdmsrl(0xc0011005, val); 625 rdmsrl(0xc0011005, val);
593 if (val & (1ULL << 54)) { 626 if (val & (1ULL << 54)) {
594 set_cpu_cap(c, X86_FEATURE_TOPOEXT); 627 set_cpu_cap(c, X86_FEATURE_TOPOEXT);
@@ -679,7 +712,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
679 err = rdmsrl_safe(MSR_AMD64_MCx_MASK(4), &mask); 712 err = rdmsrl_safe(MSR_AMD64_MCx_MASK(4), &mask);
680 if (err == 0) { 713 if (err == 0) {
681 mask |= (1 << 10); 714 mask |= (1 << 10);
682 checking_wrmsrl(MSR_AMD64_MCx_MASK(4), mask); 715 wrmsrl_safe(MSR_AMD64_MCx_MASK(4), mask);
683 } 716 }
684 } 717 }
685 718
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 46674fbb62b..c97bb7b5a9f 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -55,8 +55,8 @@ static void __init check_fpu(void)
55 55
56 if (!boot_cpu_data.hard_math) { 56 if (!boot_cpu_data.hard_math) {
57#ifndef CONFIG_MATH_EMULATION 57#ifndef CONFIG_MATH_EMULATION
58 printk(KERN_EMERG "No coprocessor found and no math emulation present.\n"); 58 pr_emerg("No coprocessor found and no math emulation present\n");
59 printk(KERN_EMERG "Giving up.\n"); 59 pr_emerg("Giving up\n");
60 for (;;) ; 60 for (;;) ;
61#endif 61#endif
62 return; 62 return;
@@ -86,7 +86,7 @@ static void __init check_fpu(void)
86 86
87 boot_cpu_data.fdiv_bug = fdiv_bug; 87 boot_cpu_data.fdiv_bug = fdiv_bug;
88 if (boot_cpu_data.fdiv_bug) 88 if (boot_cpu_data.fdiv_bug)
89 printk(KERN_WARNING "Hmm, FPU with FDIV bug.\n"); 89 pr_warn("Hmm, FPU with FDIV bug\n");
90} 90}
91 91
92static void __init check_hlt(void) 92static void __init check_hlt(void)
@@ -94,16 +94,16 @@ static void __init check_hlt(void)
94 if (boot_cpu_data.x86 >= 5 || paravirt_enabled()) 94 if (boot_cpu_data.x86 >= 5 || paravirt_enabled())
95 return; 95 return;
96 96
97 printk(KERN_INFO "Checking 'hlt' instruction... "); 97 pr_info("Checking 'hlt' instruction... ");
98 if (!boot_cpu_data.hlt_works_ok) { 98 if (!boot_cpu_data.hlt_works_ok) {
99 printk("disabled\n"); 99 pr_cont("disabled\n");
100 return; 100 return;
101 } 101 }
102 halt(); 102 halt();
103 halt(); 103 halt();
104 halt(); 104 halt();
105 halt(); 105 halt();
106 printk(KERN_CONT "OK.\n"); 106 pr_cont("OK\n");
107} 107}
108 108
109/* 109/*
@@ -116,7 +116,7 @@ static void __init check_popad(void)
116#ifndef CONFIG_X86_POPAD_OK 116#ifndef CONFIG_X86_POPAD_OK
117 int res, inp = (int) &res; 117 int res, inp = (int) &res;
118 118
119 printk(KERN_INFO "Checking for popad bug... "); 119 pr_info("Checking for popad bug... ");
120 __asm__ __volatile__( 120 __asm__ __volatile__(
121 "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx " 121 "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx "
122 : "=&a" (res) 122 : "=&a" (res)
@@ -127,9 +127,9 @@ static void __init check_popad(void)
127 * CPU hard. Too bad. 127 * CPU hard. Too bad.
128 */ 128 */
129 if (res != 12345678) 129 if (res != 12345678)
130 printk(KERN_CONT "Buggy.\n"); 130 pr_cont("Buggy\n");
131 else 131 else
132 printk(KERN_CONT "OK.\n"); 132 pr_cont("OK\n");
133#endif 133#endif
134} 134}
135 135
@@ -161,7 +161,7 @@ void __init check_bugs(void)
161{ 161{
162 identify_boot_cpu(); 162 identify_boot_cpu();
163#ifndef CONFIG_SMP 163#ifndef CONFIG_SMP
164 printk(KERN_INFO "CPU: "); 164 pr_info("CPU: ");
165 print_cpu_info(&boot_cpu_data); 165 print_cpu_info(&boot_cpu_data);
166#endif 166#endif
167 check_config(); 167 check_config();
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 82f29e70d05..5bbc082c47a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -947,7 +947,7 @@ static void __cpuinit __print_cpu_msr(void)
947 index_max = msr_range_array[i].max; 947 index_max = msr_range_array[i].max;
948 948
949 for (index = index_min; index < index_max; index++) { 949 for (index = index_min; index < index_max; index++) {
950 if (rdmsrl_amd_safe(index, &val)) 950 if (rdmsrl_safe(index, &val))
951 continue; 951 continue;
952 printk(KERN_INFO " MSR%08x: %016llx\n", index, val); 952 printk(KERN_INFO " MSR%08x: %016llx\n", index, val);
953 } 953 }
@@ -1101,14 +1101,20 @@ int is_debug_stack(unsigned long addr)
1101 addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ)); 1101 addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ));
1102} 1102}
1103 1103
1104static DEFINE_PER_CPU(u32, debug_stack_use_ctr);
1105
1104void debug_stack_set_zero(void) 1106void debug_stack_set_zero(void)
1105{ 1107{
1108 this_cpu_inc(debug_stack_use_ctr);
1106 load_idt((const struct desc_ptr *)&nmi_idt_descr); 1109 load_idt((const struct desc_ptr *)&nmi_idt_descr);
1107} 1110}
1108 1111
1109void debug_stack_reset(void) 1112void debug_stack_reset(void)
1110{ 1113{
1111 load_idt((const struct desc_ptr *)&idt_descr); 1114 if (WARN_ON(!this_cpu_read(debug_stack_use_ctr)))
1115 return;
1116 if (this_cpu_dec_return(debug_stack_use_ctr) == 0)
1117 load_idt((const struct desc_ptr *)&idt_descr);
1112} 1118}
1113 1119
1114#else /* CONFIG_X86_64 */ 1120#else /* CONFIG_X86_64 */
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 755f64fb074..a8f8fa9769d 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -37,6 +37,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] =
37#endif 37#endif
38 &x86_hyper_vmware, 38 &x86_hyper_vmware,
39 &x86_hyper_ms_hyperv, 39 &x86_hyper_ms_hyperv,
40#ifdef CONFIG_KVM_GUEST
41 &x86_hyper_kvm,
42#endif
40}; 43};
41 44
42const struct hypervisor_x86 *x86_hyper; 45const struct hypervisor_x86 *x86_hyper;
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
index 507ea58688e..cd8b166a173 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -42,7 +42,8 @@ void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err)
42 struct mce m; 42 struct mce m;
43 43
44 /* Only corrected MC is reported */ 44 /* Only corrected MC is reported */
45 if (!corrected) 45 if (!corrected || !(mem_err->validation_bits &
46 CPER_MEM_VALID_PHYSICAL_ADDRESS))
46 return; 47 return;
47 48
48 mce_setup(&m); 49 mce_setup(&m);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 0c82091b165..413c2ced887 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -126,6 +126,16 @@ static struct severity {
126 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), 126 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
127 USER 127 USER
128 ), 128 ),
129 MCESEV(
130 KEEP, "HT thread notices Action required: instruction fetch error",
131 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
132 MCGMASK(MCG_STATUS_EIPV, 0)
133 ),
134 MCESEV(
135 AR, "Action required: instruction fetch error",
136 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
137 USER
138 ),
129#endif 139#endif
130 MCESEV( 140 MCESEV(
131 PANIC, "Action required: unknown MCACOD", 141 PANIC, "Action required: unknown MCACOD",
@@ -165,15 +175,19 @@ static struct severity {
165}; 175};
166 176
167/* 177/*
168 * If the EIPV bit is set, it means the saved IP is the 178 * If mcgstatus indicated that ip/cs on the stack were
169 * instruction which caused the MCE. 179 * no good, then "m->cs" will be zero and we will have
180 * to assume the worst case (IN_KERNEL) as we actually
181 * have no idea what we were executing when the machine
182 * check hit.
183 * If we do have a good "m->cs" (or a faked one in the
184 * case we were executing in VM86 mode) we can use it to
185 * distinguish an exception taken in user from from one
186 * taken in the kernel.
170 */ 187 */
171static int error_context(struct mce *m) 188static int error_context(struct mce *m)
172{ 189{
173 if (m->mcgstatus & MCG_STATUS_EIPV) 190 return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
174 return (m->ip && (m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
175 /* Unknown, assume kernel */
176 return IN_KERNEL;
177} 191}
178 192
179int mce_severity(struct mce *m, int tolerant, char **msg) 193int mce_severity(struct mce *m, int tolerant, char **msg)
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 2afcbd253e1..5e095f873e3 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -7,6 +7,9 @@
7 * Copyright 2008 Intel Corporation 7 * Copyright 2008 Intel Corporation
8 * Author: Andi Kleen 8 * Author: Andi Kleen
9 */ 9 */
10
11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12
10#include <linux/thread_info.h> 13#include <linux/thread_info.h>
11#include <linux/capability.h> 14#include <linux/capability.h>
12#include <linux/miscdevice.h> 15#include <linux/miscdevice.h>
@@ -57,8 +60,6 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex);
57 60
58int mce_disabled __read_mostly; 61int mce_disabled __read_mostly;
59 62
60#define MISC_MCELOG_MINOR 227
61
62#define SPINUNIT 100 /* 100ns */ 63#define SPINUNIT 100 /* 100ns */
63 64
64atomic_t mce_entry; 65atomic_t mce_entry;
@@ -210,7 +211,7 @@ static void drain_mcelog_buffer(void)
210 cpu_relax(); 211 cpu_relax();
211 212
212 if (!m->finished && retries >= 4) { 213 if (!m->finished && retries >= 4) {
213 pr_err("MCE: skipping error being logged currently!\n"); 214 pr_err("skipping error being logged currently!\n");
214 break; 215 break;
215 } 216 }
216 } 217 }
@@ -437,6 +438,14 @@ static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
437 if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) { 438 if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
438 m->ip = regs->ip; 439 m->ip = regs->ip;
439 m->cs = regs->cs; 440 m->cs = regs->cs;
441
442 /*
443 * When in VM86 mode make the cs look like ring 3
444 * always. This is a lie, but it's better than passing
445 * the additional vm86 bit around everywhere.
446 */
447 if (v8086_mode(regs))
448 m->cs |= 3;
440 } 449 }
441 /* Use accurate RIP reporting if available. */ 450 /* Use accurate RIP reporting if available. */
442 if (rip_msr) 451 if (rip_msr)
@@ -641,16 +650,18 @@ EXPORT_SYMBOL_GPL(machine_check_poll);
641 * Do a quick check if any of the events requires a panic. 650 * Do a quick check if any of the events requires a panic.
642 * This decides if we keep the events around or clear them. 651 * This decides if we keep the events around or clear them.
643 */ 652 */
644static int mce_no_way_out(struct mce *m, char **msg) 653static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp)
645{ 654{
646 int i; 655 int i, ret = 0;
647 656
648 for (i = 0; i < banks; i++) { 657 for (i = 0; i < banks; i++) {
649 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 658 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
659 if (m->status & MCI_STATUS_VAL)
660 __set_bit(i, validp);
650 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) 661 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
651 return 1; 662 ret = 1;
652 } 663 }
653 return 0; 664 return ret;
654} 665}
655 666
656/* 667/*
@@ -1013,6 +1024,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1013 */ 1024 */
1014 int kill_it = 0; 1025 int kill_it = 0;
1015 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 1026 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
1027 DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
1016 char *msg = "Unknown"; 1028 char *msg = "Unknown";
1017 1029
1018 atomic_inc(&mce_entry); 1030 atomic_inc(&mce_entry);
@@ -1027,7 +1039,8 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1027 final = &__get_cpu_var(mces_seen); 1039 final = &__get_cpu_var(mces_seen);
1028 *final = m; 1040 *final = m;
1029 1041
1030 no_way_out = mce_no_way_out(&m, &msg); 1042 memset(valid_banks, 0, sizeof(valid_banks));
1043 no_way_out = mce_no_way_out(&m, &msg, valid_banks);
1031 1044
1032 barrier(); 1045 barrier();
1033 1046
@@ -1047,6 +1060,8 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1047 order = mce_start(&no_way_out); 1060 order = mce_start(&no_way_out);
1048 for (i = 0; i < banks; i++) { 1061 for (i = 0; i < banks; i++) {
1049 __clear_bit(i, toclear); 1062 __clear_bit(i, toclear);
1063 if (!test_bit(i, valid_banks))
1064 continue;
1050 if (!mce_banks[i].ctl) 1065 if (!mce_banks[i].ctl)
1051 continue; 1066 continue;
1052 1067
@@ -1153,8 +1168,9 @@ int memory_failure(unsigned long pfn, int vector, int flags)
1153{ 1168{
1154 /* mce_severity() should not hand us an ACTION_REQUIRED error */ 1169 /* mce_severity() should not hand us an ACTION_REQUIRED error */
1155 BUG_ON(flags & MF_ACTION_REQUIRED); 1170 BUG_ON(flags & MF_ACTION_REQUIRED);
1156 printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n" 1171 pr_err("Uncorrected memory error in page 0x%lx ignored\n"
1157 "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn); 1172 "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
1173 pfn);
1158 1174
1159 return 0; 1175 return 0;
1160} 1176}
@@ -1172,6 +1188,7 @@ void mce_notify_process(void)
1172{ 1188{
1173 unsigned long pfn; 1189 unsigned long pfn;
1174 struct mce_info *mi = mce_find_info(); 1190 struct mce_info *mi = mce_find_info();
1191 int flags = MF_ACTION_REQUIRED;
1175 1192
1176 if (!mi) 1193 if (!mi)
1177 mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL); 1194 mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL);
@@ -1186,8 +1203,9 @@ void mce_notify_process(void)
1186 * doomed. We still need to mark the page as poisoned and alert any 1203 * doomed. We still need to mark the page as poisoned and alert any
1187 * other users of the page. 1204 * other users of the page.
1188 */ 1205 */
1189 if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0 || 1206 if (!mi->restartable)
1190 mi->restartable == 0) { 1207 flags |= MF_MUST_KILL;
1208 if (memory_failure(pfn, MCE_VECTOR, flags) < 0) {
1191 pr_err("Memory error not recovered"); 1209 pr_err("Memory error not recovered");
1192 force_sig(SIGBUS, current); 1210 force_sig(SIGBUS, current);
1193 } 1211 }
@@ -1237,15 +1255,15 @@ void mce_log_therm_throt_event(__u64 status)
1237 * poller finds an MCE, poll 2x faster. When the poller finds no more 1255 * poller finds an MCE, poll 2x faster. When the poller finds no more
1238 * errors, poll 2x slower (up to check_interval seconds). 1256 * errors, poll 2x slower (up to check_interval seconds).
1239 */ 1257 */
1240static int check_interval = 5 * 60; /* 5 minutes */ 1258static unsigned long check_interval = 5 * 60; /* 5 minutes */
1241 1259
1242static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ 1260static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
1243static DEFINE_PER_CPU(struct timer_list, mce_timer); 1261static DEFINE_PER_CPU(struct timer_list, mce_timer);
1244 1262
1245static void mce_start_timer(unsigned long data) 1263static void mce_timer_fn(unsigned long data)
1246{ 1264{
1247 struct timer_list *t = &per_cpu(mce_timer, data); 1265 struct timer_list *t = &__get_cpu_var(mce_timer);
1248 int *n; 1266 unsigned long iv;
1249 1267
1250 WARN_ON(smp_processor_id() != data); 1268 WARN_ON(smp_processor_id() != data);
1251 1269
@@ -1258,13 +1276,14 @@ static void mce_start_timer(unsigned long data)
1258 * Alert userspace if needed. If we logged an MCE, reduce the 1276 * Alert userspace if needed. If we logged an MCE, reduce the
1259 * polling interval, otherwise increase the polling interval. 1277 * polling interval, otherwise increase the polling interval.
1260 */ 1278 */
1261 n = &__get_cpu_var(mce_next_interval); 1279 iv = __this_cpu_read(mce_next_interval);
1262 if (mce_notify_irq()) 1280 if (mce_notify_irq())
1263 *n = max(*n/2, HZ/100); 1281 iv = max(iv / 2, (unsigned long) HZ/100);
1264 else 1282 else
1265 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); 1283 iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
1284 __this_cpu_write(mce_next_interval, iv);
1266 1285
1267 t->expires = jiffies + *n; 1286 t->expires = jiffies + iv;
1268 add_timer_on(t, smp_processor_id()); 1287 add_timer_on(t, smp_processor_id());
1269} 1288}
1270 1289
@@ -1343,11 +1362,10 @@ static int __cpuinit __mcheck_cpu_cap_init(void)
1343 1362
1344 b = cap & MCG_BANKCNT_MASK; 1363 b = cap & MCG_BANKCNT_MASK;
1345 if (!banks) 1364 if (!banks)
1346 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); 1365 pr_info("CPU supports %d MCE banks\n", b);
1347 1366
1348 if (b > MAX_NR_BANKS) { 1367 if (b > MAX_NR_BANKS) {
1349 printk(KERN_WARNING 1368 pr_warn("Using only %u machine check banks out of %u\n",
1350 "MCE: Using only %u machine check banks out of %u\n",
1351 MAX_NR_BANKS, b); 1369 MAX_NR_BANKS, b);
1352 b = MAX_NR_BANKS; 1370 b = MAX_NR_BANKS;
1353 } 1371 }
@@ -1404,7 +1422,7 @@ static void __mcheck_cpu_init_generic(void)
1404static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) 1422static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1405{ 1423{
1406 if (c->x86_vendor == X86_VENDOR_UNKNOWN) { 1424 if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1407 pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); 1425 pr_info("unknown CPU type - not enabling MCE support\n");
1408 return -EOPNOTSUPP; 1426 return -EOPNOTSUPP;
1409 } 1427 }
1410 1428
@@ -1458,9 +1476,9 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1458 rdmsrl(msrs[i], val); 1476 rdmsrl(msrs[i], val);
1459 1477
1460 /* CntP bit set? */ 1478 /* CntP bit set? */
1461 if (val & BIT(62)) { 1479 if (val & BIT_64(62)) {
1462 val &= ~BIT(62); 1480 val &= ~BIT_64(62);
1463 wrmsrl(msrs[i], val); 1481 wrmsrl(msrs[i], val);
1464 } 1482 }
1465 } 1483 }
1466 1484
@@ -1542,24 +1560,24 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1542static void __mcheck_cpu_init_timer(void) 1560static void __mcheck_cpu_init_timer(void)
1543{ 1561{
1544 struct timer_list *t = &__get_cpu_var(mce_timer); 1562 struct timer_list *t = &__get_cpu_var(mce_timer);
1545 int *n = &__get_cpu_var(mce_next_interval); 1563 unsigned long iv = check_interval * HZ;
1546 1564
1547 setup_timer(t, mce_start_timer, smp_processor_id()); 1565 setup_timer(t, mce_timer_fn, smp_processor_id());
1548 1566
1549 if (mce_ignore_ce) 1567 if (mce_ignore_ce)
1550 return; 1568 return;
1551 1569
1552 *n = check_interval * HZ; 1570 __this_cpu_write(mce_next_interval, iv);
1553 if (!*n) 1571 if (!iv)
1554 return; 1572 return;
1555 t->expires = round_jiffies(jiffies + *n); 1573 t->expires = round_jiffies(jiffies + iv);
1556 add_timer_on(t, smp_processor_id()); 1574 add_timer_on(t, smp_processor_id());
1557} 1575}
1558 1576
1559/* Handle unconfigured int18 (should never happen) */ 1577/* Handle unconfigured int18 (should never happen) */
1560static void unexpected_machine_check(struct pt_regs *regs, long error_code) 1578static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1561{ 1579{
1562 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", 1580 pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
1563 smp_processor_id()); 1581 smp_processor_id());
1564} 1582}
1565 1583
@@ -1878,8 +1896,7 @@ static int __init mcheck_enable(char *str)
1878 get_option(&str, &monarch_timeout); 1896 get_option(&str, &monarch_timeout);
1879 } 1897 }
1880 } else { 1898 } else {
1881 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", 1899 pr_info("mce argument %s ignored. Please use /sys\n", str);
1882 str);
1883 return 0; 1900 return 0;
1884 } 1901 }
1885 return 1; 1902 return 1;
@@ -2262,7 +2279,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2262 case CPU_DOWN_FAILED_FROZEN: 2279 case CPU_DOWN_FAILED_FROZEN:
2263 if (!mce_ignore_ce && check_interval) { 2280 if (!mce_ignore_ce && check_interval) {
2264 t->expires = round_jiffies(jiffies + 2281 t->expires = round_jiffies(jiffies +
2265 __get_cpu_var(mce_next_interval)); 2282 per_cpu(mce_next_interval, cpu));
2266 add_timer_on(t, cpu); 2283 add_timer_on(t, cpu);
2267 } 2284 }
2268 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 2285 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
@@ -2327,7 +2344,7 @@ static __init int mcheck_init_device(void)
2327 2344
2328 return err; 2345 return err;
2329} 2346}
2330device_initcall(mcheck_init_device); 2347device_initcall_sync(mcheck_init_device);
2331 2348
2332/* 2349/*
2333 * Old style boot options parsing. Only for compatibility. 2350 * Old style boot options parsing. Only for compatibility.
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index f4873a64f46..c4e916d7737 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -1,15 +1,17 @@
1/* 1/*
2 * (c) 2005, 2006 Advanced Micro Devices, Inc. 2 * (c) 2005-2012 Advanced Micro Devices, Inc.
3 * Your use of this code is subject to the terms and conditions of the 3 * Your use of this code is subject to the terms and conditions of the
4 * GNU general public license version 2. See "COPYING" or 4 * GNU general public license version 2. See "COPYING" or
5 * http://www.gnu.org/licenses/gpl.html 5 * http://www.gnu.org/licenses/gpl.html
6 * 6 *
7 * Written by Jacob Shin - AMD, Inc. 7 * Written by Jacob Shin - AMD, Inc.
8 * 8 *
9 * Support : jacob.shin@amd.com 9 * Support: borislav.petkov@amd.com
10 * 10 *
11 * April 2006 11 * April 2006
12 * - added support for AMD Family 0x10 processors 12 * - added support for AMD Family 0x10 processors
13 * May 2012
14 * - major scrubbing
13 * 15 *
14 * All MC4_MISCi registers are shared between multi-cores 16 * All MC4_MISCi registers are shared between multi-cores
15 */ 17 */
@@ -25,6 +27,7 @@
25#include <linux/cpu.h> 27#include <linux/cpu.h>
26#include <linux/smp.h> 28#include <linux/smp.h>
27 29
30#include <asm/amd_nb.h>
28#include <asm/apic.h> 31#include <asm/apic.h>
29#include <asm/idle.h> 32#include <asm/idle.h>
30#include <asm/mce.h> 33#include <asm/mce.h>
@@ -45,23 +48,15 @@
45#define MASK_BLKPTR_LO 0xFF000000 48#define MASK_BLKPTR_LO 0xFF000000
46#define MCG_XBLK_ADDR 0xC0000400 49#define MCG_XBLK_ADDR 0xC0000400
47 50
48struct threshold_block { 51static const char * const th_names[] = {
49 unsigned int block; 52 "load_store",
50 unsigned int bank; 53 "insn_fetch",
51 unsigned int cpu; 54 "combined_unit",
52 u32 address; 55 "",
53 u16 interrupt_enable; 56 "northbridge",
54 bool interrupt_capable; 57 "execution_unit",
55 u16 threshold_limit;
56 struct kobject kobj;
57 struct list_head miscj;
58}; 58};
59 59
60struct threshold_bank {
61 struct kobject *kobj;
62 struct threshold_block *blocks;
63 cpumask_var_t cpus;
64};
65static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks); 60static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks);
66 61
67static unsigned char shared_bank[NR_BANKS] = { 62static unsigned char shared_bank[NR_BANKS] = {
@@ -84,6 +79,26 @@ struct thresh_restart {
84 u16 old_limit; 79 u16 old_limit;
85}; 80};
86 81
82static const char * const bank4_names(struct threshold_block *b)
83{
84 switch (b->address) {
85 /* MSR4_MISC0 */
86 case 0x00000413:
87 return "dram";
88
89 case 0xc0000408:
90 return "ht_links";
91
92 case 0xc0000409:
93 return "l3_cache";
94
95 default:
96 WARN(1, "Funny MSR: 0x%08x\n", b->address);
97 return "";
98 }
99};
100
101
87static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits) 102static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits)
88{ 103{
89 /* 104 /*
@@ -224,8 +239,6 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
224 239
225 if (!block) 240 if (!block)
226 per_cpu(bank_map, cpu) |= (1 << bank); 241 per_cpu(bank_map, cpu) |= (1 << bank);
227 if (shared_bank[bank] && c->cpu_core_id)
228 break;
229 242
230 memset(&b, 0, sizeof(b)); 243 memset(&b, 0, sizeof(b));
231 b.cpu = cpu; 244 b.cpu = cpu;
@@ -326,7 +339,7 @@ struct threshold_attr {
326#define SHOW_FIELDS(name) \ 339#define SHOW_FIELDS(name) \
327static ssize_t show_ ## name(struct threshold_block *b, char *buf) \ 340static ssize_t show_ ## name(struct threshold_block *b, char *buf) \
328{ \ 341{ \
329 return sprintf(buf, "%lx\n", (unsigned long) b->name); \ 342 return sprintf(buf, "%lu\n", (unsigned long) b->name); \
330} 343}
331SHOW_FIELDS(interrupt_enable) 344SHOW_FIELDS(interrupt_enable)
332SHOW_FIELDS(threshold_limit) 345SHOW_FIELDS(threshold_limit)
@@ -377,38 +390,21 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
377 return size; 390 return size;
378} 391}
379 392
380struct threshold_block_cross_cpu {
381 struct threshold_block *tb;
382 long retval;
383};
384
385static void local_error_count_handler(void *_tbcc)
386{
387 struct threshold_block_cross_cpu *tbcc = _tbcc;
388 struct threshold_block *b = tbcc->tb;
389 u32 low, high;
390
391 rdmsr(b->address, low, high);
392 tbcc->retval = (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit);
393}
394
395static ssize_t show_error_count(struct threshold_block *b, char *buf) 393static ssize_t show_error_count(struct threshold_block *b, char *buf)
396{ 394{
397 struct threshold_block_cross_cpu tbcc = { .tb = b, }; 395 u32 lo, hi;
398 396
399 smp_call_function_single(b->cpu, local_error_count_handler, &tbcc, 1); 397 rdmsr_on_cpu(b->cpu, b->address, &lo, &hi);
400 return sprintf(buf, "%lx\n", tbcc.retval);
401}
402
403static ssize_t store_error_count(struct threshold_block *b,
404 const char *buf, size_t count)
405{
406 struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 };
407 398
408 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); 399 return sprintf(buf, "%u\n", ((hi & THRESHOLD_MAX) -
409 return 1; 400 (THRESHOLD_MAX - b->threshold_limit)));
410} 401}
411 402
403static struct threshold_attr error_count = {
404 .attr = {.name = __stringify(error_count), .mode = 0444 },
405 .show = show_error_count,
406};
407
412#define RW_ATTR(val) \ 408#define RW_ATTR(val) \
413static struct threshold_attr val = { \ 409static struct threshold_attr val = { \
414 .attr = {.name = __stringify(val), .mode = 0644 }, \ 410 .attr = {.name = __stringify(val), .mode = 0644 }, \
@@ -418,7 +414,6 @@ static struct threshold_attr val = { \
418 414
419RW_ATTR(interrupt_enable); 415RW_ATTR(interrupt_enable);
420RW_ATTR(threshold_limit); 416RW_ATTR(threshold_limit);
421RW_ATTR(error_count);
422 417
423static struct attribute *default_attrs[] = { 418static struct attribute *default_attrs[] = {
424 &threshold_limit.attr, 419 &threshold_limit.attr,
@@ -517,7 +512,7 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
517 512
518 err = kobject_init_and_add(&b->kobj, &threshold_ktype, 513 err = kobject_init_and_add(&b->kobj, &threshold_ktype,
519 per_cpu(threshold_banks, cpu)[bank]->kobj, 514 per_cpu(threshold_banks, cpu)[bank]->kobj,
520 "misc%i", block); 515 (bank == 4 ? bank4_names(b) : th_names[bank]));
521 if (err) 516 if (err)
522 goto out_free; 517 goto out_free;
523recurse: 518recurse:
@@ -548,98 +543,91 @@ out_free:
548 return err; 543 return err;
549} 544}
550 545
551static __cpuinit long 546static __cpuinit int __threshold_add_blocks(struct threshold_bank *b)
552local_allocate_threshold_blocks(int cpu, unsigned int bank)
553{ 547{
554 return allocate_threshold_blocks(cpu, bank, 0, 548 struct list_head *head = &b->blocks->miscj;
555 MSR_IA32_MC0_MISC + bank * 4); 549 struct threshold_block *pos = NULL;
550 struct threshold_block *tmp = NULL;
551 int err = 0;
552
553 err = kobject_add(&b->blocks->kobj, b->kobj, b->blocks->kobj.name);
554 if (err)
555 return err;
556
557 list_for_each_entry_safe(pos, tmp, head, miscj) {
558
559 err = kobject_add(&pos->kobj, b->kobj, pos->kobj.name);
560 if (err) {
561 list_for_each_entry_safe_reverse(pos, tmp, head, miscj)
562 kobject_del(&pos->kobj);
563
564 return err;
565 }
566 }
567 return err;
556} 568}
557 569
558/* symlinks sibling shared banks to first core. first core owns dir/files. */
559static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) 570static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
560{ 571{
561 int i, err = 0;
562 struct threshold_bank *b = NULL;
563 struct device *dev = per_cpu(mce_device, cpu); 572 struct device *dev = per_cpu(mce_device, cpu);
564 char name[32]; 573 struct amd_northbridge *nb = NULL;
574 struct threshold_bank *b = NULL;
575 const char *name = th_names[bank];
576 int err = 0;
565 577
566 sprintf(name, "threshold_bank%i", bank); 578 if (shared_bank[bank]) {
567 579
568#ifdef CONFIG_SMP 580 nb = node_to_amd_nb(amd_get_nb_id(cpu));
569 if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ 581 WARN_ON(!nb);
570 i = cpumask_first(cpu_llc_shared_mask(cpu));
571 582
572 /* first core not up yet */ 583 /* threshold descriptor already initialized on this node? */
573 if (cpu_data(i).cpu_core_id) 584 if (nb->bank4) {
574 goto out; 585 /* yes, use it */
586 b = nb->bank4;
587 err = kobject_add(b->kobj, &dev->kobj, name);
588 if (err)
589 goto out;
575 590
576 /* already linked */ 591 per_cpu(threshold_banks, cpu)[bank] = b;
577 if (per_cpu(threshold_banks, cpu)[bank]) 592 atomic_inc(&b->cpus);
578 goto out;
579 593
580 b = per_cpu(threshold_banks, i)[bank]; 594 err = __threshold_add_blocks(b);
581 595
582 if (!b)
583 goto out; 596 goto out;
584 597 }
585 err = sysfs_create_link(&dev->kobj, b->kobj, name);
586 if (err)
587 goto out;
588
589 cpumask_copy(b->cpus, cpu_llc_shared_mask(cpu));
590 per_cpu(threshold_banks, cpu)[bank] = b;
591
592 goto out;
593 } 598 }
594#endif
595 599
596 b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); 600 b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
597 if (!b) { 601 if (!b) {
598 err = -ENOMEM; 602 err = -ENOMEM;
599 goto out; 603 goto out;
600 } 604 }
601 if (!zalloc_cpumask_var(&b->cpus, GFP_KERNEL)) {
602 kfree(b);
603 err = -ENOMEM;
604 goto out;
605 }
606 605
607 b->kobj = kobject_create_and_add(name, &dev->kobj); 606 b->kobj = kobject_create_and_add(name, &dev->kobj);
608 if (!b->kobj) 607 if (!b->kobj) {
608 err = -EINVAL;
609 goto out_free; 609 goto out_free;
610 610 }
611#ifndef CONFIG_SMP
612 cpumask_setall(b->cpus);
613#else
614 cpumask_set_cpu(cpu, b->cpus);
615#endif
616 611
617 per_cpu(threshold_banks, cpu)[bank] = b; 612 per_cpu(threshold_banks, cpu)[bank] = b;
618 613
619 err = local_allocate_threshold_blocks(cpu, bank); 614 if (shared_bank[bank]) {
620 if (err) 615 atomic_set(&b->cpus, 1);
621 goto out_free;
622
623 for_each_cpu(i, b->cpus) {
624 if (i == cpu)
625 continue;
626
627 dev = per_cpu(mce_device, i);
628 if (dev)
629 err = sysfs_create_link(&dev->kobj,b->kobj, name);
630 if (err)
631 goto out;
632 616
633 per_cpu(threshold_banks, i)[bank] = b; 617 /* nb is already initialized, see above */
618 WARN_ON(nb->bank4);
619 nb->bank4 = b;
634 } 620 }
635 621
636 goto out; 622 err = allocate_threshold_blocks(cpu, bank, 0,
623 MSR_IA32_MC0_MISC + bank * 4);
624 if (!err)
625 goto out;
637 626
638out_free: 627 out_free:
639 per_cpu(threshold_banks, cpu)[bank] = NULL;
640 free_cpumask_var(b->cpus);
641 kfree(b); 628 kfree(b);
642out: 629
630 out:
643 return err; 631 return err;
644} 632}
645 633
@@ -660,12 +648,6 @@ static __cpuinit int threshold_create_device(unsigned int cpu)
660 return err; 648 return err;
661} 649}
662 650
663/*
664 * let's be hotplug friendly.
665 * in case of multiple core processors, the first core always takes ownership
666 * of shared sysfs dir/files, and rest of the cores will be symlinked to it.
667 */
668
669static void deallocate_threshold_block(unsigned int cpu, 651static void deallocate_threshold_block(unsigned int cpu,
670 unsigned int bank) 652 unsigned int bank)
671{ 653{
@@ -686,41 +668,42 @@ static void deallocate_threshold_block(unsigned int cpu,
686 per_cpu(threshold_banks, cpu)[bank]->blocks = NULL; 668 per_cpu(threshold_banks, cpu)[bank]->blocks = NULL;
687} 669}
688 670
671static void __threshold_remove_blocks(struct threshold_bank *b)
672{
673 struct threshold_block *pos = NULL;
674 struct threshold_block *tmp = NULL;
675
676 kobject_del(b->kobj);
677
678 list_for_each_entry_safe(pos, tmp, &b->blocks->miscj, miscj)
679 kobject_del(&pos->kobj);
680}
681
689static void threshold_remove_bank(unsigned int cpu, int bank) 682static void threshold_remove_bank(unsigned int cpu, int bank)
690{ 683{
684 struct amd_northbridge *nb;
691 struct threshold_bank *b; 685 struct threshold_bank *b;
692 struct device *dev;
693 char name[32];
694 int i = 0;
695 686
696 b = per_cpu(threshold_banks, cpu)[bank]; 687 b = per_cpu(threshold_banks, cpu)[bank];
697 if (!b) 688 if (!b)
698 return; 689 return;
690
699 if (!b->blocks) 691 if (!b->blocks)
700 goto free_out; 692 goto free_out;
701 693
702 sprintf(name, "threshold_bank%i", bank); 694 if (shared_bank[bank]) {
703 695 if (!atomic_dec_and_test(&b->cpus)) {
704#ifdef CONFIG_SMP 696 __threshold_remove_blocks(b);
705 /* sibling symlink */ 697 per_cpu(threshold_banks, cpu)[bank] = NULL;
706 if (shared_bank[bank] && b->blocks->cpu != cpu) { 698 return;
707 dev = per_cpu(mce_device, cpu); 699 } else {
708 sysfs_remove_link(&dev->kobj, name); 700 /*
709 per_cpu(threshold_banks, cpu)[bank] = NULL; 701 * the last CPU on this node using the shared bank is
710 702 * going away, remove that bank now.
711 return; 703 */
712 } 704 nb = node_to_amd_nb(amd_get_nb_id(cpu));
713#endif 705 nb->bank4 = NULL;
714 706 }
715 /* remove all sibling symlinks before unregistering */
716 for_each_cpu(i, b->cpus) {
717 if (i == cpu)
718 continue;
719
720 dev = per_cpu(mce_device, i);
721 if (dev)
722 sysfs_remove_link(&dev->kobj, name);
723 per_cpu(threshold_banks, i)[bank] = NULL;
724 } 707 }
725 708
726 deallocate_threshold_block(cpu, bank); 709 deallocate_threshold_block(cpu, bank);
@@ -728,7 +711,6 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
728free_out: 711free_out:
729 kobject_del(b->kobj); 712 kobject_del(b->kobj);
730 kobject_put(b->kobj); 713 kobject_put(b->kobj);
731 free_cpumask_var(b->cpus);
732 kfree(b); 714 kfree(b);
733 per_cpu(threshold_banks, cpu)[bank] = NULL; 715 per_cpu(threshold_banks, cpu)[bank] = NULL;
734} 716}
@@ -777,4 +759,24 @@ static __init int threshold_init_device(void)
777 759
778 return 0; 760 return 0;
779} 761}
780device_initcall(threshold_init_device); 762/*
763 * there are 3 funcs which need to be _initcalled in a logic sequence:
764 * 1. xen_late_init_mcelog
765 * 2. mcheck_init_device
766 * 3. threshold_init_device
767 *
768 * xen_late_init_mcelog must register xen_mce_chrdev_device before
769 * native mce_chrdev_device registration if running under xen platform;
770 *
771 * mcheck_init_device should be inited before threshold_init_device to
772 * initialize mce_device, otherwise a NULL ptr dereference will cause panic.
773 *
774 * so we use following _initcalls
775 * 1. device_initcall(xen_late_init_mcelog);
776 * 2. device_initcall_sync(mcheck_init_device);
777 * 3. late_initcall(threshold_init_device);
778 *
779 * when running under xen, the initcall order is 1,2,3;
780 * on baremetal, we skip 1 and we do only 2 and 3.
781 */
782late_initcall(threshold_init_device);
diff --git a/arch/x86/kernel/cpu/mkcapflags.pl b/arch/x86/kernel/cpu/mkcapflags.pl
index dfea390e160..c7b3fe2d72e 100644
--- a/arch/x86/kernel/cpu/mkcapflags.pl
+++ b/arch/x86/kernel/cpu/mkcapflags.pl
@@ -1,4 +1,4 @@
1#!/usr/bin/perl 1#!/usr/bin/perl -w
2# 2#
3# Generate the x86_cap_flags[] array from include/asm-x86/cpufeature.h 3# Generate the x86_cap_flags[] array from include/asm-x86/cpufeature.h
4# 4#
@@ -11,22 +11,35 @@ open(OUT, "> $out\0") or die "$0: cannot create: $out: $!\n";
11print OUT "#include <asm/cpufeature.h>\n\n"; 11print OUT "#include <asm/cpufeature.h>\n\n";
12print OUT "const char * const x86_cap_flags[NCAPINTS*32] = {\n"; 12print OUT "const char * const x86_cap_flags[NCAPINTS*32] = {\n";
13 13
14%features = ();
15$err = 0;
16
14while (defined($line = <IN>)) { 17while (defined($line = <IN>)) {
15 if ($line =~ /^\s*\#\s*define\s+(X86_FEATURE_(\S+))\s+(.*)$/) { 18 if ($line =~ /^\s*\#\s*define\s+(X86_FEATURE_(\S+))\s+(.*)$/) {
16 $macro = $1; 19 $macro = $1;
17 $feature = $2; 20 $feature = "\L$2";
18 $tail = $3; 21 $tail = $3;
19 if ($tail =~ /\/\*\s*\"([^"]*)\".*\*\//) { 22 if ($tail =~ /\/\*\s*\"([^"]*)\".*\*\//) {
20 $feature = $1; 23 $feature = "\L$1";
21 } 24 }
22 25
23 if ($feature ne '') { 26 next if ($feature eq '');
24 printf OUT "\t%-32s = \"%s\",\n", 27
25 "[$macro]", "\L$feature"; 28 if ($features{$feature}++) {
29 print STDERR "$in: duplicate feature name: $feature\n";
30 $err++;
26 } 31 }
32 printf OUT "\t%-32s = \"%s\",\n", "[$macro]", $feature;
27 } 33 }
28} 34}
29print OUT "};\n"; 35print OUT "};\n";
30 36
31close(IN); 37close(IN);
32close(OUT); 38close(OUT);
39
40if ($err) {
41 unlink($out);
42 exit(1);
43}
44
45exit(0);
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index ac140c7be39..35ffda5d072 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -258,15 +258,15 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk,
258 258
259 /* Compute the maximum size with which we can make a range: */ 259 /* Compute the maximum size with which we can make a range: */
260 if (range_startk) 260 if (range_startk)
261 max_align = ffs(range_startk) - 1; 261 max_align = __ffs(range_startk);
262 else 262 else
263 max_align = 32; 263 max_align = BITS_PER_LONG - 1;
264 264
265 align = fls(range_sizek) - 1; 265 align = __fls(range_sizek);
266 if (align > max_align) 266 if (align > max_align)
267 align = max_align; 267 align = max_align;
268 268
269 sizek = 1 << align; 269 sizek = 1UL << align;
270 if (debug_print) { 270 if (debug_print) {
271 char start_factor = 'K', size_factor = 'K'; 271 char start_factor = 'K', size_factor = 'K';
272 unsigned long start_base, size_base; 272 unsigned long start_base, size_base;
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 75772ae6c65..e9fe907cd24 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -361,11 +361,7 @@ static void __init print_mtrr_state(void)
361 } 361 }
362 pr_debug("MTRR variable ranges %sabled:\n", 362 pr_debug("MTRR variable ranges %sabled:\n",
363 mtrr_state.enabled & 2 ? "en" : "dis"); 363 mtrr_state.enabled & 2 ? "en" : "dis");
364 if (size_or_mask & 0xffffffffUL) 364 high_width = (__ffs64(size_or_mask) - (32 - PAGE_SHIFT) + 3) / 4;
365 high_width = ffs(size_or_mask & 0xffffffffUL) - 1;
366 else
367 high_width = ffs(size_or_mask>>32) + 32 - 1;
368 high_width = (high_width - (32 - PAGE_SHIFT) + 3) / 4;
369 365
370 for (i = 0; i < num_var_ranges; ++i) { 366 for (i = 0; i < num_var_ranges; ++i) {
371 if (mtrr_state.var_ranges[i].mask_lo & (1 << 11)) 367 if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index e049d6da018..29557aa06dd 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -35,17 +35,6 @@
35 35
36#include "perf_event.h" 36#include "perf_event.h"
37 37
38#if 0
39#undef wrmsrl
40#define wrmsrl(msr, val) \
41do { \
42 trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\
43 (unsigned long)(val)); \
44 native_write_msr((msr), (u32)((u64)(val)), \
45 (u32)((u64)(val) >> 32)); \
46} while (0)
47#endif
48
49struct x86_pmu x86_pmu __read_mostly; 38struct x86_pmu x86_pmu __read_mostly;
50 39
51DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { 40DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
@@ -74,7 +63,7 @@ u64 x86_perf_event_update(struct perf_event *event)
74 int idx = hwc->idx; 63 int idx = hwc->idx;
75 s64 delta; 64 s64 delta;
76 65
77 if (idx == X86_PMC_IDX_FIXED_BTS) 66 if (idx == INTEL_PMC_IDX_FIXED_BTS)
78 return 0; 67 return 0;
79 68
80 /* 69 /*
@@ -86,7 +75,7 @@ u64 x86_perf_event_update(struct perf_event *event)
86 */ 75 */
87again: 76again:
88 prev_raw_count = local64_read(&hwc->prev_count); 77 prev_raw_count = local64_read(&hwc->prev_count);
89 rdmsrl(hwc->event_base, new_raw_count); 78 rdpmcl(hwc->event_base_rdpmc, new_raw_count);
90 79
91 if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, 80 if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
92 new_raw_count) != prev_raw_count) 81 new_raw_count) != prev_raw_count)
@@ -189,7 +178,7 @@ static void release_pmc_hardware(void) {}
189 178
190static bool check_hw_exists(void) 179static bool check_hw_exists(void)
191{ 180{
192 u64 val, val_new = 0; 181 u64 val, val_new = ~0;
193 int i, reg, ret = 0; 182 int i, reg, ret = 0;
194 183
195 /* 184 /*
@@ -222,8 +211,9 @@ static bool check_hw_exists(void)
222 * that don't trap on the MSR access and always return 0s. 211 * that don't trap on the MSR access and always return 0s.
223 */ 212 */
224 val = 0xabcdUL; 213 val = 0xabcdUL;
225 ret = checking_wrmsrl(x86_pmu_event_addr(0), val); 214 reg = x86_pmu_event_addr(0);
226 ret |= rdmsrl_safe(x86_pmu_event_addr(0), &val_new); 215 ret = wrmsrl_safe(reg, val);
216 ret |= rdmsrl_safe(reg, &val_new);
227 if (ret || val != val_new) 217 if (ret || val != val_new)
228 goto msr_fail; 218 goto msr_fail;
229 219
@@ -240,6 +230,7 @@ bios_fail:
240 230
241msr_fail: 231msr_fail:
242 printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n"); 232 printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n");
233 printk(KERN_ERR "Failed to access perfctr msr (MSR %x is %Lx)\n", reg, val_new);
243 234
244 return false; 235 return false;
245} 236}
@@ -388,7 +379,7 @@ int x86_pmu_hw_config(struct perf_event *event)
388 int precise = 0; 379 int precise = 0;
389 380
390 /* Support for constant skid */ 381 /* Support for constant skid */
391 if (x86_pmu.pebs_active) { 382 if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
392 precise++; 383 precise++;
393 384
394 /* Support for IP fixup */ 385 /* Support for IP fixup */
@@ -637,8 +628,8 @@ static bool __perf_sched_find_counter(struct perf_sched *sched)
637 c = sched->constraints[sched->state.event]; 628 c = sched->constraints[sched->state.event];
638 629
639 /* Prefer fixed purpose counters */ 630 /* Prefer fixed purpose counters */
640 if (x86_pmu.num_counters_fixed) { 631 if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
641 idx = X86_PMC_IDX_FIXED; 632 idx = INTEL_PMC_IDX_FIXED;
642 for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) { 633 for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
643 if (!__test_and_set_bit(idx, sched->state.used)) 634 if (!__test_and_set_bit(idx, sched->state.used))
644 goto done; 635 goto done;
@@ -646,7 +637,7 @@ static bool __perf_sched_find_counter(struct perf_sched *sched)
646 } 637 }
647 /* Grab the first unused counter starting with idx */ 638 /* Grab the first unused counter starting with idx */
648 idx = sched->state.counter; 639 idx = sched->state.counter;
649 for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_FIXED) { 640 for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
650 if (!__test_and_set_bit(idx, sched->state.used)) 641 if (!__test_and_set_bit(idx, sched->state.used))
651 goto done; 642 goto done;
652 } 643 }
@@ -704,8 +695,8 @@ static bool perf_sched_next_event(struct perf_sched *sched)
704/* 695/*
705 * Assign a counter for each event. 696 * Assign a counter for each event.
706 */ 697 */
707static int perf_assign_events(struct event_constraint **constraints, int n, 698int perf_assign_events(struct event_constraint **constraints, int n,
708 int wmin, int wmax, int *assign) 699 int wmin, int wmax, int *assign)
709{ 700{
710 struct perf_sched sched; 701 struct perf_sched sched;
711 702
@@ -824,15 +815,17 @@ static inline void x86_assign_hw_event(struct perf_event *event,
824 hwc->last_cpu = smp_processor_id(); 815 hwc->last_cpu = smp_processor_id();
825 hwc->last_tag = ++cpuc->tags[i]; 816 hwc->last_tag = ++cpuc->tags[i];
826 817
827 if (hwc->idx == X86_PMC_IDX_FIXED_BTS) { 818 if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) {
828 hwc->config_base = 0; 819 hwc->config_base = 0;
829 hwc->event_base = 0; 820 hwc->event_base = 0;
830 } else if (hwc->idx >= X86_PMC_IDX_FIXED) { 821 } else if (hwc->idx >= INTEL_PMC_IDX_FIXED) {
831 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; 822 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
832 hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - X86_PMC_IDX_FIXED); 823 hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - INTEL_PMC_IDX_FIXED);
824 hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 1<<30;
833 } else { 825 } else {
834 hwc->config_base = x86_pmu_config_addr(hwc->idx); 826 hwc->config_base = x86_pmu_config_addr(hwc->idx);
835 hwc->event_base = x86_pmu_event_addr(hwc->idx); 827 hwc->event_base = x86_pmu_event_addr(hwc->idx);
828 hwc->event_base_rdpmc = hwc->idx;
836 } 829 }
837} 830}
838 831
@@ -930,7 +923,7 @@ int x86_perf_event_set_period(struct perf_event *event)
930 s64 period = hwc->sample_period; 923 s64 period = hwc->sample_period;
931 int ret = 0, idx = hwc->idx; 924 int ret = 0, idx = hwc->idx;
932 925
933 if (idx == X86_PMC_IDX_FIXED_BTS) 926 if (idx == INTEL_PMC_IDX_FIXED_BTS)
934 return 0; 927 return 0;
935 928
936 /* 929 /*
@@ -1316,7 +1309,6 @@ static struct attribute_group x86_pmu_format_group = {
1316static int __init init_hw_perf_events(void) 1309static int __init init_hw_perf_events(void)
1317{ 1310{
1318 struct x86_pmu_quirk *quirk; 1311 struct x86_pmu_quirk *quirk;
1319 struct event_constraint *c;
1320 int err; 1312 int err;
1321 1313
1322 pr_info("Performance Events: "); 1314 pr_info("Performance Events: ");
@@ -1347,21 +1339,8 @@ static int __init init_hw_perf_events(void)
1347 for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next) 1339 for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next)
1348 quirk->func(); 1340 quirk->func();
1349 1341
1350 if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { 1342 if (!x86_pmu.intel_ctrl)
1351 WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", 1343 x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
1352 x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
1353 x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
1354 }
1355 x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
1356
1357 if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
1358 WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
1359 x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
1360 x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
1361 }
1362
1363 x86_pmu.intel_ctrl |=
1364 ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
1365 1344
1366 perf_events_lapic_init(); 1345 perf_events_lapic_init();
1367 register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI"); 1346 register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI");
@@ -1370,22 +1349,6 @@ static int __init init_hw_perf_events(void)
1370 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1, 1349 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
1371 0, x86_pmu.num_counters, 0); 1350 0, x86_pmu.num_counters, 0);
1372 1351
1373 if (x86_pmu.event_constraints) {
1374 /*
1375 * event on fixed counter2 (REF_CYCLES) only works on this
1376 * counter, so do not extend mask to generic counters
1377 */
1378 for_each_event_constraint(c, x86_pmu.event_constraints) {
1379 if (c->cmask != X86_RAW_EVENT_MASK
1380 || c->idxmsk64 == X86_PMC_MSK_FIXED_REF_CYCLES) {
1381 continue;
1382 }
1383
1384 c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
1385 c->weight += x86_pmu.num_counters;
1386 }
1387 }
1388
1389 x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ 1352 x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */
1390 x86_pmu_format_group.attrs = x86_pmu.format_attrs; 1353 x86_pmu_format_group.attrs = x86_pmu.format_attrs;
1391 1354
@@ -1496,6 +1459,7 @@ static struct cpu_hw_events *allocate_fake_cpuc(void)
1496 if (!cpuc->shared_regs) 1459 if (!cpuc->shared_regs)
1497 goto error; 1460 goto error;
1498 } 1461 }
1462 cpuc->is_fake = 1;
1499 return cpuc; 1463 return cpuc;
1500error: 1464error:
1501 free_fake_cpuc(cpuc); 1465 free_fake_cpuc(cpuc);
@@ -1619,8 +1583,8 @@ static int x86_pmu_event_idx(struct perf_event *event)
1619 if (!x86_pmu.attr_rdpmc) 1583 if (!x86_pmu.attr_rdpmc)
1620 return 0; 1584 return 0;
1621 1585
1622 if (x86_pmu.num_counters_fixed && idx >= X86_PMC_IDX_FIXED) { 1586 if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) {
1623 idx -= X86_PMC_IDX_FIXED; 1587 idx -= INTEL_PMC_IDX_FIXED;
1624 idx |= 1 << 30; 1588 idx |= 1 << 30;
1625 } 1589 }
1626 1590
@@ -1648,7 +1612,12 @@ static ssize_t set_attr_rdpmc(struct device *cdev,
1648 struct device_attribute *attr, 1612 struct device_attribute *attr,
1649 const char *buf, size_t count) 1613 const char *buf, size_t count)
1650{ 1614{
1651 unsigned long val = simple_strtoul(buf, NULL, 0); 1615 unsigned long val;
1616 ssize_t ret;
1617
1618 ret = kstrtoul(buf, 0, &val);
1619 if (ret)
1620 return ret;
1652 1621
1653 if (!!val != !!x86_pmu.attr_rdpmc) { 1622 if (!!val != !!x86_pmu.attr_rdpmc) {
1654 x86_pmu.attr_rdpmc = !!val; 1623 x86_pmu.attr_rdpmc = !!val;
@@ -1681,13 +1650,20 @@ static void x86_pmu_flush_branch_stack(void)
1681 x86_pmu.flush_branch_stack(); 1650 x86_pmu.flush_branch_stack();
1682} 1651}
1683 1652
1653void perf_check_microcode(void)
1654{
1655 if (x86_pmu.check_microcode)
1656 x86_pmu.check_microcode();
1657}
1658EXPORT_SYMBOL_GPL(perf_check_microcode);
1659
1684static struct pmu pmu = { 1660static struct pmu pmu = {
1685 .pmu_enable = x86_pmu_enable, 1661 .pmu_enable = x86_pmu_enable,
1686 .pmu_disable = x86_pmu_disable, 1662 .pmu_disable = x86_pmu_disable,
1687 1663
1688 .attr_groups = x86_pmu_attr_groups, 1664 .attr_groups = x86_pmu_attr_groups,
1689 1665
1690 .event_init = x86_pmu_event_init, 1666 .event_init = x86_pmu_event_init,
1691 1667
1692 .add = x86_pmu_add, 1668 .add = x86_pmu_add,
1693 .del = x86_pmu_del, 1669 .del = x86_pmu_del,
@@ -1695,11 +1671,11 @@ static struct pmu pmu = {
1695 .stop = x86_pmu_stop, 1671 .stop = x86_pmu_stop,
1696 .read = x86_pmu_read, 1672 .read = x86_pmu_read,
1697 1673
1698 .start_txn = x86_pmu_start_txn, 1674 .start_txn = x86_pmu_start_txn,
1699 .cancel_txn = x86_pmu_cancel_txn, 1675 .cancel_txn = x86_pmu_cancel_txn,
1700 .commit_txn = x86_pmu_commit_txn, 1676 .commit_txn = x86_pmu_commit_txn,
1701 1677
1702 .event_idx = x86_pmu_event_idx, 1678 .event_idx = x86_pmu_event_idx,
1703 .flush_branch_stack = x86_pmu_flush_branch_stack, 1679 .flush_branch_stack = x86_pmu_flush_branch_stack,
1704}; 1680};
1705 1681
@@ -1756,6 +1732,12 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
1756 dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); 1732 dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
1757} 1733}
1758 1734
1735static inline int
1736valid_user_frame(const void __user *fp, unsigned long size)
1737{
1738 return (__range_not_ok(fp, size, TASK_SIZE) == 0);
1739}
1740
1759#ifdef CONFIG_COMPAT 1741#ifdef CONFIG_COMPAT
1760 1742
1761#include <asm/compat.h> 1743#include <asm/compat.h>
@@ -1780,7 +1762,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
1780 if (bytes != sizeof(frame)) 1762 if (bytes != sizeof(frame))
1781 break; 1763 break;
1782 1764
1783 if (fp < compat_ptr(regs->sp)) 1765 if (!valid_user_frame(fp, sizeof(frame)))
1784 break; 1766 break;
1785 1767
1786 perf_callchain_store(entry, frame.return_address); 1768 perf_callchain_store(entry, frame.return_address);
@@ -1826,7 +1808,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
1826 if (bytes != sizeof(frame)) 1808 if (bytes != sizeof(frame))
1827 break; 1809 break;
1828 1810
1829 if ((unsigned long)fp < regs->sp) 1811 if (!valid_user_frame(fp, sizeof(frame)))
1830 break; 1812 break;
1831 1813
1832 perf_callchain_store(entry, frame.return_address); 1814 perf_callchain_store(entry, frame.return_address);
@@ -1856,7 +1838,7 @@ unsigned long perf_misc_flags(struct pt_regs *regs)
1856 else 1838 else
1857 misc |= PERF_RECORD_MISC_GUEST_KERNEL; 1839 misc |= PERF_RECORD_MISC_GUEST_KERNEL;
1858 } else { 1840 } else {
1859 if (user_mode(regs)) 1841 if (!kernel_ip(regs->ip))
1860 misc |= PERF_RECORD_MISC_USER; 1842 misc |= PERF_RECORD_MISC_USER;
1861 else 1843 else
1862 misc |= PERF_RECORD_MISC_KERNEL; 1844 misc |= PERF_RECORD_MISC_KERNEL;
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 6638aaf5449..a15df4be151 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -14,6 +14,18 @@
14 14
15#include <linux/perf_event.h> 15#include <linux/perf_event.h>
16 16
17#if 0
18#undef wrmsrl
19#define wrmsrl(msr, val) \
20do { \
21 unsigned int _msr = (msr); \
22 u64 _val = (val); \
23 trace_printk("wrmsrl(%x, %Lx)\n", (unsigned int)(_msr), \
24 (unsigned long long)(_val)); \
25 native_write_msr((_msr), (u32)(_val), (u32)(_val >> 32)); \
26} while (0)
27#endif
28
17/* 29/*
18 * | NHM/WSM | SNB | 30 * | NHM/WSM | SNB |
19 * register ------------------------------- 31 * register -------------------------------
@@ -57,7 +69,7 @@ struct amd_nb {
57}; 69};
58 70
59/* The maximal number of PEBS events: */ 71/* The maximal number of PEBS events: */
60#define MAX_PEBS_EVENTS 4 72#define MAX_PEBS_EVENTS 8
61 73
62/* 74/*
63 * A debug store configuration. 75 * A debug store configuration.
@@ -117,6 +129,7 @@ struct cpu_hw_events {
117 struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ 129 struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
118 130
119 unsigned int group_flag; 131 unsigned int group_flag;
132 int is_fake;
120 133
121 /* 134 /*
122 * Intel DebugStore bits 135 * Intel DebugStore bits
@@ -348,6 +361,8 @@ struct x86_pmu {
348 void (*cpu_starting)(int cpu); 361 void (*cpu_starting)(int cpu);
349 void (*cpu_dying)(int cpu); 362 void (*cpu_dying)(int cpu);
350 void (*cpu_dead)(int cpu); 363 void (*cpu_dead)(int cpu);
364
365 void (*check_microcode)(void);
351 void (*flush_branch_stack)(void); 366 void (*flush_branch_stack)(void);
352 367
353 /* 368 /*
@@ -359,11 +374,16 @@ struct x86_pmu {
359 /* 374 /*
360 * Intel DebugStore bits 375 * Intel DebugStore bits
361 */ 376 */
362 int bts, pebs; 377 int bts :1,
363 int bts_active, pebs_active; 378 bts_active :1,
379 pebs :1,
380 pebs_active :1,
381 pebs_broken :1;
364 int pebs_record_size; 382 int pebs_record_size;
365 void (*drain_pebs)(struct pt_regs *regs); 383 void (*drain_pebs)(struct pt_regs *regs);
366 struct event_constraint *pebs_constraints; 384 struct event_constraint *pebs_constraints;
385 void (*pebs_aliases)(struct perf_event *event);
386 int max_pebs_events;
367 387
368 /* 388 /*
369 * Intel LBR 389 * Intel LBR
@@ -466,6 +486,8 @@ static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
466 486
467void x86_pmu_enable_all(int added); 487void x86_pmu_enable_all(int added);
468 488
489int perf_assign_events(struct event_constraint **constraints, int n,
490 int wmin, int wmax, int *assign);
469int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign); 491int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign);
470 492
471void x86_pmu_stop(struct perf_event *event, int flags); 493void x86_pmu_stop(struct perf_event *event, int flags);
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 11a4eb9131d..4528ae7b6ec 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -366,7 +366,7 @@ static void amd_pmu_cpu_starting(int cpu)
366 366
367 cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY; 367 cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY;
368 368
369 if (boot_cpu_data.x86_max_cores < 2 || boot_cpu_data.x86 == 0x15) 369 if (boot_cpu_data.x86_max_cores < 2)
370 return; 370 return;
371 371
372 nb_id = amd_get_nb_id(cpu); 372 nb_id = amd_get_nb_id(cpu);
@@ -422,35 +422,6 @@ static struct attribute *amd_format_attr[] = {
422 NULL, 422 NULL,
423}; 423};
424 424
425static __initconst const struct x86_pmu amd_pmu = {
426 .name = "AMD",
427 .handle_irq = x86_pmu_handle_irq,
428 .disable_all = x86_pmu_disable_all,
429 .enable_all = x86_pmu_enable_all,
430 .enable = x86_pmu_enable_event,
431 .disable = x86_pmu_disable_event,
432 .hw_config = amd_pmu_hw_config,
433 .schedule_events = x86_schedule_events,
434 .eventsel = MSR_K7_EVNTSEL0,
435 .perfctr = MSR_K7_PERFCTR0,
436 .event_map = amd_pmu_event_map,
437 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
438 .num_counters = AMD64_NUM_COUNTERS,
439 .cntval_bits = 48,
440 .cntval_mask = (1ULL << 48) - 1,
441 .apic = 1,
442 /* use highest bit to detect overflow */
443 .max_period = (1ULL << 47) - 1,
444 .get_event_constraints = amd_get_event_constraints,
445 .put_event_constraints = amd_put_event_constraints,
446
447 .format_attrs = amd_format_attr,
448
449 .cpu_prepare = amd_pmu_cpu_prepare,
450 .cpu_starting = amd_pmu_cpu_starting,
451 .cpu_dead = amd_pmu_cpu_dead,
452};
453
454/* AMD Family 15h */ 425/* AMD Family 15h */
455 426
456#define AMD_EVENT_TYPE_MASK 0x000000F0ULL 427#define AMD_EVENT_TYPE_MASK 0x000000F0ULL
@@ -597,8 +568,8 @@ amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *ev
597 } 568 }
598} 569}
599 570
600static __initconst const struct x86_pmu amd_pmu_f15h = { 571static __initconst const struct x86_pmu amd_pmu = {
601 .name = "AMD Family 15h", 572 .name = "AMD",
602 .handle_irq = x86_pmu_handle_irq, 573 .handle_irq = x86_pmu_handle_irq,
603 .disable_all = x86_pmu_disable_all, 574 .disable_all = x86_pmu_disable_all,
604 .enable_all = x86_pmu_enable_all, 575 .enable_all = x86_pmu_enable_all,
@@ -606,50 +577,68 @@ static __initconst const struct x86_pmu amd_pmu_f15h = {
606 .disable = x86_pmu_disable_event, 577 .disable = x86_pmu_disable_event,
607 .hw_config = amd_pmu_hw_config, 578 .hw_config = amd_pmu_hw_config,
608 .schedule_events = x86_schedule_events, 579 .schedule_events = x86_schedule_events,
609 .eventsel = MSR_F15H_PERF_CTL, 580 .eventsel = MSR_K7_EVNTSEL0,
610 .perfctr = MSR_F15H_PERF_CTR, 581 .perfctr = MSR_K7_PERFCTR0,
611 .event_map = amd_pmu_event_map, 582 .event_map = amd_pmu_event_map,
612 .max_events = ARRAY_SIZE(amd_perfmon_event_map), 583 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
613 .num_counters = AMD64_NUM_COUNTERS_F15H, 584 .num_counters = AMD64_NUM_COUNTERS,
614 .cntval_bits = 48, 585 .cntval_bits = 48,
615 .cntval_mask = (1ULL << 48) - 1, 586 .cntval_mask = (1ULL << 48) - 1,
616 .apic = 1, 587 .apic = 1,
617 /* use highest bit to detect overflow */ 588 /* use highest bit to detect overflow */
618 .max_period = (1ULL << 47) - 1, 589 .max_period = (1ULL << 47) - 1,
619 .get_event_constraints = amd_get_event_constraints_f15h, 590 .get_event_constraints = amd_get_event_constraints,
620 /* nortbridge counters not yet implemented: */
621#if 0
622 .put_event_constraints = amd_put_event_constraints, 591 .put_event_constraints = amd_put_event_constraints,
623 592
593 .format_attrs = amd_format_attr,
594
624 .cpu_prepare = amd_pmu_cpu_prepare, 595 .cpu_prepare = amd_pmu_cpu_prepare,
625 .cpu_dead = amd_pmu_cpu_dead,
626#endif
627 .cpu_starting = amd_pmu_cpu_starting, 596 .cpu_starting = amd_pmu_cpu_starting,
628 .format_attrs = amd_format_attr, 597 .cpu_dead = amd_pmu_cpu_dead,
629}; 598};
630 599
600static int setup_event_constraints(void)
601{
602 if (boot_cpu_data.x86 >= 0x15)
603 x86_pmu.get_event_constraints = amd_get_event_constraints_f15h;
604 return 0;
605}
606
607static int setup_perfctr_core(void)
608{
609 if (!cpu_has_perfctr_core) {
610 WARN(x86_pmu.get_event_constraints == amd_get_event_constraints_f15h,
611 KERN_ERR "Odd, counter constraints enabled but no core perfctrs detected!");
612 return -ENODEV;
613 }
614
615 WARN(x86_pmu.get_event_constraints == amd_get_event_constraints,
616 KERN_ERR "hw perf events core counters need constraints handler!");
617
618 /*
619 * If core performance counter extensions exists, we must use
620 * MSR_F15H_PERF_CTL/MSR_F15H_PERF_CTR msrs. See also
621 * x86_pmu_addr_offset().
622 */
623 x86_pmu.eventsel = MSR_F15H_PERF_CTL;
624 x86_pmu.perfctr = MSR_F15H_PERF_CTR;
625 x86_pmu.num_counters = AMD64_NUM_COUNTERS_CORE;
626
627 printk(KERN_INFO "perf: AMD core performance counters detected\n");
628
629 return 0;
630}
631
631__init int amd_pmu_init(void) 632__init int amd_pmu_init(void)
632{ 633{
633 /* Performance-monitoring supported from K7 and later: */ 634 /* Performance-monitoring supported from K7 and later: */
634 if (boot_cpu_data.x86 < 6) 635 if (boot_cpu_data.x86 < 6)
635 return -ENODEV; 636 return -ENODEV;
636 637
637 /* 638 x86_pmu = amd_pmu;
638 * If core performance counter extensions exists, it must be 639
639 * family 15h, otherwise fail. See x86_pmu_addr_offset(). 640 setup_event_constraints();
640 */ 641 setup_perfctr_core();
641 switch (boot_cpu_data.x86) {
642 case 0x15:
643 if (!cpu_has_perfctr_core)
644 return -ENODEV;
645 x86_pmu = amd_pmu_f15h;
646 break;
647 default:
648 if (cpu_has_perfctr_core)
649 return -ENODEV;
650 x86_pmu = amd_pmu;
651 break;
652 }
653 642
654 /* Events are common for all AMDs */ 643 /* Events are common for all AMDs */
655 memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, 644 memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 166546ec6ae..7a8b9d0abca 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -5,6 +5,8 @@
5 * among events on a single PMU. 5 * among events on a single PMU.
6 */ 6 */
7 7
8#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
9
8#include <linux/stddef.h> 10#include <linux/stddef.h>
9#include <linux/types.h> 11#include <linux/types.h>
10#include <linux/init.h> 12#include <linux/init.h>
@@ -21,14 +23,14 @@
21 */ 23 */
22static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = 24static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
23{ 25{
24 [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, 26 [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
25 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, 27 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
26 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, 28 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
27 [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, 29 [PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
28 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, 30 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
29 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, 31 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
30 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, 32 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
31 [PERF_COUNT_HW_REF_CPU_CYCLES] = 0x0300, /* pseudo-encoding */ 33 [PERF_COUNT_HW_REF_CPU_CYCLES] = 0x0300, /* pseudo-encoding */
32}; 34};
33 35
34static struct event_constraint intel_core_event_constraints[] __read_mostly = 36static struct event_constraint intel_core_event_constraints[] __read_mostly =
@@ -747,7 +749,7 @@ static void intel_pmu_disable_all(void)
747 749
748 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); 750 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
749 751
750 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) 752 if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
751 intel_pmu_disable_bts(); 753 intel_pmu_disable_bts();
752 754
753 intel_pmu_pebs_disable_all(); 755 intel_pmu_pebs_disable_all();
@@ -763,9 +765,9 @@ static void intel_pmu_enable_all(int added)
763 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 765 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL,
764 x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask); 766 x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask);
765 767
766 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { 768 if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
767 struct perf_event *event = 769 struct perf_event *event =
768 cpuc->events[X86_PMC_IDX_FIXED_BTS]; 770 cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
769 771
770 if (WARN_ON_ONCE(!event)) 772 if (WARN_ON_ONCE(!event))
771 return; 773 return;
@@ -871,7 +873,7 @@ static inline void intel_pmu_ack_status(u64 ack)
871 873
872static void intel_pmu_disable_fixed(struct hw_perf_event *hwc) 874static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
873{ 875{
874 int idx = hwc->idx - X86_PMC_IDX_FIXED; 876 int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
875 u64 ctrl_val, mask; 877 u64 ctrl_val, mask;
876 878
877 mask = 0xfULL << (idx * 4); 879 mask = 0xfULL << (idx * 4);
@@ -886,7 +888,7 @@ static void intel_pmu_disable_event(struct perf_event *event)
886 struct hw_perf_event *hwc = &event->hw; 888 struct hw_perf_event *hwc = &event->hw;
887 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 889 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
888 890
889 if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { 891 if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) {
890 intel_pmu_disable_bts(); 892 intel_pmu_disable_bts();
891 intel_pmu_drain_bts_buffer(); 893 intel_pmu_drain_bts_buffer();
892 return; 894 return;
@@ -915,7 +917,7 @@ static void intel_pmu_disable_event(struct perf_event *event)
915 917
916static void intel_pmu_enable_fixed(struct hw_perf_event *hwc) 918static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
917{ 919{
918 int idx = hwc->idx - X86_PMC_IDX_FIXED; 920 int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
919 u64 ctrl_val, bits, mask; 921 u64 ctrl_val, bits, mask;
920 922
921 /* 923 /*
@@ -949,7 +951,7 @@ static void intel_pmu_enable_event(struct perf_event *event)
949 struct hw_perf_event *hwc = &event->hw; 951 struct hw_perf_event *hwc = &event->hw;
950 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 952 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
951 953
952 if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { 954 if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) {
953 if (!__this_cpu_read(cpu_hw_events.enabled)) 955 if (!__this_cpu_read(cpu_hw_events.enabled))
954 return; 956 return;
955 957
@@ -1000,14 +1002,14 @@ static void intel_pmu_reset(void)
1000 1002
1001 local_irq_save(flags); 1003 local_irq_save(flags);
1002 1004
1003 printk("clearing PMU state on CPU#%d\n", smp_processor_id()); 1005 pr_info("clearing PMU state on CPU#%d\n", smp_processor_id());
1004 1006
1005 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1007 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1006 checking_wrmsrl(x86_pmu_config_addr(idx), 0ull); 1008 wrmsrl_safe(x86_pmu_config_addr(idx), 0ull);
1007 checking_wrmsrl(x86_pmu_event_addr(idx), 0ull); 1009 wrmsrl_safe(x86_pmu_event_addr(idx), 0ull);
1008 } 1010 }
1009 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) 1011 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
1010 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); 1012 wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
1011 1013
1012 if (ds) 1014 if (ds)
1013 ds->bts_index = ds->bts_buffer_base; 1015 ds->bts_index = ds->bts_buffer_base;
@@ -1119,27 +1121,33 @@ intel_bts_constraints(struct perf_event *event)
1119 return NULL; 1121 return NULL;
1120} 1122}
1121 1123
1122static bool intel_try_alt_er(struct perf_event *event, int orig_idx) 1124static int intel_alt_er(int idx)
1123{ 1125{
1124 if (!(x86_pmu.er_flags & ERF_HAS_RSP_1)) 1126 if (!(x86_pmu.er_flags & ERF_HAS_RSP_1))
1125 return false; 1127 return idx;
1126 1128
1127 if (event->hw.extra_reg.idx == EXTRA_REG_RSP_0) { 1129 if (idx == EXTRA_REG_RSP_0)
1128 event->hw.config &= ~INTEL_ARCH_EVENT_MASK; 1130 return EXTRA_REG_RSP_1;
1129 event->hw.config |= 0x01bb; 1131
1130 event->hw.extra_reg.idx = EXTRA_REG_RSP_1; 1132 if (idx == EXTRA_REG_RSP_1)
1131 event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1; 1133 return EXTRA_REG_RSP_0;
1132 } else if (event->hw.extra_reg.idx == EXTRA_REG_RSP_1) { 1134
1135 return idx;
1136}
1137
1138static void intel_fixup_er(struct perf_event *event, int idx)
1139{
1140 event->hw.extra_reg.idx = idx;
1141
1142 if (idx == EXTRA_REG_RSP_0) {
1133 event->hw.config &= ~INTEL_ARCH_EVENT_MASK; 1143 event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
1134 event->hw.config |= 0x01b7; 1144 event->hw.config |= 0x01b7;
1135 event->hw.extra_reg.idx = EXTRA_REG_RSP_0;
1136 event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0; 1145 event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0;
1146 } else if (idx == EXTRA_REG_RSP_1) {
1147 event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
1148 event->hw.config |= 0x01bb;
1149 event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
1137 } 1150 }
1138
1139 if (event->hw.extra_reg.idx == orig_idx)
1140 return false;
1141
1142 return true;
1143} 1151}
1144 1152
1145/* 1153/*
@@ -1157,14 +1165,18 @@ __intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc,
1157 struct event_constraint *c = &emptyconstraint; 1165 struct event_constraint *c = &emptyconstraint;
1158 struct er_account *era; 1166 struct er_account *era;
1159 unsigned long flags; 1167 unsigned long flags;
1160 int orig_idx = reg->idx; 1168 int idx = reg->idx;
1161 1169
1162 /* already allocated shared msr */ 1170 /*
1163 if (reg->alloc) 1171 * reg->alloc can be set due to existing state, so for fake cpuc we
1172 * need to ignore this, otherwise we might fail to allocate proper fake
1173 * state for this extra reg constraint. Also see the comment below.
1174 */
1175 if (reg->alloc && !cpuc->is_fake)
1164 return NULL; /* call x86_get_event_constraint() */ 1176 return NULL; /* call x86_get_event_constraint() */
1165 1177
1166again: 1178again:
1167 era = &cpuc->shared_regs->regs[reg->idx]; 1179 era = &cpuc->shared_regs->regs[idx];
1168 /* 1180 /*
1169 * we use spin_lock_irqsave() to avoid lockdep issues when 1181 * we use spin_lock_irqsave() to avoid lockdep issues when
1170 * passing a fake cpuc 1182 * passing a fake cpuc
@@ -1173,6 +1185,29 @@ again:
1173 1185
1174 if (!atomic_read(&era->ref) || era->config == reg->config) { 1186 if (!atomic_read(&era->ref) || era->config == reg->config) {
1175 1187
1188 /*
1189 * If its a fake cpuc -- as per validate_{group,event}() we
1190 * shouldn't touch event state and we can avoid doing so
1191 * since both will only call get_event_constraints() once
1192 * on each event, this avoids the need for reg->alloc.
1193 *
1194 * Not doing the ER fixup will only result in era->reg being
1195 * wrong, but since we won't actually try and program hardware
1196 * this isn't a problem either.
1197 */
1198 if (!cpuc->is_fake) {
1199 if (idx != reg->idx)
1200 intel_fixup_er(event, idx);
1201
1202 /*
1203 * x86_schedule_events() can call get_event_constraints()
1204 * multiple times on events in the case of incremental
1205 * scheduling(). reg->alloc ensures we only do the ER
1206 * allocation once.
1207 */
1208 reg->alloc = 1;
1209 }
1210
1176 /* lock in msr value */ 1211 /* lock in msr value */
1177 era->config = reg->config; 1212 era->config = reg->config;
1178 era->reg = reg->reg; 1213 era->reg = reg->reg;
@@ -1180,17 +1215,17 @@ again:
1180 /* one more user */ 1215 /* one more user */
1181 atomic_inc(&era->ref); 1216 atomic_inc(&era->ref);
1182 1217
1183 /* no need to reallocate during incremental event scheduling */
1184 reg->alloc = 1;
1185
1186 /* 1218 /*
1187 * need to call x86_get_event_constraint() 1219 * need to call x86_get_event_constraint()
1188 * to check if associated event has constraints 1220 * to check if associated event has constraints
1189 */ 1221 */
1190 c = NULL; 1222 c = NULL;
1191 } else if (intel_try_alt_er(event, orig_idx)) { 1223 } else {
1192 raw_spin_unlock_irqrestore(&era->lock, flags); 1224 idx = intel_alt_er(idx);
1193 goto again; 1225 if (idx != reg->idx) {
1226 raw_spin_unlock_irqrestore(&era->lock, flags);
1227 goto again;
1228 }
1194 } 1229 }
1195 raw_spin_unlock_irqrestore(&era->lock, flags); 1230 raw_spin_unlock_irqrestore(&era->lock, flags);
1196 1231
@@ -1204,11 +1239,14 @@ __intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc,
1204 struct er_account *era; 1239 struct er_account *era;
1205 1240
1206 /* 1241 /*
1207 * only put constraint if extra reg was actually 1242 * Only put constraint if extra reg was actually allocated. Also takes
1208 * allocated. Also takes care of event which do 1243 * care of event which do not use an extra shared reg.
1209 * not use an extra shared reg 1244 *
1245 * Also, if this is a fake cpuc we shouldn't touch any event state
1246 * (reg->alloc) and we don't care about leaving inconsistent cpuc state
1247 * either since it'll be thrown out.
1210 */ 1248 */
1211 if (!reg->alloc) 1249 if (!reg->alloc || cpuc->is_fake)
1212 return; 1250 return;
1213 1251
1214 era = &cpuc->shared_regs->regs[reg->idx]; 1252 era = &cpuc->shared_regs->regs[reg->idx];
@@ -1300,15 +1338,9 @@ static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
1300 intel_put_shared_regs_event_constraints(cpuc, event); 1338 intel_put_shared_regs_event_constraints(cpuc, event);
1301} 1339}
1302 1340
1303static int intel_pmu_hw_config(struct perf_event *event) 1341static void intel_pebs_aliases_core2(struct perf_event *event)
1304{ 1342{
1305 int ret = x86_pmu_hw_config(event); 1343 if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
1306
1307 if (ret)
1308 return ret;
1309
1310 if (event->attr.precise_ip &&
1311 (event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
1312 /* 1344 /*
1313 * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P 1345 * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
1314 * (0x003c) so that we can use it with PEBS. 1346 * (0x003c) so that we can use it with PEBS.
@@ -1329,10 +1361,48 @@ static int intel_pmu_hw_config(struct perf_event *event)
1329 */ 1361 */
1330 u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16); 1362 u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16);
1331 1363
1364 alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
1365 event->hw.config = alt_config;
1366 }
1367}
1368
1369static void intel_pebs_aliases_snb(struct perf_event *event)
1370{
1371 if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
1372 /*
1373 * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
1374 * (0x003c) so that we can use it with PEBS.
1375 *
1376 * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
1377 * PEBS capable. However we can use UOPS_RETIRED.ALL
1378 * (0x01c2), which is a PEBS capable event, to get the same
1379 * count.
1380 *
1381 * UOPS_RETIRED.ALL counts the number of cycles that retires
1382 * CNTMASK micro-ops. By setting CNTMASK to a value (16)
1383 * larger than the maximum number of micro-ops that can be
1384 * retired per cycle (4) and then inverting the condition, we
1385 * count all cycles that retire 16 or less micro-ops, which
1386 * is every cycle.
1387 *
1388 * Thereby we gain a PEBS capable cycle counter.
1389 */
1390 u64 alt_config = X86_CONFIG(.event=0xc2, .umask=0x01, .inv=1, .cmask=16);
1332 1391
1333 alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK); 1392 alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
1334 event->hw.config = alt_config; 1393 event->hw.config = alt_config;
1335 } 1394 }
1395}
1396
1397static int intel_pmu_hw_config(struct perf_event *event)
1398{
1399 int ret = x86_pmu_hw_config(event);
1400
1401 if (ret)
1402 return ret;
1403
1404 if (event->attr.precise_ip && x86_pmu.pebs_aliases)
1405 x86_pmu.pebs_aliases(event);
1336 1406
1337 if (intel_pmu_needs_lbr_smpl(event)) { 1407 if (intel_pmu_needs_lbr_smpl(event)) {
1338 ret = intel_pmu_setup_lbr_filter(event); 1408 ret = intel_pmu_setup_lbr_filter(event);
@@ -1607,6 +1677,7 @@ static __initconst const struct x86_pmu intel_pmu = {
1607 .max_period = (1ULL << 31) - 1, 1677 .max_period = (1ULL << 31) - 1,
1608 .get_event_constraints = intel_get_event_constraints, 1678 .get_event_constraints = intel_get_event_constraints,
1609 .put_event_constraints = intel_put_event_constraints, 1679 .put_event_constraints = intel_put_event_constraints,
1680 .pebs_aliases = intel_pebs_aliases_core2,
1610 1681
1611 .format_attrs = intel_arch3_formats_attr, 1682 .format_attrs = intel_arch3_formats_attr,
1612 1683
@@ -1638,16 +1709,61 @@ static __init void intel_clovertown_quirk(void)
1638 * But taken together it might just make sense to not enable PEBS on 1709 * But taken together it might just make sense to not enable PEBS on
1639 * these chips. 1710 * these chips.
1640 */ 1711 */
1641 printk(KERN_WARNING "PEBS disabled due to CPU errata.\n"); 1712 pr_warn("PEBS disabled due to CPU errata\n");
1642 x86_pmu.pebs = 0; 1713 x86_pmu.pebs = 0;
1643 x86_pmu.pebs_constraints = NULL; 1714 x86_pmu.pebs_constraints = NULL;
1644} 1715}
1645 1716
1717static int intel_snb_pebs_broken(int cpu)
1718{
1719 u32 rev = UINT_MAX; /* default to broken for unknown models */
1720
1721 switch (cpu_data(cpu).x86_model) {
1722 case 42: /* SNB */
1723 rev = 0x28;
1724 break;
1725
1726 case 45: /* SNB-EP */
1727 switch (cpu_data(cpu).x86_mask) {
1728 case 6: rev = 0x618; break;
1729 case 7: rev = 0x70c; break;
1730 }
1731 }
1732
1733 return (cpu_data(cpu).microcode < rev);
1734}
1735
1736static void intel_snb_check_microcode(void)
1737{
1738 int pebs_broken = 0;
1739 int cpu;
1740
1741 get_online_cpus();
1742 for_each_online_cpu(cpu) {
1743 if ((pebs_broken = intel_snb_pebs_broken(cpu)))
1744 break;
1745 }
1746 put_online_cpus();
1747
1748 if (pebs_broken == x86_pmu.pebs_broken)
1749 return;
1750
1751 /*
1752 * Serialized by the microcode lock..
1753 */
1754 if (x86_pmu.pebs_broken) {
1755 pr_info("PEBS enabled due to microcode update\n");
1756 x86_pmu.pebs_broken = 0;
1757 } else {
1758 pr_info("PEBS disabled due to CPU errata, please upgrade microcode\n");
1759 x86_pmu.pebs_broken = 1;
1760 }
1761}
1762
1646static __init void intel_sandybridge_quirk(void) 1763static __init void intel_sandybridge_quirk(void)
1647{ 1764{
1648 printk(KERN_WARNING "PEBS disabled due to CPU errata.\n"); 1765 x86_pmu.check_microcode = intel_snb_check_microcode;
1649 x86_pmu.pebs = 0; 1766 intel_snb_check_microcode();
1650 x86_pmu.pebs_constraints = NULL;
1651} 1767}
1652 1768
1653static const struct { int id; char *name; } intel_arch_events_map[] __initconst = { 1769static const struct { int id; char *name; } intel_arch_events_map[] __initconst = {
@@ -1667,8 +1783,8 @@ static __init void intel_arch_events_quirk(void)
1667 /* disable event that reported as not presend by cpuid */ 1783 /* disable event that reported as not presend by cpuid */
1668 for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) { 1784 for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) {
1669 intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0; 1785 intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0;
1670 printk(KERN_WARNING "CPUID marked event: \'%s\' unavailable\n", 1786 pr_warn("CPUID marked event: \'%s\' unavailable\n",
1671 intel_arch_events_map[bit].name); 1787 intel_arch_events_map[bit].name);
1672 } 1788 }
1673} 1789}
1674 1790
@@ -1687,7 +1803,7 @@ static __init void intel_nehalem_quirk(void)
1687 intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89; 1803 intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
1688 ebx.split.no_branch_misses_retired = 0; 1804 ebx.split.no_branch_misses_retired = 0;
1689 x86_pmu.events_maskl = ebx.full; 1805 x86_pmu.events_maskl = ebx.full;
1690 printk(KERN_INFO "CPU erratum AAJ80 worked around\n"); 1806 pr_info("CPU erratum AAJ80 worked around\n");
1691 } 1807 }
1692} 1808}
1693 1809
@@ -1696,6 +1812,7 @@ __init int intel_pmu_init(void)
1696 union cpuid10_edx edx; 1812 union cpuid10_edx edx;
1697 union cpuid10_eax eax; 1813 union cpuid10_eax eax;
1698 union cpuid10_ebx ebx; 1814 union cpuid10_ebx ebx;
1815 struct event_constraint *c;
1699 unsigned int unused; 1816 unsigned int unused;
1700 int version; 1817 int version;
1701 1818
@@ -1731,6 +1848,8 @@ __init int intel_pmu_init(void)
1731 x86_pmu.events_maskl = ebx.full; 1848 x86_pmu.events_maskl = ebx.full;
1732 x86_pmu.events_mask_len = eax.split.mask_length; 1849 x86_pmu.events_mask_len = eax.split.mask_length;
1733 1850
1851 x86_pmu.max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters);
1852
1734 /* 1853 /*
1735 * Quirk: v2 perfmon does not report fixed-purpose events, so 1854 * Quirk: v2 perfmon does not report fixed-purpose events, so
1736 * assume at least 3 events: 1855 * assume at least 3 events:
@@ -1840,8 +1959,9 @@ __init int intel_pmu_init(void)
1840 break; 1959 break;
1841 1960
1842 case 42: /* SandyBridge */ 1961 case 42: /* SandyBridge */
1843 x86_add_quirk(intel_sandybridge_quirk);
1844 case 45: /* SandyBridge, "Romely-EP" */ 1962 case 45: /* SandyBridge, "Romely-EP" */
1963 x86_add_quirk(intel_sandybridge_quirk);
1964 case 58: /* IvyBridge */
1845 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, 1965 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
1846 sizeof(hw_cache_event_ids)); 1966 sizeof(hw_cache_event_ids));
1847 1967
@@ -1849,6 +1969,7 @@ __init int intel_pmu_init(void)
1849 1969
1850 x86_pmu.event_constraints = intel_snb_event_constraints; 1970 x86_pmu.event_constraints = intel_snb_event_constraints;
1851 x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints; 1971 x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints;
1972 x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
1852 x86_pmu.extra_regs = intel_snb_extra_regs; 1973 x86_pmu.extra_regs = intel_snb_extra_regs;
1853 /* all extra regs are per-cpu when HT is on */ 1974 /* all extra regs are per-cpu when HT is on */
1854 x86_pmu.er_flags |= ERF_HAS_RSP_1; 1975 x86_pmu.er_flags |= ERF_HAS_RSP_1;
@@ -1880,5 +2001,37 @@ __init int intel_pmu_init(void)
1880 } 2001 }
1881 } 2002 }
1882 2003
2004 if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) {
2005 WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
2006 x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC);
2007 x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC;
2008 }
2009 x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
2010
2011 if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) {
2012 WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
2013 x86_pmu.num_counters_fixed, INTEL_PMC_MAX_FIXED);
2014 x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED;
2015 }
2016
2017 x86_pmu.intel_ctrl |=
2018 ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED;
2019
2020 if (x86_pmu.event_constraints) {
2021 /*
2022 * event on fixed counter2 (REF_CYCLES) only works on this
2023 * counter, so do not extend mask to generic counters
2024 */
2025 for_each_event_constraint(c, x86_pmu.event_constraints) {
2026 if (c->cmask != X86_RAW_EVENT_MASK
2027 || c->idxmsk64 == INTEL_PMC_MSK_FIXED_REF_CYCLES) {
2028 continue;
2029 }
2030
2031 c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
2032 c->weight += x86_pmu.num_counters;
2033 }
2034 }
2035
1883 return 0; 2036 return 0;
1884} 2037}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 5a3edc27f6e..629ae0b7ad9 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -248,7 +248,7 @@ void reserve_ds_buffers(void)
248 */ 248 */
249 249
250struct event_constraint bts_constraint = 250struct event_constraint bts_constraint =
251 EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0); 251 EVENT_CONSTRAINT(0, 1ULL << INTEL_PMC_IDX_FIXED_BTS, 0);
252 252
253void intel_pmu_enable_bts(u64 config) 253void intel_pmu_enable_bts(u64 config)
254{ 254{
@@ -295,7 +295,7 @@ int intel_pmu_drain_bts_buffer(void)
295 u64 to; 295 u64 to;
296 u64 flags; 296 u64 flags;
297 }; 297 };
298 struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS]; 298 struct perf_event *event = cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
299 struct bts_record *at, *top; 299 struct bts_record *at, *top;
300 struct perf_output_handle handle; 300 struct perf_output_handle handle;
301 struct perf_event_header header; 301 struct perf_event_header header;
@@ -400,14 +400,7 @@ struct event_constraint intel_snb_pebs_event_constraints[] = {
400 INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ 400 INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
401 INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ 401 INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */
402 INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.* */ 402 INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.* */
403 INTEL_UEVENT_CONSTRAINT(0x11d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_LOADS */ 403 INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */
404 INTEL_UEVENT_CONSTRAINT(0x12d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_STORES */
405 INTEL_UEVENT_CONSTRAINT(0x21d0, 0xf), /* MEM_UOP_RETIRED.LOCK_LOADS */
406 INTEL_UEVENT_CONSTRAINT(0x22d0, 0xf), /* MEM_UOP_RETIRED.LOCK_STORES */
407 INTEL_UEVENT_CONSTRAINT(0x41d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_LOADS */
408 INTEL_UEVENT_CONSTRAINT(0x42d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_STORES */
409 INTEL_UEVENT_CONSTRAINT(0x81d0, 0xf), /* MEM_UOP_RETIRED.ANY_LOADS */
410 INTEL_UEVENT_CONSTRAINT(0x82d0, 0xf), /* MEM_UOP_RETIRED.ANY_STORES */
411 INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ 404 INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
412 INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ 405 INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
413 INTEL_UEVENT_CONSTRAINT(0x02d4, 0xf), /* MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS */ 406 INTEL_UEVENT_CONSTRAINT(0x02d4, 0xf), /* MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS */
@@ -627,7 +620,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
627 * Should not happen, we program the threshold at 1 and do not 620 * Should not happen, we program the threshold at 1 and do not
628 * set a reset value. 621 * set a reset value.
629 */ 622 */
630 WARN_ON_ONCE(n > 1); 623 WARN_ONCE(n > 1, "bad leftover pebs %d\n", n);
631 at += n - 1; 624 at += n - 1;
632 625
633 __intel_pmu_pebs_event(event, iregs, at); 626 __intel_pmu_pebs_event(event, iregs, at);
@@ -658,10 +651,10 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
658 * Should not happen, we program the threshold at 1 and do not 651 * Should not happen, we program the threshold at 1 and do not
659 * set a reset value. 652 * set a reset value.
660 */ 653 */
661 WARN_ON_ONCE(n > MAX_PEBS_EVENTS); 654 WARN_ONCE(n > x86_pmu.max_pebs_events, "Unexpected number of pebs records %d\n", n);
662 655
663 for ( ; at < top; at++) { 656 for ( ; at < top; at++) {
664 for_each_set_bit(bit, (unsigned long *)&at->status, MAX_PEBS_EVENTS) { 657 for_each_set_bit(bit, (unsigned long *)&at->status, x86_pmu.max_pebs_events) {
665 event = cpuc->events[bit]; 658 event = cpuc->events[bit];
666 if (!test_bit(bit, cpuc->active_mask)) 659 if (!test_bit(bit, cpuc->active_mask))
667 continue; 660 continue;
@@ -677,7 +670,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
677 break; 670 break;
678 } 671 }
679 672
680 if (!event || bit >= MAX_PEBS_EVENTS) 673 if (!event || bit >= x86_pmu.max_pebs_events)
681 continue; 674 continue;
682 675
683 __intel_pmu_pebs_event(event, iregs, at); 676 __intel_pmu_pebs_event(event, iregs, at);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
new file mode 100644
index 00000000000..19faffc6088
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -0,0 +1,1850 @@
1#include "perf_event_intel_uncore.h"
2
3static struct intel_uncore_type *empty_uncore[] = { NULL, };
4static struct intel_uncore_type **msr_uncores = empty_uncore;
5static struct intel_uncore_type **pci_uncores = empty_uncore;
6/* pci bus to socket mapping */
7static int pcibus_to_physid[256] = { [0 ... 255] = -1, };
8
9static DEFINE_RAW_SPINLOCK(uncore_box_lock);
10
11/* mask of cpus that collect uncore events */
12static cpumask_t uncore_cpu_mask;
13
14/* constraint for the fixed counter */
15static struct event_constraint constraint_fixed =
16 EVENT_CONSTRAINT(~0ULL, 1 << UNCORE_PMC_IDX_FIXED, ~0ULL);
17static struct event_constraint constraint_empty =
18 EVENT_CONSTRAINT(0, 0, 0);
19
20DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
21DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
22DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
23DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19");
24DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
25DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28");
26DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31");
27DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
28DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28");
29DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15");
30DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30");
31DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51");
32DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4");
33DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17");
34DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22");
35DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31");
36DEFINE_UNCORE_FORMAT_ATTR(filter_brand0, filter_brand0, "config1:0-7");
37DEFINE_UNCORE_FORMAT_ATTR(filter_brand1, filter_brand1, "config1:8-15");
38DEFINE_UNCORE_FORMAT_ATTR(filter_brand2, filter_brand2, "config1:16-23");
39DEFINE_UNCORE_FORMAT_ATTR(filter_brand3, filter_brand3, "config1:24-31");
40
41/* Sandy Bridge-EP uncore support */
42static struct intel_uncore_type snbep_uncore_cbox;
43static struct intel_uncore_type snbep_uncore_pcu;
44
45static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box)
46{
47 struct pci_dev *pdev = box->pci_dev;
48 int box_ctl = uncore_pci_box_ctl(box);
49 u32 config;
50
51 pci_read_config_dword(pdev, box_ctl, &config);
52 config |= SNBEP_PMON_BOX_CTL_FRZ;
53 pci_write_config_dword(pdev, box_ctl, config);
54}
55
56static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box)
57{
58 struct pci_dev *pdev = box->pci_dev;
59 int box_ctl = uncore_pci_box_ctl(box);
60 u32 config;
61
62 pci_read_config_dword(pdev, box_ctl, &config);
63 config &= ~SNBEP_PMON_BOX_CTL_FRZ;
64 pci_write_config_dword(pdev, box_ctl, config);
65}
66
67static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box,
68 struct perf_event *event)
69{
70 struct pci_dev *pdev = box->pci_dev;
71 struct hw_perf_event *hwc = &event->hw;
72
73 pci_write_config_dword(pdev, hwc->config_base, hwc->config |
74 SNBEP_PMON_CTL_EN);
75}
76
77static void snbep_uncore_pci_disable_event(struct intel_uncore_box *box,
78 struct perf_event *event)
79{
80 struct pci_dev *pdev = box->pci_dev;
81 struct hw_perf_event *hwc = &event->hw;
82
83 pci_write_config_dword(pdev, hwc->config_base, hwc->config);
84}
85
86static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box,
87 struct perf_event *event)
88{
89 struct pci_dev *pdev = box->pci_dev;
90 struct hw_perf_event *hwc = &event->hw;
91 u64 count;
92
93 pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count);
94 pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1);
95 return count;
96}
97
98static void snbep_uncore_pci_init_box(struct intel_uncore_box *box)
99{
100 struct pci_dev *pdev = box->pci_dev;
101 pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL,
102 SNBEP_PMON_BOX_CTL_INT);
103}
104
105static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box)
106{
107 u64 config;
108 unsigned msr;
109
110 msr = uncore_msr_box_ctl(box);
111 if (msr) {
112 rdmsrl(msr, config);
113 config |= SNBEP_PMON_BOX_CTL_FRZ;
114 wrmsrl(msr, config);
115 return;
116 }
117}
118
119static void snbep_uncore_msr_enable_box(struct intel_uncore_box *box)
120{
121 u64 config;
122 unsigned msr;
123
124 msr = uncore_msr_box_ctl(box);
125 if (msr) {
126 rdmsrl(msr, config);
127 config &= ~SNBEP_PMON_BOX_CTL_FRZ;
128 wrmsrl(msr, config);
129 return;
130 }
131}
132
133static void snbep_uncore_msr_enable_event(struct intel_uncore_box *box,
134 struct perf_event *event)
135{
136 struct hw_perf_event *hwc = &event->hw;
137 struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
138
139 if (reg1->idx != EXTRA_REG_NONE)
140 wrmsrl(reg1->reg, reg1->config);
141
142 wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
143}
144
145static void snbep_uncore_msr_disable_event(struct intel_uncore_box *box,
146 struct perf_event *event)
147{
148 struct hw_perf_event *hwc = &event->hw;
149
150 wrmsrl(hwc->config_base, hwc->config);
151}
152
153static u64 snbep_uncore_msr_read_counter(struct intel_uncore_box *box,
154 struct perf_event *event)
155{
156 struct hw_perf_event *hwc = &event->hw;
157 u64 count;
158
159 rdmsrl(hwc->event_base, count);
160 return count;
161}
162
163static void snbep_uncore_msr_init_box(struct intel_uncore_box *box)
164{
165 unsigned msr = uncore_msr_box_ctl(box);
166 if (msr)
167 wrmsrl(msr, SNBEP_PMON_BOX_CTL_INT);
168}
169
170static struct event_constraint *
171snbep_uncore_get_constraint(struct intel_uncore_box *box,
172 struct perf_event *event)
173{
174 struct intel_uncore_extra_reg *er;
175 struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
176 unsigned long flags;
177 bool ok = false;
178
179 if (reg1->idx == EXTRA_REG_NONE || (box->phys_id >= 0 && reg1->alloc))
180 return NULL;
181
182 er = &box->shared_regs[reg1->idx];
183 raw_spin_lock_irqsave(&er->lock, flags);
184 if (!atomic_read(&er->ref) || er->config1 == reg1->config) {
185 atomic_inc(&er->ref);
186 er->config1 = reg1->config;
187 ok = true;
188 }
189 raw_spin_unlock_irqrestore(&er->lock, flags);
190
191 if (ok) {
192 if (box->phys_id >= 0)
193 reg1->alloc = 1;
194 return NULL;
195 }
196 return &constraint_empty;
197}
198
199static void snbep_uncore_put_constraint(struct intel_uncore_box *box,
200 struct perf_event *event)
201{
202 struct intel_uncore_extra_reg *er;
203 struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
204
205 if (box->phys_id < 0 || !reg1->alloc)
206 return;
207
208 er = &box->shared_regs[reg1->idx];
209 atomic_dec(&er->ref);
210 reg1->alloc = 0;
211}
212
213static int snbep_uncore_hw_config(struct intel_uncore_box *box,
214 struct perf_event *event)
215{
216 struct hw_perf_event *hwc = &event->hw;
217 struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
218
219 if (box->pmu->type == &snbep_uncore_cbox) {
220 reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
221 SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
222 reg1->config = event->attr.config1 &
223 SNBEP_CB0_MSR_PMON_BOX_FILTER_MASK;
224 } else if (box->pmu->type == &snbep_uncore_pcu) {
225 reg1->reg = SNBEP_PCU_MSR_PMON_BOX_FILTER;
226 reg1->config = event->attr.config1 &
227 SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK;
228 } else {
229 return 0;
230 }
231 reg1->idx = 0;
232 return 0;
233}
234
235static struct attribute *snbep_uncore_formats_attr[] = {
236 &format_attr_event.attr,
237 &format_attr_umask.attr,
238 &format_attr_edge.attr,
239 &format_attr_inv.attr,
240 &format_attr_thresh8.attr,
241 NULL,
242};
243
244static struct attribute *snbep_uncore_ubox_formats_attr[] = {
245 &format_attr_event.attr,
246 &format_attr_umask.attr,
247 &format_attr_edge.attr,
248 &format_attr_inv.attr,
249 &format_attr_thresh5.attr,
250 NULL,
251};
252
253static struct attribute *snbep_uncore_cbox_formats_attr[] = {
254 &format_attr_event.attr,
255 &format_attr_umask.attr,
256 &format_attr_edge.attr,
257 &format_attr_tid_en.attr,
258 &format_attr_inv.attr,
259 &format_attr_thresh8.attr,
260 &format_attr_filter_tid.attr,
261 &format_attr_filter_nid.attr,
262 &format_attr_filter_state.attr,
263 &format_attr_filter_opc.attr,
264 NULL,
265};
266
267static struct attribute *snbep_uncore_pcu_formats_attr[] = {
268 &format_attr_event.attr,
269 &format_attr_occ_sel.attr,
270 &format_attr_edge.attr,
271 &format_attr_inv.attr,
272 &format_attr_thresh5.attr,
273 &format_attr_occ_invert.attr,
274 &format_attr_occ_edge.attr,
275 &format_attr_filter_brand0.attr,
276 &format_attr_filter_brand1.attr,
277 &format_attr_filter_brand2.attr,
278 &format_attr_filter_brand3.attr,
279 NULL,
280};
281
282static struct uncore_event_desc snbep_uncore_imc_events[] = {
283 INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
284 INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03"),
285 INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"),
286 { /* end: all zeroes */ },
287};
288
289static struct uncore_event_desc snbep_uncore_qpi_events[] = {
290 INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x14"),
291 INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"),
292 INTEL_UNCORE_EVENT_DESC(drs_data, "event=0x02,umask=0x08"),
293 INTEL_UNCORE_EVENT_DESC(ncb_data, "event=0x03,umask=0x04"),
294 { /* end: all zeroes */ },
295};
296
297static struct attribute_group snbep_uncore_format_group = {
298 .name = "format",
299 .attrs = snbep_uncore_formats_attr,
300};
301
302static struct attribute_group snbep_uncore_ubox_format_group = {
303 .name = "format",
304 .attrs = snbep_uncore_ubox_formats_attr,
305};
306
307static struct attribute_group snbep_uncore_cbox_format_group = {
308 .name = "format",
309 .attrs = snbep_uncore_cbox_formats_attr,
310};
311
312static struct attribute_group snbep_uncore_pcu_format_group = {
313 .name = "format",
314 .attrs = snbep_uncore_pcu_formats_attr,
315};
316
317static struct intel_uncore_ops snbep_uncore_msr_ops = {
318 .init_box = snbep_uncore_msr_init_box,
319 .disable_box = snbep_uncore_msr_disable_box,
320 .enable_box = snbep_uncore_msr_enable_box,
321 .disable_event = snbep_uncore_msr_disable_event,
322 .enable_event = snbep_uncore_msr_enable_event,
323 .read_counter = snbep_uncore_msr_read_counter,
324 .get_constraint = snbep_uncore_get_constraint,
325 .put_constraint = snbep_uncore_put_constraint,
326 .hw_config = snbep_uncore_hw_config,
327};
328
329static struct intel_uncore_ops snbep_uncore_pci_ops = {
330 .init_box = snbep_uncore_pci_init_box,
331 .disable_box = snbep_uncore_pci_disable_box,
332 .enable_box = snbep_uncore_pci_enable_box,
333 .disable_event = snbep_uncore_pci_disable_event,
334 .enable_event = snbep_uncore_pci_enable_event,
335 .read_counter = snbep_uncore_pci_read_counter,
336};
337
338static struct event_constraint snbep_uncore_cbox_constraints[] = {
339 UNCORE_EVENT_CONSTRAINT(0x01, 0x1),
340 UNCORE_EVENT_CONSTRAINT(0x02, 0x3),
341 UNCORE_EVENT_CONSTRAINT(0x04, 0x3),
342 UNCORE_EVENT_CONSTRAINT(0x05, 0x3),
343 UNCORE_EVENT_CONSTRAINT(0x07, 0x3),
344 UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
345 UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
346 UNCORE_EVENT_CONSTRAINT(0x13, 0x3),
347 UNCORE_EVENT_CONSTRAINT(0x1b, 0xc),
348 UNCORE_EVENT_CONSTRAINT(0x1c, 0xc),
349 UNCORE_EVENT_CONSTRAINT(0x1d, 0xc),
350 UNCORE_EVENT_CONSTRAINT(0x1e, 0xc),
351 EVENT_CONSTRAINT_OVERLAP(0x1f, 0xe, 0xff),
352 UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
353 UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
354 UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
355 UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
356 UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
357 UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
358 UNCORE_EVENT_CONSTRAINT(0x35, 0x3),
359 UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
360 UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
361 UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
362 UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
363 UNCORE_EVENT_CONSTRAINT(0x3b, 0x1),
364 EVENT_CONSTRAINT_END
365};
366
367static struct event_constraint snbep_uncore_r2pcie_constraints[] = {
368 UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
369 UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
370 UNCORE_EVENT_CONSTRAINT(0x12, 0x1),
371 UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
372 UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
373 UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
374 UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
375 UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
376 UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
377 UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
378 EVENT_CONSTRAINT_END
379};
380
381static struct event_constraint snbep_uncore_r3qpi_constraints[] = {
382 UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
383 UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
384 UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
385 UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
386 UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
387 UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
388 UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
389 UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
390 UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
391 UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
392 UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
393 UNCORE_EVENT_CONSTRAINT(0x30, 0x3),
394 UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
395 UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
396 UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
397 UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
398 UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
399 UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
400 EVENT_CONSTRAINT_END
401};
402
403static struct intel_uncore_type snbep_uncore_ubox = {
404 .name = "ubox",
405 .num_counters = 2,
406 .num_boxes = 1,
407 .perf_ctr_bits = 44,
408 .fixed_ctr_bits = 48,
409 .perf_ctr = SNBEP_U_MSR_PMON_CTR0,
410 .event_ctl = SNBEP_U_MSR_PMON_CTL0,
411 .event_mask = SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
412 .fixed_ctr = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
413 .fixed_ctl = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
414 .ops = &snbep_uncore_msr_ops,
415 .format_group = &snbep_uncore_ubox_format_group,
416};
417
418static struct intel_uncore_type snbep_uncore_cbox = {
419 .name = "cbox",
420 .num_counters = 4,
421 .num_boxes = 8,
422 .perf_ctr_bits = 44,
423 .event_ctl = SNBEP_C0_MSR_PMON_CTL0,
424 .perf_ctr = SNBEP_C0_MSR_PMON_CTR0,
425 .event_mask = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
426 .box_ctl = SNBEP_C0_MSR_PMON_BOX_CTL,
427 .msr_offset = SNBEP_CBO_MSR_OFFSET,
428 .num_shared_regs = 1,
429 .constraints = snbep_uncore_cbox_constraints,
430 .ops = &snbep_uncore_msr_ops,
431 .format_group = &snbep_uncore_cbox_format_group,
432};
433
434static struct intel_uncore_type snbep_uncore_pcu = {
435 .name = "pcu",
436 .num_counters = 4,
437 .num_boxes = 1,
438 .perf_ctr_bits = 48,
439 .perf_ctr = SNBEP_PCU_MSR_PMON_CTR0,
440 .event_ctl = SNBEP_PCU_MSR_PMON_CTL0,
441 .event_mask = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
442 .box_ctl = SNBEP_PCU_MSR_PMON_BOX_CTL,
443 .num_shared_regs = 1,
444 .ops = &snbep_uncore_msr_ops,
445 .format_group = &snbep_uncore_pcu_format_group,
446};
447
448static struct intel_uncore_type *snbep_msr_uncores[] = {
449 &snbep_uncore_ubox,
450 &snbep_uncore_cbox,
451 &snbep_uncore_pcu,
452 NULL,
453};
454
455#define SNBEP_UNCORE_PCI_COMMON_INIT() \
456 .perf_ctr = SNBEP_PCI_PMON_CTR0, \
457 .event_ctl = SNBEP_PCI_PMON_CTL0, \
458 .event_mask = SNBEP_PMON_RAW_EVENT_MASK, \
459 .box_ctl = SNBEP_PCI_PMON_BOX_CTL, \
460 .ops = &snbep_uncore_pci_ops, \
461 .format_group = &snbep_uncore_format_group
462
463static struct intel_uncore_type snbep_uncore_ha = {
464 .name = "ha",
465 .num_counters = 4,
466 .num_boxes = 1,
467 .perf_ctr_bits = 48,
468 SNBEP_UNCORE_PCI_COMMON_INIT(),
469};
470
471static struct intel_uncore_type snbep_uncore_imc = {
472 .name = "imc",
473 .num_counters = 4,
474 .num_boxes = 4,
475 .perf_ctr_bits = 48,
476 .fixed_ctr_bits = 48,
477 .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
478 .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
479 .event_descs = snbep_uncore_imc_events,
480 SNBEP_UNCORE_PCI_COMMON_INIT(),
481};
482
483static struct intel_uncore_type snbep_uncore_qpi = {
484 .name = "qpi",
485 .num_counters = 4,
486 .num_boxes = 2,
487 .perf_ctr_bits = 48,
488 .event_descs = snbep_uncore_qpi_events,
489 SNBEP_UNCORE_PCI_COMMON_INIT(),
490};
491
492
493static struct intel_uncore_type snbep_uncore_r2pcie = {
494 .name = "r2pcie",
495 .num_counters = 4,
496 .num_boxes = 1,
497 .perf_ctr_bits = 44,
498 .constraints = snbep_uncore_r2pcie_constraints,
499 SNBEP_UNCORE_PCI_COMMON_INIT(),
500};
501
502static struct intel_uncore_type snbep_uncore_r3qpi = {
503 .name = "r3qpi",
504 .num_counters = 3,
505 .num_boxes = 2,
506 .perf_ctr_bits = 44,
507 .constraints = snbep_uncore_r3qpi_constraints,
508 SNBEP_UNCORE_PCI_COMMON_INIT(),
509};
510
511static struct intel_uncore_type *snbep_pci_uncores[] = {
512 &snbep_uncore_ha,
513 &snbep_uncore_imc,
514 &snbep_uncore_qpi,
515 &snbep_uncore_r2pcie,
516 &snbep_uncore_r3qpi,
517 NULL,
518};
519
520static DEFINE_PCI_DEVICE_TABLE(snbep_uncore_pci_ids) = {
521 { /* Home Agent */
522 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA),
523 .driver_data = (unsigned long)&snbep_uncore_ha,
524 },
525 { /* MC Channel 0 */
526 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0),
527 .driver_data = (unsigned long)&snbep_uncore_imc,
528 },
529 { /* MC Channel 1 */
530 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1),
531 .driver_data = (unsigned long)&snbep_uncore_imc,
532 },
533 { /* MC Channel 2 */
534 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2),
535 .driver_data = (unsigned long)&snbep_uncore_imc,
536 },
537 { /* MC Channel 3 */
538 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3),
539 .driver_data = (unsigned long)&snbep_uncore_imc,
540 },
541 { /* QPI Port 0 */
542 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0),
543 .driver_data = (unsigned long)&snbep_uncore_qpi,
544 },
545 { /* QPI Port 1 */
546 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1),
547 .driver_data = (unsigned long)&snbep_uncore_qpi,
548 },
549 { /* P2PCIe */
550 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE),
551 .driver_data = (unsigned long)&snbep_uncore_r2pcie,
552 },
553 { /* R3QPI Link 0 */
554 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0),
555 .driver_data = (unsigned long)&snbep_uncore_r3qpi,
556 },
557 { /* R3QPI Link 1 */
558 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1),
559 .driver_data = (unsigned long)&snbep_uncore_r3qpi,
560 },
561 { /* end: all zeroes */ }
562};
563
564static struct pci_driver snbep_uncore_pci_driver = {
565 .name = "snbep_uncore",
566 .id_table = snbep_uncore_pci_ids,
567};
568
569/*
570 * build pci bus to socket mapping
571 */
572static void snbep_pci2phy_map_init(void)
573{
574 struct pci_dev *ubox_dev = NULL;
575 int i, bus, nodeid;
576 u32 config;
577
578 while (1) {
579 /* find the UBOX device */
580 ubox_dev = pci_get_device(PCI_VENDOR_ID_INTEL,
581 PCI_DEVICE_ID_INTEL_JAKETOWN_UBOX,
582 ubox_dev);
583 if (!ubox_dev)
584 break;
585 bus = ubox_dev->bus->number;
586 /* get the Node ID of the local register */
587 pci_read_config_dword(ubox_dev, 0x40, &config);
588 nodeid = config;
589 /* get the Node ID mapping */
590 pci_read_config_dword(ubox_dev, 0x54, &config);
591 /*
592 * every three bits in the Node ID mapping register maps
593 * to a particular node.
594 */
595 for (i = 0; i < 8; i++) {
596 if (nodeid == ((config >> (3 * i)) & 0x7)) {
597 pcibus_to_physid[bus] = i;
598 break;
599 }
600 }
601 };
602 return;
603}
604/* end of Sandy Bridge-EP uncore support */
605
606
607/* Sandy Bridge uncore support */
608static void snb_uncore_msr_enable_event(struct intel_uncore_box *box,
609 struct perf_event *event)
610{
611 struct hw_perf_event *hwc = &event->hw;
612
613 if (hwc->idx < UNCORE_PMC_IDX_FIXED)
614 wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
615 else
616 wrmsrl(hwc->config_base, SNB_UNC_CTL_EN);
617}
618
619static void snb_uncore_msr_disable_event(struct intel_uncore_box *box,
620 struct perf_event *event)
621{
622 wrmsrl(event->hw.config_base, 0);
623}
624
625static u64 snb_uncore_msr_read_counter(struct intel_uncore_box *box,
626 struct perf_event *event)
627{
628 u64 count;
629 rdmsrl(event->hw.event_base, count);
630 return count;
631}
632
633static void snb_uncore_msr_init_box(struct intel_uncore_box *box)
634{
635 if (box->pmu->pmu_idx == 0) {
636 wrmsrl(SNB_UNC_PERF_GLOBAL_CTL,
637 SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL);
638 }
639}
640
641static struct attribute *snb_uncore_formats_attr[] = {
642 &format_attr_event.attr,
643 &format_attr_umask.attr,
644 &format_attr_edge.attr,
645 &format_attr_inv.attr,
646 &format_attr_cmask5.attr,
647 NULL,
648};
649
650static struct attribute_group snb_uncore_format_group = {
651 .name = "format",
652 .attrs = snb_uncore_formats_attr,
653};
654
655static struct intel_uncore_ops snb_uncore_msr_ops = {
656 .init_box = snb_uncore_msr_init_box,
657 .disable_event = snb_uncore_msr_disable_event,
658 .enable_event = snb_uncore_msr_enable_event,
659 .read_counter = snb_uncore_msr_read_counter,
660};
661
662static struct event_constraint snb_uncore_cbox_constraints[] = {
663 UNCORE_EVENT_CONSTRAINT(0x80, 0x1),
664 UNCORE_EVENT_CONSTRAINT(0x83, 0x1),
665 EVENT_CONSTRAINT_END
666};
667
668static struct intel_uncore_type snb_uncore_cbox = {
669 .name = "cbox",
670 .num_counters = 2,
671 .num_boxes = 4,
672 .perf_ctr_bits = 44,
673 .fixed_ctr_bits = 48,
674 .perf_ctr = SNB_UNC_CBO_0_PER_CTR0,
675 .event_ctl = SNB_UNC_CBO_0_PERFEVTSEL0,
676 .fixed_ctr = SNB_UNC_FIXED_CTR,
677 .fixed_ctl = SNB_UNC_FIXED_CTR_CTRL,
678 .single_fixed = 1,
679 .event_mask = SNB_UNC_RAW_EVENT_MASK,
680 .msr_offset = SNB_UNC_CBO_MSR_OFFSET,
681 .constraints = snb_uncore_cbox_constraints,
682 .ops = &snb_uncore_msr_ops,
683 .format_group = &snb_uncore_format_group,
684};
685
686static struct intel_uncore_type *snb_msr_uncores[] = {
687 &snb_uncore_cbox,
688 NULL,
689};
690/* end of Sandy Bridge uncore support */
691
692/* Nehalem uncore support */
693static void nhm_uncore_msr_disable_box(struct intel_uncore_box *box)
694{
695 wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, 0);
696}
697
698static void nhm_uncore_msr_enable_box(struct intel_uncore_box *box)
699{
700 wrmsrl(NHM_UNC_PERF_GLOBAL_CTL,
701 NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC);
702}
703
704static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box,
705 struct perf_event *event)
706{
707 struct hw_perf_event *hwc = &event->hw;
708
709 if (hwc->idx < UNCORE_PMC_IDX_FIXED)
710 wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
711 else
712 wrmsrl(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN);
713}
714
715static struct attribute *nhm_uncore_formats_attr[] = {
716 &format_attr_event.attr,
717 &format_attr_umask.attr,
718 &format_attr_edge.attr,
719 &format_attr_inv.attr,
720 &format_attr_cmask8.attr,
721 NULL,
722};
723
724static struct attribute_group nhm_uncore_format_group = {
725 .name = "format",
726 .attrs = nhm_uncore_formats_attr,
727};
728
729static struct uncore_event_desc nhm_uncore_events[] = {
730 INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
731 INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any, "event=0x2f,umask=0x0f"),
732 INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any, "event=0x2c,umask=0x0f"),
733 INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads, "event=0x20,umask=0x01"),
734 INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_writes, "event=0x20,umask=0x02"),
735 INTEL_UNCORE_EVENT_DESC(qhl_request_remote_reads, "event=0x20,umask=0x04"),
736 INTEL_UNCORE_EVENT_DESC(qhl_request_remote_writes, "event=0x20,umask=0x08"),
737 INTEL_UNCORE_EVENT_DESC(qhl_request_local_reads, "event=0x20,umask=0x10"),
738 INTEL_UNCORE_EVENT_DESC(qhl_request_local_writes, "event=0x20,umask=0x20"),
739 { /* end: all zeroes */ },
740};
741
742static struct intel_uncore_ops nhm_uncore_msr_ops = {
743 .disable_box = nhm_uncore_msr_disable_box,
744 .enable_box = nhm_uncore_msr_enable_box,
745 .disable_event = snb_uncore_msr_disable_event,
746 .enable_event = nhm_uncore_msr_enable_event,
747 .read_counter = snb_uncore_msr_read_counter,
748};
749
750static struct intel_uncore_type nhm_uncore = {
751 .name = "",
752 .num_counters = 8,
753 .num_boxes = 1,
754 .perf_ctr_bits = 48,
755 .fixed_ctr_bits = 48,
756 .event_ctl = NHM_UNC_PERFEVTSEL0,
757 .perf_ctr = NHM_UNC_UNCORE_PMC0,
758 .fixed_ctr = NHM_UNC_FIXED_CTR,
759 .fixed_ctl = NHM_UNC_FIXED_CTR_CTRL,
760 .event_mask = NHM_UNC_RAW_EVENT_MASK,
761 .event_descs = nhm_uncore_events,
762 .ops = &nhm_uncore_msr_ops,
763 .format_group = &nhm_uncore_format_group,
764};
765
766static struct intel_uncore_type *nhm_msr_uncores[] = {
767 &nhm_uncore,
768 NULL,
769};
770/* end of Nehalem uncore support */
771
772static void uncore_assign_hw_event(struct intel_uncore_box *box,
773 struct perf_event *event, int idx)
774{
775 struct hw_perf_event *hwc = &event->hw;
776
777 hwc->idx = idx;
778 hwc->last_tag = ++box->tags[idx];
779
780 if (hwc->idx == UNCORE_PMC_IDX_FIXED) {
781 hwc->event_base = uncore_fixed_ctr(box);
782 hwc->config_base = uncore_fixed_ctl(box);
783 return;
784 }
785
786 hwc->config_base = uncore_event_ctl(box, hwc->idx);
787 hwc->event_base = uncore_perf_ctr(box, hwc->idx);
788}
789
790static void uncore_perf_event_update(struct intel_uncore_box *box,
791 struct perf_event *event)
792{
793 u64 prev_count, new_count, delta;
794 int shift;
795
796 if (event->hw.idx >= UNCORE_PMC_IDX_FIXED)
797 shift = 64 - uncore_fixed_ctr_bits(box);
798 else
799 shift = 64 - uncore_perf_ctr_bits(box);
800
801 /* the hrtimer might modify the previous event value */
802again:
803 prev_count = local64_read(&event->hw.prev_count);
804 new_count = uncore_read_counter(box, event);
805 if (local64_xchg(&event->hw.prev_count, new_count) != prev_count)
806 goto again;
807
808 delta = (new_count << shift) - (prev_count << shift);
809 delta >>= shift;
810
811 local64_add(delta, &event->count);
812}
813
814/*
815 * The overflow interrupt is unavailable for SandyBridge-EP, is broken
816 * for SandyBridge. So we use hrtimer to periodically poll the counter
817 * to avoid overflow.
818 */
819static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer)
820{
821 struct intel_uncore_box *box;
822 unsigned long flags;
823 int bit;
824
825 box = container_of(hrtimer, struct intel_uncore_box, hrtimer);
826 if (!box->n_active || box->cpu != smp_processor_id())
827 return HRTIMER_NORESTART;
828 /*
829 * disable local interrupt to prevent uncore_pmu_event_start/stop
830 * to interrupt the update process
831 */
832 local_irq_save(flags);
833
834 for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX)
835 uncore_perf_event_update(box, box->events[bit]);
836
837 local_irq_restore(flags);
838
839 hrtimer_forward_now(hrtimer, ns_to_ktime(UNCORE_PMU_HRTIMER_INTERVAL));
840 return HRTIMER_RESTART;
841}
842
843static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box)
844{
845 __hrtimer_start_range_ns(&box->hrtimer,
846 ns_to_ktime(UNCORE_PMU_HRTIMER_INTERVAL), 0,
847 HRTIMER_MODE_REL_PINNED, 0);
848}
849
850static void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box)
851{
852 hrtimer_cancel(&box->hrtimer);
853}
854
855static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box)
856{
857 hrtimer_init(&box->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
858 box->hrtimer.function = uncore_pmu_hrtimer;
859}
860
861struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type,
862 int cpu)
863{
864 struct intel_uncore_box *box;
865 int i, size;
866
867 size = sizeof(*box) + type->num_shared_regs *
868 sizeof(struct intel_uncore_extra_reg);
869
870 box = kmalloc_node(size, GFP_KERNEL | __GFP_ZERO, cpu_to_node(cpu));
871 if (!box)
872 return NULL;
873
874 for (i = 0; i < type->num_shared_regs; i++)
875 raw_spin_lock_init(&box->shared_regs[i].lock);
876
877 uncore_pmu_init_hrtimer(box);
878 atomic_set(&box->refcnt, 1);
879 box->cpu = -1;
880 box->phys_id = -1;
881
882 return box;
883}
884
885static struct intel_uncore_box *
886uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
887{
888 static struct intel_uncore_box *box;
889
890 box = *per_cpu_ptr(pmu->box, cpu);
891 if (box)
892 return box;
893
894 raw_spin_lock(&uncore_box_lock);
895 list_for_each_entry(box, &pmu->box_list, list) {
896 if (box->phys_id == topology_physical_package_id(cpu)) {
897 atomic_inc(&box->refcnt);
898 *per_cpu_ptr(pmu->box, cpu) = box;
899 break;
900 }
901 }
902 raw_spin_unlock(&uncore_box_lock);
903
904 return *per_cpu_ptr(pmu->box, cpu);
905}
906
907static struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event)
908{
909 return container_of(event->pmu, struct intel_uncore_pmu, pmu);
910}
911
912static struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
913{
914 /*
915 * perf core schedules event on the basis of cpu, uncore events are
916 * collected by one of the cpus inside a physical package.
917 */
918 return uncore_pmu_to_box(uncore_event_to_pmu(event),
919 smp_processor_id());
920}
921
922static int uncore_collect_events(struct intel_uncore_box *box,
923 struct perf_event *leader, bool dogrp)
924{
925 struct perf_event *event;
926 int n, max_count;
927
928 max_count = box->pmu->type->num_counters;
929 if (box->pmu->type->fixed_ctl)
930 max_count++;
931
932 if (box->n_events >= max_count)
933 return -EINVAL;
934
935 n = box->n_events;
936 box->event_list[n] = leader;
937 n++;
938 if (!dogrp)
939 return n;
940
941 list_for_each_entry(event, &leader->sibling_list, group_entry) {
942 if (event->state <= PERF_EVENT_STATE_OFF)
943 continue;
944
945 if (n >= max_count)
946 return -EINVAL;
947
948 box->event_list[n] = event;
949 n++;
950 }
951 return n;
952}
953
954static struct event_constraint *
955uncore_get_event_constraint(struct intel_uncore_box *box,
956 struct perf_event *event)
957{
958 struct intel_uncore_type *type = box->pmu->type;
959 struct event_constraint *c;
960
961 if (type->ops->get_constraint) {
962 c = type->ops->get_constraint(box, event);
963 if (c)
964 return c;
965 }
966
967 if (event->hw.config == ~0ULL)
968 return &constraint_fixed;
969
970 if (type->constraints) {
971 for_each_event_constraint(c, type->constraints) {
972 if ((event->hw.config & c->cmask) == c->code)
973 return c;
974 }
975 }
976
977 return &type->unconstrainted;
978}
979
980static void uncore_put_event_constraint(struct intel_uncore_box *box,
981 struct perf_event *event)
982{
983 if (box->pmu->type->ops->put_constraint)
984 box->pmu->type->ops->put_constraint(box, event);
985}
986
987static int uncore_assign_events(struct intel_uncore_box *box,
988 int assign[], int n)
989{
990 unsigned long used_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
991 struct event_constraint *c, *constraints[UNCORE_PMC_IDX_MAX];
992 int i, wmin, wmax, ret = 0;
993 struct hw_perf_event *hwc;
994
995 bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX);
996
997 for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) {
998 c = uncore_get_event_constraint(box, box->event_list[i]);
999 constraints[i] = c;
1000 wmin = min(wmin, c->weight);
1001 wmax = max(wmax, c->weight);
1002 }
1003
1004 /* fastpath, try to reuse previous register */
1005 for (i = 0; i < n; i++) {
1006 hwc = &box->event_list[i]->hw;
1007 c = constraints[i];
1008
1009 /* never assigned */
1010 if (hwc->idx == -1)
1011 break;
1012
1013 /* constraint still honored */
1014 if (!test_bit(hwc->idx, c->idxmsk))
1015 break;
1016
1017 /* not already used */
1018 if (test_bit(hwc->idx, used_mask))
1019 break;
1020
1021 __set_bit(hwc->idx, used_mask);
1022 if (assign)
1023 assign[i] = hwc->idx;
1024 }
1025 /* slow path */
1026 if (i != n)
1027 ret = perf_assign_events(constraints, n, wmin, wmax, assign);
1028
1029 if (!assign || ret) {
1030 for (i = 0; i < n; i++)
1031 uncore_put_event_constraint(box, box->event_list[i]);
1032 }
1033 return ret ? -EINVAL : 0;
1034}
1035
1036static void uncore_pmu_event_start(struct perf_event *event, int flags)
1037{
1038 struct intel_uncore_box *box = uncore_event_to_box(event);
1039 int idx = event->hw.idx;
1040
1041 if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
1042 return;
1043
1044 if (WARN_ON_ONCE(idx == -1 || idx >= UNCORE_PMC_IDX_MAX))
1045 return;
1046
1047 event->hw.state = 0;
1048 box->events[idx] = event;
1049 box->n_active++;
1050 __set_bit(idx, box->active_mask);
1051
1052 local64_set(&event->hw.prev_count, uncore_read_counter(box, event));
1053 uncore_enable_event(box, event);
1054
1055 if (box->n_active == 1) {
1056 uncore_enable_box(box);
1057 uncore_pmu_start_hrtimer(box);
1058 }
1059}
1060
1061static void uncore_pmu_event_stop(struct perf_event *event, int flags)
1062{
1063 struct intel_uncore_box *box = uncore_event_to_box(event);
1064 struct hw_perf_event *hwc = &event->hw;
1065
1066 if (__test_and_clear_bit(hwc->idx, box->active_mask)) {
1067 uncore_disable_event(box, event);
1068 box->n_active--;
1069 box->events[hwc->idx] = NULL;
1070 WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
1071 hwc->state |= PERF_HES_STOPPED;
1072
1073 if (box->n_active == 0) {
1074 uncore_disable_box(box);
1075 uncore_pmu_cancel_hrtimer(box);
1076 }
1077 }
1078
1079 if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
1080 /*
1081 * Drain the remaining delta count out of a event
1082 * that we are disabling:
1083 */
1084 uncore_perf_event_update(box, event);
1085 hwc->state |= PERF_HES_UPTODATE;
1086 }
1087}
1088
1089static int uncore_pmu_event_add(struct perf_event *event, int flags)
1090{
1091 struct intel_uncore_box *box = uncore_event_to_box(event);
1092 struct hw_perf_event *hwc = &event->hw;
1093 int assign[UNCORE_PMC_IDX_MAX];
1094 int i, n, ret;
1095
1096 if (!box)
1097 return -ENODEV;
1098
1099 ret = n = uncore_collect_events(box, event, false);
1100 if (ret < 0)
1101 return ret;
1102
1103 hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
1104 if (!(flags & PERF_EF_START))
1105 hwc->state |= PERF_HES_ARCH;
1106
1107 ret = uncore_assign_events(box, assign, n);
1108 if (ret)
1109 return ret;
1110
1111 /* save events moving to new counters */
1112 for (i = 0; i < box->n_events; i++) {
1113 event = box->event_list[i];
1114 hwc = &event->hw;
1115
1116 if (hwc->idx == assign[i] &&
1117 hwc->last_tag == box->tags[assign[i]])
1118 continue;
1119 /*
1120 * Ensure we don't accidentally enable a stopped
1121 * counter simply because we rescheduled.
1122 */
1123 if (hwc->state & PERF_HES_STOPPED)
1124 hwc->state |= PERF_HES_ARCH;
1125
1126 uncore_pmu_event_stop(event, PERF_EF_UPDATE);
1127 }
1128
1129 /* reprogram moved events into new counters */
1130 for (i = 0; i < n; i++) {
1131 event = box->event_list[i];
1132 hwc = &event->hw;
1133
1134 if (hwc->idx != assign[i] ||
1135 hwc->last_tag != box->tags[assign[i]])
1136 uncore_assign_hw_event(box, event, assign[i]);
1137 else if (i < box->n_events)
1138 continue;
1139
1140 if (hwc->state & PERF_HES_ARCH)
1141 continue;
1142
1143 uncore_pmu_event_start(event, 0);
1144 }
1145 box->n_events = n;
1146
1147 return 0;
1148}
1149
1150static void uncore_pmu_event_del(struct perf_event *event, int flags)
1151{
1152 struct intel_uncore_box *box = uncore_event_to_box(event);
1153 int i;
1154
1155 uncore_pmu_event_stop(event, PERF_EF_UPDATE);
1156
1157 for (i = 0; i < box->n_events; i++) {
1158 if (event == box->event_list[i]) {
1159 uncore_put_event_constraint(box, event);
1160
1161 while (++i < box->n_events)
1162 box->event_list[i - 1] = box->event_list[i];
1163
1164 --box->n_events;
1165 break;
1166 }
1167 }
1168
1169 event->hw.idx = -1;
1170 event->hw.last_tag = ~0ULL;
1171}
1172
1173static void uncore_pmu_event_read(struct perf_event *event)
1174{
1175 struct intel_uncore_box *box = uncore_event_to_box(event);
1176 uncore_perf_event_update(box, event);
1177}
1178
1179/*
1180 * validation ensures the group can be loaded onto the
1181 * PMU if it was the only group available.
1182 */
1183static int uncore_validate_group(struct intel_uncore_pmu *pmu,
1184 struct perf_event *event)
1185{
1186 struct perf_event *leader = event->group_leader;
1187 struct intel_uncore_box *fake_box;
1188 int ret = -EINVAL, n;
1189
1190 fake_box = uncore_alloc_box(pmu->type, smp_processor_id());
1191 if (!fake_box)
1192 return -ENOMEM;
1193
1194 fake_box->pmu = pmu;
1195 /*
1196 * the event is not yet connected with its
1197 * siblings therefore we must first collect
1198 * existing siblings, then add the new event
1199 * before we can simulate the scheduling
1200 */
1201 n = uncore_collect_events(fake_box, leader, true);
1202 if (n < 0)
1203 goto out;
1204
1205 fake_box->n_events = n;
1206 n = uncore_collect_events(fake_box, event, false);
1207 if (n < 0)
1208 goto out;
1209
1210 fake_box->n_events = n;
1211
1212 ret = uncore_assign_events(fake_box, NULL, n);
1213out:
1214 kfree(fake_box);
1215 return ret;
1216}
1217
1218int uncore_pmu_event_init(struct perf_event *event)
1219{
1220 struct intel_uncore_pmu *pmu;
1221 struct intel_uncore_box *box;
1222 struct hw_perf_event *hwc = &event->hw;
1223 int ret;
1224
1225 if (event->attr.type != event->pmu->type)
1226 return -ENOENT;
1227
1228 pmu = uncore_event_to_pmu(event);
1229 /* no device found for this pmu */
1230 if (pmu->func_id < 0)
1231 return -ENOENT;
1232
1233 /*
1234 * Uncore PMU does measure at all privilege level all the time.
1235 * So it doesn't make sense to specify any exclude bits.
1236 */
1237 if (event->attr.exclude_user || event->attr.exclude_kernel ||
1238 event->attr.exclude_hv || event->attr.exclude_idle)
1239 return -EINVAL;
1240
1241 /* Sampling not supported yet */
1242 if (hwc->sample_period)
1243 return -EINVAL;
1244
1245 /*
1246 * Place all uncore events for a particular physical package
1247 * onto a single cpu
1248 */
1249 if (event->cpu < 0)
1250 return -EINVAL;
1251 box = uncore_pmu_to_box(pmu, event->cpu);
1252 if (!box || box->cpu < 0)
1253 return -EINVAL;
1254 event->cpu = box->cpu;
1255
1256 event->hw.idx = -1;
1257 event->hw.last_tag = ~0ULL;
1258 event->hw.extra_reg.idx = EXTRA_REG_NONE;
1259
1260 if (event->attr.config == UNCORE_FIXED_EVENT) {
1261 /* no fixed counter */
1262 if (!pmu->type->fixed_ctl)
1263 return -EINVAL;
1264 /*
1265 * if there is only one fixed counter, only the first pmu
1266 * can access the fixed counter
1267 */
1268 if (pmu->type->single_fixed && pmu->pmu_idx > 0)
1269 return -EINVAL;
1270 hwc->config = ~0ULL;
1271 } else {
1272 hwc->config = event->attr.config & pmu->type->event_mask;
1273 if (pmu->type->ops->hw_config) {
1274 ret = pmu->type->ops->hw_config(box, event);
1275 if (ret)
1276 return ret;
1277 }
1278 }
1279
1280 if (event->group_leader != event)
1281 ret = uncore_validate_group(pmu, event);
1282 else
1283 ret = 0;
1284
1285 return ret;
1286}
1287
1288static int __init uncore_pmu_register(struct intel_uncore_pmu *pmu)
1289{
1290 int ret;
1291
1292 pmu->pmu = (struct pmu) {
1293 .attr_groups = pmu->type->attr_groups,
1294 .task_ctx_nr = perf_invalid_context,
1295 .event_init = uncore_pmu_event_init,
1296 .add = uncore_pmu_event_add,
1297 .del = uncore_pmu_event_del,
1298 .start = uncore_pmu_event_start,
1299 .stop = uncore_pmu_event_stop,
1300 .read = uncore_pmu_event_read,
1301 };
1302
1303 if (pmu->type->num_boxes == 1) {
1304 if (strlen(pmu->type->name) > 0)
1305 sprintf(pmu->name, "uncore_%s", pmu->type->name);
1306 else
1307 sprintf(pmu->name, "uncore");
1308 } else {
1309 sprintf(pmu->name, "uncore_%s_%d", pmu->type->name,
1310 pmu->pmu_idx);
1311 }
1312
1313 ret = perf_pmu_register(&pmu->pmu, pmu->name, -1);
1314 return ret;
1315}
1316
1317static void __init uncore_type_exit(struct intel_uncore_type *type)
1318{
1319 int i;
1320
1321 for (i = 0; i < type->num_boxes; i++)
1322 free_percpu(type->pmus[i].box);
1323 kfree(type->pmus);
1324 type->pmus = NULL;
1325 kfree(type->attr_groups[1]);
1326 type->attr_groups[1] = NULL;
1327}
1328
1329static void uncore_types_exit(struct intel_uncore_type **types)
1330{
1331 int i;
1332 for (i = 0; types[i]; i++)
1333 uncore_type_exit(types[i]);
1334}
1335
1336static int __init uncore_type_init(struct intel_uncore_type *type)
1337{
1338 struct intel_uncore_pmu *pmus;
1339 struct attribute_group *events_group;
1340 struct attribute **attrs;
1341 int i, j;
1342
1343 pmus = kzalloc(sizeof(*pmus) * type->num_boxes, GFP_KERNEL);
1344 if (!pmus)
1345 return -ENOMEM;
1346
1347 type->unconstrainted = (struct event_constraint)
1348 __EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1,
1349 0, type->num_counters, 0);
1350
1351 for (i = 0; i < type->num_boxes; i++) {
1352 pmus[i].func_id = -1;
1353 pmus[i].pmu_idx = i;
1354 pmus[i].type = type;
1355 INIT_LIST_HEAD(&pmus[i].box_list);
1356 pmus[i].box = alloc_percpu(struct intel_uncore_box *);
1357 if (!pmus[i].box)
1358 goto fail;
1359 }
1360
1361 if (type->event_descs) {
1362 i = 0;
1363 while (type->event_descs[i].attr.attr.name)
1364 i++;
1365
1366 events_group = kzalloc(sizeof(struct attribute *) * (i + 1) +
1367 sizeof(*events_group), GFP_KERNEL);
1368 if (!events_group)
1369 goto fail;
1370
1371 attrs = (struct attribute **)(events_group + 1);
1372 events_group->name = "events";
1373 events_group->attrs = attrs;
1374
1375 for (j = 0; j < i; j++)
1376 attrs[j] = &type->event_descs[j].attr.attr;
1377
1378 type->attr_groups[1] = events_group;
1379 }
1380
1381 type->pmus = pmus;
1382 return 0;
1383fail:
1384 uncore_type_exit(type);
1385 return -ENOMEM;
1386}
1387
1388static int __init uncore_types_init(struct intel_uncore_type **types)
1389{
1390 int i, ret;
1391
1392 for (i = 0; types[i]; i++) {
1393 ret = uncore_type_init(types[i]);
1394 if (ret)
1395 goto fail;
1396 }
1397 return 0;
1398fail:
1399 while (--i >= 0)
1400 uncore_type_exit(types[i]);
1401 return ret;
1402}
1403
1404static struct pci_driver *uncore_pci_driver;
1405static bool pcidrv_registered;
1406
1407/*
1408 * add a pci uncore device
1409 */
1410static int __devinit uncore_pci_add(struct intel_uncore_type *type,
1411 struct pci_dev *pdev)
1412{
1413 struct intel_uncore_pmu *pmu;
1414 struct intel_uncore_box *box;
1415 int i, phys_id;
1416
1417 phys_id = pcibus_to_physid[pdev->bus->number];
1418 if (phys_id < 0)
1419 return -ENODEV;
1420
1421 box = uncore_alloc_box(type, 0);
1422 if (!box)
1423 return -ENOMEM;
1424
1425 /*
1426 * for performance monitoring unit with multiple boxes,
1427 * each box has a different function id.
1428 */
1429 for (i = 0; i < type->num_boxes; i++) {
1430 pmu = &type->pmus[i];
1431 if (pmu->func_id == pdev->devfn)
1432 break;
1433 if (pmu->func_id < 0) {
1434 pmu->func_id = pdev->devfn;
1435 break;
1436 }
1437 pmu = NULL;
1438 }
1439
1440 if (!pmu) {
1441 kfree(box);
1442 return -EINVAL;
1443 }
1444
1445 box->phys_id = phys_id;
1446 box->pci_dev = pdev;
1447 box->pmu = pmu;
1448 uncore_box_init(box);
1449 pci_set_drvdata(pdev, box);
1450
1451 raw_spin_lock(&uncore_box_lock);
1452 list_add_tail(&box->list, &pmu->box_list);
1453 raw_spin_unlock(&uncore_box_lock);
1454
1455 return 0;
1456}
1457
1458static void uncore_pci_remove(struct pci_dev *pdev)
1459{
1460 struct intel_uncore_box *box = pci_get_drvdata(pdev);
1461 struct intel_uncore_pmu *pmu = box->pmu;
1462 int cpu, phys_id = pcibus_to_physid[pdev->bus->number];
1463
1464 if (WARN_ON_ONCE(phys_id != box->phys_id))
1465 return;
1466
1467 raw_spin_lock(&uncore_box_lock);
1468 list_del(&box->list);
1469 raw_spin_unlock(&uncore_box_lock);
1470
1471 for_each_possible_cpu(cpu) {
1472 if (*per_cpu_ptr(pmu->box, cpu) == box) {
1473 *per_cpu_ptr(pmu->box, cpu) = NULL;
1474 atomic_dec(&box->refcnt);
1475 }
1476 }
1477
1478 WARN_ON_ONCE(atomic_read(&box->refcnt) != 1);
1479 kfree(box);
1480}
1481
1482static int __devinit uncore_pci_probe(struct pci_dev *pdev,
1483 const struct pci_device_id *id)
1484{
1485 struct intel_uncore_type *type;
1486
1487 type = (struct intel_uncore_type *)id->driver_data;
1488 return uncore_pci_add(type, pdev);
1489}
1490
1491static int __init uncore_pci_init(void)
1492{
1493 int ret;
1494
1495 switch (boot_cpu_data.x86_model) {
1496 case 45: /* Sandy Bridge-EP */
1497 pci_uncores = snbep_pci_uncores;
1498 uncore_pci_driver = &snbep_uncore_pci_driver;
1499 snbep_pci2phy_map_init();
1500 break;
1501 default:
1502 return 0;
1503 }
1504
1505 ret = uncore_types_init(pci_uncores);
1506 if (ret)
1507 return ret;
1508
1509 uncore_pci_driver->probe = uncore_pci_probe;
1510 uncore_pci_driver->remove = uncore_pci_remove;
1511
1512 ret = pci_register_driver(uncore_pci_driver);
1513 if (ret == 0)
1514 pcidrv_registered = true;
1515 else
1516 uncore_types_exit(pci_uncores);
1517
1518 return ret;
1519}
1520
1521static void __init uncore_pci_exit(void)
1522{
1523 if (pcidrv_registered) {
1524 pcidrv_registered = false;
1525 pci_unregister_driver(uncore_pci_driver);
1526 uncore_types_exit(pci_uncores);
1527 }
1528}
1529
1530static void __cpuinit uncore_cpu_dying(int cpu)
1531{
1532 struct intel_uncore_type *type;
1533 struct intel_uncore_pmu *pmu;
1534 struct intel_uncore_box *box;
1535 int i, j;
1536
1537 for (i = 0; msr_uncores[i]; i++) {
1538 type = msr_uncores[i];
1539 for (j = 0; j < type->num_boxes; j++) {
1540 pmu = &type->pmus[j];
1541 box = *per_cpu_ptr(pmu->box, cpu);
1542 *per_cpu_ptr(pmu->box, cpu) = NULL;
1543 if (box && atomic_dec_and_test(&box->refcnt))
1544 kfree(box);
1545 }
1546 }
1547}
1548
1549static int __cpuinit uncore_cpu_starting(int cpu)
1550{
1551 struct intel_uncore_type *type;
1552 struct intel_uncore_pmu *pmu;
1553 struct intel_uncore_box *box, *exist;
1554 int i, j, k, phys_id;
1555
1556 phys_id = topology_physical_package_id(cpu);
1557
1558 for (i = 0; msr_uncores[i]; i++) {
1559 type = msr_uncores[i];
1560 for (j = 0; j < type->num_boxes; j++) {
1561 pmu = &type->pmus[j];
1562 box = *per_cpu_ptr(pmu->box, cpu);
1563 /* called by uncore_cpu_init? */
1564 if (box && box->phys_id >= 0) {
1565 uncore_box_init(box);
1566 continue;
1567 }
1568
1569 for_each_online_cpu(k) {
1570 exist = *per_cpu_ptr(pmu->box, k);
1571 if (exist && exist->phys_id == phys_id) {
1572 atomic_inc(&exist->refcnt);
1573 *per_cpu_ptr(pmu->box, cpu) = exist;
1574 kfree(box);
1575 box = NULL;
1576 break;
1577 }
1578 }
1579
1580 if (box) {
1581 box->phys_id = phys_id;
1582 uncore_box_init(box);
1583 }
1584 }
1585 }
1586 return 0;
1587}
1588
1589static int __cpuinit uncore_cpu_prepare(int cpu, int phys_id)
1590{
1591 struct intel_uncore_type *type;
1592 struct intel_uncore_pmu *pmu;
1593 struct intel_uncore_box *box;
1594 int i, j;
1595
1596 for (i = 0; msr_uncores[i]; i++) {
1597 type = msr_uncores[i];
1598 for (j = 0; j < type->num_boxes; j++) {
1599 pmu = &type->pmus[j];
1600 if (pmu->func_id < 0)
1601 pmu->func_id = j;
1602
1603 box = uncore_alloc_box(type, cpu);
1604 if (!box)
1605 return -ENOMEM;
1606
1607 box->pmu = pmu;
1608 box->phys_id = phys_id;
1609 *per_cpu_ptr(pmu->box, cpu) = box;
1610 }
1611 }
1612 return 0;
1613}
1614
1615static void __cpuinit uncore_change_context(struct intel_uncore_type **uncores,
1616 int old_cpu, int new_cpu)
1617{
1618 struct intel_uncore_type *type;
1619 struct intel_uncore_pmu *pmu;
1620 struct intel_uncore_box *box;
1621 int i, j;
1622
1623 for (i = 0; uncores[i]; i++) {
1624 type = uncores[i];
1625 for (j = 0; j < type->num_boxes; j++) {
1626 pmu = &type->pmus[j];
1627 if (old_cpu < 0)
1628 box = uncore_pmu_to_box(pmu, new_cpu);
1629 else
1630 box = uncore_pmu_to_box(pmu, old_cpu);
1631 if (!box)
1632 continue;
1633
1634 if (old_cpu < 0) {
1635 WARN_ON_ONCE(box->cpu != -1);
1636 box->cpu = new_cpu;
1637 continue;
1638 }
1639
1640 WARN_ON_ONCE(box->cpu != old_cpu);
1641 if (new_cpu >= 0) {
1642 uncore_pmu_cancel_hrtimer(box);
1643 perf_pmu_migrate_context(&pmu->pmu,
1644 old_cpu, new_cpu);
1645 box->cpu = new_cpu;
1646 } else {
1647 box->cpu = -1;
1648 }
1649 }
1650 }
1651}
1652
1653static void __cpuinit uncore_event_exit_cpu(int cpu)
1654{
1655 int i, phys_id, target;
1656
1657 /* if exiting cpu is used for collecting uncore events */
1658 if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask))
1659 return;
1660
1661 /* find a new cpu to collect uncore events */
1662 phys_id = topology_physical_package_id(cpu);
1663 target = -1;
1664 for_each_online_cpu(i) {
1665 if (i == cpu)
1666 continue;
1667 if (phys_id == topology_physical_package_id(i)) {
1668 target = i;
1669 break;
1670 }
1671 }
1672
1673 /* migrate uncore events to the new cpu */
1674 if (target >= 0)
1675 cpumask_set_cpu(target, &uncore_cpu_mask);
1676
1677 uncore_change_context(msr_uncores, cpu, target);
1678 uncore_change_context(pci_uncores, cpu, target);
1679}
1680
1681static void __cpuinit uncore_event_init_cpu(int cpu)
1682{
1683 int i, phys_id;
1684
1685 phys_id = topology_physical_package_id(cpu);
1686 for_each_cpu(i, &uncore_cpu_mask) {
1687 if (phys_id == topology_physical_package_id(i))
1688 return;
1689 }
1690
1691 cpumask_set_cpu(cpu, &uncore_cpu_mask);
1692
1693 uncore_change_context(msr_uncores, -1, cpu);
1694 uncore_change_context(pci_uncores, -1, cpu);
1695}
1696
1697static int __cpuinit uncore_cpu_notifier(struct notifier_block *self,
1698 unsigned long action, void *hcpu)
1699{
1700 unsigned int cpu = (long)hcpu;
1701
1702 /* allocate/free data structure for uncore box */
1703 switch (action & ~CPU_TASKS_FROZEN) {
1704 case CPU_UP_PREPARE:
1705 uncore_cpu_prepare(cpu, -1);
1706 break;
1707 case CPU_STARTING:
1708 uncore_cpu_starting(cpu);
1709 break;
1710 case CPU_UP_CANCELED:
1711 case CPU_DYING:
1712 uncore_cpu_dying(cpu);
1713 break;
1714 default:
1715 break;
1716 }
1717
1718 /* select the cpu that collects uncore events */
1719 switch (action & ~CPU_TASKS_FROZEN) {
1720 case CPU_DOWN_FAILED:
1721 case CPU_STARTING:
1722 uncore_event_init_cpu(cpu);
1723 break;
1724 case CPU_DOWN_PREPARE:
1725 uncore_event_exit_cpu(cpu);
1726 break;
1727 default:
1728 break;
1729 }
1730
1731 return NOTIFY_OK;
1732}
1733
1734static struct notifier_block uncore_cpu_nb __cpuinitdata = {
1735 .notifier_call = uncore_cpu_notifier,
1736 /*
1737 * to migrate uncore events, our notifier should be executed
1738 * before perf core's notifier.
1739 */
1740 .priority = CPU_PRI_PERF + 1,
1741};
1742
1743static void __init uncore_cpu_setup(void *dummy)
1744{
1745 uncore_cpu_starting(smp_processor_id());
1746}
1747
1748static int __init uncore_cpu_init(void)
1749{
1750 int ret, cpu, max_cores;
1751
1752 max_cores = boot_cpu_data.x86_max_cores;
1753 switch (boot_cpu_data.x86_model) {
1754 case 26: /* Nehalem */
1755 case 30:
1756 case 37: /* Westmere */
1757 case 44:
1758 msr_uncores = nhm_msr_uncores;
1759 break;
1760 case 42: /* Sandy Bridge */
1761 if (snb_uncore_cbox.num_boxes > max_cores)
1762 snb_uncore_cbox.num_boxes = max_cores;
1763 msr_uncores = snb_msr_uncores;
1764 break;
1765 case 45: /* Sandy Birdge-EP */
1766 if (snbep_uncore_cbox.num_boxes > max_cores)
1767 snbep_uncore_cbox.num_boxes = max_cores;
1768 msr_uncores = snbep_msr_uncores;
1769 break;
1770 default:
1771 return 0;
1772 }
1773
1774 ret = uncore_types_init(msr_uncores);
1775 if (ret)
1776 return ret;
1777
1778 get_online_cpus();
1779
1780 for_each_online_cpu(cpu) {
1781 int i, phys_id = topology_physical_package_id(cpu);
1782
1783 for_each_cpu(i, &uncore_cpu_mask) {
1784 if (phys_id == topology_physical_package_id(i)) {
1785 phys_id = -1;
1786 break;
1787 }
1788 }
1789 if (phys_id < 0)
1790 continue;
1791
1792 uncore_cpu_prepare(cpu, phys_id);
1793 uncore_event_init_cpu(cpu);
1794 }
1795 on_each_cpu(uncore_cpu_setup, NULL, 1);
1796
1797 register_cpu_notifier(&uncore_cpu_nb);
1798
1799 put_online_cpus();
1800
1801 return 0;
1802}
1803
1804static int __init uncore_pmus_register(void)
1805{
1806 struct intel_uncore_pmu *pmu;
1807 struct intel_uncore_type *type;
1808 int i, j;
1809
1810 for (i = 0; msr_uncores[i]; i++) {
1811 type = msr_uncores[i];
1812 for (j = 0; j < type->num_boxes; j++) {
1813 pmu = &type->pmus[j];
1814 uncore_pmu_register(pmu);
1815 }
1816 }
1817
1818 for (i = 0; pci_uncores[i]; i++) {
1819 type = pci_uncores[i];
1820 for (j = 0; j < type->num_boxes; j++) {
1821 pmu = &type->pmus[j];
1822 uncore_pmu_register(pmu);
1823 }
1824 }
1825
1826 return 0;
1827}
1828
1829static int __init intel_uncore_init(void)
1830{
1831 int ret;
1832
1833 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
1834 return -ENODEV;
1835
1836 ret = uncore_pci_init();
1837 if (ret)
1838 goto fail;
1839 ret = uncore_cpu_init();
1840 if (ret) {
1841 uncore_pci_exit();
1842 goto fail;
1843 }
1844
1845 uncore_pmus_register();
1846 return 0;
1847fail:
1848 return ret;
1849}
1850device_initcall(intel_uncore_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
new file mode 100644
index 00000000000..b13e9ea81de
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -0,0 +1,424 @@
1#include <linux/module.h>
2#include <linux/slab.h>
3#include <linux/pci.h>
4#include <linux/perf_event.h>
5#include "perf_event.h"
6
7#define UNCORE_PMU_NAME_LEN 32
8#define UNCORE_BOX_HASH_SIZE 8
9
10#define UNCORE_PMU_HRTIMER_INTERVAL (60 * NSEC_PER_SEC)
11
12#define UNCORE_FIXED_EVENT 0xff
13#define UNCORE_PMC_IDX_MAX_GENERIC 8
14#define UNCORE_PMC_IDX_FIXED UNCORE_PMC_IDX_MAX_GENERIC
15#define UNCORE_PMC_IDX_MAX (UNCORE_PMC_IDX_FIXED + 1)
16
17#define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff)
18
19/* SNB event control */
20#define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff
21#define SNB_UNC_CTL_UMASK_MASK 0x0000ff00
22#define SNB_UNC_CTL_EDGE_DET (1 << 18)
23#define SNB_UNC_CTL_EN (1 << 22)
24#define SNB_UNC_CTL_INVERT (1 << 23)
25#define SNB_UNC_CTL_CMASK_MASK 0x1f000000
26#define NHM_UNC_CTL_CMASK_MASK 0xff000000
27#define NHM_UNC_FIXED_CTR_CTL_EN (1 << 0)
28
29#define SNB_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \
30 SNB_UNC_CTL_UMASK_MASK | \
31 SNB_UNC_CTL_EDGE_DET | \
32 SNB_UNC_CTL_INVERT | \
33 SNB_UNC_CTL_CMASK_MASK)
34
35#define NHM_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \
36 SNB_UNC_CTL_UMASK_MASK | \
37 SNB_UNC_CTL_EDGE_DET | \
38 SNB_UNC_CTL_INVERT | \
39 NHM_UNC_CTL_CMASK_MASK)
40
41/* SNB global control register */
42#define SNB_UNC_PERF_GLOBAL_CTL 0x391
43#define SNB_UNC_FIXED_CTR_CTRL 0x394
44#define SNB_UNC_FIXED_CTR 0x395
45
46/* SNB uncore global control */
47#define SNB_UNC_GLOBAL_CTL_CORE_ALL ((1 << 4) - 1)
48#define SNB_UNC_GLOBAL_CTL_EN (1 << 29)
49
50/* SNB Cbo register */
51#define SNB_UNC_CBO_0_PERFEVTSEL0 0x700
52#define SNB_UNC_CBO_0_PER_CTR0 0x706
53#define SNB_UNC_CBO_MSR_OFFSET 0x10
54
55/* NHM global control register */
56#define NHM_UNC_PERF_GLOBAL_CTL 0x391
57#define NHM_UNC_FIXED_CTR 0x394
58#define NHM_UNC_FIXED_CTR_CTRL 0x395
59
60/* NHM uncore global control */
61#define NHM_UNC_GLOBAL_CTL_EN_PC_ALL ((1ULL << 8) - 1)
62#define NHM_UNC_GLOBAL_CTL_EN_FC (1ULL << 32)
63
64/* NHM uncore register */
65#define NHM_UNC_PERFEVTSEL0 0x3c0
66#define NHM_UNC_UNCORE_PMC0 0x3b0
67
68/* SNB-EP Box level control */
69#define SNBEP_PMON_BOX_CTL_RST_CTRL (1 << 0)
70#define SNBEP_PMON_BOX_CTL_RST_CTRS (1 << 1)
71#define SNBEP_PMON_BOX_CTL_FRZ (1 << 8)
72#define SNBEP_PMON_BOX_CTL_FRZ_EN (1 << 16)
73#define SNBEP_PMON_BOX_CTL_INT (SNBEP_PMON_BOX_CTL_RST_CTRL | \
74 SNBEP_PMON_BOX_CTL_RST_CTRS | \
75 SNBEP_PMON_BOX_CTL_FRZ_EN)
76/* SNB-EP event control */
77#define SNBEP_PMON_CTL_EV_SEL_MASK 0x000000ff
78#define SNBEP_PMON_CTL_UMASK_MASK 0x0000ff00
79#define SNBEP_PMON_CTL_RST (1 << 17)
80#define SNBEP_PMON_CTL_EDGE_DET (1 << 18)
81#define SNBEP_PMON_CTL_EV_SEL_EXT (1 << 21) /* only for QPI */
82#define SNBEP_PMON_CTL_EN (1 << 22)
83#define SNBEP_PMON_CTL_INVERT (1 << 23)
84#define SNBEP_PMON_CTL_TRESH_MASK 0xff000000
85#define SNBEP_PMON_RAW_EVENT_MASK (SNBEP_PMON_CTL_EV_SEL_MASK | \
86 SNBEP_PMON_CTL_UMASK_MASK | \
87 SNBEP_PMON_CTL_EDGE_DET | \
88 SNBEP_PMON_CTL_INVERT | \
89 SNBEP_PMON_CTL_TRESH_MASK)
90
91/* SNB-EP Ubox event control */
92#define SNBEP_U_MSR_PMON_CTL_TRESH_MASK 0x1f000000
93#define SNBEP_U_MSR_PMON_RAW_EVENT_MASK \
94 (SNBEP_PMON_CTL_EV_SEL_MASK | \
95 SNBEP_PMON_CTL_UMASK_MASK | \
96 SNBEP_PMON_CTL_EDGE_DET | \
97 SNBEP_PMON_CTL_INVERT | \
98 SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
99
100#define SNBEP_CBO_PMON_CTL_TID_EN (1 << 19)
101#define SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK (SNBEP_PMON_RAW_EVENT_MASK | \
102 SNBEP_CBO_PMON_CTL_TID_EN)
103
104/* SNB-EP PCU event control */
105#define SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK 0x0000c000
106#define SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK 0x1f000000
107#define SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT (1 << 30)
108#define SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET (1 << 31)
109#define SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK \
110 (SNBEP_PMON_CTL_EV_SEL_MASK | \
111 SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
112 SNBEP_PMON_CTL_EDGE_DET | \
113 SNBEP_PMON_CTL_INVERT | \
114 SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
115 SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
116 SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
117
118/* SNB-EP pci control register */
119#define SNBEP_PCI_PMON_BOX_CTL 0xf4
120#define SNBEP_PCI_PMON_CTL0 0xd8
121/* SNB-EP pci counter register */
122#define SNBEP_PCI_PMON_CTR0 0xa0
123
124/* SNB-EP home agent register */
125#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH0 0x40
126#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH1 0x44
127#define SNBEP_HA_PCI_PMON_BOX_OPCODEMATCH 0x48
128/* SNB-EP memory controller register */
129#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTL 0xf0
130#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTR 0xd0
131/* SNB-EP QPI register */
132#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH0 0x228
133#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH1 0x22c
134#define SNBEP_Q_Py_PCI_PMON_PKT_MASK0 0x238
135#define SNBEP_Q_Py_PCI_PMON_PKT_MASK1 0x23c
136
137/* SNB-EP Ubox register */
138#define SNBEP_U_MSR_PMON_CTR0 0xc16
139#define SNBEP_U_MSR_PMON_CTL0 0xc10
140
141#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTL 0xc08
142#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTR 0xc09
143
144/* SNB-EP Cbo register */
145#define SNBEP_C0_MSR_PMON_CTR0 0xd16
146#define SNBEP_C0_MSR_PMON_CTL0 0xd10
147#define SNBEP_C0_MSR_PMON_BOX_CTL 0xd04
148#define SNBEP_C0_MSR_PMON_BOX_FILTER 0xd14
149#define SNBEP_CB0_MSR_PMON_BOX_FILTER_MASK 0xfffffc1f
150#define SNBEP_CBO_MSR_OFFSET 0x20
151
152/* SNB-EP PCU register */
153#define SNBEP_PCU_MSR_PMON_CTR0 0xc36
154#define SNBEP_PCU_MSR_PMON_CTL0 0xc30
155#define SNBEP_PCU_MSR_PMON_BOX_CTL 0xc24
156#define SNBEP_PCU_MSR_PMON_BOX_FILTER 0xc34
157#define SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK 0xffffffff
158#define SNBEP_PCU_MSR_CORE_C3_CTR 0x3fc
159#define SNBEP_PCU_MSR_CORE_C6_CTR 0x3fd
160
161struct intel_uncore_ops;
162struct intel_uncore_pmu;
163struct intel_uncore_box;
164struct uncore_event_desc;
165
166struct intel_uncore_type {
167 const char *name;
168 int num_counters;
169 int num_boxes;
170 int perf_ctr_bits;
171 int fixed_ctr_bits;
172 unsigned perf_ctr;
173 unsigned event_ctl;
174 unsigned event_mask;
175 unsigned fixed_ctr;
176 unsigned fixed_ctl;
177 unsigned box_ctl;
178 unsigned msr_offset;
179 unsigned num_shared_regs:8;
180 unsigned single_fixed:1;
181 struct event_constraint unconstrainted;
182 struct event_constraint *constraints;
183 struct intel_uncore_pmu *pmus;
184 struct intel_uncore_ops *ops;
185 struct uncore_event_desc *event_descs;
186 const struct attribute_group *attr_groups[3];
187};
188
189#define format_group attr_groups[0]
190
191struct intel_uncore_ops {
192 void (*init_box)(struct intel_uncore_box *);
193 void (*disable_box)(struct intel_uncore_box *);
194 void (*enable_box)(struct intel_uncore_box *);
195 void (*disable_event)(struct intel_uncore_box *, struct perf_event *);
196 void (*enable_event)(struct intel_uncore_box *, struct perf_event *);
197 u64 (*read_counter)(struct intel_uncore_box *, struct perf_event *);
198 int (*hw_config)(struct intel_uncore_box *, struct perf_event *);
199 struct event_constraint *(*get_constraint)(struct intel_uncore_box *,
200 struct perf_event *);
201 void (*put_constraint)(struct intel_uncore_box *, struct perf_event *);
202};
203
204struct intel_uncore_pmu {
205 struct pmu pmu;
206 char name[UNCORE_PMU_NAME_LEN];
207 int pmu_idx;
208 int func_id;
209 struct intel_uncore_type *type;
210 struct intel_uncore_box ** __percpu box;
211 struct list_head box_list;
212};
213
214struct intel_uncore_extra_reg {
215 raw_spinlock_t lock;
216 u64 config1;
217 atomic_t ref;
218};
219
220struct intel_uncore_box {
221 int phys_id;
222 int n_active; /* number of active events */
223 int n_events;
224 int cpu; /* cpu to collect events */
225 unsigned long flags;
226 atomic_t refcnt;
227 struct perf_event *events[UNCORE_PMC_IDX_MAX];
228 struct perf_event *event_list[UNCORE_PMC_IDX_MAX];
229 unsigned long active_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
230 u64 tags[UNCORE_PMC_IDX_MAX];
231 struct pci_dev *pci_dev;
232 struct intel_uncore_pmu *pmu;
233 struct hrtimer hrtimer;
234 struct list_head list;
235 struct intel_uncore_extra_reg shared_regs[0];
236};
237
238#define UNCORE_BOX_FLAG_INITIATED 0
239
240struct uncore_event_desc {
241 struct kobj_attribute attr;
242 const char *config;
243};
244
245#define INTEL_UNCORE_EVENT_DESC(_name, _config) \
246{ \
247 .attr = __ATTR(_name, 0444, uncore_event_show, NULL), \
248 .config = _config, \
249}
250
251#define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format) \
252static ssize_t __uncore_##_var##_show(struct kobject *kobj, \
253 struct kobj_attribute *attr, \
254 char *page) \
255{ \
256 BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \
257 return sprintf(page, _format "\n"); \
258} \
259static struct kobj_attribute format_attr_##_var = \
260 __ATTR(_name, 0444, __uncore_##_var##_show, NULL)
261
262
263static ssize_t uncore_event_show(struct kobject *kobj,
264 struct kobj_attribute *attr, char *buf)
265{
266 struct uncore_event_desc *event =
267 container_of(attr, struct uncore_event_desc, attr);
268 return sprintf(buf, "%s", event->config);
269}
270
271static inline unsigned uncore_pci_box_ctl(struct intel_uncore_box *box)
272{
273 return box->pmu->type->box_ctl;
274}
275
276static inline unsigned uncore_pci_fixed_ctl(struct intel_uncore_box *box)
277{
278 return box->pmu->type->fixed_ctl;
279}
280
281static inline unsigned uncore_pci_fixed_ctr(struct intel_uncore_box *box)
282{
283 return box->pmu->type->fixed_ctr;
284}
285
286static inline
287unsigned uncore_pci_event_ctl(struct intel_uncore_box *box, int idx)
288{
289 return idx * 4 + box->pmu->type->event_ctl;
290}
291
292static inline
293unsigned uncore_pci_perf_ctr(struct intel_uncore_box *box, int idx)
294{
295 return idx * 8 + box->pmu->type->perf_ctr;
296}
297
298static inline
299unsigned uncore_msr_box_ctl(struct intel_uncore_box *box)
300{
301 if (!box->pmu->type->box_ctl)
302 return 0;
303 return box->pmu->type->box_ctl +
304 box->pmu->type->msr_offset * box->pmu->pmu_idx;
305}
306
307static inline
308unsigned uncore_msr_fixed_ctl(struct intel_uncore_box *box)
309{
310 if (!box->pmu->type->fixed_ctl)
311 return 0;
312 return box->pmu->type->fixed_ctl +
313 box->pmu->type->msr_offset * box->pmu->pmu_idx;
314}
315
316static inline
317unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box)
318{
319 return box->pmu->type->fixed_ctr +
320 box->pmu->type->msr_offset * box->pmu->pmu_idx;
321}
322
323static inline
324unsigned uncore_msr_event_ctl(struct intel_uncore_box *box, int idx)
325{
326 return idx + box->pmu->type->event_ctl +
327 box->pmu->type->msr_offset * box->pmu->pmu_idx;
328}
329
330static inline
331unsigned uncore_msr_perf_ctr(struct intel_uncore_box *box, int idx)
332{
333 return idx + box->pmu->type->perf_ctr +
334 box->pmu->type->msr_offset * box->pmu->pmu_idx;
335}
336
337static inline
338unsigned uncore_fixed_ctl(struct intel_uncore_box *box)
339{
340 if (box->pci_dev)
341 return uncore_pci_fixed_ctl(box);
342 else
343 return uncore_msr_fixed_ctl(box);
344}
345
346static inline
347unsigned uncore_fixed_ctr(struct intel_uncore_box *box)
348{
349 if (box->pci_dev)
350 return uncore_pci_fixed_ctr(box);
351 else
352 return uncore_msr_fixed_ctr(box);
353}
354
355static inline
356unsigned uncore_event_ctl(struct intel_uncore_box *box, int idx)
357{
358 if (box->pci_dev)
359 return uncore_pci_event_ctl(box, idx);
360 else
361 return uncore_msr_event_ctl(box, idx);
362}
363
364static inline
365unsigned uncore_perf_ctr(struct intel_uncore_box *box, int idx)
366{
367 if (box->pci_dev)
368 return uncore_pci_perf_ctr(box, idx);
369 else
370 return uncore_msr_perf_ctr(box, idx);
371}
372
373static inline int uncore_perf_ctr_bits(struct intel_uncore_box *box)
374{
375 return box->pmu->type->perf_ctr_bits;
376}
377
378static inline int uncore_fixed_ctr_bits(struct intel_uncore_box *box)
379{
380 return box->pmu->type->fixed_ctr_bits;
381}
382
383static inline int uncore_num_counters(struct intel_uncore_box *box)
384{
385 return box->pmu->type->num_counters;
386}
387
388static inline void uncore_disable_box(struct intel_uncore_box *box)
389{
390 if (box->pmu->type->ops->disable_box)
391 box->pmu->type->ops->disable_box(box);
392}
393
394static inline void uncore_enable_box(struct intel_uncore_box *box)
395{
396 if (box->pmu->type->ops->enable_box)
397 box->pmu->type->ops->enable_box(box);
398}
399
400static inline void uncore_disable_event(struct intel_uncore_box *box,
401 struct perf_event *event)
402{
403 box->pmu->type->ops->disable_event(box, event);
404}
405
406static inline void uncore_enable_event(struct intel_uncore_box *box,
407 struct perf_event *event)
408{
409 box->pmu->type->ops->enable_event(box, event);
410}
411
412static inline u64 uncore_read_counter(struct intel_uncore_box *box,
413 struct perf_event *event)
414{
415 return box->pmu->type->ops->read_counter(box, event);
416}
417
418static inline void uncore_box_init(struct intel_uncore_box *box)
419{
420 if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) {
421 if (box->pmu->type->ops->init_box)
422 box->pmu->type->ops->init_box(box);
423 }
424}
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 47124a73dd7..92c7e39a079 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -895,8 +895,8 @@ static void p4_pmu_disable_pebs(void)
895 * So at moment let leave metrics turned on forever -- it's 895 * So at moment let leave metrics turned on forever -- it's
896 * ok for now but need to be revisited! 896 * ok for now but need to be revisited!
897 * 897 *
898 * (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)0); 898 * (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, (u64)0);
899 * (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)0); 899 * (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, (u64)0);
900 */ 900 */
901} 901}
902 902
@@ -909,7 +909,7 @@ static inline void p4_pmu_disable_event(struct perf_event *event)
909 * state we need to clear P4_CCCR_OVF, otherwise interrupt get 909 * state we need to clear P4_CCCR_OVF, otherwise interrupt get
910 * asserted again and again 910 * asserted again and again
911 */ 911 */
912 (void)checking_wrmsrl(hwc->config_base, 912 (void)wrmsrl_safe(hwc->config_base,
913 (u64)(p4_config_unpack_cccr(hwc->config)) & 913 (u64)(p4_config_unpack_cccr(hwc->config)) &
914 ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED); 914 ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
915} 915}
@@ -943,8 +943,8 @@ static void p4_pmu_enable_pebs(u64 config)
943 943
944 bind = &p4_pebs_bind_map[idx]; 944 bind = &p4_pebs_bind_map[idx];
945 945
946 (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs); 946 (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs);
947 (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert); 947 (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert);
948} 948}
949 949
950static void p4_pmu_enable_event(struct perf_event *event) 950static void p4_pmu_enable_event(struct perf_event *event)
@@ -978,8 +978,8 @@ static void p4_pmu_enable_event(struct perf_event *event)
978 */ 978 */
979 p4_pmu_enable_pebs(hwc->config); 979 p4_pmu_enable_pebs(hwc->config);
980 980
981 (void)checking_wrmsrl(escr_addr, escr_conf); 981 (void)wrmsrl_safe(escr_addr, escr_conf);
982 (void)checking_wrmsrl(hwc->config_base, 982 (void)wrmsrl_safe(hwc->config_base,
983 (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE); 983 (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
984} 984}
985 985
@@ -1325,7 +1325,7 @@ __init int p4_pmu_init(void)
1325 unsigned int low, high; 1325 unsigned int low, high;
1326 1326
1327 /* If we get stripped -- indexing fails */ 1327 /* If we get stripped -- indexing fails */
1328 BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC); 1328 BUILD_BUG_ON(ARCH_P4_MAX_CCCR > INTEL_PMC_MAX_GENERIC);
1329 1329
1330 rdmsr(MSR_IA32_MISC_ENABLE, low, high); 1330 rdmsr(MSR_IA32_MISC_ENABLE, low, high);
1331 if (!(low & (1 << 7))) { 1331 if (!(low & (1 << 7))) {
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index 32bcfc7dd23..e4dd0f7a045 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -71,7 +71,7 @@ p6_pmu_disable_event(struct perf_event *event)
71 if (cpuc->enabled) 71 if (cpuc->enabled)
72 val |= ARCH_PERFMON_EVENTSEL_ENABLE; 72 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
73 73
74 (void)checking_wrmsrl(hwc->config_base, val); 74 (void)wrmsrl_safe(hwc->config_base, val);
75} 75}
76 76
77static void p6_pmu_enable_event(struct perf_event *event) 77static void p6_pmu_enable_event(struct perf_event *event)
@@ -84,7 +84,7 @@ static void p6_pmu_enable_event(struct perf_event *event)
84 if (cpuc->enabled) 84 if (cpuc->enabled)
85 val |= ARCH_PERFMON_EVENTSEL_ENABLE; 85 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
86 86
87 (void)checking_wrmsrl(hwc->config_base, val); 87 (void)wrmsrl_safe(hwc->config_base, val);
88} 88}
89 89
90PMU_FORMAT_ATTR(event, "config:0-7" ); 90PMU_FORMAT_ATTR(event, "config:0-7" );
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index addf9e82a7f..ee8e9abc859 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -31,7 +31,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
31 const struct cpuid_bit *cb; 31 const struct cpuid_bit *cb;
32 32
33 static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { 33 static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
34 { X86_FEATURE_DTS, CR_EAX, 0, 0x00000006, 0 }, 34 { X86_FEATURE_DTHERM, CR_EAX, 0, 0x00000006, 0 },
35 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 }, 35 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 },
36 { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 }, 36 { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 },
37 { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 }, 37 { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 },
diff --git a/arch/x86/kernel/cpu/sched.c b/arch/x86/kernel/cpu/sched.c
deleted file mode 100644
index a640ae5ad20..00000000000
--- a/arch/x86/kernel/cpu/sched.c
+++ /dev/null
@@ -1,55 +0,0 @@
1#include <linux/sched.h>
2#include <linux/math64.h>
3#include <linux/percpu.h>
4#include <linux/irqflags.h>
5
6#include <asm/cpufeature.h>
7#include <asm/processor.h>
8
9#ifdef CONFIG_SMP
10
11static DEFINE_PER_CPU(struct aperfmperf, old_perf_sched);
12
13static unsigned long scale_aperfmperf(void)
14{
15 struct aperfmperf val, *old = &__get_cpu_var(old_perf_sched);
16 unsigned long ratio, flags;
17
18 local_irq_save(flags);
19 get_aperfmperf(&val);
20 local_irq_restore(flags);
21
22 ratio = calc_aperfmperf_ratio(old, &val);
23 *old = val;
24
25 return ratio;
26}
27
28unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu)
29{
30 /*
31 * do aperf/mperf on the cpu level because it includes things
32 * like turbo mode, which are relevant to full cores.
33 */
34 if (boot_cpu_has(X86_FEATURE_APERFMPERF))
35 return scale_aperfmperf();
36
37 /*
38 * maybe have something cpufreq here
39 */
40
41 return default_scale_freq_power(sd, cpu);
42}
43
44unsigned long arch_scale_smt_power(struct sched_domain *sd, int cpu)
45{
46 /*
47 * aperf/mperf already includes the smt gain
48 */
49 if (boot_cpu_has(X86_FEATURE_APERFMPERF))
50 return SCHED_LOAD_SCALE;
51
52 return default_scale_smt_power(sd, cpu);
53}
54
55#endif
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 571246d81ed..ae42418bc50 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -27,8 +27,8 @@ static int die_counter;
27 27
28void printk_address(unsigned long address, int reliable) 28void printk_address(unsigned long address, int reliable)
29{ 29{
30 printk(" [<%p>] %s%pB\n", (void *) address, 30 pr_cont(" [<%p>] %s%pB\n",
31 reliable ? "" : "? ", (void *) address); 31 (void *)address, reliable ? "" : "? ", (void *)address);
32} 32}
33 33
34#ifdef CONFIG_FUNCTION_GRAPH_TRACER 34#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -271,6 +271,7 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
271 current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP) 271 current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)
272 return 1; 272 return 1;
273 273
274 print_modules();
274 show_regs(regs); 275 show_regs(regs);
275#ifdef CONFIG_X86_32 276#ifdef CONFIG_X86_32
276 if (user_mode_vm(regs)) { 277 if (user_mode_vm(regs)) {
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index e0b1d783daa..1038a417ea5 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -73,11 +73,11 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
73 if (kstack_end(stack)) 73 if (kstack_end(stack))
74 break; 74 break;
75 if (i && ((i % STACKSLOTS_PER_LINE) == 0)) 75 if (i && ((i % STACKSLOTS_PER_LINE) == 0))
76 printk(KERN_CONT "\n"); 76 pr_cont("\n");
77 printk(KERN_CONT " %08lx", *stack++); 77 pr_cont(" %08lx", *stack++);
78 touch_nmi_watchdog(); 78 touch_nmi_watchdog();
79 } 79 }
80 printk(KERN_CONT "\n"); 80 pr_cont("\n");
81 show_trace_log_lvl(task, regs, sp, bp, log_lvl); 81 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
82} 82}
83 83
@@ -86,12 +86,11 @@ void show_regs(struct pt_regs *regs)
86{ 86{
87 int i; 87 int i;
88 88
89 print_modules();
90 __show_regs(regs, !user_mode_vm(regs)); 89 __show_regs(regs, !user_mode_vm(regs));
91 90
92 printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n", 91 pr_emerg("Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n",
93 TASK_COMM_LEN, current->comm, task_pid_nr(current), 92 TASK_COMM_LEN, current->comm, task_pid_nr(current),
94 current_thread_info(), current, task_thread_info(current)); 93 current_thread_info(), current, task_thread_info(current));
95 /* 94 /*
96 * When in-kernel, we also print out the stack and code at the 95 * When in-kernel, we also print out the stack and code at the
97 * time of the fault.. 96 * time of the fault..
@@ -102,10 +101,10 @@ void show_regs(struct pt_regs *regs)
102 unsigned char c; 101 unsigned char c;
103 u8 *ip; 102 u8 *ip;
104 103
105 printk(KERN_EMERG "Stack:\n"); 104 pr_emerg("Stack:\n");
106 show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG); 105 show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG);
107 106
108 printk(KERN_EMERG "Code: "); 107 pr_emerg("Code:");
109 108
110 ip = (u8 *)regs->ip - code_prologue; 109 ip = (u8 *)regs->ip - code_prologue;
111 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { 110 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
@@ -116,16 +115,16 @@ void show_regs(struct pt_regs *regs)
116 for (i = 0; i < code_len; i++, ip++) { 115 for (i = 0; i < code_len; i++, ip++) {
117 if (ip < (u8 *)PAGE_OFFSET || 116 if (ip < (u8 *)PAGE_OFFSET ||
118 probe_kernel_address(ip, c)) { 117 probe_kernel_address(ip, c)) {
119 printk(KERN_CONT " Bad EIP value."); 118 pr_cont(" Bad EIP value.");
120 break; 119 break;
121 } 120 }
122 if (ip == (u8 *)regs->ip) 121 if (ip == (u8 *)regs->ip)
123 printk(KERN_CONT "<%02x> ", c); 122 pr_cont(" <%02x>", c);
124 else 123 else
125 printk(KERN_CONT "%02x ", c); 124 pr_cont(" %02x", c);
126 } 125 }
127 } 126 }
128 printk(KERN_CONT "\n"); 127 pr_cont("\n");
129} 128}
130 129
131int is_valid_bugaddr(unsigned long ip) 130int is_valid_bugaddr(unsigned long ip)
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 791b76122aa..b653675d528 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -228,20 +228,20 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
228 if (stack >= irq_stack && stack <= irq_stack_end) { 228 if (stack >= irq_stack && stack <= irq_stack_end) {
229 if (stack == irq_stack_end) { 229 if (stack == irq_stack_end) {
230 stack = (unsigned long *) (irq_stack_end[-1]); 230 stack = (unsigned long *) (irq_stack_end[-1]);
231 printk(KERN_CONT " <EOI> "); 231 pr_cont(" <EOI> ");
232 } 232 }
233 } else { 233 } else {
234 if (((long) stack & (THREAD_SIZE-1)) == 0) 234 if (((long) stack & (THREAD_SIZE-1)) == 0)
235 break; 235 break;
236 } 236 }
237 if (i && ((i % STACKSLOTS_PER_LINE) == 0)) 237 if (i && ((i % STACKSLOTS_PER_LINE) == 0))
238 printk(KERN_CONT "\n"); 238 pr_cont("\n");
239 printk(KERN_CONT " %016lx", *stack++); 239 pr_cont(" %016lx", *stack++);
240 touch_nmi_watchdog(); 240 touch_nmi_watchdog();
241 } 241 }
242 preempt_enable(); 242 preempt_enable();
243 243
244 printk(KERN_CONT "\n"); 244 pr_cont("\n");
245 show_trace_log_lvl(task, regs, sp, bp, log_lvl); 245 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
246} 246}
247 247
@@ -254,10 +254,9 @@ void show_regs(struct pt_regs *regs)
254 254
255 sp = regs->sp; 255 sp = regs->sp;
256 printk("CPU %d ", cpu); 256 printk("CPU %d ", cpu);
257 print_modules();
258 __show_regs(regs, 1); 257 __show_regs(regs, 1);
259 printk("Process %s (pid: %d, threadinfo %p, task %p)\n", 258 printk(KERN_DEFAULT "Process %s (pid: %d, threadinfo %p, task %p)\n",
260 cur->comm, cur->pid, task_thread_info(cur), cur); 259 cur->comm, cur->pid, task_thread_info(cur), cur);
261 260
262 /* 261 /*
263 * When in-kernel, we also print out the stack and code at the 262 * When in-kernel, we also print out the stack and code at the
@@ -284,16 +283,16 @@ void show_regs(struct pt_regs *regs)
284 for (i = 0; i < code_len; i++, ip++) { 283 for (i = 0; i < code_len; i++, ip++) {
285 if (ip < (u8 *)PAGE_OFFSET || 284 if (ip < (u8 *)PAGE_OFFSET ||
286 probe_kernel_address(ip, c)) { 285 probe_kernel_address(ip, c)) {
287 printk(KERN_CONT " Bad RIP value."); 286 pr_cont(" Bad RIP value.");
288 break; 287 break;
289 } 288 }
290 if (ip == (u8 *)regs->ip) 289 if (ip == (u8 *)regs->ip)
291 printk(KERN_CONT "<%02x> ", c); 290 pr_cont("<%02x> ", c);
292 else 291 else
293 printk(KERN_CONT "%02x ", c); 292 pr_cont("%02x ", c);
294 } 293 }
295 } 294 }
296 printk(KERN_CONT "\n"); 295 pr_cont("\n");
297} 296}
298 297
299int is_valid_bugaddr(unsigned long ip) 298int is_valid_bugaddr(unsigned long ip)
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 62d61e9976e..41857970517 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -113,7 +113,9 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
113 int x = e820x->nr_map; 113 int x = e820x->nr_map;
114 114
115 if (x >= ARRAY_SIZE(e820x->map)) { 115 if (x >= ARRAY_SIZE(e820x->map)) {
116 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); 116 printk(KERN_ERR "e820: too many entries; ignoring [mem %#010llx-%#010llx]\n",
117 (unsigned long long) start,
118 (unsigned long long) (start + size - 1));
117 return; 119 return;
118 } 120 }
119 121
@@ -133,19 +135,19 @@ static void __init e820_print_type(u32 type)
133 switch (type) { 135 switch (type) {
134 case E820_RAM: 136 case E820_RAM:
135 case E820_RESERVED_KERN: 137 case E820_RESERVED_KERN:
136 printk(KERN_CONT "(usable)"); 138 printk(KERN_CONT "usable");
137 break; 139 break;
138 case E820_RESERVED: 140 case E820_RESERVED:
139 printk(KERN_CONT "(reserved)"); 141 printk(KERN_CONT "reserved");
140 break; 142 break;
141 case E820_ACPI: 143 case E820_ACPI:
142 printk(KERN_CONT "(ACPI data)"); 144 printk(KERN_CONT "ACPI data");
143 break; 145 break;
144 case E820_NVS: 146 case E820_NVS:
145 printk(KERN_CONT "(ACPI NVS)"); 147 printk(KERN_CONT "ACPI NVS");
146 break; 148 break;
147 case E820_UNUSABLE: 149 case E820_UNUSABLE:
148 printk(KERN_CONT "(unusable)"); 150 printk(KERN_CONT "unusable");
149 break; 151 break;
150 default: 152 default:
151 printk(KERN_CONT "type %u", type); 153 printk(KERN_CONT "type %u", type);
@@ -158,10 +160,10 @@ void __init e820_print_map(char *who)
158 int i; 160 int i;
159 161
160 for (i = 0; i < e820.nr_map; i++) { 162 for (i = 0; i < e820.nr_map; i++) {
161 printk(KERN_INFO " %s: %016Lx - %016Lx ", who, 163 printk(KERN_INFO "%s: [mem %#018Lx-%#018Lx] ", who,
162 (unsigned long long) e820.map[i].addr, 164 (unsigned long long) e820.map[i].addr,
163 (unsigned long long) 165 (unsigned long long)
164 (e820.map[i].addr + e820.map[i].size)); 166 (e820.map[i].addr + e820.map[i].size - 1));
165 e820_print_type(e820.map[i].type); 167 e820_print_type(e820.map[i].type);
166 printk(KERN_CONT "\n"); 168 printk(KERN_CONT "\n");
167 } 169 }
@@ -428,9 +430,8 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
428 size = ULLONG_MAX - start; 430 size = ULLONG_MAX - start;
429 431
430 end = start + size; 432 end = start + size;
431 printk(KERN_DEBUG "e820 update range: %016Lx - %016Lx ", 433 printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx] ",
432 (unsigned long long) start, 434 (unsigned long long) start, (unsigned long long) (end - 1));
433 (unsigned long long) end);
434 e820_print_type(old_type); 435 e820_print_type(old_type);
435 printk(KERN_CONT " ==> "); 436 printk(KERN_CONT " ==> ");
436 e820_print_type(new_type); 437 e820_print_type(new_type);
@@ -509,9 +510,8 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
509 size = ULLONG_MAX - start; 510 size = ULLONG_MAX - start;
510 511
511 end = start + size; 512 end = start + size;
512 printk(KERN_DEBUG "e820 remove range: %016Lx - %016Lx ", 513 printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx] ",
513 (unsigned long long) start, 514 (unsigned long long) start, (unsigned long long) (end - 1));
514 (unsigned long long) end);
515 if (checktype) 515 if (checktype)
516 e820_print_type(old_type); 516 e820_print_type(old_type);
517 printk(KERN_CONT "\n"); 517 printk(KERN_CONT "\n");
@@ -567,7 +567,7 @@ void __init update_e820(void)
567 if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map)) 567 if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
568 return; 568 return;
569 e820.nr_map = nr_map; 569 e820.nr_map = nr_map;
570 printk(KERN_INFO "modified physical RAM map:\n"); 570 printk(KERN_INFO "e820: modified physical RAM map:\n");
571 e820_print_map("modified"); 571 e820_print_map("modified");
572} 572}
573static void __init update_e820_saved(void) 573static void __init update_e820_saved(void)
@@ -637,8 +637,8 @@ __init void e820_setup_gap(void)
637 if (!found) { 637 if (!found) {
638 gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024; 638 gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024;
639 printk(KERN_ERR 639 printk(KERN_ERR
640 "PCI: Warning: Cannot find a gap in the 32bit address range\n" 640 "e820: cannot find a gap in the 32bit address range\n"
641 "PCI: Unassigned devices with 32bit resource registers may break!\n"); 641 "e820: PCI devices with unassigned 32bit BARs may break!\n");
642 } 642 }
643#endif 643#endif
644 644
@@ -648,8 +648,8 @@ __init void e820_setup_gap(void)
648 pci_mem_start = gapstart; 648 pci_mem_start = gapstart;
649 649
650 printk(KERN_INFO 650 printk(KERN_INFO
651 "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", 651 "e820: [mem %#010lx-%#010lx] available for PCI devices\n",
652 pci_mem_start, gapstart, gapsize); 652 gapstart, gapstart + gapsize - 1);
653} 653}
654 654
655/** 655/**
@@ -667,7 +667,7 @@ void __init parse_e820_ext(struct setup_data *sdata)
667 extmap = (struct e820entry *)(sdata->data); 667 extmap = (struct e820entry *)(sdata->data);
668 __append_e820_map(extmap, entries); 668 __append_e820_map(extmap, entries);
669 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 669 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
670 printk(KERN_INFO "extended physical RAM map:\n"); 670 printk(KERN_INFO "e820: extended physical RAM map:\n");
671 e820_print_map("extended"); 671 e820_print_map("extended");
672} 672}
673 673
@@ -734,7 +734,7 @@ u64 __init early_reserve_e820(u64 size, u64 align)
734 addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); 734 addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
735 if (addr) { 735 if (addr) {
736 e820_update_range_saved(addr, size, E820_RAM, E820_RESERVED); 736 e820_update_range_saved(addr, size, E820_RAM, E820_RESERVED);
737 printk(KERN_INFO "update e820_saved for early_reserve_e820\n"); 737 printk(KERN_INFO "e820: update e820_saved for early_reserve_e820\n");
738 update_e820_saved(); 738 update_e820_saved();
739 } 739 }
740 740
@@ -784,7 +784,7 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
784 if (last_pfn > max_arch_pfn) 784 if (last_pfn > max_arch_pfn)
785 last_pfn = max_arch_pfn; 785 last_pfn = max_arch_pfn;
786 786
787 printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n", 787 printk(KERN_INFO "e820: last_pfn = %#lx max_arch_pfn = %#lx\n",
788 last_pfn, max_arch_pfn); 788 last_pfn, max_arch_pfn);
789 return last_pfn; 789 return last_pfn;
790} 790}
@@ -888,7 +888,7 @@ void __init finish_e820_parsing(void)
888 early_panic("Invalid user supplied memory map"); 888 early_panic("Invalid user supplied memory map");
889 e820.nr_map = nr; 889 e820.nr_map = nr;
890 890
891 printk(KERN_INFO "user-defined physical RAM map:\n"); 891 printk(KERN_INFO "e820: user-defined physical RAM map:\n");
892 e820_print_map("user"); 892 e820_print_map("user");
893 } 893 }
894} 894}
@@ -996,8 +996,9 @@ void __init e820_reserve_resources_late(void)
996 end = MAX_RESOURCE_SIZE; 996 end = MAX_RESOURCE_SIZE;
997 if (start >= end) 997 if (start >= end)
998 continue; 998 continue;
999 printk(KERN_DEBUG "reserve RAM buffer: %016llx - %016llx ", 999 printk(KERN_DEBUG
1000 start, end); 1000 "e820: reserve RAM buffer [mem %#010llx-%#010llx]\n",
1001 start, end);
1001 reserve_region_with_split(&iomem_resource, start, end, 1002 reserve_region_with_split(&iomem_resource, start, end,
1002 "RAM buffer"); 1003 "RAM buffer");
1003 } 1004 }
@@ -1047,7 +1048,7 @@ void __init setup_memory_map(void)
1047 1048
1048 who = x86_init.resources.memory_setup(); 1049 who = x86_init.resources.memory_setup();
1049 memcpy(&e820_saved, &e820, sizeof(struct e820map)); 1050 memcpy(&e820_saved, &e820, sizeof(struct e820map));
1050 printk(KERN_INFO "BIOS-provided physical RAM map:\n"); 1051 printk(KERN_INFO "e820: BIOS-provided physical RAM map:\n");
1051 e820_print_map(who); 1052 e820_print_map(who);
1052} 1053}
1053 1054
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 5e4771266f1..9b9f18b4991 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -119,7 +119,7 @@ static __init void early_serial_init(char *s)
119 unsigned char c; 119 unsigned char c;
120 unsigned divisor; 120 unsigned divisor;
121 unsigned baud = DEFAULT_BAUD; 121 unsigned baud = DEFAULT_BAUD;
122 ssize_t ret; 122 char *e;
123 123
124 if (*s == ',') 124 if (*s == ',')
125 ++s; 125 ++s;
@@ -127,14 +127,14 @@ static __init void early_serial_init(char *s)
127 if (*s) { 127 if (*s) {
128 unsigned port; 128 unsigned port;
129 if (!strncmp(s, "0x", 2)) { 129 if (!strncmp(s, "0x", 2)) {
130 ret = kstrtoint(s, 16, &early_serial_base); 130 early_serial_base = simple_strtoul(s, &e, 16);
131 } else { 131 } else {
132 static const int __initconst bases[] = { 0x3f8, 0x2f8 }; 132 static const int __initconst bases[] = { 0x3f8, 0x2f8 };
133 133
134 if (!strncmp(s, "ttyS", 4)) 134 if (!strncmp(s, "ttyS", 4))
135 s += 4; 135 s += 4;
136 ret = kstrtouint(s, 10, &port); 136 port = simple_strtoul(s, &e, 10);
137 if (ret || port > 1) 137 if (port > 1 || s == e)
138 port = 0; 138 port = 0;
139 early_serial_base = bases[port]; 139 early_serial_base = bases[port];
140 } 140 }
@@ -149,8 +149,8 @@ static __init void early_serial_init(char *s)
149 outb(0x3, early_serial_base + MCR); /* DTR + RTS */ 149 outb(0x3, early_serial_base + MCR); /* DTR + RTS */
150 150
151 if (*s) { 151 if (*s) {
152 ret = kstrtouint(s, 0, &baud); 152 baud = simple_strtoul(s, &e, 0);
153 if (ret || baud == 0) 153 if (baud == 0 || s == e)
154 baud = DEFAULT_BAUD; 154 baud = DEFAULT_BAUD;
155 } 155 }
156 156
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 01ccf9b7147..623f2883747 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -316,7 +316,6 @@ ret_from_exception:
316 preempt_stop(CLBR_ANY) 316 preempt_stop(CLBR_ANY)
317ret_from_intr: 317ret_from_intr:
318 GET_THREAD_INFO(%ebp) 318 GET_THREAD_INFO(%ebp)
319resume_userspace_sig:
320#ifdef CONFIG_VM86 319#ifdef CONFIG_VM86
321 movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS 320 movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
322 movb PT_CS(%esp), %al 321 movb PT_CS(%esp), %al
@@ -615,9 +614,13 @@ work_notifysig: # deal with pending signals and
615 # vm86-space 614 # vm86-space
616 TRACE_IRQS_ON 615 TRACE_IRQS_ON
617 ENABLE_INTERRUPTS(CLBR_NONE) 616 ENABLE_INTERRUPTS(CLBR_NONE)
617 movb PT_CS(%esp), %bl
618 andb $SEGMENT_RPL_MASK, %bl
619 cmpb $USER_RPL, %bl
620 jb resume_kernel
618 xorl %edx, %edx 621 xorl %edx, %edx
619 call do_notify_resume 622 call do_notify_resume
620 jmp resume_userspace_sig 623 jmp resume_userspace
621 624
622 ALIGN 625 ALIGN
623work_notifysig_v86: 626work_notifysig_v86:
@@ -630,9 +633,13 @@ work_notifysig_v86:
630#endif 633#endif
631 TRACE_IRQS_ON 634 TRACE_IRQS_ON
632 ENABLE_INTERRUPTS(CLBR_NONE) 635 ENABLE_INTERRUPTS(CLBR_NONE)
636 movb PT_CS(%esp), %bl
637 andb $SEGMENT_RPL_MASK, %bl
638 cmpb $USER_RPL, %bl
639 jb resume_kernel
633 xorl %edx, %edx 640 xorl %edx, %edx
634 call do_notify_resume 641 call do_notify_resume
635 jmp resume_userspace_sig 642 jmp resume_userspace
636END(work_pending) 643END(work_pending)
637 644
638 # perform syscall exit tracing 645 # perform syscall exit tracing
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 320852d0202..111f6bbd8b3 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -191,6 +191,44 @@ ENDPROC(native_usergs_sysret64)
191.endm 191.endm
192 192
193/* 193/*
194 * When dynamic function tracer is enabled it will add a breakpoint
195 * to all locations that it is about to modify, sync CPUs, update
196 * all the code, sync CPUs, then remove the breakpoints. In this time
197 * if lockdep is enabled, it might jump back into the debug handler
198 * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF).
199 *
200 * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to
201 * make sure the stack pointer does not get reset back to the top
202 * of the debug stack, and instead just reuses the current stack.
203 */
204#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS)
205
206.macro TRACE_IRQS_OFF_DEBUG
207 call debug_stack_set_zero
208 TRACE_IRQS_OFF
209 call debug_stack_reset
210.endm
211
212.macro TRACE_IRQS_ON_DEBUG
213 call debug_stack_set_zero
214 TRACE_IRQS_ON
215 call debug_stack_reset
216.endm
217
218.macro TRACE_IRQS_IRETQ_DEBUG offset=ARGOFFSET
219 bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */
220 jnc 1f
221 TRACE_IRQS_ON_DEBUG
2221:
223.endm
224
225#else
226# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF
227# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON
228# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ
229#endif
230
231/*
194 * C code is not supposed to know about undefined top of stack. Every time 232 * C code is not supposed to know about undefined top of stack. Every time
195 * a C function with an pt_regs argument is called from the SYSCALL based 233 * a C function with an pt_regs argument is called from the SYSCALL based
196 * fast path FIXUP_TOP_OF_STACK is needed. 234 * fast path FIXUP_TOP_OF_STACK is needed.
@@ -1098,7 +1136,7 @@ ENTRY(\sym)
1098 subq $ORIG_RAX-R15, %rsp 1136 subq $ORIG_RAX-R15, %rsp
1099 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 1137 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1100 call save_paranoid 1138 call save_paranoid
1101 TRACE_IRQS_OFF 1139 TRACE_IRQS_OFF_DEBUG
1102 movq %rsp,%rdi /* pt_regs pointer */ 1140 movq %rsp,%rdi /* pt_regs pointer */
1103 xorl %esi,%esi /* no error code */ 1141 xorl %esi,%esi /* no error code */
1104 subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist) 1142 subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
@@ -1393,7 +1431,7 @@ paranoidzeroentry machine_check *machine_check_vector(%rip)
1393ENTRY(paranoid_exit) 1431ENTRY(paranoid_exit)
1394 DEFAULT_FRAME 1432 DEFAULT_FRAME
1395 DISABLE_INTERRUPTS(CLBR_NONE) 1433 DISABLE_INTERRUPTS(CLBR_NONE)
1396 TRACE_IRQS_OFF 1434 TRACE_IRQS_OFF_DEBUG
1397 testl %ebx,%ebx /* swapgs needed? */ 1435 testl %ebx,%ebx /* swapgs needed? */
1398 jnz paranoid_restore 1436 jnz paranoid_restore
1399 testl $3,CS(%rsp) 1437 testl $3,CS(%rsp)
@@ -1404,7 +1442,7 @@ paranoid_swapgs:
1404 RESTORE_ALL 8 1442 RESTORE_ALL 8
1405 jmp irq_return 1443 jmp irq_return
1406paranoid_restore: 1444paranoid_restore:
1407 TRACE_IRQS_IRETQ 0 1445 TRACE_IRQS_IRETQ_DEBUG 0
1408 RESTORE_ALL 8 1446 RESTORE_ALL 8
1409 jmp irq_return 1447 jmp irq_return
1410paranoid_userspace: 1448paranoid_userspace:
@@ -1720,10 +1758,30 @@ end_repeat_nmi:
1720 */ 1758 */
1721 call save_paranoid 1759 call save_paranoid
1722 DEFAULT_FRAME 0 1760 DEFAULT_FRAME 0
1761
1762 /*
1763 * Save off the CR2 register. If we take a page fault in the NMI then
1764 * it could corrupt the CR2 value. If the NMI preempts a page fault
1765 * handler before it was able to read the CR2 register, and then the
1766 * NMI itself takes a page fault, the page fault that was preempted
1767 * will read the information from the NMI page fault and not the
1768 * origin fault. Save it off and restore it if it changes.
1769 * Use the r12 callee-saved register.
1770 */
1771 movq %cr2, %r12
1772
1723 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ 1773 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
1724 movq %rsp,%rdi 1774 movq %rsp,%rdi
1725 movq $-1,%rsi 1775 movq $-1,%rsi
1726 call do_nmi 1776 call do_nmi
1777
1778 /* Did the NMI take a page fault? Restore cr2 if it did */
1779 movq %cr2, %rcx
1780 cmpq %rcx, %r12
1781 je 1f
1782 movq %r12, %cr2
17831:
1784
1727 testl %ebx,%ebx /* swapgs needed? */ 1785 testl %ebx,%ebx /* swapgs needed? */
1728 jnz nmi_restore 1786 jnz nmi_restore
1729nmi_swapgs: 1787nmi_swapgs:
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 32ff36596ab..c3a7cb4bf6e 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -100,7 +100,7 @@ static const unsigned char *ftrace_nop_replace(void)
100} 100}
101 101
102static int 102static int
103ftrace_modify_code(unsigned long ip, unsigned const char *old_code, 103ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code,
104 unsigned const char *new_code) 104 unsigned const char *new_code)
105{ 105{
106 unsigned char replaced[MCOUNT_INSN_SIZE]; 106 unsigned char replaced[MCOUNT_INSN_SIZE];
@@ -141,7 +141,20 @@ int ftrace_make_nop(struct module *mod,
141 old = ftrace_call_replace(ip, addr); 141 old = ftrace_call_replace(ip, addr);
142 new = ftrace_nop_replace(); 142 new = ftrace_nop_replace();
143 143
144 return ftrace_modify_code(rec->ip, old, new); 144 /*
145 * On boot up, and when modules are loaded, the MCOUNT_ADDR
146 * is converted to a nop, and will never become MCOUNT_ADDR
147 * again. This code is either running before SMP (on boot up)
148 * or before the code will ever be executed (module load).
149 * We do not want to use the breakpoint version in this case,
150 * just modify the code directly.
151 */
152 if (addr == MCOUNT_ADDR)
153 return ftrace_modify_code_direct(rec->ip, old, new);
154
155 /* Normal cases use add_brk_on_nop */
156 WARN_ONCE(1, "invalid use of ftrace_make_nop");
157 return -EINVAL;
145} 158}
146 159
147int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) 160int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
@@ -152,9 +165,47 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
152 old = ftrace_nop_replace(); 165 old = ftrace_nop_replace();
153 new = ftrace_call_replace(ip, addr); 166 new = ftrace_call_replace(ip, addr);
154 167
155 return ftrace_modify_code(rec->ip, old, new); 168 /* Should only be called when module is loaded */
169 return ftrace_modify_code_direct(rec->ip, old, new);
156} 170}
157 171
172/*
173 * The modifying_ftrace_code is used to tell the breakpoint
174 * handler to call ftrace_int3_handler(). If it fails to
175 * call this handler for a breakpoint added by ftrace, then
176 * the kernel may crash.
177 *
178 * As atomic_writes on x86 do not need a barrier, we do not
179 * need to add smp_mb()s for this to work. It is also considered
180 * that we can not read the modifying_ftrace_code before
181 * executing the breakpoint. That would be quite remarkable if
182 * it could do that. Here's the flow that is required:
183 *
184 * CPU-0 CPU-1
185 *
186 * atomic_inc(mfc);
187 * write int3s
188 * <trap-int3> // implicit (r)mb
189 * if (atomic_read(mfc))
190 * call ftrace_int3_handler()
191 *
192 * Then when we are finished:
193 *
194 * atomic_dec(mfc);
195 *
196 * If we hit a breakpoint that was not set by ftrace, it does not
197 * matter if ftrace_int3_handler() is called or not. It will
198 * simply be ignored. But it is crucial that a ftrace nop/caller
199 * breakpoint is handled. No other user should ever place a
200 * breakpoint on an ftrace nop/caller location. It must only
201 * be done by this code.
202 */
203atomic_t modifying_ftrace_code __read_mostly;
204
205static int
206ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
207 unsigned const char *new_code);
208
158int ftrace_update_ftrace_func(ftrace_func_t func) 209int ftrace_update_ftrace_func(ftrace_func_t func)
159{ 210{
160 unsigned long ip = (unsigned long)(&ftrace_call); 211 unsigned long ip = (unsigned long)(&ftrace_call);
@@ -163,13 +214,17 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
163 214
164 memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE); 215 memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE);
165 new = ftrace_call_replace(ip, (unsigned long)func); 216 new = ftrace_call_replace(ip, (unsigned long)func);
217
218 /* See comment above by declaration of modifying_ftrace_code */
219 atomic_inc(&modifying_ftrace_code);
220
166 ret = ftrace_modify_code(ip, old, new); 221 ret = ftrace_modify_code(ip, old, new);
167 222
223 atomic_dec(&modifying_ftrace_code);
224
168 return ret; 225 return ret;
169} 226}
170 227
171int modifying_ftrace_code __read_mostly;
172
173/* 228/*
174 * A breakpoint was added to the code address we are about to 229 * A breakpoint was added to the code address we are about to
175 * modify, and this is the handle that will just skip over it. 230 * modify, and this is the handle that will just skip over it.
@@ -489,13 +544,46 @@ void ftrace_replace_code(int enable)
489 } 544 }
490} 545}
491 546
547static int
548ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
549 unsigned const char *new_code)
550{
551 int ret;
552
553 ret = add_break(ip, old_code);
554 if (ret)
555 goto out;
556
557 run_sync();
558
559 ret = add_update_code(ip, new_code);
560 if (ret)
561 goto fail_update;
562
563 run_sync();
564
565 ret = ftrace_write(ip, new_code, 1);
566 if (ret) {
567 ret = -EPERM;
568 goto out;
569 }
570 run_sync();
571 out:
572 return ret;
573
574 fail_update:
575 probe_kernel_write((void *)ip, &old_code[0], 1);
576 goto out;
577}
578
492void arch_ftrace_update_code(int command) 579void arch_ftrace_update_code(int command)
493{ 580{
494 modifying_ftrace_code++; 581 /* See comment above by declaration of modifying_ftrace_code */
582 atomic_inc(&modifying_ftrace_code);
495 583
496 ftrace_modify_all_code(command); 584 ftrace_modify_all_code(command);
497 585
498 modifying_ftrace_code--; 586 atomic_dec(&modifying_ftrace_code);
499} 587}
500 588
501int __init ftrace_dyn_arch_init(void *data) 589int __init ftrace_dyn_arch_init(void *data)
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 51ff18616d5..c18f59d1010 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -14,7 +14,6 @@
14#include <asm/sections.h> 14#include <asm/sections.h>
15#include <asm/e820.h> 15#include <asm/e820.h>
16#include <asm/page.h> 16#include <asm/page.h>
17#include <asm/trampoline.h>
18#include <asm/apic.h> 17#include <asm/apic.h>
19#include <asm/io_apic.h> 18#include <asm/io_apic.h>
20#include <asm/bios_ebda.h> 19#include <asm/bios_ebda.h>
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 3a3b779f41d..037df57a99a 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -24,7 +24,6 @@
24#include <asm/sections.h> 24#include <asm/sections.h>
25#include <asm/kdebug.h> 25#include <asm/kdebug.h>
26#include <asm/e820.h> 26#include <asm/e820.h>
27#include <asm/trampoline.h>
28#include <asm/bios_ebda.h> 27#include <asm/bios_ebda.h>
29 28
30static void __init zap_identity_mappings(void) 29static void __init zap_identity_mappings(void)
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 463c9797ca6..d42ab17b739 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -274,10 +274,7 @@ num_subarch_entries = (. - subarch_entries) / 4
274 * If cpu hotplug is not supported then this code can go in init section 274 * If cpu hotplug is not supported then this code can go in init section
275 * which will be freed later 275 * which will be freed later
276 */ 276 */
277
278__CPUINIT 277__CPUINIT
279
280#ifdef CONFIG_SMP
281ENTRY(startup_32_smp) 278ENTRY(startup_32_smp)
282 cld 279 cld
283 movl $(__BOOT_DS),%eax 280 movl $(__BOOT_DS),%eax
@@ -288,7 +285,7 @@ ENTRY(startup_32_smp)
288 movl pa(stack_start),%ecx 285 movl pa(stack_start),%ecx
289 movl %eax,%ss 286 movl %eax,%ss
290 leal -__PAGE_OFFSET(%ecx),%esp 287 leal -__PAGE_OFFSET(%ecx),%esp
291#endif /* CONFIG_SMP */ 288
292default_entry: 289default_entry:
293 290
294/* 291/*
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 7a40f244732..94bf9cc2c7e 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -139,10 +139,6 @@ ident_complete:
139 /* Fixup phys_base */ 139 /* Fixup phys_base */
140 addq %rbp, phys_base(%rip) 140 addq %rbp, phys_base(%rip)
141 141
142 /* Fixup trampoline */
143 addq %rbp, trampoline_level4_pgt + 0(%rip)
144 addq %rbp, trampoline_level4_pgt + (511*8)(%rip)
145
146 /* Due to ENTRY(), sometimes the empty space gets filled with 142 /* Due to ENTRY(), sometimes the empty space gets filled with
147 * zeros. Better take a jmp than relying on empty space being 143 * zeros. Better take a jmp than relying on empty space being
148 * filled with 0x90 (nop) 144 * filled with 0x90 (nop)
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index ad0de0c2714..1460a5df92f 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -94,13 +94,18 @@ static int hpet_verbose;
94 94
95static int __init hpet_setup(char *str) 95static int __init hpet_setup(char *str)
96{ 96{
97 if (str) { 97 while (str) {
98 char *next = strchr(str, ',');
99
100 if (next)
101 *next++ = 0;
98 if (!strncmp("disable", str, 7)) 102 if (!strncmp("disable", str, 7))
99 boot_hpet_disable = 1; 103 boot_hpet_disable = 1;
100 if (!strncmp("force", str, 5)) 104 if (!strncmp("force", str, 5))
101 hpet_force_user = 1; 105 hpet_force_user = 1;
102 if (!strncmp("verbose", str, 7)) 106 if (!strncmp("verbose", str, 7))
103 hpet_verbose = 1; 107 hpet_verbose = 1;
108 str = next;
104 } 109 }
105 return 1; 110 return 1;
106} 111}
@@ -319,8 +324,6 @@ static void hpet_set_mode(enum clock_event_mode mode,
319 now = hpet_readl(HPET_COUNTER); 324 now = hpet_readl(HPET_COUNTER);
320 cmp = now + (unsigned int) delta; 325 cmp = now + (unsigned int) delta;
321 cfg = hpet_readl(HPET_Tn_CFG(timer)); 326 cfg = hpet_readl(HPET_Tn_CFG(timer));
322 /* Make sure we use edge triggered interrupts */
323 cfg &= ~HPET_TN_LEVEL;
324 cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | 327 cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
325 HPET_TN_SETVAL | HPET_TN_32BIT; 328 HPET_TN_SETVAL | HPET_TN_32BIT;
326 hpet_writel(cfg, HPET_Tn_CFG(timer)); 329 hpet_writel(cfg, HPET_Tn_CFG(timer));
@@ -787,15 +790,16 @@ static int hpet_clocksource_register(void)
787 return 0; 790 return 0;
788} 791}
789 792
793static u32 *hpet_boot_cfg;
794
790/** 795/**
791 * hpet_enable - Try to setup the HPET timer. Returns 1 on success. 796 * hpet_enable - Try to setup the HPET timer. Returns 1 on success.
792 */ 797 */
793int __init hpet_enable(void) 798int __init hpet_enable(void)
794{ 799{
795 unsigned long hpet_period; 800 u32 hpet_period, cfg, id;
796 unsigned int id;
797 u64 freq; 801 u64 freq;
798 int i; 802 unsigned int i, last;
799 803
800 if (!is_hpet_capable()) 804 if (!is_hpet_capable())
801 return 0; 805 return 0;
@@ -847,15 +851,45 @@ int __init hpet_enable(void)
847 id = hpet_readl(HPET_ID); 851 id = hpet_readl(HPET_ID);
848 hpet_print_config(); 852 hpet_print_config();
849 853
854 last = (id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT;
855
850#ifdef CONFIG_HPET_EMULATE_RTC 856#ifdef CONFIG_HPET_EMULATE_RTC
851 /* 857 /*
852 * The legacy routing mode needs at least two channels, tick timer 858 * The legacy routing mode needs at least two channels, tick timer
853 * and the rtc emulation channel. 859 * and the rtc emulation channel.
854 */ 860 */
855 if (!(id & HPET_ID_NUMBER)) 861 if (!last)
856 goto out_nohpet; 862 goto out_nohpet;
857#endif 863#endif
858 864
865 cfg = hpet_readl(HPET_CFG);
866 hpet_boot_cfg = kmalloc((last + 2) * sizeof(*hpet_boot_cfg),
867 GFP_KERNEL);
868 if (hpet_boot_cfg)
869 *hpet_boot_cfg = cfg;
870 else
871 pr_warn("HPET initial state will not be saved\n");
872 cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY);
873 hpet_writel(cfg, HPET_CFG);
874 if (cfg)
875 pr_warn("HPET: Unrecognized bits %#x set in global cfg\n",
876 cfg);
877
878 for (i = 0; i <= last; ++i) {
879 cfg = hpet_readl(HPET_Tn_CFG(i));
880 if (hpet_boot_cfg)
881 hpet_boot_cfg[i + 1] = cfg;
882 cfg &= ~(HPET_TN_ENABLE | HPET_TN_LEVEL | HPET_TN_FSB);
883 hpet_writel(cfg, HPET_Tn_CFG(i));
884 cfg &= ~(HPET_TN_PERIODIC | HPET_TN_PERIODIC_CAP
885 | HPET_TN_64BIT_CAP | HPET_TN_32BIT | HPET_TN_ROUTE
886 | HPET_TN_FSB | HPET_TN_FSB_CAP);
887 if (cfg)
888 pr_warn("HPET: Unrecognized bits %#x set in cfg#%u\n",
889 cfg, i);
890 }
891 hpet_print_config();
892
859 if (hpet_clocksource_register()) 893 if (hpet_clocksource_register())
860 goto out_nohpet; 894 goto out_nohpet;
861 895
@@ -923,14 +957,28 @@ fs_initcall(hpet_late_init);
923void hpet_disable(void) 957void hpet_disable(void)
924{ 958{
925 if (is_hpet_capable() && hpet_virt_address) { 959 if (is_hpet_capable() && hpet_virt_address) {
926 unsigned int cfg = hpet_readl(HPET_CFG); 960 unsigned int cfg = hpet_readl(HPET_CFG), id, last;
927 961
928 if (hpet_legacy_int_enabled) { 962 if (hpet_boot_cfg)
963 cfg = *hpet_boot_cfg;
964 else if (hpet_legacy_int_enabled) {
929 cfg &= ~HPET_CFG_LEGACY; 965 cfg &= ~HPET_CFG_LEGACY;
930 hpet_legacy_int_enabled = 0; 966 hpet_legacy_int_enabled = 0;
931 } 967 }
932 cfg &= ~HPET_CFG_ENABLE; 968 cfg &= ~HPET_CFG_ENABLE;
933 hpet_writel(cfg, HPET_CFG); 969 hpet_writel(cfg, HPET_CFG);
970
971 if (!hpet_boot_cfg)
972 return;
973
974 id = hpet_readl(HPET_ID);
975 last = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT);
976
977 for (id = 0; id <= last; ++id)
978 hpet_writel(hpet_boot_cfg[id + 1], HPET_Tn_CFG(id));
979
980 if (*hpet_boot_cfg & HPET_CFG_ENABLE)
981 hpet_writel(*hpet_boot_cfg, HPET_CFG);
934 } 982 }
935} 983}
936 984
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 3dafc6003b7..1f5f1d5d2a0 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -294,9 +294,9 @@ void fixup_irqs(void)
294 raw_spin_unlock(&desc->lock); 294 raw_spin_unlock(&desc->lock);
295 295
296 if (break_affinity && set_affinity) 296 if (break_affinity && set_affinity)
297 printk("Broke affinity for irq %i\n", irq); 297 pr_notice("Broke affinity for irq %i\n", irq);
298 else if (!set_affinity) 298 else if (!set_affinity)
299 printk("Cannot set affinity for irq %i\n", irq); 299 pr_notice("Cannot set affinity for irq %i\n", irq);
300 } 300 }
301 301
302 /* 302 /*
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 8bfb6146f75..3f61904365c 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -444,12 +444,12 @@ void kgdb_roundup_cpus(unsigned long flags)
444 444
445/** 445/**
446 * kgdb_arch_handle_exception - Handle architecture specific GDB packets. 446 * kgdb_arch_handle_exception - Handle architecture specific GDB packets.
447 * @vector: The error vector of the exception that happened. 447 * @e_vector: The error vector of the exception that happened.
448 * @signo: The signal number of the exception that happened. 448 * @signo: The signal number of the exception that happened.
449 * @err_code: The error code of the exception that happened. 449 * @err_code: The error code of the exception that happened.
450 * @remcom_in_buffer: The buffer of the packet we have read. 450 * @remcomInBuffer: The buffer of the packet we have read.
451 * @remcom_out_buffer: The buffer of %BUFMAX bytes to write a packet into. 451 * @remcomOutBuffer: The buffer of %BUFMAX bytes to write a packet into.
452 * @regs: The &struct pt_regs of the current process. 452 * @linux_regs: The &struct pt_regs of the current process.
453 * 453 *
454 * This function MUST handle the 'c' and 's' command packets, 454 * This function MUST handle the 'c' and 's' command packets,
455 * as well packets to set / remove a hardware breakpoint, if used. 455 * as well packets to set / remove a hardware breakpoint, if used.
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index e554e5ad2fe..c1d61ee4b4f 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -39,6 +39,9 @@
39#include <asm/desc.h> 39#include <asm/desc.h>
40#include <asm/tlbflush.h> 40#include <asm/tlbflush.h>
41#include <asm/idle.h> 41#include <asm/idle.h>
42#include <asm/apic.h>
43#include <asm/apicdef.h>
44#include <asm/hypervisor.h>
42 45
43static int kvmapf = 1; 46static int kvmapf = 1;
44 47
@@ -283,6 +286,22 @@ static void kvm_register_steal_time(void)
283 cpu, __pa(st)); 286 cpu, __pa(st));
284} 287}
285 288
289static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
290
291static void kvm_guest_apic_eoi_write(u32 reg, u32 val)
292{
293 /**
294 * This relies on __test_and_clear_bit to modify the memory
295 * in a way that is atomic with respect to the local CPU.
296 * The hypervisor only accesses this memory from the local CPU so
297 * there's no need for lock or memory barriers.
298 * An optimization barrier is implied in apic write.
299 */
300 if (__test_and_clear_bit(KVM_PV_EOI_BIT, &__get_cpu_var(kvm_apic_eoi)))
301 return;
302 apic_write(APIC_EOI, APIC_EOI_ACK);
303}
304
286void __cpuinit kvm_guest_cpu_init(void) 305void __cpuinit kvm_guest_cpu_init(void)
287{ 306{
288 if (!kvm_para_available()) 307 if (!kvm_para_available())
@@ -300,11 +319,20 @@ void __cpuinit kvm_guest_cpu_init(void)
300 smp_processor_id()); 319 smp_processor_id());
301 } 320 }
302 321
322 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) {
323 unsigned long pa;
324 /* Size alignment is implied but just to make it explicit. */
325 BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4);
326 __get_cpu_var(kvm_apic_eoi) = 0;
327 pa = __pa(&__get_cpu_var(kvm_apic_eoi)) | KVM_MSR_ENABLED;
328 wrmsrl(MSR_KVM_PV_EOI_EN, pa);
329 }
330
303 if (has_steal_clock) 331 if (has_steal_clock)
304 kvm_register_steal_time(); 332 kvm_register_steal_time();
305} 333}
306 334
307static void kvm_pv_disable_apf(void *unused) 335static void kvm_pv_disable_apf(void)
308{ 336{
309 if (!__get_cpu_var(apf_reason).enabled) 337 if (!__get_cpu_var(apf_reason).enabled)
310 return; 338 return;
@@ -316,11 +344,23 @@ static void kvm_pv_disable_apf(void *unused)
316 smp_processor_id()); 344 smp_processor_id());
317} 345}
318 346
347static void kvm_pv_guest_cpu_reboot(void *unused)
348{
349 /*
350 * We disable PV EOI before we load a new kernel by kexec,
351 * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory.
352 * New kernel can re-enable when it boots.
353 */
354 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
355 wrmsrl(MSR_KVM_PV_EOI_EN, 0);
356 kvm_pv_disable_apf();
357}
358
319static int kvm_pv_reboot_notify(struct notifier_block *nb, 359static int kvm_pv_reboot_notify(struct notifier_block *nb,
320 unsigned long code, void *unused) 360 unsigned long code, void *unused)
321{ 361{
322 if (code == SYS_RESTART) 362 if (code == SYS_RESTART)
323 on_each_cpu(kvm_pv_disable_apf, NULL, 1); 363 on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1);
324 return NOTIFY_DONE; 364 return NOTIFY_DONE;
325} 365}
326 366
@@ -371,7 +411,9 @@ static void __cpuinit kvm_guest_cpu_online(void *dummy)
371static void kvm_guest_cpu_offline(void *dummy) 411static void kvm_guest_cpu_offline(void *dummy)
372{ 412{
373 kvm_disable_steal_time(); 413 kvm_disable_steal_time();
374 kvm_pv_disable_apf(NULL); 414 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
415 wrmsrl(MSR_KVM_PV_EOI_EN, 0);
416 kvm_pv_disable_apf();
375 apf_task_wake_all(); 417 apf_task_wake_all();
376} 418}
377 419
@@ -424,6 +466,9 @@ void __init kvm_guest_init(void)
424 pv_time_ops.steal_clock = kvm_steal_clock; 466 pv_time_ops.steal_clock = kvm_steal_clock;
425 } 467 }
426 468
469 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
470 apic_set_eoi_write(kvm_guest_apic_eoi_write);
471
427#ifdef CONFIG_SMP 472#ifdef CONFIG_SMP
428 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; 473 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
429 register_cpu_notifier(&kvm_cpu_notifier); 474 register_cpu_notifier(&kvm_cpu_notifier);
@@ -432,6 +477,19 @@ void __init kvm_guest_init(void)
432#endif 477#endif
433} 478}
434 479
480static bool __init kvm_detect(void)
481{
482 if (!kvm_para_available())
483 return false;
484 return true;
485}
486
487const struct hypervisor_x86 x86_hyper_kvm __refconst = {
488 .name = "KVM",
489 .detect = kvm_detect,
490};
491EXPORT_SYMBOL_GPL(x86_hyper_kvm);
492
435static __init int activate_jump_labels(void) 493static __init int activate_jump_labels(void)
436{ 494{
437 if (has_steal_clock) { 495 if (has_steal_clock) {
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index f8492da65bf..f1b42b3a186 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -22,6 +22,7 @@
22#include <asm/msr.h> 22#include <asm/msr.h>
23#include <asm/apic.h> 23#include <asm/apic.h>
24#include <linux/percpu.h> 24#include <linux/percpu.h>
25#include <linux/hardirq.h>
25 26
26#include <asm/x86_init.h> 27#include <asm/x86_init.h>
27#include <asm/reboot.h> 28#include <asm/reboot.h>
@@ -114,6 +115,20 @@ static void kvm_get_preset_lpj(void)
114 preset_lpj = lpj; 115 preset_lpj = lpj;
115} 116}
116 117
118bool kvm_check_and_clear_guest_paused(void)
119{
120 bool ret = false;
121 struct pvclock_vcpu_time_info *src;
122
123 src = &__get_cpu_var(hv_clock);
124 if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) {
125 __this_cpu_and(hv_clock.flags, ~PVCLOCK_GUEST_STOPPED);
126 ret = true;
127 }
128
129 return ret;
130}
131
117static struct clocksource kvm_clock = { 132static struct clocksource kvm_clock = {
118 .name = "kvm-clock", 133 .name = "kvm-clock",
119 .read = kvm_clock_get_cycles, 134 .read = kvm_clock_get_cycles,
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index fbdfc691718..4873e62db6a 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -87,6 +87,7 @@
87#include <asm/microcode.h> 87#include <asm/microcode.h>
88#include <asm/processor.h> 88#include <asm/processor.h>
89#include <asm/cpu_device_id.h> 89#include <asm/cpu_device_id.h>
90#include <asm/perf_event.h>
90 91
91MODULE_DESCRIPTION("Microcode Update Driver"); 92MODULE_DESCRIPTION("Microcode Update Driver");
92MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); 93MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
@@ -277,7 +278,6 @@ static int reload_for_cpu(int cpu)
277 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 278 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
278 int err = 0; 279 int err = 0;
279 280
280 mutex_lock(&microcode_mutex);
281 if (uci->valid) { 281 if (uci->valid) {
282 enum ucode_state ustate; 282 enum ucode_state ustate;
283 283
@@ -288,7 +288,6 @@ static int reload_for_cpu(int cpu)
288 if (ustate == UCODE_ERROR) 288 if (ustate == UCODE_ERROR)
289 err = -EINVAL; 289 err = -EINVAL;
290 } 290 }
291 mutex_unlock(&microcode_mutex);
292 291
293 return err; 292 return err;
294} 293}
@@ -298,19 +297,31 @@ static ssize_t reload_store(struct device *dev,
298 const char *buf, size_t size) 297 const char *buf, size_t size)
299{ 298{
300 unsigned long val; 299 unsigned long val;
301 int cpu = dev->id; 300 int cpu;
302 ssize_t ret = 0; 301 ssize_t ret = 0, tmp_ret;
303 302
304 ret = kstrtoul(buf, 0, &val); 303 ret = kstrtoul(buf, 0, &val);
305 if (ret) 304 if (ret)
306 return ret; 305 return ret;
307 306
308 if (val == 1) { 307 if (val != 1)
309 get_online_cpus(); 308 return size;
310 if (cpu_online(cpu)) 309
311 ret = reload_for_cpu(cpu); 310 get_online_cpus();
312 put_online_cpus(); 311 mutex_lock(&microcode_mutex);
312 for_each_online_cpu(cpu) {
313 tmp_ret = reload_for_cpu(cpu);
314 if (tmp_ret != 0)
315 pr_warn("Error reloading microcode on CPU %d\n", cpu);
316
317 /* save retval of the first encountered reload error */
318 if (!ret)
319 ret = tmp_ret;
313 } 320 }
321 if (!ret)
322 perf_check_microcode();
323 mutex_unlock(&microcode_mutex);
324 put_online_cpus();
314 325
315 if (!ret) 326 if (!ret)
316 ret = size; 327 ret = size;
@@ -339,7 +350,6 @@ static DEVICE_ATTR(version, 0400, version_show, NULL);
339static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL); 350static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL);
340 351
341static struct attribute *mc_default_attrs[] = { 352static struct attribute *mc_default_attrs[] = {
342 &dev_attr_reload.attr,
343 &dev_attr_version.attr, 353 &dev_attr_version.attr,
344 &dev_attr_processor_flags.attr, 354 &dev_attr_processor_flags.attr,
345 NULL 355 NULL
@@ -504,7 +514,7 @@ static struct notifier_block __refdata mc_cpu_notifier = {
504 514
505#ifdef MODULE 515#ifdef MODULE
506/* Autoload on Intel and AMD systems */ 516/* Autoload on Intel and AMD systems */
507static const struct x86_cpu_id microcode_id[] = { 517static const struct x86_cpu_id __initconst microcode_id[] = {
508#ifdef CONFIG_MICROCODE_INTEL 518#ifdef CONFIG_MICROCODE_INTEL
509 { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, }, 519 { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, },
510#endif 520#endif
@@ -516,6 +526,16 @@ static const struct x86_cpu_id microcode_id[] = {
516MODULE_DEVICE_TABLE(x86cpu, microcode_id); 526MODULE_DEVICE_TABLE(x86cpu, microcode_id);
517#endif 527#endif
518 528
529static struct attribute *cpu_root_microcode_attrs[] = {
530 &dev_attr_reload.attr,
531 NULL
532};
533
534static struct attribute_group cpu_root_microcode_group = {
535 .name = "microcode",
536 .attrs = cpu_root_microcode_attrs,
537};
538
519static int __init microcode_init(void) 539static int __init microcode_init(void)
520{ 540{
521 struct cpuinfo_x86 *c = &cpu_data(0); 541 struct cpuinfo_x86 *c = &cpu_data(0);
@@ -540,16 +560,25 @@ static int __init microcode_init(void)
540 mutex_lock(&microcode_mutex); 560 mutex_lock(&microcode_mutex);
541 561
542 error = subsys_interface_register(&mc_cpu_interface); 562 error = subsys_interface_register(&mc_cpu_interface);
543 563 if (!error)
564 perf_check_microcode();
544 mutex_unlock(&microcode_mutex); 565 mutex_unlock(&microcode_mutex);
545 put_online_cpus(); 566 put_online_cpus();
546 567
547 if (error) 568 if (error)
548 goto out_pdev; 569 goto out_pdev;
549 570
571 error = sysfs_create_group(&cpu_subsys.dev_root->kobj,
572 &cpu_root_microcode_group);
573
574 if (error) {
575 pr_err("Error creating microcode group!\n");
576 goto out_driver;
577 }
578
550 error = microcode_dev_init(); 579 error = microcode_dev_init();
551 if (error) 580 if (error)
552 goto out_driver; 581 goto out_ucode_group;
553 582
554 register_syscore_ops(&mc_syscore_ops); 583 register_syscore_ops(&mc_syscore_ops);
555 register_hotcpu_notifier(&mc_cpu_notifier); 584 register_hotcpu_notifier(&mc_cpu_notifier);
@@ -559,7 +588,11 @@ static int __init microcode_init(void)
559 588
560 return 0; 589 return 0;
561 590
562out_driver: 591 out_ucode_group:
592 sysfs_remove_group(&cpu_subsys.dev_root->kobj,
593 &cpu_root_microcode_group);
594
595 out_driver:
563 get_online_cpus(); 596 get_online_cpus();
564 mutex_lock(&microcode_mutex); 597 mutex_lock(&microcode_mutex);
565 598
@@ -568,7 +601,7 @@ out_driver:
568 mutex_unlock(&microcode_mutex); 601 mutex_unlock(&microcode_mutex);
569 put_online_cpus(); 602 put_online_cpus();
570 603
571out_pdev: 604 out_pdev:
572 platform_device_unregister(microcode_pdev); 605 platform_device_unregister(microcode_pdev);
573 return error; 606 return error;
574 607
@@ -584,6 +617,9 @@ static void __exit microcode_exit(void)
584 unregister_hotcpu_notifier(&mc_cpu_notifier); 617 unregister_hotcpu_notifier(&mc_cpu_notifier);
585 unregister_syscore_ops(&mc_syscore_ops); 618 unregister_syscore_ops(&mc_syscore_ops);
586 619
620 sysfs_remove_group(&cpu_subsys.dev_root->kobj,
621 &cpu_root_microcode_group);
622
587 get_online_cpus(); 623 get_online_cpus();
588 mutex_lock(&microcode_mutex); 624 mutex_lock(&microcode_mutex);
589 625
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index f21fd94ac89..216a4d754b0 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -15,6 +15,9 @@
15 along with this program; if not, write to the Free Software 15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/ 17*/
18
19#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
18#include <linux/moduleloader.h> 21#include <linux/moduleloader.h>
19#include <linux/elf.h> 22#include <linux/elf.h>
20#include <linux/vmalloc.h> 23#include <linux/vmalloc.h>
@@ -30,9 +33,14 @@
30#include <asm/pgtable.h> 33#include <asm/pgtable.h>
31 34
32#if 0 35#if 0
33#define DEBUGP printk 36#define DEBUGP(fmt, ...) \
37 printk(KERN_DEBUG fmt, ##__VA_ARGS__)
34#else 38#else
35#define DEBUGP(fmt...) 39#define DEBUGP(fmt, ...) \
40do { \
41 if (0) \
42 printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
43} while (0)
36#endif 44#endif
37 45
38void *module_alloc(unsigned long size) 46void *module_alloc(unsigned long size)
@@ -56,8 +64,8 @@ int apply_relocate(Elf32_Shdr *sechdrs,
56 Elf32_Sym *sym; 64 Elf32_Sym *sym;
57 uint32_t *location; 65 uint32_t *location;
58 66
59 DEBUGP("Applying relocate section %u to %u\n", relsec, 67 DEBUGP("Applying relocate section %u to %u\n",
60 sechdrs[relsec].sh_info); 68 relsec, sechdrs[relsec].sh_info);
61 for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { 69 for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
62 /* This is where to make the change */ 70 /* This is where to make the change */
63 location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr 71 location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
@@ -73,11 +81,11 @@ int apply_relocate(Elf32_Shdr *sechdrs,
73 *location += sym->st_value; 81 *location += sym->st_value;
74 break; 82 break;
75 case R_386_PC32: 83 case R_386_PC32:
76 /* Add the value, subtract its postition */ 84 /* Add the value, subtract its position */
77 *location += sym->st_value - (uint32_t)location; 85 *location += sym->st_value - (uint32_t)location;
78 break; 86 break;
79 default: 87 default:
80 printk(KERN_ERR "module %s: Unknown relocation: %u\n", 88 pr_err("%s: Unknown relocation: %u\n",
81 me->name, ELF32_R_TYPE(rel[i].r_info)); 89 me->name, ELF32_R_TYPE(rel[i].r_info));
82 return -ENOEXEC; 90 return -ENOEXEC;
83 } 91 }
@@ -97,8 +105,8 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
97 void *loc; 105 void *loc;
98 u64 val; 106 u64 val;
99 107
100 DEBUGP("Applying relocate section %u to %u\n", relsec, 108 DEBUGP("Applying relocate section %u to %u\n",
101 sechdrs[relsec].sh_info); 109 relsec, sechdrs[relsec].sh_info);
102 for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { 110 for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
103 /* This is where to make the change */ 111 /* This is where to make the change */
104 loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr 112 loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
@@ -110,8 +118,8 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
110 + ELF64_R_SYM(rel[i].r_info); 118 + ELF64_R_SYM(rel[i].r_info);
111 119
112 DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n", 120 DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n",
113 (int)ELF64_R_TYPE(rel[i].r_info), 121 (int)ELF64_R_TYPE(rel[i].r_info),
114 sym->st_value, rel[i].r_addend, (u64)loc); 122 sym->st_value, rel[i].r_addend, (u64)loc);
115 123
116 val = sym->st_value + rel[i].r_addend; 124 val = sym->st_value + rel[i].r_addend;
117 125
@@ -140,7 +148,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
140#endif 148#endif
141 break; 149 break;
142 default: 150 default:
143 printk(KERN_ERR "module %s: Unknown rela relocation: %llu\n", 151 pr_err("%s: Unknown rela relocation: %llu\n",
144 me->name, ELF64_R_TYPE(rel[i].r_info)); 152 me->name, ELF64_R_TYPE(rel[i].r_info));
145 return -ENOEXEC; 153 return -ENOEXEC;
146 } 154 }
@@ -148,9 +156,9 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
148 return 0; 156 return 0;
149 157
150overflow: 158overflow:
151 printk(KERN_ERR "overflow in relocation type %d val %Lx\n", 159 pr_err("overflow in relocation type %d val %Lx\n",
152 (int)ELF64_R_TYPE(rel[i].r_info), val); 160 (int)ELF64_R_TYPE(rel[i].r_info), val);
153 printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n", 161 pr_err("`%s' likely not compiled with -mcmodel=kernel\n",
154 me->name); 162 me->name);
155 return -ENOEXEC; 163 return -ENOEXEC;
156} 164}
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index b02d4dd6b8a..d2b56489d70 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -27,7 +27,6 @@
27#include <asm/proto.h> 27#include <asm/proto.h>
28#include <asm/bios_ebda.h> 28#include <asm/bios_ebda.h>
29#include <asm/e820.h> 29#include <asm/e820.h>
30#include <asm/trampoline.h>
31#include <asm/setup.h> 30#include <asm/setup.h>
32#include <asm/smp.h> 31#include <asm/smp.h>
33 32
@@ -568,8 +567,8 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
568 struct mpf_intel *mpf; 567 struct mpf_intel *mpf;
569 unsigned long mem; 568 unsigned long mem;
570 569
571 apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n", 570 apic_printk(APIC_VERBOSE, "Scan for SMP in [mem %#010lx-%#010lx]\n",
572 bp, length); 571 base, base + length - 1);
573 BUILD_BUG_ON(sizeof(*mpf) != 16); 572 BUILD_BUG_ON(sizeof(*mpf) != 16);
574 573
575 while (length > 0) { 574 while (length > 0) {
@@ -584,8 +583,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
584#endif 583#endif
585 mpf_found = mpf; 584 mpf_found = mpf;
586 585
587 printk(KERN_INFO "found SMP MP-table at [%p] %llx\n", 586 printk(KERN_INFO "found SMP MP-table at [mem %#010llx-%#010llx] mapped at [%p]\n",
588 mpf, (u64)virt_to_phys(mpf)); 587 (unsigned long long) virt_to_phys(mpf),
588 (unsigned long long) virt_to_phys(mpf) +
589 sizeof(*mpf) - 1, mpf);
589 590
590 mem = virt_to_phys(mpf); 591 mem = virt_to_phys(mpf);
591 memblock_reserve(mem, sizeof(*mpf)); 592 memblock_reserve(mem, sizeof(*mpf));
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 90875279ef3..f84f5c57de3 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -365,8 +365,9 @@ static __kprobes void default_do_nmi(struct pt_regs *regs)
365#ifdef CONFIG_X86_32 365#ifdef CONFIG_X86_32
366/* 366/*
367 * For i386, NMIs use the same stack as the kernel, and we can 367 * For i386, NMIs use the same stack as the kernel, and we can
368 * add a workaround to the iret problem in C. Simply have 3 states 368 * add a workaround to the iret problem in C (preventing nested
369 * the NMI can be in. 369 * NMIs if an NMI takes a trap). Simply have 3 states the NMI
370 * can be in:
370 * 371 *
371 * 1) not running 372 * 1) not running
372 * 2) executing 373 * 2) executing
@@ -383,32 +384,50 @@ static __kprobes void default_do_nmi(struct pt_regs *regs)
383 * If an NMI hits a breakpoint that executes an iret, another 384 * If an NMI hits a breakpoint that executes an iret, another
384 * NMI can preempt it. We do not want to allow this new NMI 385 * NMI can preempt it. We do not want to allow this new NMI
385 * to run, but we want to execute it when the first one finishes. 386 * to run, but we want to execute it when the first one finishes.
386 * We set the state to "latched", and the first NMI will perform 387 * We set the state to "latched", and the exit of the first NMI will
387 * an cmpxchg on the state, and if it doesn't successfully 388 * perform a dec_return, if the result is zero (NOT_RUNNING), then
388 * reset the state to "not running" it will restart the next 389 * it will simply exit the NMI handler. If not, the dec_return
389 * NMI. 390 * would have set the state to NMI_EXECUTING (what we want it to
391 * be when we are running). In this case, we simply jump back
392 * to rerun the NMI handler again, and restart the 'latched' NMI.
393 *
394 * No trap (breakpoint or page fault) should be hit before nmi_restart,
395 * thus there is no race between the first check of state for NOT_RUNNING
396 * and setting it to NMI_EXECUTING. The HW will prevent nested NMIs
397 * at this point.
398 *
399 * In case the NMI takes a page fault, we need to save off the CR2
400 * because the NMI could have preempted another page fault and corrupt
401 * the CR2 that is about to be read. As nested NMIs must be restarted
402 * and they can not take breakpoints or page faults, the update of the
403 * CR2 must be done before converting the nmi state back to NOT_RUNNING.
404 * Otherwise, there would be a race of another nested NMI coming in
405 * after setting state to NOT_RUNNING but before updating the nmi_cr2.
390 */ 406 */
391enum nmi_states { 407enum nmi_states {
392 NMI_NOT_RUNNING, 408 NMI_NOT_RUNNING = 0,
393 NMI_EXECUTING, 409 NMI_EXECUTING,
394 NMI_LATCHED, 410 NMI_LATCHED,
395}; 411};
396static DEFINE_PER_CPU(enum nmi_states, nmi_state); 412static DEFINE_PER_CPU(enum nmi_states, nmi_state);
413static DEFINE_PER_CPU(unsigned long, nmi_cr2);
397 414
398#define nmi_nesting_preprocess(regs) \ 415#define nmi_nesting_preprocess(regs) \
399 do { \ 416 do { \
400 if (__get_cpu_var(nmi_state) != NMI_NOT_RUNNING) { \ 417 if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) { \
401 __get_cpu_var(nmi_state) = NMI_LATCHED; \ 418 this_cpu_write(nmi_state, NMI_LATCHED); \
402 return; \ 419 return; \
403 } \ 420 } \
404 nmi_restart: \ 421 this_cpu_write(nmi_state, NMI_EXECUTING); \
405 __get_cpu_var(nmi_state) = NMI_EXECUTING; \ 422 this_cpu_write(nmi_cr2, read_cr2()); \
406 } while (0) 423 } while (0); \
424 nmi_restart:
407 425
408#define nmi_nesting_postprocess() \ 426#define nmi_nesting_postprocess() \
409 do { \ 427 do { \
410 if (cmpxchg(&__get_cpu_var(nmi_state), \ 428 if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) \
411 NMI_EXECUTING, NMI_NOT_RUNNING) != NMI_EXECUTING) \ 429 write_cr2(this_cpu_read(nmi_cr2)); \
430 if (this_cpu_dec_return(nmi_state)) \
412 goto nmi_restart; \ 431 goto nmi_restart; \
413 } while (0) 432 } while (0)
414#else /* x86_64 */ 433#else /* x86_64 */
@@ -444,14 +463,16 @@ static inline void nmi_nesting_preprocess(struct pt_regs *regs)
444 */ 463 */
445 if (unlikely(is_debug_stack(regs->sp))) { 464 if (unlikely(is_debug_stack(regs->sp))) {
446 debug_stack_set_zero(); 465 debug_stack_set_zero();
447 __get_cpu_var(update_debug_stack) = 1; 466 this_cpu_write(update_debug_stack, 1);
448 } 467 }
449} 468}
450 469
451static inline void nmi_nesting_postprocess(void) 470static inline void nmi_nesting_postprocess(void)
452{ 471{
453 if (unlikely(__get_cpu_var(update_debug_stack))) 472 if (unlikely(this_cpu_read(update_debug_stack))) {
454 debug_stack_reset(); 473 debug_stack_reset();
474 this_cpu_write(update_debug_stack, 0);
475 }
455} 476}
456#endif 477#endif
457 478
diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c
index e31bf8d5c4d..6d9582ec032 100644
--- a/arch/x86/kernel/nmi_selftest.c
+++ b/arch/x86/kernel/nmi_selftest.c
@@ -42,7 +42,8 @@ static int __init nmi_unk_cb(unsigned int val, struct pt_regs *regs)
42static void __init init_nmi_testsuite(void) 42static void __init init_nmi_testsuite(void)
43{ 43{
44 /* trap all the unknown NMIs we may generate */ 44 /* trap all the unknown NMIs we may generate */
45 register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk"); 45 register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk",
46 __initdata);
46} 47}
47 48
48static void __init cleanup_nmi_testsuite(void) 49static void __init cleanup_nmi_testsuite(void)
@@ -65,7 +66,7 @@ static void __init test_nmi_ipi(struct cpumask *mask)
65 unsigned long timeout; 66 unsigned long timeout;
66 67
67 if (register_nmi_handler(NMI_LOCAL, test_nmi_ipi_callback, 68 if (register_nmi_handler(NMI_LOCAL, test_nmi_ipi_callback,
68 NMI_FLAG_FIRST, "nmi_selftest")) { 69 NMI_FLAG_FIRST, "nmi_selftest", __initdata)) {
69 nmi_fail = FAILURE; 70 nmi_fail = FAILURE;
70 return; 71 return;
71 } 72 }
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 9ce885996fd..17fff18a103 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -352,9 +352,7 @@ struct pv_cpu_ops pv_cpu_ops = {
352#endif 352#endif
353 .wbinvd = native_wbinvd, 353 .wbinvd = native_wbinvd,
354 .read_msr = native_read_msr_safe, 354 .read_msr = native_read_msr_safe,
355 .rdmsr_regs = native_rdmsr_safe_regs,
356 .write_msr = native_write_msr_safe, 355 .write_msr = native_write_msr_safe,
357 .wrmsr_regs = native_wrmsr_safe_regs,
358 .read_tsc = native_read_tsc, 356 .read_tsc = native_read_tsc,
359 .read_pmc = native_read_pmc, 357 .read_pmc = native_read_pmc,
360 .read_tscp = native_read_tscp, 358 .read_tscp = native_read_tscp,
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index b72838bae64..299d49302e7 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -22,6 +22,8 @@
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 */ 23 */
24 24
25#define pr_fmt(fmt) "Calgary: " fmt
26
25#include <linux/kernel.h> 27#include <linux/kernel.h>
26#include <linux/init.h> 28#include <linux/init.h>
27#include <linux/types.h> 29#include <linux/types.h>
@@ -245,7 +247,7 @@ static unsigned long iommu_range_alloc(struct device *dev,
245 offset = iommu_area_alloc(tbl->it_map, tbl->it_size, 0, 247 offset = iommu_area_alloc(tbl->it_map, tbl->it_size, 0,
246 npages, 0, boundary_size, 0); 248 npages, 0, boundary_size, 0);
247 if (offset == ~0UL) { 249 if (offset == ~0UL) {
248 printk(KERN_WARNING "Calgary: IOMMU full.\n"); 250 pr_warn("IOMMU full\n");
249 spin_unlock_irqrestore(&tbl->it_lock, flags); 251 spin_unlock_irqrestore(&tbl->it_lock, flags);
250 if (panic_on_overflow) 252 if (panic_on_overflow)
251 panic("Calgary: fix the allocator.\n"); 253 panic("Calgary: fix the allocator.\n");
@@ -271,8 +273,8 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
271 entry = iommu_range_alloc(dev, tbl, npages); 273 entry = iommu_range_alloc(dev, tbl, npages);
272 274
273 if (unlikely(entry == DMA_ERROR_CODE)) { 275 if (unlikely(entry == DMA_ERROR_CODE)) {
274 printk(KERN_WARNING "Calgary: failed to allocate %u pages in " 276 pr_warn("failed to allocate %u pages in iommu %p\n",
275 "iommu %p\n", npages, tbl); 277 npages, tbl);
276 return DMA_ERROR_CODE; 278 return DMA_ERROR_CODE;
277 } 279 }
278 280
@@ -561,8 +563,7 @@ static void calgary_tce_cache_blast(struct iommu_table *tbl)
561 i++; 563 i++;
562 } while ((val & 0xff) != 0xff && i < 100); 564 } while ((val & 0xff) != 0xff && i < 100);
563 if (i == 100) 565 if (i == 100)
564 printk(KERN_WARNING "Calgary: PCI bus not quiesced, " 566 pr_warn("PCI bus not quiesced, continuing anyway\n");
565 "continuing anyway\n");
566 567
567 /* invalidate TCE cache */ 568 /* invalidate TCE cache */
568 target = calgary_reg(bbar, tar_offset(tbl->it_busno)); 569 target = calgary_reg(bbar, tar_offset(tbl->it_busno));
@@ -604,8 +605,7 @@ begin:
604 i++; 605 i++;
605 } while ((val64 & 0xff) != 0xff && i < 100); 606 } while ((val64 & 0xff) != 0xff && i < 100);
606 if (i == 100) 607 if (i == 100)
607 printk(KERN_WARNING "CalIOC2: PCI bus not quiesced, " 608 pr_warn("CalIOC2: PCI bus not quiesced, continuing anyway\n");
608 "continuing anyway\n");
609 609
610 /* 3. poll Page Migration DEBUG for SoftStopFault */ 610 /* 3. poll Page Migration DEBUG for SoftStopFault */
611 target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG); 611 target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG);
@@ -617,8 +617,7 @@ begin:
617 if (++count < 100) 617 if (++count < 100)
618 goto begin; 618 goto begin;
619 else { 619 else {
620 printk(KERN_WARNING "CalIOC2: too many SoftStopFaults, " 620 pr_warn("CalIOC2: too many SoftStopFaults, aborting TCE cache flush sequence!\n");
621 "aborting TCE cache flush sequence!\n");
622 return; /* pray for the best */ 621 return; /* pray for the best */
623 } 622 }
624 } 623 }
@@ -840,8 +839,8 @@ static void calgary_dump_error_regs(struct iommu_table *tbl)
840 plssr = be32_to_cpu(readl(target)); 839 plssr = be32_to_cpu(readl(target));
841 840
842 /* If no error, the agent ID in the CSR is not valid */ 841 /* If no error, the agent ID in the CSR is not valid */
843 printk(KERN_EMERG "Calgary: DMA error on Calgary PHB 0x%x, " 842 pr_emerg("DMA error on Calgary PHB 0x%x, 0x%08x@CSR 0x%08x@PLSSR\n",
844 "0x%08x@CSR 0x%08x@PLSSR\n", tbl->it_busno, csr, plssr); 843 tbl->it_busno, csr, plssr);
845} 844}
846 845
847static void calioc2_dump_error_regs(struct iommu_table *tbl) 846static void calioc2_dump_error_regs(struct iommu_table *tbl)
@@ -867,22 +866,21 @@ static void calioc2_dump_error_regs(struct iommu_table *tbl)
867 target = calgary_reg(bbar, phboff | 0x800); 866 target = calgary_reg(bbar, phboff | 0x800);
868 mck = be32_to_cpu(readl(target)); 867 mck = be32_to_cpu(readl(target));
869 868
870 printk(KERN_EMERG "Calgary: DMA error on CalIOC2 PHB 0x%x\n", 869 pr_emerg("DMA error on CalIOC2 PHB 0x%x\n", tbl->it_busno);
871 tbl->it_busno);
872 870
873 printk(KERN_EMERG "Calgary: 0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n", 871 pr_emerg("0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n",
874 csr, plssr, csmr, mck); 872 csr, plssr, csmr, mck);
875 873
876 /* dump rest of error regs */ 874 /* dump rest of error regs */
877 printk(KERN_EMERG "Calgary: "); 875 pr_emerg("");
878 for (i = 0; i < ARRAY_SIZE(errregs); i++) { 876 for (i = 0; i < ARRAY_SIZE(errregs); i++) {
879 /* err regs are at 0x810 - 0x870 */ 877 /* err regs are at 0x810 - 0x870 */
880 erroff = (0x810 + (i * 0x10)); 878 erroff = (0x810 + (i * 0x10));
881 target = calgary_reg(bbar, phboff | erroff); 879 target = calgary_reg(bbar, phboff | erroff);
882 errregs[i] = be32_to_cpu(readl(target)); 880 errregs[i] = be32_to_cpu(readl(target));
883 printk("0x%08x@0x%lx ", errregs[i], erroff); 881 pr_cont("0x%08x@0x%lx ", errregs[i], erroff);
884 } 882 }
885 printk("\n"); 883 pr_cont("\n");
886 884
887 /* root complex status */ 885 /* root complex status */
888 target = calgary_reg(bbar, phboff | PHB_ROOT_COMPLEX_STATUS); 886 target = calgary_reg(bbar, phboff | PHB_ROOT_COMPLEX_STATUS);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 3003250ac51..de2b7ad7027 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -45,15 +45,6 @@ int iommu_detected __read_mostly = 0;
45 */ 45 */
46int iommu_pass_through __read_mostly; 46int iommu_pass_through __read_mostly;
47 47
48/*
49 * Group multi-function PCI devices into a single device-group for the
50 * iommu_device_group interface. This tells the iommu driver to pretend
51 * it cannot distinguish between functions of a device, exposing only one
52 * group for the device. Useful for disallowing use of individual PCI
53 * functions from userspace drivers.
54 */
55int iommu_group_mf __read_mostly;
56
57extern struct iommu_table_entry __iommu_table[], __iommu_table_end[]; 48extern struct iommu_table_entry __iommu_table[], __iommu_table_end[];
58 49
59/* Dummy device used for NULL arguments (normally ISA). */ 50/* Dummy device used for NULL arguments (normally ISA). */
@@ -101,13 +92,18 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size,
101{ 92{
102 unsigned long dma_mask; 93 unsigned long dma_mask;
103 struct page *page; 94 struct page *page;
95 unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
104 dma_addr_t addr; 96 dma_addr_t addr;
105 97
106 dma_mask = dma_alloc_coherent_mask(dev, flag); 98 dma_mask = dma_alloc_coherent_mask(dev, flag);
107 99
108 flag |= __GFP_ZERO; 100 flag |= __GFP_ZERO;
109again: 101again:
110 page = alloc_pages_node(dev_to_node(dev), flag, get_order(size)); 102 page = NULL;
103 if (!(flag & GFP_ATOMIC))
104 page = dma_alloc_from_contiguous(dev, count, get_order(size));
105 if (!page)
106 page = alloc_pages_node(dev_to_node(dev), flag, get_order(size));
111 if (!page) 107 if (!page)
112 return NULL; 108 return NULL;
113 109
@@ -127,6 +123,16 @@ again:
127 return page_address(page); 123 return page_address(page);
128} 124}
129 125
126void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr,
127 dma_addr_t dma_addr, struct dma_attrs *attrs)
128{
129 unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
130 struct page *page = virt_to_page(vaddr);
131
132 if (!dma_release_from_contiguous(dev, page, count))
133 free_pages((unsigned long)vaddr, get_order(size));
134}
135
130/* 136/*
131 * See <Documentation/x86/x86_64/boot-options.txt> for the iommu kernel 137 * See <Documentation/x86/x86_64/boot-options.txt> for the iommu kernel
132 * parameter documentation. 138 * parameter documentation.
@@ -179,8 +185,6 @@ static __init int iommu_setup(char *p)
179#endif 185#endif
180 if (!strncmp(p, "pt", 2)) 186 if (!strncmp(p, "pt", 2))
181 iommu_pass_through = 1; 187 iommu_pass_through = 1;
182 if (!strncmp(p, "group_mf", 8))
183 iommu_group_mf = 1;
184 188
185 gart_parse_options(p); 189 gart_parse_options(p);
186 190
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index f96050685b4..871be4a84c7 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -74,12 +74,6 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
74 return nents; 74 return nents;
75} 75}
76 76
77static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr,
78 dma_addr_t dma_addr, struct dma_attrs *attrs)
79{
80 free_pages((unsigned long)vaddr, get_order(size));
81}
82
83static void nommu_sync_single_for_device(struct device *dev, 77static void nommu_sync_single_for_device(struct device *dev,
84 dma_addr_t addr, size_t size, 78 dma_addr_t addr, size_t size,
85 enum dma_data_direction dir) 79 enum dma_data_direction dir)
@@ -97,7 +91,7 @@ static void nommu_sync_sg_for_device(struct device *dev,
97 91
98struct dma_map_ops nommu_dma_ops = { 92struct dma_map_ops nommu_dma_ops = {
99 .alloc = dma_generic_alloc_coherent, 93 .alloc = dma_generic_alloc_coherent,
100 .free = nommu_free_coherent, 94 .free = dma_generic_free_coherent,
101 .map_sg = nommu_map_sg, 95 .map_sg = nommu_map_sg,
102 .map_page = nommu_map_page, 96 .map_page = nommu_map_page,
103 .sync_single_for_device = nommu_sync_single_for_device, 97 .sync_single_for_device = nommu_sync_single_for_device,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 735279e54e5..ef6a8456f71 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -1,3 +1,5 @@
1#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
2
1#include <linux/errno.h> 3#include <linux/errno.h>
2#include <linux/kernel.h> 4#include <linux/kernel.h>
3#include <linux/mm.h> 5#include <linux/mm.h>
@@ -145,16 +147,14 @@ void show_regs_common(void)
145 /* Board Name is optional */ 147 /* Board Name is optional */
146 board = dmi_get_system_info(DMI_BOARD_NAME); 148 board = dmi_get_system_info(DMI_BOARD_NAME);
147 149
148 printk(KERN_CONT "\n"); 150 printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s %s %s%s%s\n",
149 printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s", 151 current->pid, current->comm, print_tainted(),
150 current->pid, current->comm, print_tainted(), 152 init_utsname()->release,
151 init_utsname()->release, 153 (int)strcspn(init_utsname()->version, " "),
152 (int)strcspn(init_utsname()->version, " "), 154 init_utsname()->version,
153 init_utsname()->version); 155 vendor, product,
154 printk(KERN_CONT " %s %s", vendor, product); 156 board ? "/" : "",
155 if (board) 157 board ? board : "");
156 printk(KERN_CONT "/%s", board);
157 printk(KERN_CONT "\n");
158} 158}
159 159
160void flush_thread(void) 160void flush_thread(void)
@@ -645,7 +645,7 @@ static void amd_e400_idle(void)
645 amd_e400_c1e_detected = true; 645 amd_e400_c1e_detected = true;
646 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) 646 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
647 mark_tsc_unstable("TSC halt in AMD C1E"); 647 mark_tsc_unstable("TSC halt in AMD C1E");
648 printk(KERN_INFO "System has AMD C1E enabled\n"); 648 pr_info("System has AMD C1E enabled\n");
649 } 649 }
650 } 650 }
651 651
@@ -659,8 +659,7 @@ static void amd_e400_idle(void)
659 */ 659 */
660 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, 660 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
661 &cpu); 661 &cpu);
662 printk(KERN_INFO "Switch to broadcast mode on CPU%d\n", 662 pr_info("Switch to broadcast mode on CPU%d\n", cpu);
663 cpu);
664 } 663 }
665 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); 664 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
666 665
@@ -681,8 +680,7 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
681{ 680{
682#ifdef CONFIG_SMP 681#ifdef CONFIG_SMP
683 if (pm_idle == poll_idle && smp_num_siblings > 1) { 682 if (pm_idle == poll_idle && smp_num_siblings > 1) {
684 printk_once(KERN_WARNING "WARNING: polling idle and HT enabled," 683 pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n");
685 " performance may degrade.\n");
686 } 684 }
687#endif 685#endif
688 if (pm_idle) 686 if (pm_idle)
@@ -692,11 +690,11 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
692 /* 690 /*
693 * One CPU supports mwait => All CPUs supports mwait 691 * One CPU supports mwait => All CPUs supports mwait
694 */ 692 */
695 printk(KERN_INFO "using mwait in idle threads.\n"); 693 pr_info("using mwait in idle threads\n");
696 pm_idle = mwait_idle; 694 pm_idle = mwait_idle;
697 } else if (cpu_has_amd_erratum(amd_erratum_400)) { 695 } else if (cpu_has_amd_erratum(amd_erratum_400)) {
698 /* E400: APIC timer interrupt does not wake up CPU from C1e */ 696 /* E400: APIC timer interrupt does not wake up CPU from C1e */
699 printk(KERN_INFO "using AMD E400 aware idle routine\n"); 697 pr_info("using AMD E400 aware idle routine\n");
700 pm_idle = amd_e400_idle; 698 pm_idle = amd_e400_idle;
701 } else 699 } else
702 pm_idle = default_idle; 700 pm_idle = default_idle;
@@ -715,7 +713,7 @@ static int __init idle_setup(char *str)
715 return -EINVAL; 713 return -EINVAL;
716 714
717 if (!strcmp(str, "poll")) { 715 if (!strcmp(str, "poll")) {
718 printk("using polling idle threads.\n"); 716 pr_info("using polling idle threads\n");
719 pm_idle = poll_idle; 717 pm_idle = poll_idle;
720 boot_option_idle_override = IDLE_POLL; 718 boot_option_idle_override = IDLE_POLL;
721 } else if (!strcmp(str, "mwait")) { 719 } else if (!strcmp(str, "mwait")) {
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 61cdf7fdf09..0a980c9d7cb 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -117,10 +117,10 @@ void release_thread(struct task_struct *dead_task)
117{ 117{
118 if (dead_task->mm) { 118 if (dead_task->mm) {
119 if (dead_task->mm->context.size) { 119 if (dead_task->mm->context.size) {
120 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", 120 pr_warn("WARNING: dead process %8s still has LDT? <%p/%d>\n",
121 dead_task->comm, 121 dead_task->comm,
122 dead_task->mm->context.ldt, 122 dead_task->mm->context.ldt,
123 dead_task->mm->context.size); 123 dead_task->mm->context.size);
124 BUG(); 124 BUG();
125 } 125 }
126 } 126 }
@@ -466,7 +466,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
466 task->thread.gs = addr; 466 task->thread.gs = addr;
467 if (doit) { 467 if (doit) {
468 load_gs_index(0); 468 load_gs_index(0);
469 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); 469 ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr);
470 } 470 }
471 } 471 }
472 put_cpu(); 472 put_cpu();
@@ -494,7 +494,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
494 /* set the selector to 0 to not confuse 494 /* set the selector to 0 to not confuse
495 __switch_to */ 495 __switch_to */
496 loadsegment(fs, 0); 496 loadsegment(fs, 0);
497 ret = checking_wrmsrl(MSR_FS_BASE, addr); 497 ret = wrmsrl_safe(MSR_FS_BASE, addr);
498 } 498 }
499 } 499 }
500 put_cpu(); 500 put_cpu();
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 13b1990c7c5..c4c6a5c2bf0 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1211,12 +1211,6 @@ static long x32_arch_ptrace(struct task_struct *child,
1211 0, sizeof(struct user_i387_struct), 1211 0, sizeof(struct user_i387_struct),
1212 datap); 1212 datap);
1213 1213
1214 /* normal 64bit interface to access TLS data.
1215 Works just like arch_prctl, except that the arguments
1216 are reversed. */
1217 case PTRACE_ARCH_PRCTL:
1218 return do_arch_prctl(child, data, addr);
1219
1220 default: 1214 default:
1221 return compat_ptrace_request(child, request, addr, data); 1215 return compat_ptrace_request(child, request, addr, data);
1222 } 1216 }
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 03920a15a63..1b27de56356 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -512,7 +512,7 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
512 512
513#if defined(CONFIG_PCI) && defined(CONFIG_NUMA) 513#if defined(CONFIG_PCI) && defined(CONFIG_NUMA)
514/* Set correct numa_node information for AMD NB functions */ 514/* Set correct numa_node information for AMD NB functions */
515static void __init quirk_amd_nb_node(struct pci_dev *dev) 515static void __devinit quirk_amd_nb_node(struct pci_dev *dev)
516{ 516{
517 struct pci_dev *nb_ht; 517 struct pci_dev *nb_ht;
518 unsigned int devfn; 518 unsigned int devfn;
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 77215c23fba..52190a938b4 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -1,3 +1,5 @@
1#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
2
1#include <linux/module.h> 3#include <linux/module.h>
2#include <linux/reboot.h> 4#include <linux/reboot.h>
3#include <linux/init.h> 5#include <linux/init.h>
@@ -20,13 +22,12 @@
20#include <asm/virtext.h> 22#include <asm/virtext.h>
21#include <asm/cpu.h> 23#include <asm/cpu.h>
22#include <asm/nmi.h> 24#include <asm/nmi.h>
25#include <asm/smp.h>
23 26
24#ifdef CONFIG_X86_32 27#include <linux/ctype.h>
25# include <linux/ctype.h> 28#include <linux/mc146818rtc.h>
26# include <linux/mc146818rtc.h> 29#include <asm/realmode.h>
27#else 30#include <asm/x86_init.h>
28# include <asm/x86_init.h>
29#endif
30 31
31/* 32/*
32 * Power off function, if any 33 * Power off function, if any
@@ -48,7 +49,7 @@ int reboot_force;
48 */ 49 */
49static int reboot_default = 1; 50static int reboot_default = 1;
50 51
51#if defined(CONFIG_X86_32) && defined(CONFIG_SMP) 52#ifdef CONFIG_SMP
52static int reboot_cpu = -1; 53static int reboot_cpu = -1;
53#endif 54#endif
54 55
@@ -66,8 +67,8 @@ bool port_cf9_safe = false;
66 * reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci] 67 * reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci]
67 * warm Don't set the cold reboot flag 68 * warm Don't set the cold reboot flag
68 * cold Set the cold reboot flag 69 * cold Set the cold reboot flag
69 * bios Reboot by jumping through the BIOS (only for X86_32) 70 * bios Reboot by jumping through the BIOS
70 * smp Reboot by executing reset on BSP or other CPU (only for X86_32) 71 * smp Reboot by executing reset on BSP or other CPU
71 * triple Force a triple fault (init) 72 * triple Force a triple fault (init)
72 * kbd Use the keyboard controller. cold reset (default) 73 * kbd Use the keyboard controller. cold reset (default)
73 * acpi Use the RESET_REG in the FADT 74 * acpi Use the RESET_REG in the FADT
@@ -94,7 +95,6 @@ static int __init reboot_setup(char *str)
94 reboot_mode = 0; 95 reboot_mode = 0;
95 break; 96 break;
96 97
97#ifdef CONFIG_X86_32
98#ifdef CONFIG_SMP 98#ifdef CONFIG_SMP
99 case 's': 99 case 's':
100 if (isdigit(*(str+1))) { 100 if (isdigit(*(str+1))) {
@@ -111,7 +111,6 @@ static int __init reboot_setup(char *str)
111#endif /* CONFIG_SMP */ 111#endif /* CONFIG_SMP */
112 112
113 case 'b': 113 case 'b':
114#endif
115 case 'a': 114 case 'a':
116 case 'k': 115 case 'k':
117 case 't': 116 case 't':
@@ -137,7 +136,6 @@ static int __init reboot_setup(char *str)
137__setup("reboot=", reboot_setup); 136__setup("reboot=", reboot_setup);
138 137
139 138
140#ifdef CONFIG_X86_32
141/* 139/*
142 * Reboot options and system auto-detection code provided by 140 * Reboot options and system auto-detection code provided by
143 * Dell Inc. so their systems "just work". :-) 141 * Dell Inc. so their systems "just work". :-)
@@ -151,21 +149,14 @@ static int __init set_bios_reboot(const struct dmi_system_id *d)
151{ 149{
152 if (reboot_type != BOOT_BIOS) { 150 if (reboot_type != BOOT_BIOS) {
153 reboot_type = BOOT_BIOS; 151 reboot_type = BOOT_BIOS;
154 printk(KERN_INFO "%s series board detected. Selecting BIOS-method for reboots.\n", d->ident); 152 pr_info("%s series board detected. Selecting %s-method for reboots.\n",
153 "BIOS", d->ident);
155 } 154 }
156 return 0; 155 return 0;
157} 156}
158 157
159extern const unsigned char machine_real_restart_asm[]; 158void __noreturn machine_real_restart(unsigned int type)
160extern const u64 machine_real_restart_gdt[3];
161
162void machine_real_restart(unsigned int type)
163{ 159{
164 void *restart_va;
165 unsigned long restart_pa;
166 void (*restart_lowmem)(unsigned int);
167 u64 *lowmem_gdt;
168
169 local_irq_disable(); 160 local_irq_disable();
170 161
171 /* 162 /*
@@ -185,40 +176,28 @@ void machine_real_restart(unsigned int type)
185 /* 176 /*
186 * Switch back to the initial page table. 177 * Switch back to the initial page table.
187 */ 178 */
179#ifdef CONFIG_X86_32
188 load_cr3(initial_page_table); 180 load_cr3(initial_page_table);
189 181#else
190 /* 182 write_cr3(real_mode_header->trampoline_pgd);
191 * Write 0x1234 to absolute memory location 0x472. The BIOS reads 183#endif
192 * this on booting to tell it to "Bypass memory test (also warm
193 * boot)". This seems like a fairly standard thing that gets set by
194 * REBOOT.COM programs, and the previous reset routine did this
195 * too. */
196 *((unsigned short *)0x472) = reboot_mode;
197
198 /* Patch the GDT in the low memory trampoline */
199 lowmem_gdt = TRAMPOLINE_SYM(machine_real_restart_gdt);
200
201 restart_va = TRAMPOLINE_SYM(machine_real_restart_asm);
202 restart_pa = virt_to_phys(restart_va);
203 restart_lowmem = (void (*)(unsigned int))restart_pa;
204
205 /* GDT[0]: GDT self-pointer */
206 lowmem_gdt[0] =
207 (u64)(sizeof(machine_real_restart_gdt) - 1) +
208 ((u64)virt_to_phys(lowmem_gdt) << 16);
209 /* GDT[1]: 64K real mode code segment */
210 lowmem_gdt[1] =
211 GDT_ENTRY(0x009b, restart_pa, 0xffff);
212 184
213 /* Jump to the identity-mapped low memory code */ 185 /* Jump to the identity-mapped low memory code */
214 restart_lowmem(type); 186#ifdef CONFIG_X86_32
187 asm volatile("jmpl *%0" : :
188 "rm" (real_mode_header->machine_real_restart_asm),
189 "a" (type));
190#else
191 asm volatile("ljmpl *%0" : :
192 "m" (real_mode_header->machine_real_restart_asm),
193 "D" (type));
194#endif
195 unreachable();
215} 196}
216#ifdef CONFIG_APM_MODULE 197#ifdef CONFIG_APM_MODULE
217EXPORT_SYMBOL(machine_real_restart); 198EXPORT_SYMBOL(machine_real_restart);
218#endif 199#endif
219 200
220#endif /* CONFIG_X86_32 */
221
222/* 201/*
223 * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot 202 * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot
224 */ 203 */
@@ -226,8 +205,8 @@ static int __init set_pci_reboot(const struct dmi_system_id *d)
226{ 205{
227 if (reboot_type != BOOT_CF9) { 206 if (reboot_type != BOOT_CF9) {
228 reboot_type = BOOT_CF9; 207 reboot_type = BOOT_CF9;
229 printk(KERN_INFO "%s series board detected. " 208 pr_info("%s series board detected. Selecting %s-method for reboots.\n",
230 "Selecting PCI-method for reboots.\n", d->ident); 209 "PCI", d->ident);
231 } 210 }
232 return 0; 211 return 0;
233} 212}
@@ -236,17 +215,16 @@ static int __init set_kbd_reboot(const struct dmi_system_id *d)
236{ 215{
237 if (reboot_type != BOOT_KBD) { 216 if (reboot_type != BOOT_KBD) {
238 reboot_type = BOOT_KBD; 217 reboot_type = BOOT_KBD;
239 printk(KERN_INFO "%s series board detected. Selecting KBD-method for reboot.\n", d->ident); 218 pr_info("%s series board detected. Selecting %s-method for reboot.\n",
219 "KBD", d->ident);
240 } 220 }
241 return 0; 221 return 0;
242} 222}
243 223
244/* 224/*
245 * This is a single dmi_table handling all reboot quirks. Note that 225 * This is a single dmi_table handling all reboot quirks.
246 * REBOOT_BIOS is only available for 32bit
247 */ 226 */
248static struct dmi_system_id __initdata reboot_dmi_table[] = { 227static struct dmi_system_id __initdata reboot_dmi_table[] = {
249#ifdef CONFIG_X86_32
250 { /* Handle problems with rebooting on Dell E520's */ 228 { /* Handle problems with rebooting on Dell E520's */
251 .callback = set_bios_reboot, 229 .callback = set_bios_reboot,
252 .ident = "Dell E520", 230 .ident = "Dell E520",
@@ -396,7 +374,6 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
396 DMI_MATCH(DMI_BOARD_NAME, "P4S800"), 374 DMI_MATCH(DMI_BOARD_NAME, "P4S800"),
397 }, 375 },
398 }, 376 },
399#endif /* CONFIG_X86_32 */
400 377
401 { /* Handle reboot issue on Acer Aspire one */ 378 { /* Handle reboot issue on Acer Aspire one */
402 .callback = set_kbd_reboot, 379 .callback = set_kbd_reboot,
@@ -470,6 +447,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
470 DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"), 447 DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"),
471 }, 448 },
472 }, 449 },
450 { /* Handle problems with rebooting on the Precision M6600. */
451 .callback = set_pci_reboot,
452 .ident = "Dell OptiPlex 990",
453 .matches = {
454 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
455 DMI_MATCH(DMI_PRODUCT_NAME, "Precision M6600"),
456 },
457 },
473 { } 458 { }
474}; 459};
475 460
@@ -595,13 +580,11 @@ static void native_machine_emergency_restart(void)
595 reboot_type = BOOT_KBD; 580 reboot_type = BOOT_KBD;
596 break; 581 break;
597 582
598#ifdef CONFIG_X86_32
599 case BOOT_BIOS: 583 case BOOT_BIOS:
600 machine_real_restart(MRR_BIOS); 584 machine_real_restart(MRR_BIOS);
601 585
602 reboot_type = BOOT_KBD; 586 reboot_type = BOOT_KBD;
603 break; 587 break;
604#endif
605 588
606 case BOOT_ACPI: 589 case BOOT_ACPI:
607 acpi_reboot(); 590 acpi_reboot();
@@ -643,12 +626,10 @@ void native_machine_shutdown(void)
643 /* The boot cpu is always logical cpu 0 */ 626 /* The boot cpu is always logical cpu 0 */
644 int reboot_cpu_id = 0; 627 int reboot_cpu_id = 0;
645 628
646#ifdef CONFIG_X86_32
647 /* See if there has been given a command line override */ 629 /* See if there has been given a command line override */
648 if ((reboot_cpu != -1) && (reboot_cpu < nr_cpu_ids) && 630 if ((reboot_cpu != -1) && (reboot_cpu < nr_cpu_ids) &&
649 cpu_online(reboot_cpu)) 631 cpu_online(reboot_cpu))
650 reboot_cpu_id = reboot_cpu; 632 reboot_cpu_id = reboot_cpu;
651#endif
652 633
653 /* Make certain the cpu I'm about to reboot on is online */ 634 /* Make certain the cpu I'm about to reboot on is online */
654 if (!cpu_online(reboot_cpu_id)) 635 if (!cpu_online(reboot_cpu_id))
@@ -658,9 +639,11 @@ void native_machine_shutdown(void)
658 set_cpus_allowed_ptr(current, cpumask_of(reboot_cpu_id)); 639 set_cpus_allowed_ptr(current, cpumask_of(reboot_cpu_id));
659 640
660 /* 641 /*
661 * O.K Now that I'm on the appropriate processor, 642 * O.K Now that I'm on the appropriate processor, stop all of the
662 * stop all of the others. 643 * others. Also disable the local irq to not receive the per-cpu
644 * timer interrupt which may trigger scheduler's load balance.
663 */ 645 */
646 local_irq_disable();
664 stop_other_cpus(); 647 stop_other_cpus();
665#endif 648#endif
666 649
@@ -687,7 +670,7 @@ static void __machine_emergency_restart(int emergency)
687 670
688static void native_machine_restart(char *__unused) 671static void native_machine_restart(char *__unused)
689{ 672{
690 printk("machine restart\n"); 673 pr_notice("machine restart\n");
691 674
692 if (!reboot_force) 675 if (!reboot_force)
693 machine_shutdown(); 676 machine_shutdown();
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 58a07b10812..f4b9b80e1b9 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -49,6 +49,7 @@
49#include <asm/pci-direct.h> 49#include <asm/pci-direct.h>
50#include <linux/init_ohci1394_dma.h> 50#include <linux/init_ohci1394_dma.h>
51#include <linux/kvm_para.h> 51#include <linux/kvm_para.h>
52#include <linux/dma-contiguous.h>
52 53
53#include <linux/errno.h> 54#include <linux/errno.h>
54#include <linux/kernel.h> 55#include <linux/kernel.h>
@@ -72,7 +73,7 @@
72 73
73#include <asm/mtrr.h> 74#include <asm/mtrr.h>
74#include <asm/apic.h> 75#include <asm/apic.h>
75#include <asm/trampoline.h> 76#include <asm/realmode.h>
76#include <asm/e820.h> 77#include <asm/e820.h>
77#include <asm/mpspec.h> 78#include <asm/mpspec.h>
78#include <asm/setup.h> 79#include <asm/setup.h>
@@ -333,8 +334,8 @@ static void __init relocate_initrd(void)
333 memblock_reserve(ramdisk_here, area_size); 334 memblock_reserve(ramdisk_here, area_size);
334 initrd_start = ramdisk_here + PAGE_OFFSET; 335 initrd_start = ramdisk_here + PAGE_OFFSET;
335 initrd_end = initrd_start + ramdisk_size; 336 initrd_end = initrd_start + ramdisk_size;
336 printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n", 337 printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n",
337 ramdisk_here, ramdisk_here + ramdisk_size); 338 ramdisk_here, ramdisk_here + ramdisk_size - 1);
338 339
339 q = (char *)initrd_start; 340 q = (char *)initrd_start;
340 341
@@ -365,8 +366,8 @@ static void __init relocate_initrd(void)
365 /* high pages is not converted by early_res_to_bootmem */ 366 /* high pages is not converted by early_res_to_bootmem */
366 ramdisk_image = boot_params.hdr.ramdisk_image; 367 ramdisk_image = boot_params.hdr.ramdisk_image;
367 ramdisk_size = boot_params.hdr.ramdisk_size; 368 ramdisk_size = boot_params.hdr.ramdisk_size;
368 printk(KERN_INFO "Move RAMDISK from %016llx - %016llx to" 369 printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
369 " %08llx - %08llx\n", 370 " [mem %#010llx-%#010llx]\n",
370 ramdisk_image, ramdisk_image + ramdisk_size - 1, 371 ramdisk_image, ramdisk_image + ramdisk_size - 1,
371 ramdisk_here, ramdisk_here + ramdisk_size - 1); 372 ramdisk_here, ramdisk_here + ramdisk_size - 1);
372} 373}
@@ -391,8 +392,8 @@ static void __init reserve_initrd(void)
391 ramdisk_size, end_of_lowmem>>1); 392 ramdisk_size, end_of_lowmem>>1);
392 } 393 }
393 394
394 printk(KERN_INFO "RAMDISK: %08llx - %08llx\n", ramdisk_image, 395 printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image,
395 ramdisk_end); 396 ramdisk_end - 1);
396 397
397 398
398 if (ramdisk_end <= end_of_lowmem) { 399 if (ramdisk_end <= end_of_lowmem) {
@@ -905,10 +906,10 @@ void __init setup_arch(char **cmdline_p)
905 setup_bios_corruption_check(); 906 setup_bios_corruption_check();
906#endif 907#endif
907 908
908 printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n", 909 printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n",
909 max_pfn_mapped<<PAGE_SHIFT); 910 (max_pfn_mapped<<PAGE_SHIFT) - 1);
910 911
911 setup_trampolines(); 912 setup_real_mode();
912 913
913 init_gbpages(); 914 init_gbpages();
914 915
@@ -925,6 +926,7 @@ void __init setup_arch(char **cmdline_p)
925 } 926 }
926#endif 927#endif
927 memblock.current_limit = get_max_mapped(); 928 memblock.current_limit = get_max_mapped();
929 dma_contiguous_reserve(0);
928 930
929 /* 931 /*
930 * NOTE: On x86-32, only from this point on, fixmaps are ready for use. 932 * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
@@ -966,6 +968,8 @@ void __init setup_arch(char **cmdline_p)
966 if (boot_cpu_data.cpuid_level >= 0) { 968 if (boot_cpu_data.cpuid_level >= 0) {
967 /* A CPU has %cr4 if and only if it has CPUID */ 969 /* A CPU has %cr4 if and only if it has CPUID */
968 mmu_cr4_features = read_cr4(); 970 mmu_cr4_features = read_cr4();
971 if (trampoline_cr4_features)
972 *trampoline_cr4_features = mmu_cr4_features;
969 } 973 }
970 974
971#ifdef CONFIG_X86_32 975#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index b68ccadd2ff..b280908a376 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -6,6 +6,9 @@
6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes 6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
7 * 2000-2002 x86-64 support by Andi Kleen 7 * 2000-2002 x86-64 support by Andi Kleen
8 */ 8 */
9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
9#include <linux/sched.h> 12#include <linux/sched.h>
10#include <linux/mm.h> 13#include <linux/mm.h>
11#include <linux/smp.h> 14#include <linux/smp.h>
@@ -18,6 +21,7 @@
18#include <linux/personality.h> 21#include <linux/personality.h>
19#include <linux/uaccess.h> 22#include <linux/uaccess.h>
20#include <linux/user-return-notifier.h> 23#include <linux/user-return-notifier.h>
24#include <linux/uprobes.h>
21 25
22#include <asm/processor.h> 26#include <asm/processor.h>
23#include <asm/ucontext.h> 27#include <asm/ucontext.h>
@@ -554,7 +558,6 @@ unsigned long sys_sigreturn(struct pt_regs *regs)
554 sizeof(frame->extramask)))) 558 sizeof(frame->extramask))))
555 goto badframe; 559 goto badframe;
556 560
557 sigdelsetmask(&set, ~_BLOCKABLE);
558 set_current_blocked(&set); 561 set_current_blocked(&set);
559 562
560 if (restore_sigcontext(regs, &frame->sc, &ax)) 563 if (restore_sigcontext(regs, &frame->sc, &ax))
@@ -580,7 +583,6 @@ long sys_rt_sigreturn(struct pt_regs *regs)
580 if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) 583 if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
581 goto badframe; 584 goto badframe;
582 585
583 sigdelsetmask(&set, ~_BLOCKABLE);
584 set_current_blocked(&set); 586 set_current_blocked(&set);
585 587
586 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) 588 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
@@ -646,42 +648,28 @@ setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
646 struct pt_regs *regs) 648 struct pt_regs *regs)
647{ 649{
648 int usig = signr_convert(sig); 650 int usig = signr_convert(sig);
649 sigset_t *set = &current->blocked; 651 sigset_t *set = sigmask_to_save();
650 int ret;
651
652 if (current_thread_info()->status & TS_RESTORE_SIGMASK)
653 set = &current->saved_sigmask;
654 652
655 /* Set up the stack frame */ 653 /* Set up the stack frame */
656 if (is_ia32) { 654 if (is_ia32) {
657 if (ka->sa.sa_flags & SA_SIGINFO) 655 if (ka->sa.sa_flags & SA_SIGINFO)
658 ret = ia32_setup_rt_frame(usig, ka, info, set, regs); 656 return ia32_setup_rt_frame(usig, ka, info, set, regs);
659 else 657 else
660 ret = ia32_setup_frame(usig, ka, set, regs); 658 return ia32_setup_frame(usig, ka, set, regs);
661#ifdef CONFIG_X86_X32_ABI 659#ifdef CONFIG_X86_X32_ABI
662 } else if (is_x32) { 660 } else if (is_x32) {
663 ret = x32_setup_rt_frame(usig, ka, info, 661 return x32_setup_rt_frame(usig, ka, info,
664 (compat_sigset_t *)set, regs); 662 (compat_sigset_t *)set, regs);
665#endif 663#endif
666 } else { 664 } else {
667 ret = __setup_rt_frame(sig, ka, info, set, regs); 665 return __setup_rt_frame(sig, ka, info, set, regs);
668 }
669
670 if (ret) {
671 force_sigsegv(sig, current);
672 return -EFAULT;
673 } 666 }
674
675 current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
676 return ret;
677} 667}
678 668
679static int 669static void
680handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, 670handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
681 struct pt_regs *regs) 671 struct pt_regs *regs)
682{ 672{
683 int ret;
684
685 /* Are we from a system call? */ 673 /* Are we from a system call? */
686 if (syscall_get_nr(current, regs) >= 0) { 674 if (syscall_get_nr(current, regs) >= 0) {
687 /* If so, check system call restarting.. */ 675 /* If so, check system call restarting.. */
@@ -712,10 +700,10 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
712 likely(test_and_clear_thread_flag(TIF_FORCED_TF))) 700 likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
713 regs->flags &= ~X86_EFLAGS_TF; 701 regs->flags &= ~X86_EFLAGS_TF;
714 702
715 ret = setup_rt_frame(sig, ka, info, regs); 703 if (setup_rt_frame(sig, ka, info, regs) < 0) {
716 704 force_sigsegv(sig, current);
717 if (ret) 705 return;
718 return ret; 706 }
719 707
720 /* 708 /*
721 * Clear the direction flag as per the ABI for function entry. 709 * Clear the direction flag as per the ABI for function entry.
@@ -730,12 +718,8 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
730 */ 718 */
731 regs->flags &= ~X86_EFLAGS_TF; 719 regs->flags &= ~X86_EFLAGS_TF;
732 720
733 block_sigmask(ka, sig); 721 signal_delivered(sig, info, ka, regs,
734 722 test_thread_flag(TIF_SINGLESTEP));
735 tracehook_signal_handler(sig, info, ka, regs,
736 test_thread_flag(TIF_SINGLESTEP));
737
738 return 0;
739} 723}
740 724
741#ifdef CONFIG_X86_32 725#ifdef CONFIG_X86_32
@@ -756,16 +740,6 @@ static void do_signal(struct pt_regs *regs)
756 siginfo_t info; 740 siginfo_t info;
757 int signr; 741 int signr;
758 742
759 /*
760 * We want the common case to go fast, which is why we may in certain
761 * cases get here from kernel mode. Just return without doing anything
762 * if so.
763 * X86_32: vm86 regs switched out by assembly code before reaching
764 * here, so testing against kernel CS suffices.
765 */
766 if (!user_mode(regs))
767 return;
768
769 signr = get_signal_to_deliver(&info, &ka, regs, NULL); 743 signr = get_signal_to_deliver(&info, &ka, regs, NULL);
770 if (signr > 0) { 744 if (signr > 0) {
771 /* Whee! Actually deliver the signal. */ 745 /* Whee! Actually deliver the signal. */
@@ -795,10 +769,7 @@ static void do_signal(struct pt_regs *regs)
795 * If there's no signal to deliver, we just put the saved sigmask 769 * If there's no signal to deliver, we just put the saved sigmask
796 * back. 770 * back.
797 */ 771 */
798 if (current_thread_info()->status & TS_RESTORE_SIGMASK) { 772 restore_saved_sigmask();
799 current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
800 set_current_blocked(&current->saved_sigmask);
801 }
802} 773}
803 774
804/* 775/*
@@ -814,6 +785,11 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
814 mce_notify_process(); 785 mce_notify_process();
815#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ 786#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
816 787
788 if (thread_info_flags & _TIF_UPROBE) {
789 clear_thread_flag(TIF_UPROBE);
790 uprobe_notify_resume(regs);
791 }
792
817 /* deal with pending signal delivery */ 793 /* deal with pending signal delivery */
818 if (thread_info_flags & _TIF_SIGPENDING) 794 if (thread_info_flags & _TIF_SIGPENDING)
819 do_signal(regs); 795 do_signal(regs);
@@ -821,8 +797,6 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
821 if (thread_info_flags & _TIF_NOTIFY_RESUME) { 797 if (thread_info_flags & _TIF_NOTIFY_RESUME) {
822 clear_thread_flag(TIF_NOTIFY_RESUME); 798 clear_thread_flag(TIF_NOTIFY_RESUME);
823 tracehook_notify_resume(regs); 799 tracehook_notify_resume(regs);
824 if (current->replacement_session_keyring)
825 key_replace_session_keyring();
826 } 800 }
827 if (thread_info_flags & _TIF_USER_RETURN_NOTIFY) 801 if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
828 fire_user_return_notifiers(); 802 fire_user_return_notifiers();
@@ -843,7 +817,7 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
843 me->comm, me->pid, where, frame, 817 me->comm, me->pid, where, frame,
844 regs->ip, regs->sp, regs->orig_ax); 818 regs->ip, regs->sp, regs->orig_ax);
845 print_vma_addr(" in ", regs->ip); 819 print_vma_addr(" in ", regs->ip);
846 printk(KERN_CONT "\n"); 820 pr_cont("\n");
847 } 821 }
848 822
849 force_sig(SIGSEGV, me); 823 force_sig(SIGSEGV, me);
@@ -930,7 +904,6 @@ asmlinkage long sys32_x32_rt_sigreturn(struct pt_regs *regs)
930 if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) 904 if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
931 goto badframe; 905 goto badframe;
932 906
933 sigdelsetmask(&set, ~_BLOCKABLE);
934 set_current_blocked(&set); 907 set_current_blocked(&set);
935 908
936 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) 909 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 433529e29be..c1a310fb830 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1,4 +1,4 @@
1/* 1 /*
2 * x86 SMP booting functions 2 * x86 SMP booting functions
3 * 3 *
4 * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk> 4 * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
@@ -39,6 +39,8 @@
39 * Glauber Costa : i386 and x86_64 integration 39 * Glauber Costa : i386 and x86_64 integration
40 */ 40 */
41 41
42#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
43
42#include <linux/init.h> 44#include <linux/init.h>
43#include <linux/smp.h> 45#include <linux/smp.h>
44#include <linux/module.h> 46#include <linux/module.h>
@@ -57,7 +59,7 @@
57#include <asm/nmi.h> 59#include <asm/nmi.h>
58#include <asm/irq.h> 60#include <asm/irq.h>
59#include <asm/idle.h> 61#include <asm/idle.h>
60#include <asm/trampoline.h> 62#include <asm/realmode.h>
61#include <asm/cpu.h> 63#include <asm/cpu.h>
62#include <asm/numa.h> 64#include <asm/numa.h>
63#include <asm/pgtable.h> 65#include <asm/pgtable.h>
@@ -73,6 +75,8 @@
73#include <asm/smpboot_hooks.h> 75#include <asm/smpboot_hooks.h>
74#include <asm/i8259.h> 76#include <asm/i8259.h>
75 77
78#include <asm/realmode.h>
79
76/* State of each CPU */ 80/* State of each CPU */
77DEFINE_PER_CPU(int, cpu_state) = { 0 }; 81DEFINE_PER_CPU(int, cpu_state) = { 0 };
78 82
@@ -182,7 +186,7 @@ static void __cpuinit smp_callin(void)
182 * boards) 186 * boards)
183 */ 187 */
184 188
185 pr_debug("CALLIN, before setup_local_APIC().\n"); 189 pr_debug("CALLIN, before setup_local_APIC()\n");
186 if (apic->smp_callin_clear_local_apic) 190 if (apic->smp_callin_clear_local_apic)
187 apic->smp_callin_clear_local_apic(); 191 apic->smp_callin_clear_local_apic();
188 setup_local_APIC(); 192 setup_local_APIC();
@@ -253,22 +257,13 @@ notrace static void __cpuinit start_secondary(void *unused)
253 check_tsc_sync_target(); 257 check_tsc_sync_target();
254 258
255 /* 259 /*
256 * We need to hold call_lock, so there is no inconsistency
257 * between the time smp_call_function() determines number of
258 * IPI recipients, and the time when the determination is made
259 * for which cpus receive the IPI. Holding this
260 * lock helps us to not include this cpu in a currently in progress
261 * smp_call_function().
262 *
263 * We need to hold vector_lock so there the set of online cpus 260 * We need to hold vector_lock so there the set of online cpus
264 * does not change while we are assigning vectors to cpus. Holding 261 * does not change while we are assigning vectors to cpus. Holding
265 * this lock ensures we don't half assign or remove an irq from a cpu. 262 * this lock ensures we don't half assign or remove an irq from a cpu.
266 */ 263 */
267 ipi_call_lock();
268 lock_vector_lock(); 264 lock_vector_lock();
269 set_cpu_online(smp_processor_id(), true); 265 set_cpu_online(smp_processor_id(), true);
270 unlock_vector_lock(); 266 unlock_vector_lock();
271 ipi_call_unlock();
272 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; 267 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
273 x86_platform.nmi_init(); 268 x86_platform.nmi_init();
274 269
@@ -347,9 +342,12 @@ static bool __cpuinit match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
347 342
348static bool __cpuinit match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) 343static bool __cpuinit match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
349{ 344{
350 if (c->phys_proc_id == o->phys_proc_id) 345 if (c->phys_proc_id == o->phys_proc_id) {
351 return topology_sane(c, o, "mc"); 346 if (cpu_has(c, X86_FEATURE_AMD_DCM))
347 return true;
352 348
349 return topology_sane(c, o, "mc");
350 }
353 return false; 351 return false;
354} 352}
355 353
@@ -380,6 +378,15 @@ void __cpuinit set_cpu_sibling_map(int cpu)
380 if ((i == cpu) || (has_mc && match_llc(c, o))) 378 if ((i == cpu) || (has_mc && match_llc(c, o)))
381 link_mask(llc_shared, cpu, i); 379 link_mask(llc_shared, cpu, i);
382 380
381 }
382
383 /*
384 * This needs a separate iteration over the cpus because we rely on all
385 * cpu_sibling_mask links to be set-up.
386 */
387 for_each_cpu(i, cpu_sibling_setup_mask) {
388 o = &cpu_data(i);
389
383 if ((i == cpu) || (has_mc && match_mc(c, o))) { 390 if ((i == cpu) || (has_mc && match_mc(c, o))) {
384 link_mask(core, cpu, i); 391 link_mask(core, cpu, i);
385 392
@@ -408,15 +415,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
408/* maps the cpu to the sched domain representing multi-core */ 415/* maps the cpu to the sched domain representing multi-core */
409const struct cpumask *cpu_coregroup_mask(int cpu) 416const struct cpumask *cpu_coregroup_mask(int cpu)
410{ 417{
411 struct cpuinfo_x86 *c = &cpu_data(cpu); 418 return cpu_llc_shared_mask(cpu);
412 /*
413 * For perf, we return last level cache shared map.
414 * And for power savings, we return cpu_core_map
415 */
416 if (!(cpu_has(c, X86_FEATURE_AMD_DCM)))
417 return cpu_core_mask(cpu);
418 else
419 return cpu_llc_shared_mask(cpu);
420} 419}
421 420
422static void impress_friends(void) 421static void impress_friends(void)
@@ -426,17 +425,16 @@ static void impress_friends(void)
426 /* 425 /*
427 * Allow the user to impress friends. 426 * Allow the user to impress friends.
428 */ 427 */
429 pr_debug("Before bogomips.\n"); 428 pr_debug("Before bogomips\n");
430 for_each_possible_cpu(cpu) 429 for_each_possible_cpu(cpu)
431 if (cpumask_test_cpu(cpu, cpu_callout_mask)) 430 if (cpumask_test_cpu(cpu, cpu_callout_mask))
432 bogosum += cpu_data(cpu).loops_per_jiffy; 431 bogosum += cpu_data(cpu).loops_per_jiffy;
433 printk(KERN_INFO 432 pr_info("Total of %d processors activated (%lu.%02lu BogoMIPS)\n",
434 "Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
435 num_online_cpus(), 433 num_online_cpus(),
436 bogosum/(500000/HZ), 434 bogosum/(500000/HZ),
437 (bogosum/(5000/HZ))%100); 435 (bogosum/(5000/HZ))%100);
438 436
439 pr_debug("Before bogocount - setting activated=1.\n"); 437 pr_debug("Before bogocount - setting activated=1\n");
440} 438}
441 439
442void __inquire_remote_apic(int apicid) 440void __inquire_remote_apic(int apicid)
@@ -446,18 +444,17 @@ void __inquire_remote_apic(int apicid)
446 int timeout; 444 int timeout;
447 u32 status; 445 u32 status;
448 446
449 printk(KERN_INFO "Inquiring remote APIC 0x%x...\n", apicid); 447 pr_info("Inquiring remote APIC 0x%x...\n", apicid);
450 448
451 for (i = 0; i < ARRAY_SIZE(regs); i++) { 449 for (i = 0; i < ARRAY_SIZE(regs); i++) {
452 printk(KERN_INFO "... APIC 0x%x %s: ", apicid, names[i]); 450 pr_info("... APIC 0x%x %s: ", apicid, names[i]);
453 451
454 /* 452 /*
455 * Wait for idle. 453 * Wait for idle.
456 */ 454 */
457 status = safe_apic_wait_icr_idle(); 455 status = safe_apic_wait_icr_idle();
458 if (status) 456 if (status)
459 printk(KERN_CONT 457 pr_cont("a previous APIC delivery may have failed\n");
460 "a previous APIC delivery may have failed\n");
461 458
462 apic_icr_write(APIC_DM_REMRD | regs[i], apicid); 459 apic_icr_write(APIC_DM_REMRD | regs[i], apicid);
463 460
@@ -470,10 +467,10 @@ void __inquire_remote_apic(int apicid)
470 switch (status) { 467 switch (status) {
471 case APIC_ICR_RR_VALID: 468 case APIC_ICR_RR_VALID:
472 status = apic_read(APIC_RRR); 469 status = apic_read(APIC_RRR);
473 printk(KERN_CONT "%08x\n", status); 470 pr_cont("%08x\n", status);
474 break; 471 break;
475 default: 472 default:
476 printk(KERN_CONT "failed\n"); 473 pr_cont("failed\n");
477 } 474 }
478 } 475 }
479} 476}
@@ -507,12 +504,12 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
507 apic_write(APIC_ESR, 0); 504 apic_write(APIC_ESR, 0);
508 accept_status = (apic_read(APIC_ESR) & 0xEF); 505 accept_status = (apic_read(APIC_ESR) & 0xEF);
509 } 506 }
510 pr_debug("NMI sent.\n"); 507 pr_debug("NMI sent\n");
511 508
512 if (send_status) 509 if (send_status)
513 printk(KERN_ERR "APIC never delivered???\n"); 510 pr_err("APIC never delivered???\n");
514 if (accept_status) 511 if (accept_status)
515 printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status); 512 pr_err("APIC delivery error (%lx)\n", accept_status);
516 513
517 return (send_status | accept_status); 514 return (send_status | accept_status);
518} 515}
@@ -534,7 +531,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
534 apic_read(APIC_ESR); 531 apic_read(APIC_ESR);
535 } 532 }
536 533
537 pr_debug("Asserting INIT.\n"); 534 pr_debug("Asserting INIT\n");
538 535
539 /* 536 /*
540 * Turn INIT on target chip 537 * Turn INIT on target chip
@@ -550,7 +547,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
550 547
551 mdelay(10); 548 mdelay(10);
552 549
553 pr_debug("Deasserting INIT.\n"); 550 pr_debug("Deasserting INIT\n");
554 551
555 /* Target chip */ 552 /* Target chip */
556 /* Send IPI */ 553 /* Send IPI */
@@ -583,14 +580,14 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
583 /* 580 /*
584 * Run STARTUP IPI loop. 581 * Run STARTUP IPI loop.
585 */ 582 */
586 pr_debug("#startup loops: %d.\n", num_starts); 583 pr_debug("#startup loops: %d\n", num_starts);
587 584
588 for (j = 1; j <= num_starts; j++) { 585 for (j = 1; j <= num_starts; j++) {
589 pr_debug("Sending STARTUP #%d.\n", j); 586 pr_debug("Sending STARTUP #%d\n", j);
590 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ 587 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
591 apic_write(APIC_ESR, 0); 588 apic_write(APIC_ESR, 0);
592 apic_read(APIC_ESR); 589 apic_read(APIC_ESR);
593 pr_debug("After apic_write.\n"); 590 pr_debug("After apic_write\n");
594 591
595 /* 592 /*
596 * STARTUP IPI 593 * STARTUP IPI
@@ -607,7 +604,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
607 */ 604 */
608 udelay(300); 605 udelay(300);
609 606
610 pr_debug("Startup point 1.\n"); 607 pr_debug("Startup point 1\n");
611 608
612 pr_debug("Waiting for send to finish...\n"); 609 pr_debug("Waiting for send to finish...\n");
613 send_status = safe_apic_wait_icr_idle(); 610 send_status = safe_apic_wait_icr_idle();
@@ -622,12 +619,12 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
622 if (send_status || accept_status) 619 if (send_status || accept_status)
623 break; 620 break;
624 } 621 }
625 pr_debug("After Startup.\n"); 622 pr_debug("After Startup\n");
626 623
627 if (send_status) 624 if (send_status)
628 printk(KERN_ERR "APIC never delivered???\n"); 625 pr_err("APIC never delivered???\n");
629 if (accept_status) 626 if (accept_status)
630 printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status); 627 pr_err("APIC delivery error (%lx)\n", accept_status);
631 628
632 return (send_status | accept_status); 629 return (send_status | accept_status);
633} 630}
@@ -641,11 +638,11 @@ static void __cpuinit announce_cpu(int cpu, int apicid)
641 if (system_state == SYSTEM_BOOTING) { 638 if (system_state == SYSTEM_BOOTING) {
642 if (node != current_node) { 639 if (node != current_node) {
643 if (current_node > (-1)) 640 if (current_node > (-1))
644 pr_cont(" Ok.\n"); 641 pr_cont(" OK\n");
645 current_node = node; 642 current_node = node;
646 pr_info("Booting Node %3d, Processors ", node); 643 pr_info("Booting Node %3d, Processors ", node);
647 } 644 }
648 pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " Ok.\n" : ""); 645 pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " OK\n" : "");
649 return; 646 return;
650 } else 647 } else
651 pr_info("Booting Node %d Processor %d APIC 0x%x\n", 648 pr_info("Booting Node %d Processor %d APIC 0x%x\n",
@@ -660,8 +657,12 @@ static void __cpuinit announce_cpu(int cpu, int apicid)
660 */ 657 */
661static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle) 658static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
662{ 659{
660 volatile u32 *trampoline_status =
661 (volatile u32 *) __va(real_mode_header->trampoline_status);
662 /* start_ip had better be page-aligned! */
663 unsigned long start_ip = real_mode_header->trampoline_start;
664
663 unsigned long boot_error = 0; 665 unsigned long boot_error = 0;
664 unsigned long start_ip;
665 int timeout; 666 int timeout;
666 667
667 alternatives_smp_switch(1); 668 alternatives_smp_switch(1);
@@ -684,9 +685,6 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
684 initial_code = (unsigned long)start_secondary; 685 initial_code = (unsigned long)start_secondary;
685 stack_start = idle->thread.sp; 686 stack_start = idle->thread.sp;
686 687
687 /* start_ip had better be page-aligned! */
688 start_ip = trampoline_address();
689
690 /* So we see what's up */ 688 /* So we see what's up */
691 announce_cpu(cpu, apicid); 689 announce_cpu(cpu, apicid);
692 690
@@ -724,9 +722,9 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
724 /* 722 /*
725 * allow APs to start initializing. 723 * allow APs to start initializing.
726 */ 724 */
727 pr_debug("Before Callout %d.\n", cpu); 725 pr_debug("Before Callout %d\n", cpu);
728 cpumask_set_cpu(cpu, cpu_callout_mask); 726 cpumask_set_cpu(cpu, cpu_callout_mask);
729 pr_debug("After Callout %d.\n", cpu); 727 pr_debug("After Callout %d\n", cpu);
730 728
731 /* 729 /*
732 * Wait 5s total for a response 730 * Wait 5s total for a response
@@ -749,13 +747,12 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
749 pr_debug("CPU%d: has booted.\n", cpu); 747 pr_debug("CPU%d: has booted.\n", cpu);
750 } else { 748 } else {
751 boot_error = 1; 749 boot_error = 1;
752 if (*(volatile u32 *)TRAMPOLINE_SYM(trampoline_status) 750 if (*trampoline_status == 0xA5A5A5A5)
753 == 0xA5A5A5A5)
754 /* trampoline started but...? */ 751 /* trampoline started but...? */
755 pr_err("CPU%d: Stuck ??\n", cpu); 752 pr_err("CPU%d: Stuck ??\n", cpu);
756 else 753 else
757 /* trampoline code not run */ 754 /* trampoline code not run */
758 pr_err("CPU%d: Not responding.\n", cpu); 755 pr_err("CPU%d: Not responding\n", cpu);
759 if (apic->inquire_remote_apic) 756 if (apic->inquire_remote_apic)
760 apic->inquire_remote_apic(apicid); 757 apic->inquire_remote_apic(apicid);
761 } 758 }
@@ -776,7 +773,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
776 } 773 }
777 774
778 /* mark "stuck" area as not stuck */ 775 /* mark "stuck" area as not stuck */
779 *(volatile u32 *)TRAMPOLINE_SYM(trampoline_status) = 0; 776 *trampoline_status = 0;
780 777
781 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { 778 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
782 /* 779 /*
@@ -800,7 +797,7 @@ int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle)
800 if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || 797 if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid ||
801 !physid_isset(apicid, phys_cpu_present_map) || 798 !physid_isset(apicid, phys_cpu_present_map) ||
802 !apic->apic_id_valid(apicid)) { 799 !apic->apic_id_valid(apicid)) {
803 printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu); 800 pr_err("%s: bad cpu %d\n", __func__, cpu);
804 return -EINVAL; 801 return -EINVAL;
805 } 802 }
806 803
@@ -881,9 +878,8 @@ static int __init smp_sanity_check(unsigned max_cpus)
881 unsigned int cpu; 878 unsigned int cpu;
882 unsigned nr; 879 unsigned nr;
883 880
884 printk(KERN_WARNING 881 pr_warn("More than 8 CPUs detected - skipping them\n"
885 "More than 8 CPUs detected - skipping them.\n" 882 "Use CONFIG_X86_BIGSMP\n");
886 "Use CONFIG_X86_BIGSMP.\n");
887 883
888 nr = 0; 884 nr = 0;
889 for_each_present_cpu(cpu) { 885 for_each_present_cpu(cpu) {
@@ -904,8 +900,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
904#endif 900#endif
905 901
906 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { 902 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
907 printk(KERN_WARNING 903 pr_warn("weird, boot CPU (#%d) not listed by the BIOS\n",
908 "weird, boot CPU (#%d) not listed by the BIOS.\n",
909 hard_smp_processor_id()); 904 hard_smp_processor_id());
910 905
911 physid_set(hard_smp_processor_id(), phys_cpu_present_map); 906 physid_set(hard_smp_processor_id(), phys_cpu_present_map);
@@ -917,11 +912,10 @@ static int __init smp_sanity_check(unsigned max_cpus)
917 */ 912 */
918 if (!smp_found_config && !acpi_lapic) { 913 if (!smp_found_config && !acpi_lapic) {
919 preempt_enable(); 914 preempt_enable();
920 printk(KERN_NOTICE "SMP motherboard not detected.\n"); 915 pr_notice("SMP motherboard not detected\n");
921 disable_smp(); 916 disable_smp();
922 if (APIC_init_uniprocessor()) 917 if (APIC_init_uniprocessor())
923 printk(KERN_NOTICE "Local APIC not detected." 918 pr_notice("Local APIC not detected. Using dummy APIC emulation.\n");
924 " Using dummy APIC emulation.\n");
925 return -1; 919 return -1;
926 } 920 }
927 921
@@ -930,9 +924,8 @@ static int __init smp_sanity_check(unsigned max_cpus)
930 * CPU too, but we do it for the sake of robustness anyway. 924 * CPU too, but we do it for the sake of robustness anyway.
931 */ 925 */
932 if (!apic->check_phys_apicid_present(boot_cpu_physical_apicid)) { 926 if (!apic->check_phys_apicid_present(boot_cpu_physical_apicid)) {
933 printk(KERN_NOTICE 927 pr_notice("weird, boot CPU (#%d) not listed by the BIOS\n",
934 "weird, boot CPU (#%d) not listed by the BIOS.\n", 928 boot_cpu_physical_apicid);
935 boot_cpu_physical_apicid);
936 physid_set(hard_smp_processor_id(), phys_cpu_present_map); 929 physid_set(hard_smp_processor_id(), phys_cpu_present_map);
937 } 930 }
938 preempt_enable(); 931 preempt_enable();
@@ -945,8 +938,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
945 if (!disable_apic) { 938 if (!disable_apic) {
946 pr_err("BIOS bug, local APIC #%d not detected!...\n", 939 pr_err("BIOS bug, local APIC #%d not detected!...\n",
947 boot_cpu_physical_apicid); 940 boot_cpu_physical_apicid);
948 pr_err("... forcing use of dummy APIC emulation." 941 pr_err("... forcing use of dummy APIC emulation (tell your hw vendor)\n");
949 "(tell your hw vendor)\n");
950 } 942 }
951 smpboot_clear_io_apic(); 943 smpboot_clear_io_apic();
952 disable_ioapic_support(); 944 disable_ioapic_support();
@@ -959,7 +951,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
959 * If SMP should be disabled, then really disable it! 951 * If SMP should be disabled, then really disable it!
960 */ 952 */
961 if (!max_cpus) { 953 if (!max_cpus) {
962 printk(KERN_INFO "SMP mode deactivated.\n"); 954 pr_info("SMP mode deactivated\n");
963 smpboot_clear_io_apic(); 955 smpboot_clear_io_apic();
964 956
965 connect_bsp_APIC(); 957 connect_bsp_APIC();
@@ -1011,7 +1003,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1011 1003
1012 1004
1013 if (smp_sanity_check(max_cpus) < 0) { 1005 if (smp_sanity_check(max_cpus) < 0) {
1014 printk(KERN_INFO "SMP disabled\n"); 1006 pr_info("SMP disabled\n");
1015 disable_smp(); 1007 disable_smp();
1016 goto out; 1008 goto out;
1017 } 1009 }
@@ -1049,7 +1041,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1049 * Set up local APIC timer on boot CPU. 1041 * Set up local APIC timer on boot CPU.
1050 */ 1042 */
1051 1043
1052 printk(KERN_INFO "CPU%d: ", 0); 1044 pr_info("CPU%d: ", 0);
1053 print_cpu_info(&cpu_data(0)); 1045 print_cpu_info(&cpu_data(0));
1054 x86_init.timers.setup_percpu_clockev(); 1046 x86_init.timers.setup_percpu_clockev();
1055 1047
@@ -1099,7 +1091,7 @@ void __init native_smp_prepare_boot_cpu(void)
1099 1091
1100void __init native_smp_cpus_done(unsigned int max_cpus) 1092void __init native_smp_cpus_done(unsigned int max_cpus)
1101{ 1093{
1102 pr_debug("Boot done.\n"); 1094 pr_debug("Boot done\n");
1103 1095
1104 nmi_selftest(); 1096 nmi_selftest();
1105 impress_friends(); 1097 impress_friends();
@@ -1160,8 +1152,7 @@ __init void prefill_possible_map(void)
1160 1152
1161 /* nr_cpu_ids could be reduced via nr_cpus= */ 1153 /* nr_cpu_ids could be reduced via nr_cpus= */
1162 if (possible > nr_cpu_ids) { 1154 if (possible > nr_cpu_ids) {
1163 printk(KERN_WARNING 1155 pr_warn("%d Processors exceeds NR_CPUS limit of %d\n",
1164 "%d Processors exceeds NR_CPUS limit of %d\n",
1165 possible, nr_cpu_ids); 1156 possible, nr_cpu_ids);
1166 possible = nr_cpu_ids; 1157 possible = nr_cpu_ids;
1167 } 1158 }
@@ -1170,13 +1161,12 @@ __init void prefill_possible_map(void)
1170 if (!setup_max_cpus) 1161 if (!setup_max_cpus)
1171#endif 1162#endif
1172 if (possible > i) { 1163 if (possible > i) {
1173 printk(KERN_WARNING 1164 pr_warn("%d Processors exceeds max_cpus limit of %u\n",
1174 "%d Processors exceeds max_cpus limit of %u\n",
1175 possible, setup_max_cpus); 1165 possible, setup_max_cpus);
1176 possible = i; 1166 possible = i;
1177 } 1167 }
1178 1168
1179 printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", 1169 pr_info("Allowing %d CPUs, %d hotplug CPUs\n",
1180 possible, max_t(int, possible - num_processors, 0)); 1170 possible, max_t(int, possible - num_processors, 0));
1181 1171
1182 for (i = 0; i < possible; i++) 1172 for (i = 0; i < possible; i++)
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 6410744ac5c..f84fe00fad4 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -32,7 +32,7 @@
32#include <linux/mm.h> 32#include <linux/mm.h>
33#include <linux/tboot.h> 33#include <linux/tboot.h>
34 34
35#include <asm/trampoline.h> 35#include <asm/realmode.h>
36#include <asm/processor.h> 36#include <asm/processor.h>
37#include <asm/bootparam.h> 37#include <asm/bootparam.h>
38#include <asm/pgtable.h> 38#include <asm/pgtable.h>
@@ -44,7 +44,7 @@
44#include <asm/e820.h> 44#include <asm/e820.h>
45#include <asm/io.h> 45#include <asm/io.h>
46 46
47#include "acpi/realmode/wakeup.h" 47#include "../realmode/rm/wakeup.h"
48 48
49/* Global pointer to shared data; NULL means no measured launch. */ 49/* Global pointer to shared data; NULL means no measured launch. */
50struct tboot *tboot __read_mostly; 50struct tboot *tboot __read_mostly;
@@ -201,7 +201,8 @@ static int tboot_setup_sleep(void)
201 add_mac_region(e820.map[i].addr, e820.map[i].size); 201 add_mac_region(e820.map[i].addr, e820.map[i].size);
202 } 202 }
203 203
204 tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address; 204 tboot->acpi_sinfo.kernel_s3_resume_vector =
205 real_mode_header->wakeup_start;
205 206
206 return 0; 207 return 0;
207} 208}
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
deleted file mode 100644
index a73b61055ad..00000000000
--- a/arch/x86/kernel/trampoline.c
+++ /dev/null
@@ -1,42 +0,0 @@
1#include <linux/io.h>
2#include <linux/memblock.h>
3
4#include <asm/trampoline.h>
5#include <asm/cacheflush.h>
6#include <asm/pgtable.h>
7
8unsigned char *x86_trampoline_base;
9
10void __init setup_trampolines(void)
11{
12 phys_addr_t mem;
13 size_t size = PAGE_ALIGN(x86_trampoline_end - x86_trampoline_start);
14
15 /* Has to be in very low memory so we can execute real-mode AP code. */
16 mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
17 if (!mem)
18 panic("Cannot allocate trampoline\n");
19
20 x86_trampoline_base = __va(mem);
21 memblock_reserve(mem, size);
22
23 printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
24 x86_trampoline_base, (unsigned long long)mem, size);
25
26 memcpy(x86_trampoline_base, x86_trampoline_start, size);
27}
28
29/*
30 * setup_trampolines() gets called very early, to guarantee the
31 * availability of low memory. This is before the proper kernel page
32 * tables are set up, so we cannot set page permissions in that
33 * function. Thus, we use an arch_initcall instead.
34 */
35static int __init configure_trampolines(void)
36{
37 size_t size = PAGE_ALIGN(x86_trampoline_end - x86_trampoline_start);
38
39 set_memory_x((unsigned long)x86_trampoline_base, size >> PAGE_SHIFT);
40 return 0;
41}
42arch_initcall(configure_trampolines);
diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S
deleted file mode 100644
index 451c0a7ef7f..00000000000
--- a/arch/x86/kernel/trampoline_32.S
+++ /dev/null
@@ -1,83 +0,0 @@
1/*
2 *
3 * Trampoline.S Derived from Setup.S by Linus Torvalds
4 *
5 * 4 Jan 1997 Michael Chastain: changed to gnu as.
6 *
7 * This is only used for booting secondary CPUs in SMP machine
8 *
9 * Entry: CS:IP point to the start of our code, we are
10 * in real mode with no stack, but the rest of the
11 * trampoline page to make our stack and everything else
12 * is a mystery.
13 *
14 * We jump into arch/x86/kernel/head_32.S.
15 *
16 * On entry to trampoline_data, the processor is in real mode
17 * with 16-bit addressing and 16-bit data. CS has some value
18 * and IP is zero. Thus, data addresses need to be absolute
19 * (no relocation) and are taken with regard to r_base.
20 *
21 * If you work on this file, check the object module with
22 * objdump --reloc to make sure there are no relocation
23 * entries except for:
24 *
25 * TYPE VALUE
26 * R_386_32 startup_32_smp
27 * R_386_32 boot_gdt
28 */
29
30#include <linux/linkage.h>
31#include <linux/init.h>
32#include <asm/segment.h>
33#include <asm/page_types.h>
34
35#ifdef CONFIG_SMP
36
37 .section ".x86_trampoline","a"
38 .balign PAGE_SIZE
39 .code16
40
41ENTRY(trampoline_data)
42r_base = .
43 wbinvd # Needed for NUMA-Q should be harmless for others
44 mov %cs, %ax # Code and data in the same place
45 mov %ax, %ds
46
47 cli # We should be safe anyway
48
49 movl $0xA5A5A5A5, trampoline_status - r_base
50 # write marker for master knows we're running
51
52 /* GDT tables in non default location kernel can be beyond 16MB and
53 * lgdt will not be able to load the address as in real mode default
54 * operand size is 16bit. Use lgdtl instead to force operand size
55 * to 32 bit.
56 */
57
58 lidtl boot_idt_descr - r_base # load idt with 0, 0
59 lgdtl boot_gdt_descr - r_base # load gdt with whatever is appropriate
60
61 xor %ax, %ax
62 inc %ax # protected mode (PE) bit
63 lmsw %ax # into protected mode
64 # flush prefetch and jump to startup_32_smp in arch/i386/kernel/head.S
65 ljmpl $__BOOT_CS, $(startup_32_smp-__PAGE_OFFSET)
66
67 # These need to be in the same 64K segment as the above;
68 # hence we don't use the boot_gdt_descr defined in head.S
69boot_gdt_descr:
70 .word __BOOT_DS + 7 # gdt limit
71 .long boot_gdt - __PAGE_OFFSET # gdt base
72
73boot_idt_descr:
74 .word 0 # idt limit = 0
75 .long 0 # idt base = 0L
76
77ENTRY(trampoline_status)
78 .long 0
79
80.globl trampoline_end
81trampoline_end:
82
83#endif /* CONFIG_SMP */
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ff08457a025..b481341c936 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -9,6 +9,9 @@
9/* 9/*
10 * Handle hardware traps and faults. 10 * Handle hardware traps and faults.
11 */ 11 */
12
13#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14
12#include <linux/interrupt.h> 15#include <linux/interrupt.h>
13#include <linux/kallsyms.h> 16#include <linux/kallsyms.h>
14#include <linux/spinlock.h> 17#include <linux/spinlock.h>
@@ -143,12 +146,11 @@ trap_signal:
143#ifdef CONFIG_X86_64 146#ifdef CONFIG_X86_64
144 if (show_unhandled_signals && unhandled_signal(tsk, signr) && 147 if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
145 printk_ratelimit()) { 148 printk_ratelimit()) {
146 printk(KERN_INFO 149 pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx",
147 "%s[%d] trap %s ip:%lx sp:%lx error:%lx", 150 tsk->comm, tsk->pid, str,
148 tsk->comm, tsk->pid, str, 151 regs->ip, regs->sp, error_code);
149 regs->ip, regs->sp, error_code);
150 print_vma_addr(" in ", regs->ip); 152 print_vma_addr(" in ", regs->ip);
151 printk("\n"); 153 pr_cont("\n");
152 } 154 }
153#endif 155#endif
154 156
@@ -269,12 +271,11 @@ do_general_protection(struct pt_regs *regs, long error_code)
269 271
270 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && 272 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
271 printk_ratelimit()) { 273 printk_ratelimit()) {
272 printk(KERN_INFO 274 pr_info("%s[%d] general protection ip:%lx sp:%lx error:%lx",
273 "%s[%d] general protection ip:%lx sp:%lx error:%lx",
274 tsk->comm, task_pid_nr(tsk), 275 tsk->comm, task_pid_nr(tsk),
275 regs->ip, regs->sp, error_code); 276 regs->ip, regs->sp, error_code);
276 print_vma_addr(" in ", regs->ip); 277 print_vma_addr(" in ", regs->ip);
277 printk("\n"); 278 pr_cont("\n");
278 } 279 }
279 280
280 force_sig(SIGSEGV, tsk); 281 force_sig(SIGSEGV, tsk);
@@ -303,8 +304,12 @@ gp_in_kernel:
303dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_code) 304dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_code)
304{ 305{
305#ifdef CONFIG_DYNAMIC_FTRACE 306#ifdef CONFIG_DYNAMIC_FTRACE
306 /* ftrace must be first, everything else may cause a recursive crash */ 307 /*
307 if (unlikely(modifying_ftrace_code) && ftrace_int3_handler(regs)) 308 * ftrace must be first, everything else may cause a recursive crash.
309 * See note by declaration of modifying_ftrace_code in ftrace.c
310 */
311 if (unlikely(atomic_read(&modifying_ftrace_code)) &&
312 ftrace_int3_handler(regs))
308 return; 313 return;
309#endif 314#endif
310#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP 315#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
@@ -566,7 +571,7 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
566 conditional_sti(regs); 571 conditional_sti(regs);
567#if 0 572#if 0
568 /* No need to warn about this any longer. */ 573 /* No need to warn about this any longer. */
569 printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); 574 pr_info("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
570#endif 575#endif
571} 576}
572 577
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index fc0a147e372..cfa5d4f7ca5 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1,3 +1,5 @@
1#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
2
1#include <linux/kernel.h> 3#include <linux/kernel.h>
2#include <linux/sched.h> 4#include <linux/sched.h>
3#include <linux/init.h> 5#include <linux/init.h>
@@ -84,8 +86,7 @@ EXPORT_SYMBOL_GPL(check_tsc_unstable);
84#ifdef CONFIG_X86_TSC 86#ifdef CONFIG_X86_TSC
85int __init notsc_setup(char *str) 87int __init notsc_setup(char *str)
86{ 88{
87 printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, " 89 pr_warn("Kernel compiled with CONFIG_X86_TSC, cannot disable TSC completely\n");
88 "cannot disable TSC completely.\n");
89 tsc_disabled = 1; 90 tsc_disabled = 1;
90 return 1; 91 return 1;
91} 92}
@@ -373,7 +374,7 @@ static unsigned long quick_pit_calibrate(void)
373 goto success; 374 goto success;
374 } 375 }
375 } 376 }
376 printk("Fast TSC calibration failed\n"); 377 pr_err("Fast TSC calibration failed\n");
377 return 0; 378 return 0;
378 379
379success: 380success:
@@ -392,7 +393,7 @@ success:
392 */ 393 */
393 delta *= PIT_TICK_RATE; 394 delta *= PIT_TICK_RATE;
394 do_div(delta, i*256*1000); 395 do_div(delta, i*256*1000);
395 printk("Fast TSC calibration using PIT\n"); 396 pr_info("Fast TSC calibration using PIT\n");
396 return delta; 397 return delta;
397} 398}
398 399
@@ -487,9 +488,8 @@ unsigned long native_calibrate_tsc(void)
487 * use the reference value, as it is more precise. 488 * use the reference value, as it is more precise.
488 */ 489 */
489 if (delta >= 90 && delta <= 110) { 490 if (delta >= 90 && delta <= 110) {
490 printk(KERN_INFO 491 pr_info("PIT calibration matches %s. %d loops\n",
491 "TSC: PIT calibration matches %s. %d loops\n", 492 hpet ? "HPET" : "PMTIMER", i + 1);
492 hpet ? "HPET" : "PMTIMER", i + 1);
493 return tsc_ref_min; 493 return tsc_ref_min;
494 } 494 }
495 495
@@ -511,38 +511,36 @@ unsigned long native_calibrate_tsc(void)
511 */ 511 */
512 if (tsc_pit_min == ULONG_MAX) { 512 if (tsc_pit_min == ULONG_MAX) {
513 /* PIT gave no useful value */ 513 /* PIT gave no useful value */
514 printk(KERN_WARNING "TSC: Unable to calibrate against PIT\n"); 514 pr_warn("Unable to calibrate against PIT\n");
515 515
516 /* We don't have an alternative source, disable TSC */ 516 /* We don't have an alternative source, disable TSC */
517 if (!hpet && !ref1 && !ref2) { 517 if (!hpet && !ref1 && !ref2) {
518 printk("TSC: No reference (HPET/PMTIMER) available\n"); 518 pr_notice("No reference (HPET/PMTIMER) available\n");
519 return 0; 519 return 0;
520 } 520 }
521 521
522 /* The alternative source failed as well, disable TSC */ 522 /* The alternative source failed as well, disable TSC */
523 if (tsc_ref_min == ULONG_MAX) { 523 if (tsc_ref_min == ULONG_MAX) {
524 printk(KERN_WARNING "TSC: HPET/PMTIMER calibration " 524 pr_warn("HPET/PMTIMER calibration failed\n");
525 "failed.\n");
526 return 0; 525 return 0;
527 } 526 }
528 527
529 /* Use the alternative source */ 528 /* Use the alternative source */
530 printk(KERN_INFO "TSC: using %s reference calibration\n", 529 pr_info("using %s reference calibration\n",
531 hpet ? "HPET" : "PMTIMER"); 530 hpet ? "HPET" : "PMTIMER");
532 531
533 return tsc_ref_min; 532 return tsc_ref_min;
534 } 533 }
535 534
536 /* We don't have an alternative source, use the PIT calibration value */ 535 /* We don't have an alternative source, use the PIT calibration value */
537 if (!hpet && !ref1 && !ref2) { 536 if (!hpet && !ref1 && !ref2) {
538 printk(KERN_INFO "TSC: Using PIT calibration value\n"); 537 pr_info("Using PIT calibration value\n");
539 return tsc_pit_min; 538 return tsc_pit_min;
540 } 539 }
541 540
542 /* The alternative source failed, use the PIT calibration value */ 541 /* The alternative source failed, use the PIT calibration value */
543 if (tsc_ref_min == ULONG_MAX) { 542 if (tsc_ref_min == ULONG_MAX) {
544 printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed. " 543 pr_warn("HPET/PMTIMER calibration failed. Using PIT calibration.\n");
545 "Using PIT calibration\n");
546 return tsc_pit_min; 544 return tsc_pit_min;
547 } 545 }
548 546
@@ -551,9 +549,9 @@ unsigned long native_calibrate_tsc(void)
551 * the PIT value as we know that there are PMTIMERs around 549 * the PIT value as we know that there are PMTIMERs around
552 * running at double speed. At least we let the user know: 550 * running at double speed. At least we let the user know:
553 */ 551 */
554 printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n", 552 pr_warn("PIT calibration deviates from %s: %lu %lu\n",
555 hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min); 553 hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
556 printk(KERN_INFO "TSC: Using PIT calibration value\n"); 554 pr_info("Using PIT calibration value\n");
557 return tsc_pit_min; 555 return tsc_pit_min;
558} 556}
559 557
@@ -785,7 +783,7 @@ void mark_tsc_unstable(char *reason)
785 tsc_unstable = 1; 783 tsc_unstable = 1;
786 sched_clock_stable = 0; 784 sched_clock_stable = 0;
787 disable_sched_clock_irqtime(); 785 disable_sched_clock_irqtime();
788 printk(KERN_INFO "Marking TSC unstable due to %s\n", reason); 786 pr_info("Marking TSC unstable due to %s\n", reason);
789 /* Change only the rating, when not registered */ 787 /* Change only the rating, when not registered */
790 if (clocksource_tsc.mult) 788 if (clocksource_tsc.mult)
791 clocksource_mark_unstable(&clocksource_tsc); 789 clocksource_mark_unstable(&clocksource_tsc);
@@ -912,9 +910,9 @@ static void tsc_refine_calibration_work(struct work_struct *work)
912 goto out; 910 goto out;
913 911
914 tsc_khz = freq; 912 tsc_khz = freq;
915 printk(KERN_INFO "Refined TSC clocksource calibration: " 913 pr_info("Refined TSC clocksource calibration: %lu.%03lu MHz\n",
916 "%lu.%03lu MHz.\n", (unsigned long)tsc_khz / 1000, 914 (unsigned long)tsc_khz / 1000,
917 (unsigned long)tsc_khz % 1000); 915 (unsigned long)tsc_khz % 1000);
918 916
919out: 917out:
920 clocksource_register_khz(&clocksource_tsc, tsc_khz); 918 clocksource_register_khz(&clocksource_tsc, tsc_khz);
@@ -970,9 +968,9 @@ void __init tsc_init(void)
970 return; 968 return;
971 } 969 }
972 970
973 printk("Detected %lu.%03lu MHz processor.\n", 971 pr_info("Detected %lu.%03lu MHz processor\n",
974 (unsigned long)cpu_khz / 1000, 972 (unsigned long)cpu_khz / 1000,
975 (unsigned long)cpu_khz % 1000); 973 (unsigned long)cpu_khz % 1000);
976 974
977 /* 975 /*
978 * Secondary CPUs do not run through tsc_init(), so set up 976 * Secondary CPUs do not run through tsc_init(), so set up
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
new file mode 100644
index 00000000000..36fd42091fa
--- /dev/null
+++ b/arch/x86/kernel/uprobes.c
@@ -0,0 +1,675 @@
1/*
2 * User-space Probes (UProbes) for x86
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2008-2011
19 * Authors:
20 * Srikar Dronamraju
21 * Jim Keniston
22 */
23#include <linux/kernel.h>
24#include <linux/sched.h>
25#include <linux/ptrace.h>
26#include <linux/uprobes.h>
27#include <linux/uaccess.h>
28
29#include <linux/kdebug.h>
30#include <asm/processor.h>
31#include <asm/insn.h>
32
33/* Post-execution fixups. */
34
35/* No fixup needed */
36#define UPROBE_FIX_NONE 0x0
37
38/* Adjust IP back to vicinity of actual insn */
39#define UPROBE_FIX_IP 0x1
40
41/* Adjust the return address of a call insn */
42#define UPROBE_FIX_CALL 0x2
43
44#define UPROBE_FIX_RIP_AX 0x8000
45#define UPROBE_FIX_RIP_CX 0x4000
46
47#define UPROBE_TRAP_NR UINT_MAX
48
49/* Adaptations for mhiramat x86 decoder v14. */
50#define OPCODE1(insn) ((insn)->opcode.bytes[0])
51#define OPCODE2(insn) ((insn)->opcode.bytes[1])
52#define OPCODE3(insn) ((insn)->opcode.bytes[2])
53#define MODRM_REG(insn) X86_MODRM_REG(insn->modrm.value)
54
55#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
56 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
57 (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \
58 (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \
59 (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \
60 << (row % 32))
61
62/*
63 * Good-instruction tables for 32-bit apps. This is non-const and volatile
64 * to keep gcc from statically optimizing it out, as variable_test_bit makes
65 * some versions of gcc to think only *(unsigned long*) is used.
66 */
67static volatile u32 good_insns_32[256 / 32] = {
68 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
69 /* ---------------------------------------------- */
70 W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 00 */
71 W(0x10, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 10 */
72 W(0x20, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) | /* 20 */
73 W(0x30, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) , /* 30 */
74 W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
75 W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
76 W(0x60, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
77 W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */
78 W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
79 W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
80 W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */
81 W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
82 W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
83 W(0xd0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
84 W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */
85 W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */
86 /* ---------------------------------------------- */
87 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
88};
89
90/* Using this for both 64-bit and 32-bit apps */
91static volatile u32 good_2byte_insns[256 / 32] = {
92 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
93 /* ---------------------------------------------- */
94 W(0x00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1) | /* 00 */
95 W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* 10 */
96 W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */
97 W(0x30, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
98 W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
99 W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
100 W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */
101 W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */
102 W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
103 W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
104 W(0xa0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */
105 W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
106 W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
107 W(0xd0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
108 W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */
109 W(0xf0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* f0 */
110 /* ---------------------------------------------- */
111 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
112};
113
114#ifdef CONFIG_X86_64
115/* Good-instruction tables for 64-bit apps */
116static volatile u32 good_insns_64[256 / 32] = {
117 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
118 /* ---------------------------------------------- */
119 W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 00 */
120 W(0x10, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 10 */
121 W(0x20, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 20 */
122 W(0x30, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 30 */
123 W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
124 W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
125 W(0x60, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
126 W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */
127 W(0x80, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
128 W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
129 W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */
130 W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
131 W(0xc0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
132 W(0xd0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
133 W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */
134 W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */
135 /* ---------------------------------------------- */
136 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
137};
138#endif
139#undef W
140
141/*
142 * opcodes we'll probably never support:
143 *
144 * 6c-6d, e4-e5, ec-ed - in
145 * 6e-6f, e6-e7, ee-ef - out
146 * cc, cd - int3, int
147 * cf - iret
148 * d6 - illegal instruction
149 * f1 - int1/icebp
150 * f4 - hlt
151 * fa, fb - cli, sti
152 * 0f - lar, lsl, syscall, clts, sysret, sysenter, sysexit, invd, wbinvd, ud2
153 *
154 * invalid opcodes in 64-bit mode:
155 *
156 * 06, 0e, 16, 1e, 27, 2f, 37, 3f, 60-62, 82, c4-c5, d4-d5
157 * 63 - we support this opcode in x86_64 but not in i386.
158 *
159 * opcodes we may need to refine support for:
160 *
161 * 0f - 2-byte instructions: For many of these instructions, the validity
162 * depends on the prefix and/or the reg field. On such instructions, we
163 * just consider the opcode combination valid if it corresponds to any
164 * valid instruction.
165 *
166 * 8f - Group 1 - only reg = 0 is OK
167 * c6-c7 - Group 11 - only reg = 0 is OK
168 * d9-df - fpu insns with some illegal encodings
169 * f2, f3 - repnz, repz prefixes. These are also the first byte for
170 * certain floating-point instructions, such as addsd.
171 *
172 * fe - Group 4 - only reg = 0 or 1 is OK
173 * ff - Group 5 - only reg = 0-6 is OK
174 *
175 * others -- Do we need to support these?
176 *
177 * 0f - (floating-point?) prefetch instructions
178 * 07, 17, 1f - pop es, pop ss, pop ds
179 * 26, 2e, 36, 3e - es:, cs:, ss:, ds: segment prefixes --
180 * but 64 and 65 (fs: and gs:) seem to be used, so we support them
181 * 67 - addr16 prefix
182 * ce - into
183 * f0 - lock prefix
184 */
185
186/*
187 * TODO:
188 * - Where necessary, examine the modrm byte and allow only valid instructions
189 * in the different Groups and fpu instructions.
190 */
191
192static bool is_prefix_bad(struct insn *insn)
193{
194 int i;
195
196 for (i = 0; i < insn->prefixes.nbytes; i++) {
197 switch (insn->prefixes.bytes[i]) {
198 case 0x26: /* INAT_PFX_ES */
199 case 0x2E: /* INAT_PFX_CS */
200 case 0x36: /* INAT_PFX_DS */
201 case 0x3E: /* INAT_PFX_SS */
202 case 0xF0: /* INAT_PFX_LOCK */
203 return true;
204 }
205 }
206 return false;
207}
208
209static int validate_insn_32bits(struct arch_uprobe *auprobe, struct insn *insn)
210{
211 insn_init(insn, auprobe->insn, false);
212
213 /* Skip good instruction prefixes; reject "bad" ones. */
214 insn_get_opcode(insn);
215 if (is_prefix_bad(insn))
216 return -ENOTSUPP;
217
218 if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_32))
219 return 0;
220
221 if (insn->opcode.nbytes == 2) {
222 if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns))
223 return 0;
224 }
225
226 return -ENOTSUPP;
227}
228
229/*
230 * Figure out which fixups arch_uprobe_post_xol() will need to perform, and
231 * annotate arch_uprobe->fixups accordingly. To start with,
232 * arch_uprobe->fixups is either zero or it reflects rip-related fixups.
233 */
234static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn)
235{
236 bool fix_ip = true, fix_call = false; /* defaults */
237 int reg;
238
239 insn_get_opcode(insn); /* should be a nop */
240
241 switch (OPCODE1(insn)) {
242 case 0xc3: /* ret/lret */
243 case 0xcb:
244 case 0xc2:
245 case 0xca:
246 /* ip is correct */
247 fix_ip = false;
248 break;
249 case 0xe8: /* call relative - Fix return addr */
250 fix_call = true;
251 break;
252 case 0x9a: /* call absolute - Fix return addr, not ip */
253 fix_call = true;
254 fix_ip = false;
255 break;
256 case 0xff:
257 insn_get_modrm(insn);
258 reg = MODRM_REG(insn);
259 if (reg == 2 || reg == 3) {
260 /* call or lcall, indirect */
261 /* Fix return addr; ip is correct. */
262 fix_call = true;
263 fix_ip = false;
264 } else if (reg == 4 || reg == 5) {
265 /* jmp or ljmp, indirect */
266 /* ip is correct. */
267 fix_ip = false;
268 }
269 break;
270 case 0xea: /* jmp absolute -- ip is correct */
271 fix_ip = false;
272 break;
273 default:
274 break;
275 }
276 if (fix_ip)
277 auprobe->fixups |= UPROBE_FIX_IP;
278 if (fix_call)
279 auprobe->fixups |= UPROBE_FIX_CALL;
280}
281
282#ifdef CONFIG_X86_64
283/*
284 * If arch_uprobe->insn doesn't use rip-relative addressing, return
285 * immediately. Otherwise, rewrite the instruction so that it accesses
286 * its memory operand indirectly through a scratch register. Set
287 * arch_uprobe->fixups and arch_uprobe->rip_rela_target_address
288 * accordingly. (The contents of the scratch register will be saved
289 * before we single-step the modified instruction, and restored
290 * afterward.)
291 *
292 * We do this because a rip-relative instruction can access only a
293 * relatively small area (+/- 2 GB from the instruction), and the XOL
294 * area typically lies beyond that area. At least for instructions
295 * that store to memory, we can't execute the original instruction
296 * and "fix things up" later, because the misdirected store could be
297 * disastrous.
298 *
299 * Some useful facts about rip-relative instructions:
300 *
301 * - There's always a modrm byte.
302 * - There's never a SIB byte.
303 * - The displacement is always 4 bytes.
304 */
305static void
306handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn)
307{
308 u8 *cursor;
309 u8 reg;
310
311 if (mm->context.ia32_compat)
312 return;
313
314 auprobe->rip_rela_target_address = 0x0;
315 if (!insn_rip_relative(insn))
316 return;
317
318 /*
319 * insn_rip_relative() would have decoded rex_prefix, modrm.
320 * Clear REX.b bit (extension of MODRM.rm field):
321 * we want to encode rax/rcx, not r8/r9.
322 */
323 if (insn->rex_prefix.nbytes) {
324 cursor = auprobe->insn + insn_offset_rex_prefix(insn);
325 *cursor &= 0xfe; /* Clearing REX.B bit */
326 }
327
328 /*
329 * Point cursor at the modrm byte. The next 4 bytes are the
330 * displacement. Beyond the displacement, for some instructions,
331 * is the immediate operand.
332 */
333 cursor = auprobe->insn + insn_offset_modrm(insn);
334 insn_get_length(insn);
335
336 /*
337 * Convert from rip-relative addressing to indirect addressing
338 * via a scratch register. Change the r/m field from 0x5 (%rip)
339 * to 0x0 (%rax) or 0x1 (%rcx), and squeeze out the offset field.
340 */
341 reg = MODRM_REG(insn);
342 if (reg == 0) {
343 /*
344 * The register operand (if any) is either the A register
345 * (%rax, %eax, etc.) or (if the 0x4 bit is set in the
346 * REX prefix) %r8. In any case, we know the C register
347 * is NOT the register operand, so we use %rcx (register
348 * #1) for the scratch register.
349 */
350 auprobe->fixups = UPROBE_FIX_RIP_CX;
351 /* Change modrm from 00 000 101 to 00 000 001. */
352 *cursor = 0x1;
353 } else {
354 /* Use %rax (register #0) for the scratch register. */
355 auprobe->fixups = UPROBE_FIX_RIP_AX;
356 /* Change modrm from 00 xxx 101 to 00 xxx 000 */
357 *cursor = (reg << 3);
358 }
359
360 /* Target address = address of next instruction + (signed) offset */
361 auprobe->rip_rela_target_address = (long)insn->length + insn->displacement.value;
362
363 /* Displacement field is gone; slide immediate field (if any) over. */
364 if (insn->immediate.nbytes) {
365 cursor++;
366 memmove(cursor, cursor + insn->displacement.nbytes, insn->immediate.nbytes);
367 }
368 return;
369}
370
371static int validate_insn_64bits(struct arch_uprobe *auprobe, struct insn *insn)
372{
373 insn_init(insn, auprobe->insn, true);
374
375 /* Skip good instruction prefixes; reject "bad" ones. */
376 insn_get_opcode(insn);
377 if (is_prefix_bad(insn))
378 return -ENOTSUPP;
379
380 if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_64))
381 return 0;
382
383 if (insn->opcode.nbytes == 2) {
384 if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns))
385 return 0;
386 }
387 return -ENOTSUPP;
388}
389
390static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn)
391{
392 if (mm->context.ia32_compat)
393 return validate_insn_32bits(auprobe, insn);
394 return validate_insn_64bits(auprobe, insn);
395}
396#else /* 32-bit: */
397static void handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn)
398{
399 /* No RIP-relative addressing on 32-bit */
400}
401
402static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn)
403{
404 return validate_insn_32bits(auprobe, insn);
405}
406#endif /* CONFIG_X86_64 */
407
408/**
409 * arch_uprobe_analyze_insn - instruction analysis including validity and fixups.
410 * @mm: the probed address space.
411 * @arch_uprobe: the probepoint information.
412 * @addr: virtual address at which to install the probepoint
413 * Return 0 on success or a -ve number on error.
414 */
415int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr)
416{
417 int ret;
418 struct insn insn;
419
420 auprobe->fixups = 0;
421 ret = validate_insn_bits(auprobe, mm, &insn);
422 if (ret != 0)
423 return ret;
424
425 handle_riprel_insn(auprobe, mm, &insn);
426 prepare_fixups(auprobe, &insn);
427
428 return 0;
429}
430
431#ifdef CONFIG_X86_64
432/*
433 * If we're emulating a rip-relative instruction, save the contents
434 * of the scratch register and store the target address in that register.
435 */
436static void
437pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs,
438 struct arch_uprobe_task *autask)
439{
440 if (auprobe->fixups & UPROBE_FIX_RIP_AX) {
441 autask->saved_scratch_register = regs->ax;
442 regs->ax = current->utask->vaddr;
443 regs->ax += auprobe->rip_rela_target_address;
444 } else if (auprobe->fixups & UPROBE_FIX_RIP_CX) {
445 autask->saved_scratch_register = regs->cx;
446 regs->cx = current->utask->vaddr;
447 regs->cx += auprobe->rip_rela_target_address;
448 }
449}
450#else
451static void
452pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs,
453 struct arch_uprobe_task *autask)
454{
455 /* No RIP-relative addressing on 32-bit */
456}
457#endif
458
459/*
460 * arch_uprobe_pre_xol - prepare to execute out of line.
461 * @auprobe: the probepoint information.
462 * @regs: reflects the saved user state of current task.
463 */
464int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
465{
466 struct arch_uprobe_task *autask;
467
468 autask = &current->utask->autask;
469 autask->saved_trap_nr = current->thread.trap_nr;
470 current->thread.trap_nr = UPROBE_TRAP_NR;
471 regs->ip = current->utask->xol_vaddr;
472 pre_xol_rip_insn(auprobe, regs, autask);
473
474 return 0;
475}
476
477/*
478 * This function is called by arch_uprobe_post_xol() to adjust the return
479 * address pushed by a call instruction executed out of line.
480 */
481static int adjust_ret_addr(unsigned long sp, long correction)
482{
483 int rasize, ncopied;
484 long ra = 0;
485
486 if (is_ia32_task())
487 rasize = 4;
488 else
489 rasize = 8;
490
491 ncopied = copy_from_user(&ra, (void __user *)sp, rasize);
492 if (unlikely(ncopied))
493 return -EFAULT;
494
495 ra += correction;
496 ncopied = copy_to_user((void __user *)sp, &ra, rasize);
497 if (unlikely(ncopied))
498 return -EFAULT;
499
500 return 0;
501}
502
503#ifdef CONFIG_X86_64
504static bool is_riprel_insn(struct arch_uprobe *auprobe)
505{
506 return ((auprobe->fixups & (UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX)) != 0);
507}
508
509static void
510handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction)
511{
512 if (is_riprel_insn(auprobe)) {
513 struct arch_uprobe_task *autask;
514
515 autask = &current->utask->autask;
516 if (auprobe->fixups & UPROBE_FIX_RIP_AX)
517 regs->ax = autask->saved_scratch_register;
518 else
519 regs->cx = autask->saved_scratch_register;
520
521 /*
522 * The original instruction includes a displacement, and so
523 * is 4 bytes longer than what we've just single-stepped.
524 * Fall through to handle stuff like "jmpq *...(%rip)" and
525 * "callq *...(%rip)".
526 */
527 if (correction)
528 *correction += 4;
529 }
530}
531#else
532static void
533handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction)
534{
535 /* No RIP-relative addressing on 32-bit */
536}
537#endif
538
539/*
540 * If xol insn itself traps and generates a signal(Say,
541 * SIGILL/SIGSEGV/etc), then detect the case where a singlestepped
542 * instruction jumps back to its own address. It is assumed that anything
543 * like do_page_fault/do_trap/etc sets thread.trap_nr != -1.
544 *
545 * arch_uprobe_pre_xol/arch_uprobe_post_xol save/restore thread.trap_nr,
546 * arch_uprobe_xol_was_trapped() simply checks that ->trap_nr is not equal to
547 * UPROBE_TRAP_NR == -1 set by arch_uprobe_pre_xol().
548 */
549bool arch_uprobe_xol_was_trapped(struct task_struct *t)
550{
551 if (t->thread.trap_nr != UPROBE_TRAP_NR)
552 return true;
553
554 return false;
555}
556
557/*
558 * Called after single-stepping. To avoid the SMP problems that can
559 * occur when we temporarily put back the original opcode to
560 * single-step, we single-stepped a copy of the instruction.
561 *
562 * This function prepares to resume execution after the single-step.
563 * We have to fix things up as follows:
564 *
565 * Typically, the new ip is relative to the copied instruction. We need
566 * to make it relative to the original instruction (FIX_IP). Exceptions
567 * are return instructions and absolute or indirect jump or call instructions.
568 *
569 * If the single-stepped instruction was a call, the return address that
570 * is atop the stack is the address following the copied instruction. We
571 * need to make it the address following the original instruction (FIX_CALL).
572 *
573 * If the original instruction was a rip-relative instruction such as
574 * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent
575 * instruction using a scratch register -- e.g., "movl %edx,(%rax)".
576 * We need to restore the contents of the scratch register and adjust
577 * the ip, keeping in mind that the instruction we executed is 4 bytes
578 * shorter than the original instruction (since we squeezed out the offset
579 * field). (FIX_RIP_AX or FIX_RIP_CX)
580 */
581int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
582{
583 struct uprobe_task *utask;
584 long correction;
585 int result = 0;
586
587 WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR);
588
589 utask = current->utask;
590 current->thread.trap_nr = utask->autask.saved_trap_nr;
591 correction = (long)(utask->vaddr - utask->xol_vaddr);
592 handle_riprel_post_xol(auprobe, regs, &correction);
593 if (auprobe->fixups & UPROBE_FIX_IP)
594 regs->ip += correction;
595
596 if (auprobe->fixups & UPROBE_FIX_CALL)
597 result = adjust_ret_addr(regs->sp, correction);
598
599 return result;
600}
601
602/* callback routine for handling exceptions. */
603int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data)
604{
605 struct die_args *args = data;
606 struct pt_regs *regs = args->regs;
607 int ret = NOTIFY_DONE;
608
609 /* We are only interested in userspace traps */
610 if (regs && !user_mode_vm(regs))
611 return NOTIFY_DONE;
612
613 switch (val) {
614 case DIE_INT3:
615 if (uprobe_pre_sstep_notifier(regs))
616 ret = NOTIFY_STOP;
617
618 break;
619
620 case DIE_DEBUG:
621 if (uprobe_post_sstep_notifier(regs))
622 ret = NOTIFY_STOP;
623
624 default:
625 break;
626 }
627
628 return ret;
629}
630
631/*
632 * This function gets called when XOL instruction either gets trapped or
633 * the thread has a fatal signal, so reset the instruction pointer to its
634 * probed address.
635 */
636void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
637{
638 struct uprobe_task *utask = current->utask;
639
640 current->thread.trap_nr = utask->autask.saved_trap_nr;
641 handle_riprel_post_xol(auprobe, regs, NULL);
642 instruction_pointer_set(regs, utask->vaddr);
643}
644
645/*
646 * Skip these instructions as per the currently known x86 ISA.
647 * 0x66* { 0x90 | 0x0f 0x1f | 0x0f 0x19 | 0x87 0xc0 }
648 */
649bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
650{
651 int i;
652
653 for (i = 0; i < MAX_UINSN_BYTES; i++) {
654 if ((auprobe->insn[i] == 0x66))
655 continue;
656
657 if (auprobe->insn[i] == 0x90)
658 return true;
659
660 if (i == (MAX_UINSN_BYTES - 1))
661 break;
662
663 if ((auprobe->insn[i] == 0x0f) && (auprobe->insn[i+1] == 0x1f))
664 return true;
665
666 if ((auprobe->insn[i] == 0x0f) && (auprobe->insn[i+1] == 0x19))
667 return true;
668
669 if ((auprobe->insn[i] == 0x87) && (auprobe->insn[i+1] == 0xc0))
670 return true;
671
672 break;
673 }
674 return false;
675}
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 255f58ae71e..54abcc0baf2 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -28,6 +28,8 @@
28 * 28 *
29 */ 29 */
30 30
31#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
32
31#include <linux/capability.h> 33#include <linux/capability.h>
32#include <linux/errno.h> 34#include <linux/errno.h>
33#include <linux/interrupt.h> 35#include <linux/interrupt.h>
@@ -137,14 +139,14 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
137 local_irq_enable(); 139 local_irq_enable();
138 140
139 if (!current->thread.vm86_info) { 141 if (!current->thread.vm86_info) {
140 printk("no vm86_info: BAD\n"); 142 pr_alert("no vm86_info: BAD\n");
141 do_exit(SIGSEGV); 143 do_exit(SIGSEGV);
142 } 144 }
143 set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | current->thread.v86mask); 145 set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | current->thread.v86mask);
144 tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs, regs); 146 tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs, regs);
145 tmp += put_user(current->thread.screen_bitmap, &current->thread.vm86_info->screen_bitmap); 147 tmp += put_user(current->thread.screen_bitmap, &current->thread.vm86_info->screen_bitmap);
146 if (tmp) { 148 if (tmp) {
147 printk("vm86: could not access userspace vm86_info\n"); 149 pr_alert("could not access userspace vm86_info\n");
148 do_exit(SIGSEGV); 150 do_exit(SIGSEGV);
149 } 151 }
150 152
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 0f703f10901..22a1530146a 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -197,18 +197,6 @@ SECTIONS
197 197
198 INIT_DATA_SECTION(16) 198 INIT_DATA_SECTION(16)
199 199
200 /*
201 * Code and data for a variety of lowlevel trampolines, to be
202 * copied into base memory (< 1 MiB) during initialization.
203 * Since it is copied early, the main copy can be discarded
204 * afterwards.
205 */
206 .x86_trampoline : AT(ADDR(.x86_trampoline) - LOAD_OFFSET) {
207 x86_trampoline_start = .;
208 *(.x86_trampoline)
209 x86_trampoline_end = .;
210 }
211
212 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { 200 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
213 __x86_cpu_dev_start = .; 201 __x86_cpu_dev_start = .;
214 *(.x86_cpu_dev.init) 202 *(.x86_cpu_dev.init)
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index 8eeb55a551b..992f890283e 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -16,6 +16,7 @@
16#include <linux/pci_ids.h> 16#include <linux/pci_ids.h>
17#include <linux/pci_regs.h> 17#include <linux/pci_regs.h>
18#include <linux/smp.h> 18#include <linux/smp.h>
19#include <linux/irq.h>
19 20
20#include <asm/apic.h> 21#include <asm/apic.h>
21#include <asm/pci-direct.h> 22#include <asm/pci-direct.h>
@@ -95,6 +96,18 @@ static void __init set_vsmp_pv_ops(void)
95 ctl = readl(address + 4); 96 ctl = readl(address + 4);
96 printk(KERN_INFO "vSMP CTL: capabilities:0x%08x control:0x%08x\n", 97 printk(KERN_INFO "vSMP CTL: capabilities:0x%08x control:0x%08x\n",
97 cap, ctl); 98 cap, ctl);
99
100 /* If possible, let the vSMP foundation route the interrupt optimally */
101#ifdef CONFIG_SMP
102 if (cap & ctl & BIT(8)) {
103 ctl &= ~BIT(8);
104#ifdef CONFIG_PROC_FS
105 /* Don't let users change irq affinity via procfs */
106 no_irq_affinity = 1;
107#endif
108 }
109#endif
110
98 if (cap & ctl & (1 << 4)) { 111 if (cap & ctl & (1 << 4)) {
99 /* Setup irq ops and turn on vSMP IRQ fastpath handling */ 112 /* Setup irq ops and turn on vSMP IRQ fastpath handling */
100 pv_irq_ops.irq_disable = PV_CALLEE_SAVE(vsmp_irq_disable); 113 pv_irq_ops.irq_disable = PV_CALLEE_SAVE(vsmp_irq_disable);
@@ -102,12 +115,11 @@ static void __init set_vsmp_pv_ops(void)
102 pv_irq_ops.save_fl = PV_CALLEE_SAVE(vsmp_save_fl); 115 pv_irq_ops.save_fl = PV_CALLEE_SAVE(vsmp_save_fl);
103 pv_irq_ops.restore_fl = PV_CALLEE_SAVE(vsmp_restore_fl); 116 pv_irq_ops.restore_fl = PV_CALLEE_SAVE(vsmp_restore_fl);
104 pv_init_ops.patch = vsmp_patch; 117 pv_init_ops.patch = vsmp_patch;
105
106 ctl &= ~(1 << 4); 118 ctl &= ~(1 << 4);
107 writel(ctl, address + 4);
108 ctl = readl(address + 4);
109 printk(KERN_INFO "vSMP CTL: control set to:0x%08x\n", ctl);
110 } 119 }
120 writel(ctl, address + 4);
121 ctl = readl(address + 4);
122 pr_info("vSMP CTL: control set to:0x%08x\n", ctl);
111 123
112 early_iounmap(address, 8); 124 early_iounmap(address, 8);
113} 125}
@@ -187,12 +199,36 @@ static void __init vsmp_cap_cpus(void)
187#endif 199#endif
188} 200}
189 201
202static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
203{
204 return hard_smp_processor_id() >> index_msb;
205}
206
207/*
208 * In vSMP, all cpus should be capable of handling interrupts, regardless of
209 * the APIC used.
210 */
211static void fill_vector_allocation_domain(int cpu, struct cpumask *retmask,
212 const struct cpumask *mask)
213{
214 cpumask_setall(retmask);
215}
216
217static void vsmp_apic_post_init(void)
218{
219 /* need to update phys_pkg_id */
220 apic->phys_pkg_id = apicid_phys_pkg_id;
221 apic->vector_allocation_domain = fill_vector_allocation_domain;
222}
223
190void __init vsmp_init(void) 224void __init vsmp_init(void)
191{ 225{
192 detect_vsmp_box(); 226 detect_vsmp_box();
193 if (!is_vsmp_box()) 227 if (!is_vsmp_box())
194 return; 228 return;
195 229
230 x86_platform.apic_post_init = vsmp_apic_post_init;
231
196 vsmp_cap_cpus(); 232 vsmp_cap_cpus();
197 233
198 set_vsmp_pv_ops(); 234 set_vsmp_pv_ops();
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 7515cf0e180..8d141b30904 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -18,6 +18,8 @@
18 * use the vDSO. 18 * use the vDSO.
19 */ 19 */
20 20
21#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
22
21#include <linux/time.h> 23#include <linux/time.h>
22#include <linux/init.h> 24#include <linux/init.h>
23#include <linux/kernel.h> 25#include <linux/kernel.h>
@@ -111,18 +113,13 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
111static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, 113static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
112 const char *message) 114 const char *message)
113{ 115{
114 static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); 116 if (!show_unhandled_signals)
115 struct task_struct *tsk;
116
117 if (!show_unhandled_signals || !__ratelimit(&rs))
118 return; 117 return;
119 118
120 tsk = current; 119 pr_notice_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
121 120 level, current->comm, task_pid_nr(current),
122 printk("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", 121 message, regs->ip, regs->cs,
123 level, tsk->comm, task_pid_nr(tsk), 122 regs->sp, regs->ax, regs->si, regs->di);
124 message, regs->ip, regs->cs,
125 regs->sp, regs->ax, regs->si, regs->di);
126} 123}
127 124
128static int addr_to_vsyscall_nr(unsigned long addr) 125static int addr_to_vsyscall_nr(unsigned long addr)
@@ -139,6 +136,19 @@ static int addr_to_vsyscall_nr(unsigned long addr)
139 return nr; 136 return nr;
140} 137}
141 138
139#ifdef CONFIG_SECCOMP
140static int vsyscall_seccomp(struct task_struct *tsk, int syscall_nr)
141{
142 if (!seccomp_mode(&tsk->seccomp))
143 return 0;
144 task_pt_regs(tsk)->orig_ax = syscall_nr;
145 task_pt_regs(tsk)->ax = syscall_nr;
146 return __secure_computing(syscall_nr);
147}
148#else
149#define vsyscall_seccomp(_tsk, _nr) 0
150#endif
151
142static bool write_ok_or_segv(unsigned long ptr, size_t size) 152static bool write_ok_or_segv(unsigned long ptr, size_t size)
143{ 153{
144 /* 154 /*
@@ -174,6 +184,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
174 int vsyscall_nr; 184 int vsyscall_nr;
175 int prev_sig_on_uaccess_error; 185 int prev_sig_on_uaccess_error;
176 long ret; 186 long ret;
187 int skip;
177 188
178 /* 189 /*
179 * No point in checking CS -- the only way to get here is a user mode 190 * No point in checking CS -- the only way to get here is a user mode
@@ -205,9 +216,6 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
205 } 216 }
206 217
207 tsk = current; 218 tsk = current;
208 if (seccomp_mode(&tsk->seccomp))
209 do_exit(SIGKILL);
210
211 /* 219 /*
212 * With a real vsyscall, page faults cause SIGSEGV. We want to 220 * With a real vsyscall, page faults cause SIGSEGV. We want to
213 * preserve that behavior to make writing exploits harder. 221 * preserve that behavior to make writing exploits harder.
@@ -222,8 +230,13 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
222 * address 0". 230 * address 0".
223 */ 231 */
224 ret = -EFAULT; 232 ret = -EFAULT;
233 skip = 0;
225 switch (vsyscall_nr) { 234 switch (vsyscall_nr) {
226 case 0: 235 case 0:
236 skip = vsyscall_seccomp(tsk, __NR_gettimeofday);
237 if (skip)
238 break;
239
227 if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || 240 if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
228 !write_ok_or_segv(regs->si, sizeof(struct timezone))) 241 !write_ok_or_segv(regs->si, sizeof(struct timezone)))
229 break; 242 break;
@@ -234,6 +247,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
234 break; 247 break;
235 248
236 case 1: 249 case 1:
250 skip = vsyscall_seccomp(tsk, __NR_time);
251 if (skip)
252 break;
253
237 if (!write_ok_or_segv(regs->di, sizeof(time_t))) 254 if (!write_ok_or_segv(regs->di, sizeof(time_t)))
238 break; 255 break;
239 256
@@ -241,6 +258,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
241 break; 258 break;
242 259
243 case 2: 260 case 2:
261 skip = vsyscall_seccomp(tsk, __NR_getcpu);
262 if (skip)
263 break;
264
244 if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || 265 if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
245 !write_ok_or_segv(regs->si, sizeof(unsigned))) 266 !write_ok_or_segv(regs->si, sizeof(unsigned)))
246 break; 267 break;
@@ -253,6 +274,12 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
253 274
254 current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; 275 current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
255 276
277 if (skip) {
278 if ((long)regs->ax <= 0L) /* seccomp errno emulation */
279 goto do_ret;
280 goto done; /* seccomp trace/trap */
281 }
282
256 if (ret == -EFAULT) { 283 if (ret == -EFAULT) {
257 /* Bad news -- userspace fed a bad pointer to a vsyscall. */ 284 /* Bad news -- userspace fed a bad pointer to a vsyscall. */
258 warn_bad_vsyscall(KERN_INFO, regs, 285 warn_bad_vsyscall(KERN_INFO, regs,
@@ -271,10 +298,11 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
271 298
272 regs->ax = ret; 299 regs->ax = ret;
273 300
301do_ret:
274 /* Emulate a ret instruction. */ 302 /* Emulate a ret instruction. */
275 regs->ip = caller; 303 regs->ip = caller;
276 regs->sp += 8; 304 regs->sp += 8;
277 305done:
278 return true; 306 return true;
279 307
280sigsegv: 308sigsegv:
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 9796c2f3d07..6020f6f5927 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -28,6 +28,7 @@ EXPORT_SYMBOL(__put_user_8);
28 28
29EXPORT_SYMBOL(copy_user_generic_string); 29EXPORT_SYMBOL(copy_user_generic_string);
30EXPORT_SYMBOL(copy_user_generic_unrolled); 30EXPORT_SYMBOL(copy_user_generic_unrolled);
31EXPORT_SYMBOL(copy_user_enhanced_fast_string);
31EXPORT_SYMBOL(__copy_user_nocache); 32EXPORT_SYMBOL(__copy_user_nocache);
32EXPORT_SYMBOL(_copy_from_user); 33EXPORT_SYMBOL(_copy_from_user);
33EXPORT_SYMBOL(_copy_to_user); 34EXPORT_SYMBOL(_copy_to_user);
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index bd18149b2b0..3d3e2070911 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -3,6 +3,9 @@
3 * 3 *
4 * Author: Suresh Siddha <suresh.b.siddha@intel.com> 4 * Author: Suresh Siddha <suresh.b.siddha@intel.com>
5 */ 5 */
6
7#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
8
6#include <linux/bootmem.h> 9#include <linux/bootmem.h>
7#include <linux/compat.h> 10#include <linux/compat.h>
8#include <asm/i387.h> 11#include <asm/i387.h>
@@ -162,7 +165,7 @@ int save_i387_xstate(void __user *buf)
162 BUG_ON(sig_xstate_size < xstate_size); 165 BUG_ON(sig_xstate_size < xstate_size);
163 166
164 if ((unsigned long)buf % 64) 167 if ((unsigned long)buf % 64)
165 printk("save_i387_xstate: bad fpstate %p\n", buf); 168 pr_err("%s: bad fpstate %p\n", __func__, buf);
166 169
167 if (!used_math()) 170 if (!used_math())
168 return 0; 171 return 0;
@@ -422,7 +425,7 @@ static void __init xstate_enable_boot_cpu(void)
422 pcntxt_mask = eax + ((u64)edx << 32); 425 pcntxt_mask = eax + ((u64)edx << 32);
423 426
424 if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) { 427 if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) {
425 printk(KERN_ERR "FP/SSE not shown under xsave features 0x%llx\n", 428 pr_err("FP/SSE not shown under xsave features 0x%llx\n",
426 pcntxt_mask); 429 pcntxt_mask);
427 BUG(); 430 BUG();
428 } 431 }
@@ -445,9 +448,8 @@ static void __init xstate_enable_boot_cpu(void)
445 448
446 setup_xstate_init(); 449 setup_xstate_init();
447 450
448 printk(KERN_INFO "xsave/xrstor: enabled xstate_bv 0x%llx, " 451 pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n",
449 "cntxt size 0x%x\n", 452 pcntxt_mask, xstate_size);
450 pcntxt_mask, xstate_size);
451} 453}
452 454
453/* 455/*
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 1a7fe868f37..a28f338843e 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -36,6 +36,7 @@ config KVM
36 select TASKSTATS 36 select TASKSTATS
37 select TASK_DELAY_ACCT 37 select TASK_DELAY_ACCT
38 select PERF_EVENTS 38 select PERF_EVENTS
39 select HAVE_KVM_MSI
39 ---help--- 40 ---help---
40 Support hosting fully virtualized guest machines using hardware 41 Support hosting fully virtualized guest machines using hardware
41 virtualization extensions. You will need a fairly recent 42 virtualization extensions. You will need a fairly recent
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 9fed5bedaad..0595f1397b7 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -201,6 +201,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
201 unsigned f_lm = 0; 201 unsigned f_lm = 0;
202#endif 202#endif
203 unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; 203 unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
204 unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0;
204 205
205 /* cpuid 1.edx */ 206 /* cpuid 1.edx */
206 const u32 kvm_supported_word0_x86_features = 207 const u32 kvm_supported_word0_x86_features =
@@ -228,7 +229,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
228 0 /* DS-CPL, VMX, SMX, EST */ | 229 0 /* DS-CPL, VMX, SMX, EST */ |
229 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | 230 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
230 F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ | 231 F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ |
231 0 /* Reserved, DCA */ | F(XMM4_1) | 232 F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) |
232 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | 233 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
233 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | 234 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
234 F(F16C) | F(RDRAND); 235 F(F16C) | F(RDRAND);
@@ -247,7 +248,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
247 248
248 /* cpuid 7.0.ebx */ 249 /* cpuid 7.0.ebx */
249 const u32 kvm_supported_word9_x86_features = 250 const u32 kvm_supported_word9_x86_features =
250 F(FSGSBASE) | F(BMI1) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS); 251 F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
252 F(BMI2) | F(ERMS) | f_invpcid | F(RTM);
251 253
252 /* all calls to cpuid_count() should be made on the same cpu */ 254 /* all calls to cpuid_count() should be made on the same cpu */
253 get_cpu(); 255 get_cpu();
@@ -397,7 +399,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
397 case KVM_CPUID_SIGNATURE: { 399 case KVM_CPUID_SIGNATURE: {
398 char signature[12] = "KVMKVMKVM\0\0"; 400 char signature[12] = "KVMKVMKVM\0\0";
399 u32 *sigptr = (u32 *)signature; 401 u32 *sigptr = (u32 *)signature;
400 entry->eax = 0; 402 entry->eax = KVM_CPUID_FEATURES;
401 entry->ebx = sigptr[0]; 403 entry->ebx = sigptr[0];
402 entry->ecx = sigptr[1]; 404 entry->ecx = sigptr[1];
403 entry->edx = sigptr[2]; 405 entry->edx = sigptr[2];
@@ -408,6 +410,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
408 (1 << KVM_FEATURE_NOP_IO_DELAY) | 410 (1 << KVM_FEATURE_NOP_IO_DELAY) |
409 (1 << KVM_FEATURE_CLOCKSOURCE2) | 411 (1 << KVM_FEATURE_CLOCKSOURCE2) |
410 (1 << KVM_FEATURE_ASYNC_PF) | 412 (1 << KVM_FEATURE_ASYNC_PF) |
413 (1 << KVM_FEATURE_PV_EOI) |
411 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); 414 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
412 415
413 if (sched_info_on()) 416 if (sched_info_on())
@@ -638,33 +641,37 @@ static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu,
638 return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index); 641 return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index);
639} 642}
640 643
641void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) 644void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
642{ 645{
643 u32 function, index; 646 u32 function = *eax, index = *ecx;
644 struct kvm_cpuid_entry2 *best; 647 struct kvm_cpuid_entry2 *best;
645 648
646 function = kvm_register_read(vcpu, VCPU_REGS_RAX);
647 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
648 kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
649 kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
650 kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
651 kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
652 best = kvm_find_cpuid_entry(vcpu, function, index); 649 best = kvm_find_cpuid_entry(vcpu, function, index);
653 650
654 if (!best) 651 if (!best)
655 best = check_cpuid_limit(vcpu, function, index); 652 best = check_cpuid_limit(vcpu, function, index);
656 653
657 if (best) { 654 if (best) {
658 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); 655 *eax = best->eax;
659 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); 656 *ebx = best->ebx;
660 kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); 657 *ecx = best->ecx;
661 kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); 658 *edx = best->edx;
662 } 659 } else
660 *eax = *ebx = *ecx = *edx = 0;
661}
662
663void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
664{
665 u32 function, eax, ebx, ecx, edx;
666
667 function = eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
668 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
669 kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx);
670 kvm_register_write(vcpu, VCPU_REGS_RAX, eax);
671 kvm_register_write(vcpu, VCPU_REGS_RBX, ebx);
672 kvm_register_write(vcpu, VCPU_REGS_RCX, ecx);
673 kvm_register_write(vcpu, VCPU_REGS_RDX, edx);
663 kvm_x86_ops->skip_emulated_instruction(vcpu); 674 kvm_x86_ops->skip_emulated_instruction(vcpu);
664 trace_kvm_cpuid(function, 675 trace_kvm_cpuid(function, eax, ebx, ecx, edx);
665 kvm_register_read(vcpu, VCPU_REGS_RAX),
666 kvm_register_read(vcpu, VCPU_REGS_RBX),
667 kvm_register_read(vcpu, VCPU_REGS_RCX),
668 kvm_register_read(vcpu, VCPU_REGS_RDX));
669} 676}
670EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); 677EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 26d1fb437eb..a10e4601685 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -17,6 +17,7 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
17int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, 17int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
18 struct kvm_cpuid2 *cpuid, 18 struct kvm_cpuid2 *cpuid,
19 struct kvm_cpuid_entry2 __user *entries); 19 struct kvm_cpuid_entry2 __user *entries);
20void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
20 21
21 22
22static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) 23static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
@@ -51,4 +52,12 @@ static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu)
51 return best && (best->ecx & bit(X86_FEATURE_OSVW)); 52 return best && (best->ecx & bit(X86_FEATURE_OSVW));
52} 53}
53 54
55static inline bool guest_cpuid_has_pcid(struct kvm_vcpu *vcpu)
56{
57 struct kvm_cpuid_entry2 *best;
58
59 best = kvm_find_cpuid_entry(vcpu, 1, 0);
60 return best && (best->ecx & bit(X86_FEATURE_PCID));
61}
62
54#endif 63#endif
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 83756223f8a..97d9a9914ba 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -142,6 +142,10 @@
142#define Src2FS (OpFS << Src2Shift) 142#define Src2FS (OpFS << Src2Shift)
143#define Src2GS (OpGS << Src2Shift) 143#define Src2GS (OpGS << Src2Shift)
144#define Src2Mask (OpMask << Src2Shift) 144#define Src2Mask (OpMask << Src2Shift)
145#define Mmx ((u64)1 << 40) /* MMX Vector instruction */
146#define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */
147#define Unaligned ((u64)1 << 42) /* Explicitly unaligned (e.g. MOVDQU) */
148#define Avx ((u64)1 << 43) /* Advanced Vector Extensions */
145 149
146#define X2(x...) x, x 150#define X2(x...) x, x
147#define X3(x...) X2(x), x 151#define X3(x...) X2(x), x
@@ -429,11 +433,32 @@ static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,
429 return ctxt->ops->intercept(ctxt, &info, stage); 433 return ctxt->ops->intercept(ctxt, &info, stage);
430} 434}
431 435
436static void assign_masked(ulong *dest, ulong src, ulong mask)
437{
438 *dest = (*dest & ~mask) | (src & mask);
439}
440
432static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt) 441static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt)
433{ 442{
434 return (1UL << (ctxt->ad_bytes << 3)) - 1; 443 return (1UL << (ctxt->ad_bytes << 3)) - 1;
435} 444}
436 445
446static ulong stack_mask(struct x86_emulate_ctxt *ctxt)
447{
448 u16 sel;
449 struct desc_struct ss;
450
451 if (ctxt->mode == X86EMUL_MODE_PROT64)
452 return ~0UL;
453 ctxt->ops->get_segment(ctxt, &sel, &ss, NULL, VCPU_SREG_SS);
454 return ~0U >> ((ss.d ^ 1) * 16); /* d=0: 0xffff; d=1: 0xffffffff */
455}
456
457static int stack_size(struct x86_emulate_ctxt *ctxt)
458{
459 return (__fls(stack_mask(ctxt)) + 1) >> 3;
460}
461
437/* Access/update address held in a register, based on addressing mode. */ 462/* Access/update address held in a register, based on addressing mode. */
438static inline unsigned long 463static inline unsigned long
439address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg) 464address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg)
@@ -557,6 +582,29 @@ static void set_segment_selector(struct x86_emulate_ctxt *ctxt, u16 selector,
557 ctxt->ops->set_segment(ctxt, selector, &desc, base3, seg); 582 ctxt->ops->set_segment(ctxt, selector, &desc, base3, seg);
558} 583}
559 584
585/*
586 * x86 defines three classes of vector instructions: explicitly
587 * aligned, explicitly unaligned, and the rest, which change behaviour
588 * depending on whether they're AVX encoded or not.
589 *
590 * Also included is CMPXCHG16B which is not a vector instruction, yet it is
591 * subject to the same check.
592 */
593static bool insn_aligned(struct x86_emulate_ctxt *ctxt, unsigned size)
594{
595 if (likely(size < 16))
596 return false;
597
598 if (ctxt->d & Aligned)
599 return true;
600 else if (ctxt->d & Unaligned)
601 return false;
602 else if (ctxt->d & Avx)
603 return false;
604 else
605 return true;
606}
607
560static int __linearize(struct x86_emulate_ctxt *ctxt, 608static int __linearize(struct x86_emulate_ctxt *ctxt,
561 struct segmented_address addr, 609 struct segmented_address addr,
562 unsigned size, bool write, bool fetch, 610 unsigned size, bool write, bool fetch,
@@ -621,6 +669,8 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
621 } 669 }
622 if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : ctxt->ad_bytes != 8) 670 if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : ctxt->ad_bytes != 8)
623 la &= (u32)-1; 671 la &= (u32)-1;
672 if (insn_aligned(ctxt, size) && ((la & (size - 1)) != 0))
673 return emulate_gp(ctxt, 0);
624 *linear = la; 674 *linear = la;
625 return X86EMUL_CONTINUE; 675 return X86EMUL_CONTINUE;
626bad: 676bad:
@@ -859,6 +909,40 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,
859 ctxt->ops->put_fpu(ctxt); 909 ctxt->ops->put_fpu(ctxt);
860} 910}
861 911
912static void read_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)
913{
914 ctxt->ops->get_fpu(ctxt);
915 switch (reg) {
916 case 0: asm("movq %%mm0, %0" : "=m"(*data)); break;
917 case 1: asm("movq %%mm1, %0" : "=m"(*data)); break;
918 case 2: asm("movq %%mm2, %0" : "=m"(*data)); break;
919 case 3: asm("movq %%mm3, %0" : "=m"(*data)); break;
920 case 4: asm("movq %%mm4, %0" : "=m"(*data)); break;
921 case 5: asm("movq %%mm5, %0" : "=m"(*data)); break;
922 case 6: asm("movq %%mm6, %0" : "=m"(*data)); break;
923 case 7: asm("movq %%mm7, %0" : "=m"(*data)); break;
924 default: BUG();
925 }
926 ctxt->ops->put_fpu(ctxt);
927}
928
929static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)
930{
931 ctxt->ops->get_fpu(ctxt);
932 switch (reg) {
933 case 0: asm("movq %0, %%mm0" : : "m"(*data)); break;
934 case 1: asm("movq %0, %%mm1" : : "m"(*data)); break;
935 case 2: asm("movq %0, %%mm2" : : "m"(*data)); break;
936 case 3: asm("movq %0, %%mm3" : : "m"(*data)); break;
937 case 4: asm("movq %0, %%mm4" : : "m"(*data)); break;
938 case 5: asm("movq %0, %%mm5" : : "m"(*data)); break;
939 case 6: asm("movq %0, %%mm6" : : "m"(*data)); break;
940 case 7: asm("movq %0, %%mm7" : : "m"(*data)); break;
941 default: BUG();
942 }
943 ctxt->ops->put_fpu(ctxt);
944}
945
862static void decode_register_operand(struct x86_emulate_ctxt *ctxt, 946static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
863 struct operand *op) 947 struct operand *op)
864{ 948{
@@ -875,6 +959,13 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
875 read_sse_reg(ctxt, &op->vec_val, reg); 959 read_sse_reg(ctxt, &op->vec_val, reg);
876 return; 960 return;
877 } 961 }
962 if (ctxt->d & Mmx) {
963 reg &= 7;
964 op->type = OP_MM;
965 op->bytes = 8;
966 op->addr.mm = reg;
967 return;
968 }
878 969
879 op->type = OP_REG; 970 op->type = OP_REG;
880 if (ctxt->d & ByteOp) { 971 if (ctxt->d & ByteOp) {
@@ -888,6 +979,12 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
888 op->orig_val = op->val; 979 op->orig_val = op->val;
889} 980}
890 981
982static void adjust_modrm_seg(struct x86_emulate_ctxt *ctxt, int base_reg)
983{
984 if (base_reg == VCPU_REGS_RSP || base_reg == VCPU_REGS_RBP)
985 ctxt->modrm_seg = VCPU_SREG_SS;
986}
987
891static int decode_modrm(struct x86_emulate_ctxt *ctxt, 988static int decode_modrm(struct x86_emulate_ctxt *ctxt,
892 struct operand *op) 989 struct operand *op)
893{ 990{
@@ -902,7 +999,6 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
902 ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */ 999 ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */
903 } 1000 }
904 1001
905 ctxt->modrm = insn_fetch(u8, ctxt);
906 ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6; 1002 ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6;
907 ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3; 1003 ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3;
908 ctxt->modrm_rm |= (ctxt->modrm & 0x07); 1004 ctxt->modrm_rm |= (ctxt->modrm & 0x07);
@@ -920,6 +1016,12 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
920 read_sse_reg(ctxt, &op->vec_val, ctxt->modrm_rm); 1016 read_sse_reg(ctxt, &op->vec_val, ctxt->modrm_rm);
921 return rc; 1017 return rc;
922 } 1018 }
1019 if (ctxt->d & Mmx) {
1020 op->type = OP_MM;
1021 op->bytes = 8;
1022 op->addr.xmm = ctxt->modrm_rm & 7;
1023 return rc;
1024 }
923 fetch_register_operand(op); 1025 fetch_register_operand(op);
924 return rc; 1026 return rc;
925 } 1027 }
@@ -986,15 +1088,20 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
986 1088
987 if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0) 1089 if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0)
988 modrm_ea += insn_fetch(s32, ctxt); 1090 modrm_ea += insn_fetch(s32, ctxt);
989 else 1091 else {
990 modrm_ea += ctxt->regs[base_reg]; 1092 modrm_ea += ctxt->regs[base_reg];
1093 adjust_modrm_seg(ctxt, base_reg);
1094 }
991 if (index_reg != 4) 1095 if (index_reg != 4)
992 modrm_ea += ctxt->regs[index_reg] << scale; 1096 modrm_ea += ctxt->regs[index_reg] << scale;
993 } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) { 1097 } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) {
994 if (ctxt->mode == X86EMUL_MODE_PROT64) 1098 if (ctxt->mode == X86EMUL_MODE_PROT64)
995 ctxt->rip_relative = 1; 1099 ctxt->rip_relative = 1;
996 } else 1100 } else {
997 modrm_ea += ctxt->regs[ctxt->modrm_rm]; 1101 base_reg = ctxt->modrm_rm;
1102 modrm_ea += ctxt->regs[base_reg];
1103 adjust_modrm_seg(ctxt, base_reg);
1104 }
998 switch (ctxt->modrm_mod) { 1105 switch (ctxt->modrm_mod) {
999 case 0: 1106 case 0:
1000 if (ctxt->modrm_rm == 5) 1107 if (ctxt->modrm_rm == 5)
@@ -1189,7 +1296,8 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
1189 1296
1190/* allowed just for 8 bytes segments */ 1297/* allowed just for 8 bytes segments */
1191static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, 1298static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1192 u16 selector, struct desc_struct *desc) 1299 u16 selector, struct desc_struct *desc,
1300 ulong *desc_addr_p)
1193{ 1301{
1194 struct desc_ptr dt; 1302 struct desc_ptr dt;
1195 u16 index = selector >> 3; 1303 u16 index = selector >> 3;
@@ -1200,7 +1308,7 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1200 if (dt.size < index * 8 + 7) 1308 if (dt.size < index * 8 + 7)
1201 return emulate_gp(ctxt, selector & 0xfffc); 1309 return emulate_gp(ctxt, selector & 0xfffc);
1202 1310
1203 addr = dt.address + index * 8; 1311 *desc_addr_p = addr = dt.address + index * 8;
1204 return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc, 1312 return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc,
1205 &ctxt->exception); 1313 &ctxt->exception);
1206} 1314}
@@ -1227,11 +1335,12 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1227static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, 1335static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1228 u16 selector, int seg) 1336 u16 selector, int seg)
1229{ 1337{
1230 struct desc_struct seg_desc; 1338 struct desc_struct seg_desc, old_desc;
1231 u8 dpl, rpl, cpl; 1339 u8 dpl, rpl, cpl;
1232 unsigned err_vec = GP_VECTOR; 1340 unsigned err_vec = GP_VECTOR;
1233 u32 err_code = 0; 1341 u32 err_code = 0;
1234 bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ 1342 bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
1343 ulong desc_addr;
1235 int ret; 1344 int ret;
1236 1345
1237 memset(&seg_desc, 0, sizeof seg_desc); 1346 memset(&seg_desc, 0, sizeof seg_desc);
@@ -1249,8 +1358,14 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1249 goto load; 1358 goto load;
1250 } 1359 }
1251 1360
1252 /* NULL selector is not valid for TR, CS and SS */ 1361 rpl = selector & 3;
1253 if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR) 1362 cpl = ctxt->ops->cpl(ctxt);
1363
1364 /* NULL selector is not valid for TR, CS and SS (except for long mode) */
1365 if ((seg == VCPU_SREG_CS
1366 || (seg == VCPU_SREG_SS
1367 && (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl))
1368 || seg == VCPU_SREG_TR)
1254 && null_selector) 1369 && null_selector)
1255 goto exception; 1370 goto exception;
1256 1371
@@ -1261,7 +1376,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1261 if (null_selector) /* for NULL selector skip all following checks */ 1376 if (null_selector) /* for NULL selector skip all following checks */
1262 goto load; 1377 goto load;
1263 1378
1264 ret = read_segment_descriptor(ctxt, selector, &seg_desc); 1379 ret = read_segment_descriptor(ctxt, selector, &seg_desc, &desc_addr);
1265 if (ret != X86EMUL_CONTINUE) 1380 if (ret != X86EMUL_CONTINUE)
1266 return ret; 1381 return ret;
1267 1382
@@ -1277,9 +1392,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1277 goto exception; 1392 goto exception;
1278 } 1393 }
1279 1394
1280 rpl = selector & 3;
1281 dpl = seg_desc.dpl; 1395 dpl = seg_desc.dpl;
1282 cpl = ctxt->ops->cpl(ctxt);
1283 1396
1284 switch (seg) { 1397 switch (seg) {
1285 case VCPU_SREG_SS: 1398 case VCPU_SREG_SS:
@@ -1309,6 +1422,12 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1309 case VCPU_SREG_TR: 1422 case VCPU_SREG_TR:
1310 if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9)) 1423 if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9))
1311 goto exception; 1424 goto exception;
1425 old_desc = seg_desc;
1426 seg_desc.type |= 2; /* busy */
1427 ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc,
1428 sizeof(seg_desc), &ctxt->exception);
1429 if (ret != X86EMUL_CONTINUE)
1430 return ret;
1312 break; 1431 break;
1313 case VCPU_SREG_LDTR: 1432 case VCPU_SREG_LDTR:
1314 if (seg_desc.s || seg_desc.type != 2) 1433 if (seg_desc.s || seg_desc.type != 2)
@@ -1387,6 +1506,9 @@ static int writeback(struct x86_emulate_ctxt *ctxt)
1387 case OP_XMM: 1506 case OP_XMM:
1388 write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm); 1507 write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm);
1389 break; 1508 break;
1509 case OP_MM:
1510 write_mmx_reg(ctxt, &ctxt->dst.mm_val, ctxt->dst.addr.mm);
1511 break;
1390 case OP_NONE: 1512 case OP_NONE:
1391 /* no writeback */ 1513 /* no writeback */
1392 break; 1514 break;
@@ -1396,17 +1518,22 @@ static int writeback(struct x86_emulate_ctxt *ctxt)
1396 return X86EMUL_CONTINUE; 1518 return X86EMUL_CONTINUE;
1397} 1519}
1398 1520
1399static int em_push(struct x86_emulate_ctxt *ctxt) 1521static int push(struct x86_emulate_ctxt *ctxt, void *data, int bytes)
1400{ 1522{
1401 struct segmented_address addr; 1523 struct segmented_address addr;
1402 1524
1403 register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -ctxt->op_bytes); 1525 register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -bytes);
1404 addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]); 1526 addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]);
1405 addr.seg = VCPU_SREG_SS; 1527 addr.seg = VCPU_SREG_SS;
1406 1528
1529 return segmented_write(ctxt, addr, data, bytes);
1530}
1531
1532static int em_push(struct x86_emulate_ctxt *ctxt)
1533{
1407 /* Disable writeback. */ 1534 /* Disable writeback. */
1408 ctxt->dst.type = OP_NONE; 1535 ctxt->dst.type = OP_NONE;
1409 return segmented_write(ctxt, addr, &ctxt->src.val, ctxt->op_bytes); 1536 return push(ctxt, &ctxt->src.val, ctxt->op_bytes);
1410} 1537}
1411 1538
1412static int emulate_pop(struct x86_emulate_ctxt *ctxt, 1539static int emulate_pop(struct x86_emulate_ctxt *ctxt,
@@ -1478,6 +1605,33 @@ static int em_popf(struct x86_emulate_ctxt *ctxt)
1478 return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes); 1605 return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes);
1479} 1606}
1480 1607
1608static int em_enter(struct x86_emulate_ctxt *ctxt)
1609{
1610 int rc;
1611 unsigned frame_size = ctxt->src.val;
1612 unsigned nesting_level = ctxt->src2.val & 31;
1613
1614 if (nesting_level)
1615 return X86EMUL_UNHANDLEABLE;
1616
1617 rc = push(ctxt, &ctxt->regs[VCPU_REGS_RBP], stack_size(ctxt));
1618 if (rc != X86EMUL_CONTINUE)
1619 return rc;
1620 assign_masked(&ctxt->regs[VCPU_REGS_RBP], ctxt->regs[VCPU_REGS_RSP],
1621 stack_mask(ctxt));
1622 assign_masked(&ctxt->regs[VCPU_REGS_RSP],
1623 ctxt->regs[VCPU_REGS_RSP] - frame_size,
1624 stack_mask(ctxt));
1625 return X86EMUL_CONTINUE;
1626}
1627
1628static int em_leave(struct x86_emulate_ctxt *ctxt)
1629{
1630 assign_masked(&ctxt->regs[VCPU_REGS_RSP], ctxt->regs[VCPU_REGS_RBP],
1631 stack_mask(ctxt));
1632 return emulate_pop(ctxt, &ctxt->regs[VCPU_REGS_RBP], ctxt->op_bytes);
1633}
1634
1481static int em_push_sreg(struct x86_emulate_ctxt *ctxt) 1635static int em_push_sreg(struct x86_emulate_ctxt *ctxt)
1482{ 1636{
1483 int seg = ctxt->src2.val; 1637 int seg = ctxt->src2.val;
@@ -1915,8 +2069,8 @@ static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
1915 u32 eax, ebx, ecx, edx; 2069 u32 eax, ebx, ecx, edx;
1916 2070
1917 eax = ecx = 0; 2071 eax = ecx = 0;
1918 return ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx) 2072 ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
1919 && ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx 2073 return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx
1920 && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx 2074 && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx
1921 && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx; 2075 && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx;
1922} 2076}
@@ -1935,32 +2089,31 @@ static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)
1935 2089
1936 eax = 0x00000000; 2090 eax = 0x00000000;
1937 ecx = 0x00000000; 2091 ecx = 0x00000000;
1938 if (ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx)) { 2092 ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
1939 /* 2093 /*
1940 * Intel ("GenuineIntel") 2094 * Intel ("GenuineIntel")
1941 * remark: Intel CPUs only support "syscall" in 64bit 2095 * remark: Intel CPUs only support "syscall" in 64bit
1942 * longmode. Also an 64bit guest with a 2096 * longmode. Also an 64bit guest with a
1943 * 32bit compat-app running will #UD !! While this 2097 * 32bit compat-app running will #UD !! While this
1944 * behaviour can be fixed (by emulating) into AMD 2098 * behaviour can be fixed (by emulating) into AMD
1945 * response - CPUs of AMD can't behave like Intel. 2099 * response - CPUs of AMD can't behave like Intel.
1946 */ 2100 */
1947 if (ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx && 2101 if (ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx &&
1948 ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx && 2102 ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx &&
1949 edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx) 2103 edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx)
1950 return false; 2104 return false;
1951 2105
1952 /* AMD ("AuthenticAMD") */ 2106 /* AMD ("AuthenticAMD") */
1953 if (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx && 2107 if (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx &&
1954 ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx && 2108 ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx &&
1955 edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx) 2109 edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx)
1956 return true; 2110 return true;
1957 2111
1958 /* AMD ("AMDisbetter!") */ 2112 /* AMD ("AMDisbetter!") */
1959 if (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx && 2113 if (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx &&
1960 ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx && 2114 ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx &&
1961 edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx) 2115 edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx)
1962 return true; 2116 return true;
1963 }
1964 2117
1965 /* default: (not Intel, not AMD), apply Intel's stricter rules... */ 2118 /* default: (not Intel, not AMD), apply Intel's stricter rules... */
1966 return false; 2119 return false;
@@ -2469,13 +2622,14 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2469 ulong old_tss_base = 2622 ulong old_tss_base =
2470 ops->get_cached_segment_base(ctxt, VCPU_SREG_TR); 2623 ops->get_cached_segment_base(ctxt, VCPU_SREG_TR);
2471 u32 desc_limit; 2624 u32 desc_limit;
2625 ulong desc_addr;
2472 2626
2473 /* FIXME: old_tss_base == ~0 ? */ 2627 /* FIXME: old_tss_base == ~0 ? */
2474 2628
2475 ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc); 2629 ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc, &desc_addr);
2476 if (ret != X86EMUL_CONTINUE) 2630 if (ret != X86EMUL_CONTINUE)
2477 return ret; 2631 return ret;
2478 ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc); 2632 ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc, &desc_addr);
2479 if (ret != X86EMUL_CONTINUE) 2633 if (ret != X86EMUL_CONTINUE)
2480 return ret; 2634 return ret;
2481 2635
@@ -2790,7 +2944,7 @@ static int em_rdpmc(struct x86_emulate_ctxt *ctxt)
2790 2944
2791static int em_mov(struct x86_emulate_ctxt *ctxt) 2945static int em_mov(struct x86_emulate_ctxt *ctxt)
2792{ 2946{
2793 ctxt->dst.val = ctxt->src.val; 2947 memcpy(ctxt->dst.valptr, ctxt->src.valptr, ctxt->op_bytes);
2794 return X86EMUL_CONTINUE; 2948 return X86EMUL_CONTINUE;
2795} 2949}
2796 2950
@@ -2870,10 +3024,22 @@ static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt)
2870 return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg); 3024 return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg);
2871} 3025}
2872 3026
2873static int em_movdqu(struct x86_emulate_ctxt *ctxt) 3027static int em_lldt(struct x86_emulate_ctxt *ctxt)
2874{ 3028{
2875 memcpy(&ctxt->dst.vec_val, &ctxt->src.vec_val, ctxt->op_bytes); 3029 u16 sel = ctxt->src.val;
2876 return X86EMUL_CONTINUE; 3030
3031 /* Disable writeback. */
3032 ctxt->dst.type = OP_NONE;
3033 return load_segment_descriptor(ctxt, sel, VCPU_SREG_LDTR);
3034}
3035
3036static int em_ltr(struct x86_emulate_ctxt *ctxt)
3037{
3038 u16 sel = ctxt->src.val;
3039
3040 /* Disable writeback. */
3041 ctxt->dst.type = OP_NONE;
3042 return load_segment_descriptor(ctxt, sel, VCPU_SREG_TR);
2877} 3043}
2878 3044
2879static int em_invlpg(struct x86_emulate_ctxt *ctxt) 3045static int em_invlpg(struct x86_emulate_ctxt *ctxt)
@@ -2917,11 +3083,42 @@ static int em_vmcall(struct x86_emulate_ctxt *ctxt)
2917 return X86EMUL_CONTINUE; 3083 return X86EMUL_CONTINUE;
2918} 3084}
2919 3085
3086static int emulate_store_desc_ptr(struct x86_emulate_ctxt *ctxt,
3087 void (*get)(struct x86_emulate_ctxt *ctxt,
3088 struct desc_ptr *ptr))
3089{
3090 struct desc_ptr desc_ptr;
3091
3092 if (ctxt->mode == X86EMUL_MODE_PROT64)
3093 ctxt->op_bytes = 8;
3094 get(ctxt, &desc_ptr);
3095 if (ctxt->op_bytes == 2) {
3096 ctxt->op_bytes = 4;
3097 desc_ptr.address &= 0x00ffffff;
3098 }
3099 /* Disable writeback. */
3100 ctxt->dst.type = OP_NONE;
3101 return segmented_write(ctxt, ctxt->dst.addr.mem,
3102 &desc_ptr, 2 + ctxt->op_bytes);
3103}
3104
3105static int em_sgdt(struct x86_emulate_ctxt *ctxt)
3106{
3107 return emulate_store_desc_ptr(ctxt, ctxt->ops->get_gdt);
3108}
3109
3110static int em_sidt(struct x86_emulate_ctxt *ctxt)
3111{
3112 return emulate_store_desc_ptr(ctxt, ctxt->ops->get_idt);
3113}
3114
2920static int em_lgdt(struct x86_emulate_ctxt *ctxt) 3115static int em_lgdt(struct x86_emulate_ctxt *ctxt)
2921{ 3116{
2922 struct desc_ptr desc_ptr; 3117 struct desc_ptr desc_ptr;
2923 int rc; 3118 int rc;
2924 3119
3120 if (ctxt->mode == X86EMUL_MODE_PROT64)
3121 ctxt->op_bytes = 8;
2925 rc = read_descriptor(ctxt, ctxt->src.addr.mem, 3122 rc = read_descriptor(ctxt, ctxt->src.addr.mem,
2926 &desc_ptr.size, &desc_ptr.address, 3123 &desc_ptr.size, &desc_ptr.address,
2927 ctxt->op_bytes); 3124 ctxt->op_bytes);
@@ -2949,6 +3146,8 @@ static int em_lidt(struct x86_emulate_ctxt *ctxt)
2949 struct desc_ptr desc_ptr; 3146 struct desc_ptr desc_ptr;
2950 int rc; 3147 int rc;
2951 3148
3149 if (ctxt->mode == X86EMUL_MODE_PROT64)
3150 ctxt->op_bytes = 8;
2952 rc = read_descriptor(ctxt, ctxt->src.addr.mem, 3151 rc = read_descriptor(ctxt, ctxt->src.addr.mem,
2953 &desc_ptr.size, &desc_ptr.address, 3152 &desc_ptr.size, &desc_ptr.address,
2954 ctxt->op_bytes); 3153 ctxt->op_bytes);
@@ -3061,34 +3260,48 @@ static int em_btc(struct x86_emulate_ctxt *ctxt)
3061 3260
3062static int em_bsf(struct x86_emulate_ctxt *ctxt) 3261static int em_bsf(struct x86_emulate_ctxt *ctxt)
3063{ 3262{
3064 u8 zf; 3263 emulate_2op_SrcV_nobyte(ctxt, "bsf");
3065
3066 __asm__ ("bsf %2, %0; setz %1"
3067 : "=r"(ctxt->dst.val), "=q"(zf)
3068 : "r"(ctxt->src.val));
3069
3070 ctxt->eflags &= ~X86_EFLAGS_ZF;
3071 if (zf) {
3072 ctxt->eflags |= X86_EFLAGS_ZF;
3073 /* Disable writeback. */
3074 ctxt->dst.type = OP_NONE;
3075 }
3076 return X86EMUL_CONTINUE; 3264 return X86EMUL_CONTINUE;
3077} 3265}
3078 3266
3079static int em_bsr(struct x86_emulate_ctxt *ctxt) 3267static int em_bsr(struct x86_emulate_ctxt *ctxt)
3080{ 3268{
3081 u8 zf; 3269 emulate_2op_SrcV_nobyte(ctxt, "bsr");
3270 return X86EMUL_CONTINUE;
3271}
3082 3272
3083 __asm__ ("bsr %2, %0; setz %1" 3273static int em_cpuid(struct x86_emulate_ctxt *ctxt)
3084 : "=r"(ctxt->dst.val), "=q"(zf) 3274{
3085 : "r"(ctxt->src.val)); 3275 u32 eax, ebx, ecx, edx;
3276
3277 eax = ctxt->regs[VCPU_REGS_RAX];
3278 ecx = ctxt->regs[VCPU_REGS_RCX];
3279 ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
3280 ctxt->regs[VCPU_REGS_RAX] = eax;
3281 ctxt->regs[VCPU_REGS_RBX] = ebx;
3282 ctxt->regs[VCPU_REGS_RCX] = ecx;
3283 ctxt->regs[VCPU_REGS_RDX] = edx;
3284 return X86EMUL_CONTINUE;
3285}
3086 3286
3087 ctxt->eflags &= ~X86_EFLAGS_ZF; 3287static int em_lahf(struct x86_emulate_ctxt *ctxt)
3088 if (zf) { 3288{
3089 ctxt->eflags |= X86_EFLAGS_ZF; 3289 ctxt->regs[VCPU_REGS_RAX] &= ~0xff00UL;
3090 /* Disable writeback. */ 3290 ctxt->regs[VCPU_REGS_RAX] |= (ctxt->eflags & 0xff) << 8;
3091 ctxt->dst.type = OP_NONE; 3291 return X86EMUL_CONTINUE;
3292}
3293
3294static int em_bswap(struct x86_emulate_ctxt *ctxt)
3295{
3296 switch (ctxt->op_bytes) {
3297#ifdef CONFIG_X86_64
3298 case 8:
3299 asm("bswap %0" : "+r"(ctxt->dst.val));
3300 break;
3301#endif
3302 default:
3303 asm("bswap %0" : "+r"(*(u32 *)&ctxt->dst.val));
3304 break;
3092 } 3305 }
3093 return X86EMUL_CONTINUE; 3306 return X86EMUL_CONTINUE;
3094} 3307}
@@ -3286,8 +3499,8 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
3286 .check_perm = (_p) } 3499 .check_perm = (_p) }
3287#define N D(0) 3500#define N D(0)
3288#define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) } 3501#define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) }
3289#define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) } 3502#define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) }
3290#define GD(_f, _g) { .flags = ((_f) | GroupDual), .u.gdual = (_g) } 3503#define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) }
3291#define I(_f, _e) { .flags = (_f), .u.execute = (_e) } 3504#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
3292#define II(_f, _e, _i) \ 3505#define II(_f, _e, _i) \
3293 { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i } 3506 { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i }
@@ -3307,25 +3520,25 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
3307 I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e) 3520 I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e)
3308 3521
3309static struct opcode group7_rm1[] = { 3522static struct opcode group7_rm1[] = {
3310 DI(SrcNone | ModRM | Priv, monitor), 3523 DI(SrcNone | Priv, monitor),
3311 DI(SrcNone | ModRM | Priv, mwait), 3524 DI(SrcNone | Priv, mwait),
3312 N, N, N, N, N, N, 3525 N, N, N, N, N, N,
3313}; 3526};
3314 3527
3315static struct opcode group7_rm3[] = { 3528static struct opcode group7_rm3[] = {
3316 DIP(SrcNone | ModRM | Prot | Priv, vmrun, check_svme_pa), 3529 DIP(SrcNone | Prot | Priv, vmrun, check_svme_pa),
3317 II(SrcNone | ModRM | Prot | VendorSpecific, em_vmmcall, vmmcall), 3530 II(SrcNone | Prot | VendorSpecific, em_vmmcall, vmmcall),
3318 DIP(SrcNone | ModRM | Prot | Priv, vmload, check_svme_pa), 3531 DIP(SrcNone | Prot | Priv, vmload, check_svme_pa),
3319 DIP(SrcNone | ModRM | Prot | Priv, vmsave, check_svme_pa), 3532 DIP(SrcNone | Prot | Priv, vmsave, check_svme_pa),
3320 DIP(SrcNone | ModRM | Prot | Priv, stgi, check_svme), 3533 DIP(SrcNone | Prot | Priv, stgi, check_svme),
3321 DIP(SrcNone | ModRM | Prot | Priv, clgi, check_svme), 3534 DIP(SrcNone | Prot | Priv, clgi, check_svme),
3322 DIP(SrcNone | ModRM | Prot | Priv, skinit, check_svme), 3535 DIP(SrcNone | Prot | Priv, skinit, check_svme),
3323 DIP(SrcNone | ModRM | Prot | Priv, invlpga, check_svme), 3536 DIP(SrcNone | Prot | Priv, invlpga, check_svme),
3324}; 3537};
3325 3538
3326static struct opcode group7_rm7[] = { 3539static struct opcode group7_rm7[] = {
3327 N, 3540 N,
3328 DIP(SrcNone | ModRM, rdtscp, check_rdtsc), 3541 DIP(SrcNone, rdtscp, check_rdtsc),
3329 N, N, N, N, N, N, 3542 N, N, N, N, N, N,
3330}; 3543};
3331 3544
@@ -3341,81 +3554,86 @@ static struct opcode group1[] = {
3341}; 3554};
3342 3555
3343static struct opcode group1A[] = { 3556static struct opcode group1A[] = {
3344 I(DstMem | SrcNone | ModRM | Mov | Stack, em_pop), N, N, N, N, N, N, N, 3557 I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N,
3345}; 3558};
3346 3559
3347static struct opcode group3[] = { 3560static struct opcode group3[] = {
3348 I(DstMem | SrcImm | ModRM, em_test), 3561 I(DstMem | SrcImm, em_test),
3349 I(DstMem | SrcImm | ModRM, em_test), 3562 I(DstMem | SrcImm, em_test),
3350 I(DstMem | SrcNone | ModRM | Lock, em_not), 3563 I(DstMem | SrcNone | Lock, em_not),
3351 I(DstMem | SrcNone | ModRM | Lock, em_neg), 3564 I(DstMem | SrcNone | Lock, em_neg),
3352 I(SrcMem | ModRM, em_mul_ex), 3565 I(SrcMem, em_mul_ex),
3353 I(SrcMem | ModRM, em_imul_ex), 3566 I(SrcMem, em_imul_ex),
3354 I(SrcMem | ModRM, em_div_ex), 3567 I(SrcMem, em_div_ex),
3355 I(SrcMem | ModRM, em_idiv_ex), 3568 I(SrcMem, em_idiv_ex),
3356}; 3569};
3357 3570
3358static struct opcode group4[] = { 3571static struct opcode group4[] = {
3359 I(ByteOp | DstMem | SrcNone | ModRM | Lock, em_grp45), 3572 I(ByteOp | DstMem | SrcNone | Lock, em_grp45),
3360 I(ByteOp | DstMem | SrcNone | ModRM | Lock, em_grp45), 3573 I(ByteOp | DstMem | SrcNone | Lock, em_grp45),
3361 N, N, N, N, N, N, 3574 N, N, N, N, N, N,
3362}; 3575};
3363 3576
3364static struct opcode group5[] = { 3577static struct opcode group5[] = {
3365 I(DstMem | SrcNone | ModRM | Lock, em_grp45), 3578 I(DstMem | SrcNone | Lock, em_grp45),
3366 I(DstMem | SrcNone | ModRM | Lock, em_grp45), 3579 I(DstMem | SrcNone | Lock, em_grp45),
3367 I(SrcMem | ModRM | Stack, em_grp45), 3580 I(SrcMem | Stack, em_grp45),
3368 I(SrcMemFAddr | ModRM | ImplicitOps | Stack, em_call_far), 3581 I(SrcMemFAddr | ImplicitOps | Stack, em_call_far),
3369 I(SrcMem | ModRM | Stack, em_grp45), 3582 I(SrcMem | Stack, em_grp45),
3370 I(SrcMemFAddr | ModRM | ImplicitOps, em_grp45), 3583 I(SrcMemFAddr | ImplicitOps, em_grp45),
3371 I(SrcMem | ModRM | Stack, em_grp45), N, 3584 I(SrcMem | Stack, em_grp45), N,
3372}; 3585};
3373 3586
3374static struct opcode group6[] = { 3587static struct opcode group6[] = {
3375 DI(ModRM | Prot, sldt), 3588 DI(Prot, sldt),
3376 DI(ModRM | Prot, str), 3589 DI(Prot, str),
3377 DI(ModRM | Prot | Priv, lldt), 3590 II(Prot | Priv | SrcMem16, em_lldt, lldt),
3378 DI(ModRM | Prot | Priv, ltr), 3591 II(Prot | Priv | SrcMem16, em_ltr, ltr),
3379 N, N, N, N, 3592 N, N, N, N,
3380}; 3593};
3381 3594
3382static struct group_dual group7 = { { 3595static struct group_dual group7 = { {
3383 DI(ModRM | Mov | DstMem | Priv, sgdt), 3596 II(Mov | DstMem | Priv, em_sgdt, sgdt),
3384 DI(ModRM | Mov | DstMem | Priv, sidt), 3597 II(Mov | DstMem | Priv, em_sidt, sidt),
3385 II(ModRM | SrcMem | Priv, em_lgdt, lgdt), 3598 II(SrcMem | Priv, em_lgdt, lgdt),
3386 II(ModRM | SrcMem | Priv, em_lidt, lidt), 3599 II(SrcMem | Priv, em_lidt, lidt),
3387 II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N, 3600 II(SrcNone | DstMem | Mov, em_smsw, smsw), N,
3388 II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw), 3601 II(SrcMem16 | Mov | Priv, em_lmsw, lmsw),
3389 II(SrcMem | ModRM | ByteOp | Priv | NoAccess, em_invlpg, invlpg), 3602 II(SrcMem | ByteOp | Priv | NoAccess, em_invlpg, invlpg),
3390}, { 3603}, {
3391 I(SrcNone | ModRM | Priv | VendorSpecific, em_vmcall), 3604 I(SrcNone | Priv | VendorSpecific, em_vmcall),
3392 EXT(0, group7_rm1), 3605 EXT(0, group7_rm1),
3393 N, EXT(0, group7_rm3), 3606 N, EXT(0, group7_rm3),
3394 II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N, 3607 II(SrcNone | DstMem | Mov, em_smsw, smsw), N,
3395 II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw), EXT(0, group7_rm7), 3608 II(SrcMem16 | Mov | Priv, em_lmsw, lmsw),
3609 EXT(0, group7_rm7),
3396} }; 3610} };
3397 3611
3398static struct opcode group8[] = { 3612static struct opcode group8[] = {
3399 N, N, N, N, 3613 N, N, N, N,
3400 I(DstMem | SrcImmByte | ModRM, em_bt), 3614 I(DstMem | SrcImmByte, em_bt),
3401 I(DstMem | SrcImmByte | ModRM | Lock | PageTable, em_bts), 3615 I(DstMem | SrcImmByte | Lock | PageTable, em_bts),
3402 I(DstMem | SrcImmByte | ModRM | Lock, em_btr), 3616 I(DstMem | SrcImmByte | Lock, em_btr),
3403 I(DstMem | SrcImmByte | ModRM | Lock | PageTable, em_btc), 3617 I(DstMem | SrcImmByte | Lock | PageTable, em_btc),
3404}; 3618};
3405 3619
3406static struct group_dual group9 = { { 3620static struct group_dual group9 = { {
3407 N, I(DstMem64 | ModRM | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N, 3621 N, I(DstMem64 | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N,
3408}, { 3622}, {
3409 N, N, N, N, N, N, N, N, 3623 N, N, N, N, N, N, N, N,
3410} }; 3624} };
3411 3625
3412static struct opcode group11[] = { 3626static struct opcode group11[] = {
3413 I(DstMem | SrcImm | ModRM | Mov | PageTable, em_mov), 3627 I(DstMem | SrcImm | Mov | PageTable, em_mov),
3414 X7(D(Undefined)), 3628 X7(D(Undefined)),
3415}; 3629};
3416 3630
3417static struct gprefix pfx_0f_6f_0f_7f = { 3631static struct gprefix pfx_0f_6f_0f_7f = {
3418 N, N, N, I(Sse, em_movdqu), 3632 I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov),
3633};
3634
3635static struct gprefix pfx_vmovntpx = {
3636 I(0, em_mov), N, N, N,
3419}; 3637};
3420 3638
3421static struct opcode opcode_table[256] = { 3639static struct opcode opcode_table[256] = {
@@ -3464,10 +3682,10 @@ static struct opcode opcode_table[256] = {
3464 /* 0x70 - 0x7F */ 3682 /* 0x70 - 0x7F */
3465 X16(D(SrcImmByte)), 3683 X16(D(SrcImmByte)),
3466 /* 0x80 - 0x87 */ 3684 /* 0x80 - 0x87 */
3467 G(ByteOp | DstMem | SrcImm | ModRM | Group, group1), 3685 G(ByteOp | DstMem | SrcImm, group1),
3468 G(DstMem | SrcImm | ModRM | Group, group1), 3686 G(DstMem | SrcImm, group1),
3469 G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1), 3687 G(ByteOp | DstMem | SrcImm | No64, group1),
3470 G(DstMem | SrcImmByte | ModRM | Group, group1), 3688 G(DstMem | SrcImmByte, group1),
3471 I2bv(DstMem | SrcReg | ModRM, em_test), 3689 I2bv(DstMem | SrcReg | ModRM, em_test),
3472 I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg), 3690 I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg),
3473 /* 0x88 - 0x8F */ 3691 /* 0x88 - 0x8F */
@@ -3483,7 +3701,7 @@ static struct opcode opcode_table[256] = {
3483 D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd), 3701 D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd),
3484 I(SrcImmFAddr | No64, em_call_far), N, 3702 I(SrcImmFAddr | No64, em_call_far), N,
3485 II(ImplicitOps | Stack, em_pushf, pushf), 3703 II(ImplicitOps | Stack, em_pushf, pushf),
3486 II(ImplicitOps | Stack, em_popf, popf), N, N, 3704 II(ImplicitOps | Stack, em_popf, popf), N, I(ImplicitOps, em_lahf),
3487 /* 0xA0 - 0xA7 */ 3705 /* 0xA0 - 0xA7 */
3488 I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), 3706 I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
3489 I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov), 3707 I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov),
@@ -3506,7 +3724,8 @@ static struct opcode opcode_table[256] = {
3506 I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg), 3724 I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg),
3507 G(ByteOp, group11), G(0, group11), 3725 G(ByteOp, group11), G(0, group11),
3508 /* 0xC8 - 0xCF */ 3726 /* 0xC8 - 0xCF */
3509 N, N, N, I(ImplicitOps | Stack, em_ret_far), 3727 I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave),
3728 N, I(ImplicitOps | Stack, em_ret_far),
3510 D(ImplicitOps), DI(SrcImmByte, intn), 3729 D(ImplicitOps), DI(SrcImmByte, intn),
3511 D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret), 3730 D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret),
3512 /* 0xD0 - 0xD7 */ 3731 /* 0xD0 - 0xD7 */
@@ -3549,7 +3768,8 @@ static struct opcode twobyte_table[256] = {
3549 IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write), 3768 IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write),
3550 IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write), 3769 IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write),
3551 N, N, N, N, 3770 N, N, N, N,
3552 N, N, N, N, N, N, N, N, 3771 N, N, N, GP(ModRM | DstMem | SrcReg | Sse | Mov | Aligned, &pfx_vmovntpx),
3772 N, N, N, N,
3553 /* 0x30 - 0x3F */ 3773 /* 0x30 - 0x3F */
3554 II(ImplicitOps | Priv, em_wrmsr, wrmsr), 3774 II(ImplicitOps | Priv, em_wrmsr, wrmsr),
3555 IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc), 3775 IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc),
@@ -3579,7 +3799,7 @@ static struct opcode twobyte_table[256] = {
3579 X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), 3799 X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
3580 /* 0xA0 - 0xA7 */ 3800 /* 0xA0 - 0xA7 */
3581 I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg), 3801 I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg),
3582 DI(ImplicitOps, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt), 3802 II(ImplicitOps, em_cpuid, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt),
3583 D(DstMem | SrcReg | Src2ImmByte | ModRM), 3803 D(DstMem | SrcReg | Src2ImmByte | ModRM),
3584 D(DstMem | SrcReg | Src2CL | ModRM), N, N, 3804 D(DstMem | SrcReg | Src2CL | ModRM), N, N,
3585 /* 0xA8 - 0xAF */ 3805 /* 0xA8 - 0xAF */
@@ -3602,11 +3822,12 @@ static struct opcode twobyte_table[256] = {
3602 I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc), 3822 I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc),
3603 I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr), 3823 I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr),
3604 D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), 3824 D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
3605 /* 0xC0 - 0xCF */ 3825 /* 0xC0 - 0xC7 */
3606 D2bv(DstMem | SrcReg | ModRM | Lock), 3826 D2bv(DstMem | SrcReg | ModRM | Lock),
3607 N, D(DstMem | SrcReg | ModRM | Mov), 3827 N, D(DstMem | SrcReg | ModRM | Mov),
3608 N, N, N, GD(0, &group9), 3828 N, N, N, GD(0, &group9),
3609 N, N, N, N, N, N, N, N, 3829 /* 0xC8 - 0xCF */
3830 X8(I(DstReg, em_bswap)),
3610 /* 0xD0 - 0xDF */ 3831 /* 0xD0 - 0xDF */
3611 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, 3832 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
3612 /* 0xE0 - 0xEF */ 3833 /* 0xE0 - 0xEF */
@@ -3897,17 +4118,16 @@ done_prefixes:
3897 } 4118 }
3898 ctxt->d = opcode.flags; 4119 ctxt->d = opcode.flags;
3899 4120
4121 if (ctxt->d & ModRM)
4122 ctxt->modrm = insn_fetch(u8, ctxt);
4123
3900 while (ctxt->d & GroupMask) { 4124 while (ctxt->d & GroupMask) {
3901 switch (ctxt->d & GroupMask) { 4125 switch (ctxt->d & GroupMask) {
3902 case Group: 4126 case Group:
3903 ctxt->modrm = insn_fetch(u8, ctxt);
3904 --ctxt->_eip;
3905 goffset = (ctxt->modrm >> 3) & 7; 4127 goffset = (ctxt->modrm >> 3) & 7;
3906 opcode = opcode.u.group[goffset]; 4128 opcode = opcode.u.group[goffset];
3907 break; 4129 break;
3908 case GroupDual: 4130 case GroupDual:
3909 ctxt->modrm = insn_fetch(u8, ctxt);
3910 --ctxt->_eip;
3911 goffset = (ctxt->modrm >> 3) & 7; 4131 goffset = (ctxt->modrm >> 3) & 7;
3912 if ((ctxt->modrm >> 6) == 3) 4132 if ((ctxt->modrm >> 6) == 3)
3913 opcode = opcode.u.gdual->mod3[goffset]; 4133 opcode = opcode.u.gdual->mod3[goffset];
@@ -3960,6 +4180,8 @@ done_prefixes:
3960 4180
3961 if (ctxt->d & Sse) 4181 if (ctxt->d & Sse)
3962 ctxt->op_bytes = 16; 4182 ctxt->op_bytes = 16;
4183 else if (ctxt->d & Mmx)
4184 ctxt->op_bytes = 8;
3963 4185
3964 /* ModRM and SIB bytes. */ 4186 /* ModRM and SIB bytes. */
3965 if (ctxt->d & ModRM) { 4187 if (ctxt->d & ModRM) {
@@ -4030,6 +4252,35 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
4030 return false; 4252 return false;
4031} 4253}
4032 4254
4255static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt)
4256{
4257 bool fault = false;
4258
4259 ctxt->ops->get_fpu(ctxt);
4260 asm volatile("1: fwait \n\t"
4261 "2: \n\t"
4262 ".pushsection .fixup,\"ax\" \n\t"
4263 "3: \n\t"
4264 "movb $1, %[fault] \n\t"
4265 "jmp 2b \n\t"
4266 ".popsection \n\t"
4267 _ASM_EXTABLE(1b, 3b)
4268 : [fault]"+qm"(fault));
4269 ctxt->ops->put_fpu(ctxt);
4270
4271 if (unlikely(fault))
4272 return emulate_exception(ctxt, MF_VECTOR, 0, false);
4273
4274 return X86EMUL_CONTINUE;
4275}
4276
4277static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt,
4278 struct operand *op)
4279{
4280 if (op->type == OP_MM)
4281 read_mmx_reg(ctxt, &op->mm_val, op->addr.mm);
4282}
4283
4033int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) 4284int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
4034{ 4285{
4035 struct x86_emulate_ops *ops = ctxt->ops; 4286 struct x86_emulate_ops *ops = ctxt->ops;
@@ -4054,18 +4305,31 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
4054 goto done; 4305 goto done;
4055 } 4306 }
4056 4307
4057 if ((ctxt->d & Sse) 4308 if (((ctxt->d & (Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM)))
4058 && ((ops->get_cr(ctxt, 0) & X86_CR0_EM) 4309 || ((ctxt->d & Sse) && !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) {
4059 || !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) {
4060 rc = emulate_ud(ctxt); 4310 rc = emulate_ud(ctxt);
4061 goto done; 4311 goto done;
4062 } 4312 }
4063 4313
4064 if ((ctxt->d & Sse) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { 4314 if ((ctxt->d & (Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) {
4065 rc = emulate_nm(ctxt); 4315 rc = emulate_nm(ctxt);
4066 goto done; 4316 goto done;
4067 } 4317 }
4068 4318
4319 if (ctxt->d & Mmx) {
4320 rc = flush_pending_x87_faults(ctxt);
4321 if (rc != X86EMUL_CONTINUE)
4322 goto done;
4323 /*
4324 * Now that we know the fpu is exception safe, we can fetch
4325 * operands from it.
4326 */
4327 fetch_possible_mmx_operand(ctxt, &ctxt->src);
4328 fetch_possible_mmx_operand(ctxt, &ctxt->src2);
4329 if (!(ctxt->d & Mov))
4330 fetch_possible_mmx_operand(ctxt, &ctxt->dst);
4331 }
4332
4069 if (unlikely(ctxt->guest_mode) && ctxt->intercept) { 4333 if (unlikely(ctxt->guest_mode) && ctxt->intercept) {
4070 rc = emulator_check_intercept(ctxt, ctxt->intercept, 4334 rc = emulator_check_intercept(ctxt, ctxt->intercept,
4071 X86_ICPT_PRE_EXCEPT); 4335 X86_ICPT_PRE_EXCEPT);
@@ -4327,12 +4591,12 @@ twobyte_insn:
4327 break; 4591 break;
4328 case 0xb6 ... 0xb7: /* movzx */ 4592 case 0xb6 ... 0xb7: /* movzx */
4329 ctxt->dst.bytes = ctxt->op_bytes; 4593 ctxt->dst.bytes = ctxt->op_bytes;
4330 ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val 4594 ctxt->dst.val = (ctxt->src.bytes == 1) ? (u8) ctxt->src.val
4331 : (u16) ctxt->src.val; 4595 : (u16) ctxt->src.val;
4332 break; 4596 break;
4333 case 0xbe ... 0xbf: /* movsx */ 4597 case 0xbe ... 0xbf: /* movsx */
4334 ctxt->dst.bytes = ctxt->op_bytes; 4598 ctxt->dst.bytes = ctxt->op_bytes;
4335 ctxt->dst.val = (ctxt->d & ByteOp) ? (s8) ctxt->src.val : 4599 ctxt->dst.val = (ctxt->src.bytes == 1) ? (s8) ctxt->src.val :
4336 (s16) ctxt->src.val; 4600 (s16) ctxt->src.val;
4337 break; 4601 break;
4338 case 0xc0 ... 0xc1: /* xadd */ 4602 case 0xc0 ... 0xc1: /* xadd */
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index d68f99df690..adba28f88d1 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -34,7 +34,6 @@
34 34
35#include <linux/kvm_host.h> 35#include <linux/kvm_host.h>
36#include <linux/slab.h> 36#include <linux/slab.h>
37#include <linux/workqueue.h>
38 37
39#include "irq.h" 38#include "irq.h"
40#include "i8254.h" 39#include "i8254.h"
@@ -249,7 +248,7 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
249 /* in this case, we had multiple outstanding pit interrupts 248 /* in this case, we had multiple outstanding pit interrupts
250 * that we needed to inject. Reinject 249 * that we needed to inject. Reinject
251 */ 250 */
252 queue_work(ps->pit->wq, &ps->pit->expired); 251 queue_kthread_work(&ps->pit->worker, &ps->pit->expired);
253 ps->irq_ack = 1; 252 ps->irq_ack = 1;
254 spin_unlock(&ps->inject_lock); 253 spin_unlock(&ps->inject_lock);
255} 254}
@@ -270,7 +269,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
270static void destroy_pit_timer(struct kvm_pit *pit) 269static void destroy_pit_timer(struct kvm_pit *pit)
271{ 270{
272 hrtimer_cancel(&pit->pit_state.pit_timer.timer); 271 hrtimer_cancel(&pit->pit_state.pit_timer.timer);
273 cancel_work_sync(&pit->expired); 272 flush_kthread_work(&pit->expired);
274} 273}
275 274
276static bool kpit_is_periodic(struct kvm_timer *ktimer) 275static bool kpit_is_periodic(struct kvm_timer *ktimer)
@@ -284,7 +283,7 @@ static struct kvm_timer_ops kpit_ops = {
284 .is_periodic = kpit_is_periodic, 283 .is_periodic = kpit_is_periodic,
285}; 284};
286 285
287static void pit_do_work(struct work_struct *work) 286static void pit_do_work(struct kthread_work *work)
288{ 287{
289 struct kvm_pit *pit = container_of(work, struct kvm_pit, expired); 288 struct kvm_pit *pit = container_of(work, struct kvm_pit, expired);
290 struct kvm *kvm = pit->kvm; 289 struct kvm *kvm = pit->kvm;
@@ -328,7 +327,7 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
328 327
329 if (ktimer->reinject || !atomic_read(&ktimer->pending)) { 328 if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
330 atomic_inc(&ktimer->pending); 329 atomic_inc(&ktimer->pending);
331 queue_work(pt->wq, &pt->expired); 330 queue_kthread_work(&pt->worker, &pt->expired);
332 } 331 }
333 332
334 if (ktimer->t_ops->is_periodic(ktimer)) { 333 if (ktimer->t_ops->is_periodic(ktimer)) {
@@ -353,7 +352,7 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
353 352
354 /* TODO The new value only affected after the retriggered */ 353 /* TODO The new value only affected after the retriggered */
355 hrtimer_cancel(&pt->timer); 354 hrtimer_cancel(&pt->timer);
356 cancel_work_sync(&ps->pit->expired); 355 flush_kthread_work(&ps->pit->expired);
357 pt->period = interval; 356 pt->period = interval;
358 ps->is_periodic = is_period; 357 ps->is_periodic = is_period;
359 358
@@ -669,6 +668,8 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
669{ 668{
670 struct kvm_pit *pit; 669 struct kvm_pit *pit;
671 struct kvm_kpit_state *pit_state; 670 struct kvm_kpit_state *pit_state;
671 struct pid *pid;
672 pid_t pid_nr;
672 int ret; 673 int ret;
673 674
674 pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL); 675 pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL);
@@ -685,14 +686,20 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
685 mutex_lock(&pit->pit_state.lock); 686 mutex_lock(&pit->pit_state.lock);
686 spin_lock_init(&pit->pit_state.inject_lock); 687 spin_lock_init(&pit->pit_state.inject_lock);
687 688
688 pit->wq = create_singlethread_workqueue("kvm-pit-wq"); 689 pid = get_pid(task_tgid(current));
689 if (!pit->wq) { 690 pid_nr = pid_vnr(pid);
691 put_pid(pid);
692
693 init_kthread_worker(&pit->worker);
694 pit->worker_task = kthread_run(kthread_worker_fn, &pit->worker,
695 "kvm-pit/%d", pid_nr);
696 if (IS_ERR(pit->worker_task)) {
690 mutex_unlock(&pit->pit_state.lock); 697 mutex_unlock(&pit->pit_state.lock);
691 kvm_free_irq_source_id(kvm, pit->irq_source_id); 698 kvm_free_irq_source_id(kvm, pit->irq_source_id);
692 kfree(pit); 699 kfree(pit);
693 return NULL; 700 return NULL;
694 } 701 }
695 INIT_WORK(&pit->expired, pit_do_work); 702 init_kthread_work(&pit->expired, pit_do_work);
696 703
697 kvm->arch.vpit = pit; 704 kvm->arch.vpit = pit;
698 pit->kvm = kvm; 705 pit->kvm = kvm;
@@ -736,7 +743,7 @@ fail:
736 kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier); 743 kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
737 kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); 744 kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
738 kvm_free_irq_source_id(kvm, pit->irq_source_id); 745 kvm_free_irq_source_id(kvm, pit->irq_source_id);
739 destroy_workqueue(pit->wq); 746 kthread_stop(pit->worker_task);
740 kfree(pit); 747 kfree(pit);
741 return NULL; 748 return NULL;
742} 749}
@@ -756,10 +763,10 @@ void kvm_free_pit(struct kvm *kvm)
756 mutex_lock(&kvm->arch.vpit->pit_state.lock); 763 mutex_lock(&kvm->arch.vpit->pit_state.lock);
757 timer = &kvm->arch.vpit->pit_state.pit_timer.timer; 764 timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
758 hrtimer_cancel(timer); 765 hrtimer_cancel(timer);
759 cancel_work_sync(&kvm->arch.vpit->expired); 766 flush_kthread_work(&kvm->arch.vpit->expired);
767 kthread_stop(kvm->arch.vpit->worker_task);
760 kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id); 768 kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id);
761 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 769 mutex_unlock(&kvm->arch.vpit->pit_state.lock);
762 destroy_workqueue(kvm->arch.vpit->wq);
763 kfree(kvm->arch.vpit); 770 kfree(kvm->arch.vpit);
764 } 771 }
765} 772}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 51a97426e79..fdf40425ea1 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -1,6 +1,8 @@
1#ifndef __I8254_H 1#ifndef __I8254_H
2#define __I8254_H 2#define __I8254_H
3 3
4#include <linux/kthread.h>
5
4#include "iodev.h" 6#include "iodev.h"
5 7
6struct kvm_kpit_channel_state { 8struct kvm_kpit_channel_state {
@@ -39,8 +41,9 @@ struct kvm_pit {
39 struct kvm_kpit_state pit_state; 41 struct kvm_kpit_state pit_state;
40 int irq_source_id; 42 int irq_source_id;
41 struct kvm_irq_mask_notifier mask_notifier; 43 struct kvm_irq_mask_notifier mask_notifier;
42 struct workqueue_struct *wq; 44 struct kthread_worker worker;
43 struct work_struct expired; 45 struct task_struct *worker_task;
46 struct kthread_work expired;
44}; 47};
45 48
46#define KVM_PIT_BASE_ADDRESS 0x40 49#define KVM_PIT_BASE_ADDRESS 0x40
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 81cf4fa4a2b..1df8fb9e1d5 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -188,14 +188,15 @@ void kvm_pic_update_irq(struct kvm_pic *s)
188 pic_unlock(s); 188 pic_unlock(s);
189} 189}
190 190
191int kvm_pic_set_irq(void *opaque, int irq, int level) 191int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level)
192{ 192{
193 struct kvm_pic *s = opaque;
194 int ret = -1; 193 int ret = -1;
195 194
196 pic_lock(s); 195 pic_lock(s);
197 if (irq >= 0 && irq < PIC_NUM_PINS) { 196 if (irq >= 0 && irq < PIC_NUM_PINS) {
198 ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); 197 int irq_level = __kvm_irq_line_state(&s->irq_states[irq],
198 irq_source_id, level);
199 ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level);
199 pic_update_irq(s); 200 pic_update_irq(s);
200 trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, 201 trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
201 s->pics[irq >> 3].imr, ret == 0); 202 s->pics[irq >> 3].imr, ret == 0);
@@ -205,6 +206,16 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
205 return ret; 206 return ret;
206} 207}
207 208
209void kvm_pic_clear_all(struct kvm_pic *s, int irq_source_id)
210{
211 int i;
212
213 pic_lock(s);
214 for (i = 0; i < PIC_NUM_PINS; i++)
215 __clear_bit(irq_source_id, &s->irq_states[i]);
216 pic_unlock(s);
217}
218
208/* 219/*
209 * acknowledge interrupt 'irq' 220 * acknowledge interrupt 'irq'
210 */ 221 */
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 858432287ab..ce878788a39 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -92,6 +92,11 @@ static inline int apic_test_and_clear_vector(int vec, void *bitmap)
92 return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); 92 return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
93} 93}
94 94
95static inline int apic_test_vector(int vec, void *bitmap)
96{
97 return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
98}
99
95static inline void apic_set_vector(int vec, void *bitmap) 100static inline void apic_set_vector(int vec, void *bitmap)
96{ 101{
97 set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); 102 set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
@@ -102,6 +107,16 @@ static inline void apic_clear_vector(int vec, void *bitmap)
102 clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); 107 clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
103} 108}
104 109
110static inline int __apic_test_and_set_vector(int vec, void *bitmap)
111{
112 return __test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
113}
114
115static inline int __apic_test_and_clear_vector(int vec, void *bitmap)
116{
117 return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
118}
119
105static inline int apic_hw_enabled(struct kvm_lapic *apic) 120static inline int apic_hw_enabled(struct kvm_lapic *apic)
106{ 121{
107 return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE; 122 return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
@@ -205,6 +220,16 @@ static int find_highest_vector(void *bitmap)
205 return fls(word[word_offset << 2]) - 1 + (word_offset << 5); 220 return fls(word[word_offset << 2]) - 1 + (word_offset << 5);
206} 221}
207 222
223static u8 count_vectors(void *bitmap)
224{
225 u32 *word = bitmap;
226 int word_offset;
227 u8 count = 0;
228 for (word_offset = 0; word_offset < MAX_APIC_VECTOR >> 5; ++word_offset)
229 count += hweight32(word[word_offset << 2]);
230 return count;
231}
232
208static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) 233static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
209{ 234{
210 apic->irr_pending = true; 235 apic->irr_pending = true;
@@ -237,6 +262,27 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
237 apic->irr_pending = true; 262 apic->irr_pending = true;
238} 263}
239 264
265static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
266{
267 if (!__apic_test_and_set_vector(vec, apic->regs + APIC_ISR))
268 ++apic->isr_count;
269 BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
270 /*
271 * ISR (in service register) bit is set when injecting an interrupt.
272 * The highest vector is injected. Thus the latest bit set matches
273 * the highest bit in ISR.
274 */
275 apic->highest_isr_cache = vec;
276}
277
278static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
279{
280 if (__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR))
281 --apic->isr_count;
282 BUG_ON(apic->isr_count < 0);
283 apic->highest_isr_cache = -1;
284}
285
240int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) 286int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
241{ 287{
242 struct kvm_lapic *apic = vcpu->arch.apic; 288 struct kvm_lapic *apic = vcpu->arch.apic;
@@ -265,9 +311,61 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq)
265 irq->level, irq->trig_mode); 311 irq->level, irq->trig_mode);
266} 312}
267 313
314static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val)
315{
316
317 return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val,
318 sizeof(val));
319}
320
321static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val)
322{
323
324 return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val,
325 sizeof(*val));
326}
327
328static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu)
329{
330 return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED;
331}
332
333static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu)
334{
335 u8 val;
336 if (pv_eoi_get_user(vcpu, &val) < 0)
337 apic_debug("Can't read EOI MSR value: 0x%llx\n",
338 (unsigned long long)vcpi->arch.pv_eoi.msr_val);
339 return val & 0x1;
340}
341
342static void pv_eoi_set_pending(struct kvm_vcpu *vcpu)
343{
344 if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0) {
345 apic_debug("Can't set EOI MSR value: 0x%llx\n",
346 (unsigned long long)vcpi->arch.pv_eoi.msr_val);
347 return;
348 }
349 __set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
350}
351
352static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
353{
354 if (pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) {
355 apic_debug("Can't clear EOI MSR value: 0x%llx\n",
356 (unsigned long long)vcpi->arch.pv_eoi.msr_val);
357 return;
358 }
359 __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
360}
361
268static inline int apic_find_highest_isr(struct kvm_lapic *apic) 362static inline int apic_find_highest_isr(struct kvm_lapic *apic)
269{ 363{
270 int result; 364 int result;
365 if (!apic->isr_count)
366 return -1;
367 if (likely(apic->highest_isr_cache != -1))
368 return apic->highest_isr_cache;
271 369
272 result = find_highest_vector(apic->regs + APIC_ISR); 370 result = find_highest_vector(apic->regs + APIC_ISR);
273 ASSERT(result == -1 || result >= 16); 371 ASSERT(result == -1 || result >= 16);
@@ -477,27 +575,33 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
477 return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; 575 return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
478} 576}
479 577
480static void apic_set_eoi(struct kvm_lapic *apic) 578static int apic_set_eoi(struct kvm_lapic *apic)
481{ 579{
482 int vector = apic_find_highest_isr(apic); 580 int vector = apic_find_highest_isr(apic);
483 int trigger_mode; 581
582 trace_kvm_eoi(apic, vector);
583
484 /* 584 /*
485 * Not every write EOI will has corresponding ISR, 585 * Not every write EOI will has corresponding ISR,
486 * one example is when Kernel check timer on setup_IO_APIC 586 * one example is when Kernel check timer on setup_IO_APIC
487 */ 587 */
488 if (vector == -1) 588 if (vector == -1)
489 return; 589 return vector;
490 590
491 apic_clear_vector(vector, apic->regs + APIC_ISR); 591 apic_clear_isr(vector, apic);
492 apic_update_ppr(apic); 592 apic_update_ppr(apic);
493 593
494 if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR)) 594 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) &&
495 trigger_mode = IOAPIC_LEVEL_TRIG; 595 kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
496 else 596 int trigger_mode;
497 trigger_mode = IOAPIC_EDGE_TRIG; 597 if (apic_test_vector(vector, apic->regs + APIC_TMR))
498 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) 598 trigger_mode = IOAPIC_LEVEL_TRIG;
599 else
600 trigger_mode = IOAPIC_EDGE_TRIG;
499 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); 601 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
602 }
500 kvm_make_request(KVM_REQ_EVENT, apic->vcpu); 603 kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
604 return vector;
501} 605}
502 606
503static void apic_send_ipi(struct kvm_lapic *apic) 607static void apic_send_ipi(struct kvm_lapic *apic)
@@ -1074,13 +1178,17 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
1074 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); 1178 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
1075 } 1179 }
1076 apic->irr_pending = false; 1180 apic->irr_pending = false;
1181 apic->isr_count = 0;
1182 apic->highest_isr_cache = -1;
1077 update_divide_count(apic); 1183 update_divide_count(apic);
1078 atomic_set(&apic->lapic_timer.pending, 0); 1184 atomic_set(&apic->lapic_timer.pending, 0);
1079 if (kvm_vcpu_is_bsp(vcpu)) 1185 if (kvm_vcpu_is_bsp(vcpu))
1080 vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; 1186 vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
1187 vcpu->arch.pv_eoi.msr_val = 0;
1081 apic_update_ppr(apic); 1188 apic_update_ppr(apic);
1082 1189
1083 vcpu->arch.apic_arb_prio = 0; 1190 vcpu->arch.apic_arb_prio = 0;
1191 vcpu->arch.apic_attention = 0;
1084 1192
1085 apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" 1193 apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
1086 "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__, 1194 "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__,
@@ -1240,7 +1348,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
1240 if (vector == -1) 1348 if (vector == -1)
1241 return -1; 1349 return -1;
1242 1350
1243 apic_set_vector(vector, apic->regs + APIC_ISR); 1351 apic_set_isr(vector, apic);
1244 apic_update_ppr(apic); 1352 apic_update_ppr(apic);
1245 apic_clear_irr(vector, apic); 1353 apic_clear_irr(vector, apic);
1246 return vector; 1354 return vector;
@@ -1259,6 +1367,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
1259 update_divide_count(apic); 1367 update_divide_count(apic);
1260 start_apic_timer(apic); 1368 start_apic_timer(apic);
1261 apic->irr_pending = true; 1369 apic->irr_pending = true;
1370 apic->isr_count = count_vectors(apic->regs + APIC_ISR);
1371 apic->highest_isr_cache = -1;
1262 kvm_make_request(KVM_REQ_EVENT, vcpu); 1372 kvm_make_request(KVM_REQ_EVENT, vcpu);
1263} 1373}
1264 1374
@@ -1275,12 +1385,52 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
1275 hrtimer_start_expires(timer, HRTIMER_MODE_ABS); 1385 hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
1276} 1386}
1277 1387
1388/*
1389 * apic_sync_pv_eoi_from_guest - called on vmexit or cancel interrupt
1390 *
1391 * Detect whether guest triggered PV EOI since the
1392 * last entry. If yes, set EOI on guests's behalf.
1393 * Clear PV EOI in guest memory in any case.
1394 */
1395static void apic_sync_pv_eoi_from_guest(struct kvm_vcpu *vcpu,
1396 struct kvm_lapic *apic)
1397{
1398 bool pending;
1399 int vector;
1400 /*
1401 * PV EOI state is derived from KVM_APIC_PV_EOI_PENDING in host
1402 * and KVM_PV_EOI_ENABLED in guest memory as follows:
1403 *
1404 * KVM_APIC_PV_EOI_PENDING is unset:
1405 * -> host disabled PV EOI.
1406 * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is set:
1407 * -> host enabled PV EOI, guest did not execute EOI yet.
1408 * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is unset:
1409 * -> host enabled PV EOI, guest executed EOI.
1410 */
1411 BUG_ON(!pv_eoi_enabled(vcpu));
1412 pending = pv_eoi_get_pending(vcpu);
1413 /*
1414 * Clear pending bit in any case: it will be set again on vmentry.
1415 * While this might not be ideal from performance point of view,
1416 * this makes sure pv eoi is only enabled when we know it's safe.
1417 */
1418 pv_eoi_clr_pending(vcpu);
1419 if (pending)
1420 return;
1421 vector = apic_set_eoi(apic);
1422 trace_kvm_pv_eoi(apic, vector);
1423}
1424
1278void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) 1425void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
1279{ 1426{
1280 u32 data; 1427 u32 data;
1281 void *vapic; 1428 void *vapic;
1282 1429
1283 if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr) 1430 if (test_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention))
1431 apic_sync_pv_eoi_from_guest(vcpu, vcpu->arch.apic);
1432
1433 if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
1284 return; 1434 return;
1285 1435
1286 vapic = kmap_atomic(vcpu->arch.apic->vapic_page); 1436 vapic = kmap_atomic(vcpu->arch.apic->vapic_page);
@@ -1290,17 +1440,44 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
1290 apic_set_tpr(vcpu->arch.apic, data & 0xff); 1440 apic_set_tpr(vcpu->arch.apic, data & 0xff);
1291} 1441}
1292 1442
1443/*
1444 * apic_sync_pv_eoi_to_guest - called before vmentry
1445 *
1446 * Detect whether it's safe to enable PV EOI and
1447 * if yes do so.
1448 */
1449static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu,
1450 struct kvm_lapic *apic)
1451{
1452 if (!pv_eoi_enabled(vcpu) ||
1453 /* IRR set or many bits in ISR: could be nested. */
1454 apic->irr_pending ||
1455 /* Cache not set: could be safe but we don't bother. */
1456 apic->highest_isr_cache == -1 ||
1457 /* Need EOI to update ioapic. */
1458 kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache)) {
1459 /*
1460 * PV EOI was disabled by apic_sync_pv_eoi_from_guest
1461 * so we need not do anything here.
1462 */
1463 return;
1464 }
1465
1466 pv_eoi_set_pending(apic->vcpu);
1467}
1468
1293void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) 1469void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
1294{ 1470{
1295 u32 data, tpr; 1471 u32 data, tpr;
1296 int max_irr, max_isr; 1472 int max_irr, max_isr;
1297 struct kvm_lapic *apic; 1473 struct kvm_lapic *apic = vcpu->arch.apic;
1298 void *vapic; 1474 void *vapic;
1299 1475
1300 if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr) 1476 apic_sync_pv_eoi_to_guest(vcpu, apic);
1477
1478 if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
1301 return; 1479 return;
1302 1480
1303 apic = vcpu->arch.apic;
1304 tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff; 1481 tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff;
1305 max_irr = apic_find_highest_irr(apic); 1482 max_irr = apic_find_highest_irr(apic);
1306 if (max_irr < 0) 1483 if (max_irr < 0)
@@ -1317,10 +1494,11 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
1317 1494
1318void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) 1495void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
1319{ 1496{
1320 if (!irqchip_in_kernel(vcpu->kvm))
1321 return;
1322
1323 vcpu->arch.apic->vapic_addr = vapic_addr; 1497 vcpu->arch.apic->vapic_addr = vapic_addr;
1498 if (vapic_addr)
1499 __set_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention);
1500 else
1501 __clear_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention);
1324} 1502}
1325 1503
1326int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1504int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
@@ -1385,3 +1563,16 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
1385 1563
1386 return 0; 1564 return 0;
1387} 1565}
1566
1567int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data)
1568{
1569 u64 addr = data & ~KVM_MSR_ENABLED;
1570 if (!IS_ALIGNED(addr, 4))
1571 return 1;
1572
1573 vcpu->arch.pv_eoi.msr_val = data;
1574 if (!pv_eoi_enabled(vcpu))
1575 return 0;
1576 return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data,
1577 addr);
1578}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 6f4ce2575d0..4af5405ae1e 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -13,6 +13,15 @@ struct kvm_lapic {
13 u32 divide_count; 13 u32 divide_count;
14 struct kvm_vcpu *vcpu; 14 struct kvm_vcpu *vcpu;
15 bool irr_pending; 15 bool irr_pending;
16 /* Number of bits set in ISR. */
17 s16 isr_count;
18 /* The highest vector set in ISR; if -1 - invalid, must scan ISR. */
19 int highest_isr_cache;
20 /**
21 * APIC register page. The layout matches the register layout seen by
22 * the guest 1:1, because it is accessed by the vmx microcode.
23 * Note: Only one register, the TPR, is used by the microcode.
24 */
16 void *regs; 25 void *regs;
17 gpa_t vapic_addr; 26 gpa_t vapic_addr;
18 struct page *vapic_page; 27 struct page *vapic_page;
@@ -60,4 +69,6 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
60{ 69{
61 return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE; 70 return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE;
62} 71}
72
73int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data);
63#endif 74#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 4cb16426884..01ca0042393 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -90,7 +90,7 @@ module_param(dbg, bool, 0644);
90 90
91#define PTE_PREFETCH_NUM 8 91#define PTE_PREFETCH_NUM 8
92 92
93#define PT_FIRST_AVAIL_BITS_SHIFT 9 93#define PT_FIRST_AVAIL_BITS_SHIFT 10
94#define PT64_SECOND_AVAIL_BITS_SHIFT 52 94#define PT64_SECOND_AVAIL_BITS_SHIFT 52
95 95
96#define PT64_LEVEL_BITS 9 96#define PT64_LEVEL_BITS 9
@@ -135,8 +135,6 @@ module_param(dbg, bool, 0644);
135#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ 135#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
136 | PT64_NX_MASK) 136 | PT64_NX_MASK)
137 137
138#define PTE_LIST_EXT 4
139
140#define ACC_EXEC_MASK 1 138#define ACC_EXEC_MASK 1
141#define ACC_WRITE_MASK PT_WRITABLE_MASK 139#define ACC_WRITE_MASK PT_WRITABLE_MASK
142#define ACC_USER_MASK PT_USER_MASK 140#define ACC_USER_MASK PT_USER_MASK
@@ -147,10 +145,14 @@ module_param(dbg, bool, 0644);
147#define CREATE_TRACE_POINTS 145#define CREATE_TRACE_POINTS
148#include "mmutrace.h" 146#include "mmutrace.h"
149 147
150#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) 148#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
149#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
151 150
152#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) 151#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
153 152
153/* make pte_list_desc fit well in cache line */
154#define PTE_LIST_EXT 3
155
154struct pte_list_desc { 156struct pte_list_desc {
155 u64 *sptes[PTE_LIST_EXT]; 157 u64 *sptes[PTE_LIST_EXT];
156 struct pte_list_desc *more; 158 struct pte_list_desc *more;
@@ -187,6 +189,7 @@ static u64 __read_mostly shadow_dirty_mask;
187static u64 __read_mostly shadow_mmio_mask; 189static u64 __read_mostly shadow_mmio_mask;
188 190
189static void mmu_spte_set(u64 *sptep, u64 spte); 191static void mmu_spte_set(u64 *sptep, u64 spte);
192static void mmu_free_roots(struct kvm_vcpu *vcpu);
190 193
191void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) 194void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
192{ 195{
@@ -443,8 +446,22 @@ static bool __check_direct_spte_mmio_pf(u64 spte)
443} 446}
444#endif 447#endif
445 448
449static bool spte_is_locklessly_modifiable(u64 spte)
450{
451 return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE));
452}
453
446static bool spte_has_volatile_bits(u64 spte) 454static bool spte_has_volatile_bits(u64 spte)
447{ 455{
456 /*
457 * Always atomicly update spte if it can be updated
458 * out of mmu-lock, it can ensure dirty bit is not lost,
459 * also, it can help us to get a stable is_writable_pte()
460 * to ensure tlb flush is not missed.
461 */
462 if (spte_is_locklessly_modifiable(spte))
463 return true;
464
448 if (!shadow_accessed_mask) 465 if (!shadow_accessed_mask)
449 return false; 466 return false;
450 467
@@ -477,34 +494,47 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte)
477 494
478/* Rules for using mmu_spte_update: 495/* Rules for using mmu_spte_update:
479 * Update the state bits, it means the mapped pfn is not changged. 496 * Update the state bits, it means the mapped pfn is not changged.
497 *
498 * Whenever we overwrite a writable spte with a read-only one we
499 * should flush remote TLBs. Otherwise rmap_write_protect
500 * will find a read-only spte, even though the writable spte
501 * might be cached on a CPU's TLB, the return value indicates this
502 * case.
480 */ 503 */
481static void mmu_spte_update(u64 *sptep, u64 new_spte) 504static bool mmu_spte_update(u64 *sptep, u64 new_spte)
482{ 505{
483 u64 mask, old_spte = *sptep; 506 u64 old_spte = *sptep;
507 bool ret = false;
484 508
485 WARN_ON(!is_rmap_spte(new_spte)); 509 WARN_ON(!is_rmap_spte(new_spte));
486 510
487 if (!is_shadow_present_pte(old_spte)) 511 if (!is_shadow_present_pte(old_spte)) {
488 return mmu_spte_set(sptep, new_spte); 512 mmu_spte_set(sptep, new_spte);
489 513 return ret;
490 new_spte |= old_spte & shadow_dirty_mask; 514 }
491
492 mask = shadow_accessed_mask;
493 if (is_writable_pte(old_spte))
494 mask |= shadow_dirty_mask;
495 515
496 if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask) 516 if (!spte_has_volatile_bits(old_spte))
497 __update_clear_spte_fast(sptep, new_spte); 517 __update_clear_spte_fast(sptep, new_spte);
498 else 518 else
499 old_spte = __update_clear_spte_slow(sptep, new_spte); 519 old_spte = __update_clear_spte_slow(sptep, new_spte);
500 520
521 /*
522 * For the spte updated out of mmu-lock is safe, since
523 * we always atomicly update it, see the comments in
524 * spte_has_volatile_bits().
525 */
526 if (is_writable_pte(old_spte) && !is_writable_pte(new_spte))
527 ret = true;
528
501 if (!shadow_accessed_mask) 529 if (!shadow_accessed_mask)
502 return; 530 return ret;
503 531
504 if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) 532 if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
505 kvm_set_pfn_accessed(spte_to_pfn(old_spte)); 533 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
506 if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) 534 if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
507 kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 535 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
536
537 return ret;
508} 538}
509 539
510/* 540/*
@@ -550,19 +580,29 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
550 580
551static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) 581static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
552{ 582{
553 rcu_read_lock(); 583 /*
554 atomic_inc(&vcpu->kvm->arch.reader_counter); 584 * Prevent page table teardown by making any free-er wait during
555 585 * kvm_flush_remote_tlbs() IPI to all active vcpus.
556 /* Increase the counter before walking shadow page table */ 586 */
557 smp_mb__after_atomic_inc(); 587 local_irq_disable();
588 vcpu->mode = READING_SHADOW_PAGE_TABLES;
589 /*
590 * Make sure a following spte read is not reordered ahead of the write
591 * to vcpu->mode.
592 */
593 smp_mb();
558} 594}
559 595
560static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) 596static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
561{ 597{
562 /* Decrease the counter after walking shadow page table finished */ 598 /*
563 smp_mb__before_atomic_dec(); 599 * Make sure the write to vcpu->mode is not reordered in front of
564 atomic_dec(&vcpu->kvm->arch.reader_counter); 600 * reads to sptes. If it does, kvm_commit_zap_page() can see us
565 rcu_read_unlock(); 601 * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
602 */
603 smp_mb();
604 vcpu->mode = OUTSIDE_GUEST_MODE;
605 local_irq_enable();
566} 606}
567 607
568static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, 608static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
@@ -641,8 +681,7 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
641 mmu_page_header_cache); 681 mmu_page_header_cache);
642} 682}
643 683
644static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, 684static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
645 size_t size)
646{ 685{
647 void *p; 686 void *p;
648 687
@@ -653,8 +692,7 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
653 692
654static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) 693static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
655{ 694{
656 return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache, 695 return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
657 sizeof(struct pte_list_desc));
658} 696}
659 697
660static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) 698static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
@@ -841,32 +879,6 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
841 return count; 879 return count;
842} 880}
843 881
844static u64 *pte_list_next(unsigned long *pte_list, u64 *spte)
845{
846 struct pte_list_desc *desc;
847 u64 *prev_spte;
848 int i;
849
850 if (!*pte_list)
851 return NULL;
852 else if (!(*pte_list & 1)) {
853 if (!spte)
854 return (u64 *)*pte_list;
855 return NULL;
856 }
857 desc = (struct pte_list_desc *)(*pte_list & ~1ul);
858 prev_spte = NULL;
859 while (desc) {
860 for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
861 if (prev_spte == spte)
862 return desc->sptes[i];
863 prev_spte = desc->sptes[i];
864 }
865 desc = desc->more;
866 }
867 return NULL;
868}
869
870static void 882static void
871pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc, 883pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc,
872 int i, struct pte_list_desc *prev_desc) 884 int i, struct pte_list_desc *prev_desc)
@@ -987,11 +999,6 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
987 return pte_list_add(vcpu, spte, rmapp); 999 return pte_list_add(vcpu, spte, rmapp);
988} 1000}
989 1001
990static u64 *rmap_next(unsigned long *rmapp, u64 *spte)
991{
992 return pte_list_next(rmapp, spte);
993}
994
995static void rmap_remove(struct kvm *kvm, u64 *spte) 1002static void rmap_remove(struct kvm *kvm, u64 *spte)
996{ 1003{
997 struct kvm_mmu_page *sp; 1004 struct kvm_mmu_page *sp;
@@ -1004,106 +1011,248 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
1004 pte_list_remove(spte, rmapp); 1011 pte_list_remove(spte, rmapp);
1005} 1012}
1006 1013
1014/*
1015 * Used by the following functions to iterate through the sptes linked by a
1016 * rmap. All fields are private and not assumed to be used outside.
1017 */
1018struct rmap_iterator {
1019 /* private fields */
1020 struct pte_list_desc *desc; /* holds the sptep if not NULL */
1021 int pos; /* index of the sptep */
1022};
1023
1024/*
1025 * Iteration must be started by this function. This should also be used after
1026 * removing/dropping sptes from the rmap link because in such cases the
1027 * information in the itererator may not be valid.
1028 *
1029 * Returns sptep if found, NULL otherwise.
1030 */
1031static u64 *rmap_get_first(unsigned long rmap, struct rmap_iterator *iter)
1032{
1033 if (!rmap)
1034 return NULL;
1035
1036 if (!(rmap & 1)) {
1037 iter->desc = NULL;
1038 return (u64 *)rmap;
1039 }
1040
1041 iter->desc = (struct pte_list_desc *)(rmap & ~1ul);
1042 iter->pos = 0;
1043 return iter->desc->sptes[iter->pos];
1044}
1045
1046/*
1047 * Must be used with a valid iterator: e.g. after rmap_get_first().
1048 *
1049 * Returns sptep if found, NULL otherwise.
1050 */
1051static u64 *rmap_get_next(struct rmap_iterator *iter)
1052{
1053 if (iter->desc) {
1054 if (iter->pos < PTE_LIST_EXT - 1) {
1055 u64 *sptep;
1056
1057 ++iter->pos;
1058 sptep = iter->desc->sptes[iter->pos];
1059 if (sptep)
1060 return sptep;
1061 }
1062
1063 iter->desc = iter->desc->more;
1064
1065 if (iter->desc) {
1066 iter->pos = 0;
1067 /* desc->sptes[0] cannot be NULL */
1068 return iter->desc->sptes[iter->pos];
1069 }
1070 }
1071
1072 return NULL;
1073}
1074
1007static void drop_spte(struct kvm *kvm, u64 *sptep) 1075static void drop_spte(struct kvm *kvm, u64 *sptep)
1008{ 1076{
1009 if (mmu_spte_clear_track_bits(sptep)) 1077 if (mmu_spte_clear_track_bits(sptep))
1010 rmap_remove(kvm, sptep); 1078 rmap_remove(kvm, sptep);
1011} 1079}
1012 1080
1013int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, 1081
1014 struct kvm_memory_slot *slot) 1082static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
1015{ 1083{
1016 unsigned long *rmapp; 1084 if (is_large_pte(*sptep)) {
1017 u64 *spte; 1085 WARN_ON(page_header(__pa(sptep))->role.level ==
1018 int i, write_protected = 0; 1086 PT_PAGE_TABLE_LEVEL);
1019 1087 drop_spte(kvm, sptep);
1020 rmapp = __gfn_to_rmap(gfn, PT_PAGE_TABLE_LEVEL, slot); 1088 --kvm->stat.lpages;
1021 spte = rmap_next(rmapp, NULL); 1089 return true;
1022 while (spte) {
1023 BUG_ON(!(*spte & PT_PRESENT_MASK));
1024 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
1025 if (is_writable_pte(*spte)) {
1026 mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK);
1027 write_protected = 1;
1028 }
1029 spte = rmap_next(rmapp, spte);
1030 } 1090 }
1031 1091
1032 /* check for huge page mappings */ 1092 return false;
1033 for (i = PT_DIRECTORY_LEVEL; 1093}
1034 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 1094
1035 rmapp = __gfn_to_rmap(gfn, i, slot); 1095static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1036 spte = rmap_next(rmapp, NULL); 1096{
1037 while (spte) { 1097 if (__drop_large_spte(vcpu->kvm, sptep))
1038 BUG_ON(!(*spte & PT_PRESENT_MASK)); 1098 kvm_flush_remote_tlbs(vcpu->kvm);
1039 BUG_ON(!is_large_pte(*spte)); 1099}
1040 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); 1100
1041 if (is_writable_pte(*spte)) { 1101/*
1042 drop_spte(kvm, spte); 1102 * Write-protect on the specified @sptep, @pt_protect indicates whether
1043 --kvm->stat.lpages; 1103 * spte writ-protection is caused by protecting shadow page table.
1044 spte = NULL; 1104 * @flush indicates whether tlb need be flushed.
1045 write_protected = 1; 1105 *
1046 } 1106 * Note: write protection is difference between drity logging and spte
1047 spte = rmap_next(rmapp, spte); 1107 * protection:
1108 * - for dirty logging, the spte can be set to writable at anytime if
1109 * its dirty bitmap is properly set.
1110 * - for spte protection, the spte can be writable only after unsync-ing
1111 * shadow page.
1112 *
1113 * Return true if the spte is dropped.
1114 */
1115static bool
1116spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect)
1117{
1118 u64 spte = *sptep;
1119
1120 if (!is_writable_pte(spte) &&
1121 !(pt_protect && spte_is_locklessly_modifiable(spte)))
1122 return false;
1123
1124 rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
1125
1126 if (__drop_large_spte(kvm, sptep)) {
1127 *flush |= true;
1128 return true;
1129 }
1130
1131 if (pt_protect)
1132 spte &= ~SPTE_MMU_WRITEABLE;
1133 spte = spte & ~PT_WRITABLE_MASK;
1134
1135 *flush |= mmu_spte_update(sptep, spte);
1136 return false;
1137}
1138
1139static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
1140 int level, bool pt_protect)
1141{
1142 u64 *sptep;
1143 struct rmap_iterator iter;
1144 bool flush = false;
1145
1146 for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
1147 BUG_ON(!(*sptep & PT_PRESENT_MASK));
1148 if (spte_write_protect(kvm, sptep, &flush, pt_protect)) {
1149 sptep = rmap_get_first(*rmapp, &iter);
1150 continue;
1048 } 1151 }
1152
1153 sptep = rmap_get_next(&iter);
1049 } 1154 }
1050 1155
1051 return write_protected; 1156 return flush;
1052} 1157}
1053 1158
1054static int rmap_write_protect(struct kvm *kvm, u64 gfn) 1159/**
1160 * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
1161 * @kvm: kvm instance
1162 * @slot: slot to protect
1163 * @gfn_offset: start of the BITS_PER_LONG pages we care about
1164 * @mask: indicates which pages we should protect
1165 *
1166 * Used when we do not need to care about huge page mappings: e.g. during dirty
1167 * logging we do not have any such mappings.
1168 */
1169void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1170 struct kvm_memory_slot *slot,
1171 gfn_t gfn_offset, unsigned long mask)
1172{
1173 unsigned long *rmapp;
1174
1175 while (mask) {
1176 rmapp = &slot->rmap[gfn_offset + __ffs(mask)];
1177 __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false);
1178
1179 /* clear the first set bit */
1180 mask &= mask - 1;
1181 }
1182}
1183
1184static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
1055{ 1185{
1056 struct kvm_memory_slot *slot; 1186 struct kvm_memory_slot *slot;
1187 unsigned long *rmapp;
1188 int i;
1189 bool write_protected = false;
1057 1190
1058 slot = gfn_to_memslot(kvm, gfn); 1191 slot = gfn_to_memslot(kvm, gfn);
1059 return kvm_mmu_rmap_write_protect(kvm, gfn, slot); 1192
1193 for (i = PT_PAGE_TABLE_LEVEL;
1194 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
1195 rmapp = __gfn_to_rmap(gfn, i, slot);
1196 write_protected |= __rmap_write_protect(kvm, rmapp, i, true);
1197 }
1198
1199 return write_protected;
1060} 1200}
1061 1201
1062static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, 1202static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
1063 unsigned long data) 1203 unsigned long data)
1064{ 1204{
1065 u64 *spte; 1205 u64 *sptep;
1206 struct rmap_iterator iter;
1066 int need_tlb_flush = 0; 1207 int need_tlb_flush = 0;
1067 1208
1068 while ((spte = rmap_next(rmapp, NULL))) { 1209 while ((sptep = rmap_get_first(*rmapp, &iter))) {
1069 BUG_ON(!(*spte & PT_PRESENT_MASK)); 1210 BUG_ON(!(*sptep & PT_PRESENT_MASK));
1070 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); 1211 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", sptep, *sptep);
1071 drop_spte(kvm, spte); 1212
1213 drop_spte(kvm, sptep);
1072 need_tlb_flush = 1; 1214 need_tlb_flush = 1;
1073 } 1215 }
1216
1074 return need_tlb_flush; 1217 return need_tlb_flush;
1075} 1218}
1076 1219
1077static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, 1220static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
1078 unsigned long data) 1221 unsigned long data)
1079{ 1222{
1223 u64 *sptep;
1224 struct rmap_iterator iter;
1080 int need_flush = 0; 1225 int need_flush = 0;
1081 u64 *spte, new_spte; 1226 u64 new_spte;
1082 pte_t *ptep = (pte_t *)data; 1227 pte_t *ptep = (pte_t *)data;
1083 pfn_t new_pfn; 1228 pfn_t new_pfn;
1084 1229
1085 WARN_ON(pte_huge(*ptep)); 1230 WARN_ON(pte_huge(*ptep));
1086 new_pfn = pte_pfn(*ptep); 1231 new_pfn = pte_pfn(*ptep);
1087 spte = rmap_next(rmapp, NULL); 1232
1088 while (spte) { 1233 for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
1089 BUG_ON(!is_shadow_present_pte(*spte)); 1234 BUG_ON(!is_shadow_present_pte(*sptep));
1090 rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); 1235 rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", sptep, *sptep);
1236
1091 need_flush = 1; 1237 need_flush = 1;
1238
1092 if (pte_write(*ptep)) { 1239 if (pte_write(*ptep)) {
1093 drop_spte(kvm, spte); 1240 drop_spte(kvm, sptep);
1094 spte = rmap_next(rmapp, NULL); 1241 sptep = rmap_get_first(*rmapp, &iter);
1095 } else { 1242 } else {
1096 new_spte = *spte &~ (PT64_BASE_ADDR_MASK); 1243 new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
1097 new_spte |= (u64)new_pfn << PAGE_SHIFT; 1244 new_spte |= (u64)new_pfn << PAGE_SHIFT;
1098 1245
1099 new_spte &= ~PT_WRITABLE_MASK; 1246 new_spte &= ~PT_WRITABLE_MASK;
1100 new_spte &= ~SPTE_HOST_WRITEABLE; 1247 new_spte &= ~SPTE_HOST_WRITEABLE;
1101 new_spte &= ~shadow_accessed_mask; 1248 new_spte &= ~shadow_accessed_mask;
1102 mmu_spte_clear_track_bits(spte); 1249
1103 mmu_spte_set(spte, new_spte); 1250 mmu_spte_clear_track_bits(sptep);
1104 spte = rmap_next(rmapp, spte); 1251 mmu_spte_set(sptep, new_spte);
1252 sptep = rmap_get_next(&iter);
1105 } 1253 }
1106 } 1254 }
1255
1107 if (need_flush) 1256 if (need_flush)
1108 kvm_flush_remote_tlbs(kvm); 1257 kvm_flush_remote_tlbs(kvm);
1109 1258
@@ -1162,11 +1311,13 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
1162static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, 1311static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1163 unsigned long data) 1312 unsigned long data)
1164{ 1313{
1165 u64 *spte; 1314 u64 *sptep;
1315 struct rmap_iterator uninitialized_var(iter);
1166 int young = 0; 1316 int young = 0;
1167 1317
1168 /* 1318 /*
1169 * Emulate the accessed bit for EPT, by checking if this page has 1319 * In case of absence of EPT Access and Dirty Bits supports,
1320 * emulate the accessed bit for EPT, by checking if this page has
1170 * an EPT mapping, and clearing it if it does. On the next access, 1321 * an EPT mapping, and clearing it if it does. On the next access,
1171 * a new EPT mapping will be established. 1322 * a new EPT mapping will be established.
1172 * This has some overhead, but not as much as the cost of swapping 1323 * This has some overhead, but not as much as the cost of swapping
@@ -1175,25 +1326,25 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1175 if (!shadow_accessed_mask) 1326 if (!shadow_accessed_mask)
1176 return kvm_unmap_rmapp(kvm, rmapp, data); 1327 return kvm_unmap_rmapp(kvm, rmapp, data);
1177 1328
1178 spte = rmap_next(rmapp, NULL); 1329 for (sptep = rmap_get_first(*rmapp, &iter); sptep;
1179 while (spte) { 1330 sptep = rmap_get_next(&iter)) {
1180 int _young; 1331 BUG_ON(!is_shadow_present_pte(*sptep));
1181 u64 _spte = *spte; 1332
1182 BUG_ON(!(_spte & PT_PRESENT_MASK)); 1333 if (*sptep & shadow_accessed_mask) {
1183 _young = _spte & PT_ACCESSED_MASK;
1184 if (_young) {
1185 young = 1; 1334 young = 1;
1186 clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); 1335 clear_bit((ffs(shadow_accessed_mask) - 1),
1336 (unsigned long *)sptep);
1187 } 1337 }
1188 spte = rmap_next(rmapp, spte);
1189 } 1338 }
1339
1190 return young; 1340 return young;
1191} 1341}
1192 1342
1193static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, 1343static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1194 unsigned long data) 1344 unsigned long data)
1195{ 1345{
1196 u64 *spte; 1346 u64 *sptep;
1347 struct rmap_iterator iter;
1197 int young = 0; 1348 int young = 0;
1198 1349
1199 /* 1350 /*
@@ -1204,16 +1355,14 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1204 if (!shadow_accessed_mask) 1355 if (!shadow_accessed_mask)
1205 goto out; 1356 goto out;
1206 1357
1207 spte = rmap_next(rmapp, NULL); 1358 for (sptep = rmap_get_first(*rmapp, &iter); sptep;
1208 while (spte) { 1359 sptep = rmap_get_next(&iter)) {
1209 u64 _spte = *spte; 1360 BUG_ON(!is_shadow_present_pte(*sptep));
1210 BUG_ON(!(_spte & PT_PRESENT_MASK)); 1361
1211 young = _spte & PT_ACCESSED_MASK; 1362 if (*sptep & shadow_accessed_mask) {
1212 if (young) {
1213 young = 1; 1363 young = 1;
1214 break; 1364 break;
1215 } 1365 }
1216 spte = rmap_next(rmapp, spte);
1217 } 1366 }
1218out: 1367out:
1219 return young; 1368 return young;
@@ -1328,12 +1477,10 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
1328 u64 *parent_pte, int direct) 1477 u64 *parent_pte, int direct)
1329{ 1478{
1330 struct kvm_mmu_page *sp; 1479 struct kvm_mmu_page *sp;
1331 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, 1480 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
1332 sizeof *sp); 1481 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
1333 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
1334 if (!direct) 1482 if (!direct)
1335 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, 1483 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
1336 PAGE_SIZE);
1337 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 1484 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
1338 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); 1485 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
1339 bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM); 1486 bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM);
@@ -1628,7 +1775,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
1628 1775
1629 kvm_mmu_pages_init(parent, &parents, &pages); 1776 kvm_mmu_pages_init(parent, &parents, &pages);
1630 while (mmu_unsync_walk(parent, &pages)) { 1777 while (mmu_unsync_walk(parent, &pages)) {
1631 int protected = 0; 1778 bool protected = false;
1632 1779
1633 for_each_sp(pages, sp, parents, i) 1780 for_each_sp(pages, sp, parents, i)
1634 protected |= rmap_write_protect(vcpu->kvm, sp->gfn); 1781 protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
@@ -1793,15 +1940,6 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
1793 mmu_spte_set(sptep, spte); 1940 mmu_spte_set(sptep, spte);
1794} 1941}
1795 1942
1796static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1797{
1798 if (is_large_pte(*sptep)) {
1799 drop_spte(vcpu->kvm, sptep);
1800 --vcpu->kvm->stat.lpages;
1801 kvm_flush_remote_tlbs(vcpu->kvm);
1802 }
1803}
1804
1805static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, 1943static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1806 unsigned direct_access) 1944 unsigned direct_access)
1807{ 1945{
@@ -1865,10 +2003,11 @@ static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
1865 2003
1866static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) 2004static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
1867{ 2005{
1868 u64 *parent_pte; 2006 u64 *sptep;
2007 struct rmap_iterator iter;
1869 2008
1870 while ((parent_pte = pte_list_next(&sp->parent_ptes, NULL))) 2009 while ((sptep = rmap_get_first(sp->parent_ptes, &iter)))
1871 drop_parent_pte(sp, parent_pte); 2010 drop_parent_pte(sp, sptep);
1872} 2011}
1873 2012
1874static int mmu_zap_unsync_children(struct kvm *kvm, 2013static int mmu_zap_unsync_children(struct kvm *kvm,
@@ -1925,30 +2064,6 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1925 return ret; 2064 return ret;
1926} 2065}
1927 2066
1928static void kvm_mmu_isolate_pages(struct list_head *invalid_list)
1929{
1930 struct kvm_mmu_page *sp;
1931
1932 list_for_each_entry(sp, invalid_list, link)
1933 kvm_mmu_isolate_page(sp);
1934}
1935
1936static void free_pages_rcu(struct rcu_head *head)
1937{
1938 struct kvm_mmu_page *next, *sp;
1939
1940 sp = container_of(head, struct kvm_mmu_page, rcu);
1941 while (sp) {
1942 if (!list_empty(&sp->link))
1943 next = list_first_entry(&sp->link,
1944 struct kvm_mmu_page, link);
1945 else
1946 next = NULL;
1947 kvm_mmu_free_page(sp);
1948 sp = next;
1949 }
1950}
1951
1952static void kvm_mmu_commit_zap_page(struct kvm *kvm, 2067static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1953 struct list_head *invalid_list) 2068 struct list_head *invalid_list)
1954{ 2069{
@@ -1957,17 +2072,17 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1957 if (list_empty(invalid_list)) 2072 if (list_empty(invalid_list))
1958 return; 2073 return;
1959 2074
1960 kvm_flush_remote_tlbs(kvm); 2075 /*
1961 2076 * wmb: make sure everyone sees our modifications to the page tables
1962 if (atomic_read(&kvm->arch.reader_counter)) { 2077 * rmb: make sure we see changes to vcpu->mode
1963 kvm_mmu_isolate_pages(invalid_list); 2078 */
1964 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); 2079 smp_mb();
1965 list_del_init(invalid_list);
1966 2080
1967 trace_kvm_mmu_delay_free_pages(sp); 2081 /*
1968 call_rcu(&sp->rcu, free_pages_rcu); 2082 * Wait for all vcpus to exit guest mode and/or lockless shadow
1969 return; 2083 * page table walks.
1970 } 2084 */
2085 kvm_flush_remote_tlbs(kvm);
1971 2086
1972 do { 2087 do {
1973 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); 2088 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
@@ -1975,7 +2090,6 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1975 kvm_mmu_isolate_page(sp); 2090 kvm_mmu_isolate_page(sp);
1976 kvm_mmu_free_page(sp); 2091 kvm_mmu_free_page(sp);
1977 } while (!list_empty(invalid_list)); 2092 } while (!list_empty(invalid_list));
1978
1979} 2093}
1980 2094
1981/* 2095/*
@@ -2194,7 +2308,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2194 gfn_t gfn, pfn_t pfn, bool speculative, 2308 gfn_t gfn, pfn_t pfn, bool speculative,
2195 bool can_unsync, bool host_writable) 2309 bool can_unsync, bool host_writable)
2196{ 2310{
2197 u64 spte, entry = *sptep; 2311 u64 spte;
2198 int ret = 0; 2312 int ret = 0;
2199 2313
2200 if (set_mmio_spte(sptep, gfn, pfn, pte_access)) 2314 if (set_mmio_spte(sptep, gfn, pfn, pte_access))
@@ -2208,8 +2322,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2208 spte |= shadow_x_mask; 2322 spte |= shadow_x_mask;
2209 else 2323 else
2210 spte |= shadow_nx_mask; 2324 spte |= shadow_nx_mask;
2325
2211 if (pte_access & ACC_USER_MASK) 2326 if (pte_access & ACC_USER_MASK)
2212 spte |= shadow_user_mask; 2327 spte |= shadow_user_mask;
2328
2213 if (level > PT_PAGE_TABLE_LEVEL) 2329 if (level > PT_PAGE_TABLE_LEVEL)
2214 spte |= PT_PAGE_SIZE_MASK; 2330 spte |= PT_PAGE_SIZE_MASK;
2215 if (tdp_enabled) 2331 if (tdp_enabled)
@@ -2234,7 +2350,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2234 goto done; 2350 goto done;
2235 } 2351 }
2236 2352
2237 spte |= PT_WRITABLE_MASK; 2353 spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
2238 2354
2239 if (!vcpu->arch.mmu.direct_map 2355 if (!vcpu->arch.mmu.direct_map
2240 && !(pte_access & ACC_WRITE_MASK)) { 2356 && !(pte_access & ACC_WRITE_MASK)) {
@@ -2263,8 +2379,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2263 __func__, gfn); 2379 __func__, gfn);
2264 ret = 1; 2380 ret = 1;
2265 pte_access &= ~ACC_WRITE_MASK; 2381 pte_access &= ~ACC_WRITE_MASK;
2266 if (is_writable_pte(spte)) 2382 spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
2267 spte &= ~PT_WRITABLE_MASK;
2268 } 2383 }
2269 } 2384 }
2270 2385
@@ -2272,14 +2387,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2272 mark_page_dirty(vcpu->kvm, gfn); 2387 mark_page_dirty(vcpu->kvm, gfn);
2273 2388
2274set_pte: 2389set_pte:
2275 mmu_spte_update(sptep, spte); 2390 if (mmu_spte_update(sptep, spte))
2276 /*
2277 * If we overwrite a writable spte with a read-only one we
2278 * should flush remote TLBs. Otherwise rmap_write_protect
2279 * will find a read-only spte, even though the writable spte
2280 * might be cached on a CPU's TLB.
2281 */
2282 if (is_writable_pte(entry) && !is_writable_pte(*sptep))
2283 kvm_flush_remote_tlbs(vcpu->kvm); 2391 kvm_flush_remote_tlbs(vcpu->kvm);
2284done: 2392done:
2285 return ret; 2393 return ret;
@@ -2354,6 +2462,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2354 2462
2355static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) 2463static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
2356{ 2464{
2465 mmu_free_roots(vcpu);
2357} 2466}
2358 2467
2359static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, 2468static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
@@ -2546,8 +2655,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
2546 *gfnp = gfn; 2655 *gfnp = gfn;
2547 kvm_release_pfn_clean(pfn); 2656 kvm_release_pfn_clean(pfn);
2548 pfn &= ~mask; 2657 pfn &= ~mask;
2549 if (!get_page_unless_zero(pfn_to_page(pfn))) 2658 kvm_get_pfn(pfn);
2550 BUG();
2551 *pfnp = pfn; 2659 *pfnp = pfn;
2552 } 2660 }
2553 } 2661 }
@@ -2577,18 +2685,116 @@ exit:
2577 return ret; 2685 return ret;
2578} 2686}
2579 2687
2688static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, u32 error_code)
2689{
2690 /*
2691 * #PF can be fast only if the shadow page table is present and it
2692 * is caused by write-protect, that means we just need change the
2693 * W bit of the spte which can be done out of mmu-lock.
2694 */
2695 if (!(error_code & PFERR_PRESENT_MASK) ||
2696 !(error_code & PFERR_WRITE_MASK))
2697 return false;
2698
2699 return true;
2700}
2701
2702static bool
2703fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte)
2704{
2705 struct kvm_mmu_page *sp = page_header(__pa(sptep));
2706 gfn_t gfn;
2707
2708 WARN_ON(!sp->role.direct);
2709
2710 /*
2711 * The gfn of direct spte is stable since it is calculated
2712 * by sp->gfn.
2713 */
2714 gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
2715
2716 if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte)
2717 mark_page_dirty(vcpu->kvm, gfn);
2718
2719 return true;
2720}
2721
2722/*
2723 * Return value:
2724 * - true: let the vcpu to access on the same address again.
2725 * - false: let the real page fault path to fix it.
2726 */
2727static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
2728 u32 error_code)
2729{
2730 struct kvm_shadow_walk_iterator iterator;
2731 bool ret = false;
2732 u64 spte = 0ull;
2733
2734 if (!page_fault_can_be_fast(vcpu, error_code))
2735 return false;
2736
2737 walk_shadow_page_lockless_begin(vcpu);
2738 for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
2739 if (!is_shadow_present_pte(spte) || iterator.level < level)
2740 break;
2741
2742 /*
2743 * If the mapping has been changed, let the vcpu fault on the
2744 * same address again.
2745 */
2746 if (!is_rmap_spte(spte)) {
2747 ret = true;
2748 goto exit;
2749 }
2750
2751 if (!is_last_spte(spte, level))
2752 goto exit;
2753
2754 /*
2755 * Check if it is a spurious fault caused by TLB lazily flushed.
2756 *
2757 * Need not check the access of upper level table entries since
2758 * they are always ACC_ALL.
2759 */
2760 if (is_writable_pte(spte)) {
2761 ret = true;
2762 goto exit;
2763 }
2764
2765 /*
2766 * Currently, to simplify the code, only the spte write-protected
2767 * by dirty-log can be fast fixed.
2768 */
2769 if (!spte_is_locklessly_modifiable(spte))
2770 goto exit;
2771
2772 /*
2773 * Currently, fast page fault only works for direct mapping since
2774 * the gfn is not stable for indirect shadow page.
2775 * See Documentation/virtual/kvm/locking.txt to get more detail.
2776 */
2777 ret = fast_pf_fix_direct_spte(vcpu, iterator.sptep, spte);
2778exit:
2779 trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
2780 spte, ret);
2781 walk_shadow_page_lockless_end(vcpu);
2782
2783 return ret;
2784}
2785
2580static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, 2786static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
2581 gva_t gva, pfn_t *pfn, bool write, bool *writable); 2787 gva_t gva, pfn_t *pfn, bool write, bool *writable);
2582 2788
2583static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, 2789static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
2584 bool prefault) 2790 gfn_t gfn, bool prefault)
2585{ 2791{
2586 int r; 2792 int r;
2587 int level; 2793 int level;
2588 int force_pt_level; 2794 int force_pt_level;
2589 pfn_t pfn; 2795 pfn_t pfn;
2590 unsigned long mmu_seq; 2796 unsigned long mmu_seq;
2591 bool map_writable; 2797 bool map_writable, write = error_code & PFERR_WRITE_MASK;
2592 2798
2593 force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); 2799 force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
2594 if (likely(!force_pt_level)) { 2800 if (likely(!force_pt_level)) {
@@ -2605,6 +2811,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
2605 } else 2811 } else
2606 level = PT_PAGE_TABLE_LEVEL; 2812 level = PT_PAGE_TABLE_LEVEL;
2607 2813
2814 if (fast_page_fault(vcpu, v, level, error_code))
2815 return 0;
2816
2608 mmu_seq = vcpu->kvm->mmu_notifier_seq; 2817 mmu_seq = vcpu->kvm->mmu_notifier_seq;
2609 smp_rmb(); 2818 smp_rmb();
2610 2819
@@ -2993,7 +3202,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
2993 gfn = gva >> PAGE_SHIFT; 3202 gfn = gva >> PAGE_SHIFT;
2994 3203
2995 return nonpaging_map(vcpu, gva & PAGE_MASK, 3204 return nonpaging_map(vcpu, gva & PAGE_MASK,
2996 error_code & PFERR_WRITE_MASK, gfn, prefault); 3205 error_code, gfn, prefault);
2997} 3206}
2998 3207
2999static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) 3208static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
@@ -3073,6 +3282,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
3073 } else 3282 } else
3074 level = PT_PAGE_TABLE_LEVEL; 3283 level = PT_PAGE_TABLE_LEVEL;
3075 3284
3285 if (fast_page_fault(vcpu, gpa, level, error_code))
3286 return 0;
3287
3076 mmu_seq = vcpu->kvm->mmu_notifier_seq; 3288 mmu_seq = vcpu->kvm->mmu_notifier_seq;
3077 smp_rmb(); 3289 smp_rmb();
3078 3290
@@ -3554,7 +3766,7 @@ static bool detect_write_flooding(struct kvm_mmu_page *sp)
3554 * Skip write-flooding detected for the sp whose level is 1, because 3766 * Skip write-flooding detected for the sp whose level is 1, because
3555 * it can become unsync, then the guest page is not write-protected. 3767 * it can become unsync, then the guest page is not write-protected.
3556 */ 3768 */
3557 if (sp->role.level == 1) 3769 if (sp->role.level == PT_PAGE_TABLE_LEVEL)
3558 return false; 3770 return false;
3559 3771
3560 return ++sp->write_flooding_count >= 3; 3772 return ++sp->write_flooding_count >= 3;
@@ -3837,6 +4049,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu)
3837void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) 4049void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
3838{ 4050{
3839 struct kvm_mmu_page *sp; 4051 struct kvm_mmu_page *sp;
4052 bool flush = false;
3840 4053
3841 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { 4054 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
3842 int i; 4055 int i;
@@ -3851,16 +4064,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
3851 !is_last_spte(pt[i], sp->role.level)) 4064 !is_last_spte(pt[i], sp->role.level))
3852 continue; 4065 continue;
3853 4066
3854 if (is_large_pte(pt[i])) { 4067 spte_write_protect(kvm, &pt[i], &flush, false);
3855 drop_spte(kvm, &pt[i]);
3856 --kvm->stat.lpages;
3857 continue;
3858 }
3859
3860 /* avoid RMW */
3861 if (is_writable_pte(pt[i]))
3862 mmu_spte_update(&pt[i],
3863 pt[i] & ~PT_WRITABLE_MASK);
3864 } 4068 }
3865 } 4069 }
3866 kvm_flush_remote_tlbs(kvm); 4070 kvm_flush_remote_tlbs(kvm);
@@ -3886,6 +4090,9 @@ static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
3886{ 4090{
3887 struct kvm_mmu_page *page; 4091 struct kvm_mmu_page *page;
3888 4092
4093 if (list_empty(&kvm->arch.active_mmu_pages))
4094 return;
4095
3889 page = container_of(kvm->arch.active_mmu_pages.prev, 4096 page = container_of(kvm->arch.active_mmu_pages.prev,
3890 struct kvm_mmu_page, link); 4097 struct kvm_mmu_page, link);
3891 kvm_mmu_prepare_zap_page(kvm, page, invalid_list); 4098 kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
@@ -3894,7 +4101,6 @@ static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
3894static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) 4101static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
3895{ 4102{
3896 struct kvm *kvm; 4103 struct kvm *kvm;
3897 struct kvm *kvm_freed = NULL;
3898 int nr_to_scan = sc->nr_to_scan; 4104 int nr_to_scan = sc->nr_to_scan;
3899 4105
3900 if (nr_to_scan == 0) 4106 if (nr_to_scan == 0)
@@ -3906,22 +4112,30 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
3906 int idx; 4112 int idx;
3907 LIST_HEAD(invalid_list); 4113 LIST_HEAD(invalid_list);
3908 4114
4115 /*
4116 * n_used_mmu_pages is accessed without holding kvm->mmu_lock
4117 * here. We may skip a VM instance errorneosly, but we do not
4118 * want to shrink a VM that only started to populate its MMU
4119 * anyway.
4120 */
4121 if (kvm->arch.n_used_mmu_pages > 0) {
4122 if (!nr_to_scan--)
4123 break;
4124 continue;
4125 }
4126
3909 idx = srcu_read_lock(&kvm->srcu); 4127 idx = srcu_read_lock(&kvm->srcu);
3910 spin_lock(&kvm->mmu_lock); 4128 spin_lock(&kvm->mmu_lock);
3911 if (!kvm_freed && nr_to_scan > 0 &&
3912 kvm->arch.n_used_mmu_pages > 0) {
3913 kvm_mmu_remove_some_alloc_mmu_pages(kvm,
3914 &invalid_list);
3915 kvm_freed = kvm;
3916 }
3917 nr_to_scan--;
3918 4129
4130 kvm_mmu_remove_some_alloc_mmu_pages(kvm, &invalid_list);
3919 kvm_mmu_commit_zap_page(kvm, &invalid_list); 4131 kvm_mmu_commit_zap_page(kvm, &invalid_list);
4132
3920 spin_unlock(&kvm->mmu_lock); 4133 spin_unlock(&kvm->mmu_lock);
3921 srcu_read_unlock(&kvm->srcu, idx); 4134 srcu_read_unlock(&kvm->srcu, idx);
4135
4136 list_move_tail(&kvm->vm_list, &vm_list);
4137 break;
3922 } 4138 }
3923 if (kvm_freed)
3924 list_move_tail(&kvm_freed->vm_list, &vm_list);
3925 4139
3926 raw_spin_unlock(&kvm_lock); 4140 raw_spin_unlock(&kvm_lock);
3927 4141
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 715da5a19a5..7d7d0b9e23e 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -192,7 +192,8 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
192{ 192{
193 struct kvm_memory_slot *slot; 193 struct kvm_memory_slot *slot;
194 unsigned long *rmapp; 194 unsigned long *rmapp;
195 u64 *spte; 195 u64 *sptep;
196 struct rmap_iterator iter;
196 197
197 if (sp->role.direct || sp->unsync || sp->role.invalid) 198 if (sp->role.direct || sp->unsync || sp->role.invalid)
198 return; 199 return;
@@ -200,13 +201,12 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
200 slot = gfn_to_memslot(kvm, sp->gfn); 201 slot = gfn_to_memslot(kvm, sp->gfn);
201 rmapp = &slot->rmap[sp->gfn - slot->base_gfn]; 202 rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
202 203
203 spte = rmap_next(rmapp, NULL); 204 for (sptep = rmap_get_first(*rmapp, &iter); sptep;
204 while (spte) { 205 sptep = rmap_get_next(&iter)) {
205 if (is_writable_pte(*spte)) 206 if (is_writable_pte(*sptep))
206 audit_printk(kvm, "shadow page has writable " 207 audit_printk(kvm, "shadow page has writable "
207 "mappings: gfn %llx role %x\n", 208 "mappings: gfn %llx role %x\n",
208 sp->gfn, sp->role.word); 209 sp->gfn, sp->role.word);
209 spte = rmap_next(rmapp, spte);
210 } 210 }
211} 211}
212 212
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 89fb0e81322..cd6e98333ba 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -54,8 +54,8 @@
54 */ 54 */
55TRACE_EVENT( 55TRACE_EVENT(
56 kvm_mmu_pagetable_walk, 56 kvm_mmu_pagetable_walk,
57 TP_PROTO(u64 addr, int write_fault, int user_fault, int fetch_fault), 57 TP_PROTO(u64 addr, u32 pferr),
58 TP_ARGS(addr, write_fault, user_fault, fetch_fault), 58 TP_ARGS(addr, pferr),
59 59
60 TP_STRUCT__entry( 60 TP_STRUCT__entry(
61 __field(__u64, addr) 61 __field(__u64, addr)
@@ -64,8 +64,7 @@ TRACE_EVENT(
64 64
65 TP_fast_assign( 65 TP_fast_assign(
66 __entry->addr = addr; 66 __entry->addr = addr;
67 __entry->pferr = (!!write_fault << 1) | (!!user_fault << 2) 67 __entry->pferr = pferr;
68 | (!!fetch_fault << 4);
69 ), 68 ),
70 69
71 TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr, 70 TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr,
@@ -243,6 +242,44 @@ TRACE_EVENT(
243 TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn, 242 TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn,
244 __entry->access) 243 __entry->access)
245); 244);
245
246#define __spte_satisfied(__spte) \
247 (__entry->retry && is_writable_pte(__entry->__spte))
248
249TRACE_EVENT(
250 fast_page_fault,
251 TP_PROTO(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
252 u64 *sptep, u64 old_spte, bool retry),
253 TP_ARGS(vcpu, gva, error_code, sptep, old_spte, retry),
254
255 TP_STRUCT__entry(
256 __field(int, vcpu_id)
257 __field(gva_t, gva)
258 __field(u32, error_code)
259 __field(u64 *, sptep)
260 __field(u64, old_spte)
261 __field(u64, new_spte)
262 __field(bool, retry)
263 ),
264
265 TP_fast_assign(
266 __entry->vcpu_id = vcpu->vcpu_id;
267 __entry->gva = gva;
268 __entry->error_code = error_code;
269 __entry->sptep = sptep;
270 __entry->old_spte = old_spte;
271 __entry->new_spte = *sptep;
272 __entry->retry = retry;
273 ),
274
275 TP_printk("vcpu %d gva %lx error_code %s sptep %p old %#llx"
276 " new %llx spurious %d fixed %d", __entry->vcpu_id,
277 __entry->gva, __print_flags(__entry->error_code, "|",
278 kvm_mmu_trace_pferr_flags), __entry->sptep,
279 __entry->old_spte, __entry->new_spte,
280 __spte_satisfied(old_spte), __spte_satisfied(new_spte)
281 )
282);
246#endif /* _TRACE_KVMMMU_H */ 283#endif /* _TRACE_KVMMMU_H */
247 284
248#undef TRACE_INCLUDE_PATH 285#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index df5a70311be..bb7cf01cae7 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -154,8 +154,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
154 const int fetch_fault = access & PFERR_FETCH_MASK; 154 const int fetch_fault = access & PFERR_FETCH_MASK;
155 u16 errcode = 0; 155 u16 errcode = 0;
156 156
157 trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, 157 trace_kvm_mmu_pagetable_walk(addr, access);
158 fetch_fault);
159retry_walk: 158retry_walk:
160 eperm = false; 159 eperm = false;
161 walker->level = mmu->root_level; 160 walker->level = mmu->root_level;
@@ -658,7 +657,7 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
658{ 657{
659 int offset = 0; 658 int offset = 0;
660 659
661 WARN_ON(sp->role.level != 1); 660 WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
662 661
663 if (PTTYPE == 32) 662 if (PTTYPE == 32)
664 offset = sp->role.quadrant << PT64_LEVEL_BITS; 663 offset = sp->role.quadrant << PT64_LEVEL_BITS;
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 2e88438ffd8..9b7ec1150ab 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -80,10 +80,10 @@ static inline struct kvm_pmc *get_fixed_pmc_idx(struct kvm_pmu *pmu, int idx)
80 80
81static struct kvm_pmc *global_idx_to_pmc(struct kvm_pmu *pmu, int idx) 81static struct kvm_pmc *global_idx_to_pmc(struct kvm_pmu *pmu, int idx)
82{ 82{
83 if (idx < X86_PMC_IDX_FIXED) 83 if (idx < INTEL_PMC_IDX_FIXED)
84 return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + idx, MSR_P6_EVNTSEL0); 84 return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + idx, MSR_P6_EVNTSEL0);
85 else 85 else
86 return get_fixed_pmc_idx(pmu, idx - X86_PMC_IDX_FIXED); 86 return get_fixed_pmc_idx(pmu, idx - INTEL_PMC_IDX_FIXED);
87} 87}
88 88
89void kvm_deliver_pmi(struct kvm_vcpu *vcpu) 89void kvm_deliver_pmi(struct kvm_vcpu *vcpu)
@@ -291,7 +291,7 @@ static void reprogram_idx(struct kvm_pmu *pmu, int idx)
291 if (pmc_is_gp(pmc)) 291 if (pmc_is_gp(pmc))
292 reprogram_gp_counter(pmc, pmc->eventsel); 292 reprogram_gp_counter(pmc, pmc->eventsel);
293 else { 293 else {
294 int fidx = idx - X86_PMC_IDX_FIXED; 294 int fidx = idx - INTEL_PMC_IDX_FIXED;
295 reprogram_fixed_counter(pmc, 295 reprogram_fixed_counter(pmc,
296 fixed_en_pmi(pmu->fixed_ctr_ctrl, fidx), fidx); 296 fixed_en_pmi(pmu->fixed_ctr_ctrl, fidx), fidx);
297 } 297 }
@@ -452,7 +452,7 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
452 return; 452 return;
453 453
454 pmu->nr_arch_gp_counters = min((int)(entry->eax >> 8) & 0xff, 454 pmu->nr_arch_gp_counters = min((int)(entry->eax >> 8) & 0xff,
455 X86_PMC_MAX_GENERIC); 455 INTEL_PMC_MAX_GENERIC);
456 pmu->counter_bitmask[KVM_PMC_GP] = 456 pmu->counter_bitmask[KVM_PMC_GP] =
457 ((u64)1 << ((entry->eax >> 16) & 0xff)) - 1; 457 ((u64)1 << ((entry->eax >> 16) & 0xff)) - 1;
458 bitmap_len = (entry->eax >> 24) & 0xff; 458 bitmap_len = (entry->eax >> 24) & 0xff;
@@ -462,13 +462,13 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
462 pmu->nr_arch_fixed_counters = 0; 462 pmu->nr_arch_fixed_counters = 0;
463 } else { 463 } else {
464 pmu->nr_arch_fixed_counters = min((int)(entry->edx & 0x1f), 464 pmu->nr_arch_fixed_counters = min((int)(entry->edx & 0x1f),
465 X86_PMC_MAX_FIXED); 465 INTEL_PMC_MAX_FIXED);
466 pmu->counter_bitmask[KVM_PMC_FIXED] = 466 pmu->counter_bitmask[KVM_PMC_FIXED] =
467 ((u64)1 << ((entry->edx >> 5) & 0xff)) - 1; 467 ((u64)1 << ((entry->edx >> 5) & 0xff)) - 1;
468 } 468 }
469 469
470 pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) | 470 pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) |
471 (((1ull << pmu->nr_arch_fixed_counters) - 1) << X86_PMC_IDX_FIXED); 471 (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED);
472 pmu->global_ctrl_mask = ~pmu->global_ctrl; 472 pmu->global_ctrl_mask = ~pmu->global_ctrl;
473} 473}
474 474
@@ -478,15 +478,15 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu)
478 struct kvm_pmu *pmu = &vcpu->arch.pmu; 478 struct kvm_pmu *pmu = &vcpu->arch.pmu;
479 479
480 memset(pmu, 0, sizeof(*pmu)); 480 memset(pmu, 0, sizeof(*pmu));
481 for (i = 0; i < X86_PMC_MAX_GENERIC; i++) { 481 for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) {
482 pmu->gp_counters[i].type = KVM_PMC_GP; 482 pmu->gp_counters[i].type = KVM_PMC_GP;
483 pmu->gp_counters[i].vcpu = vcpu; 483 pmu->gp_counters[i].vcpu = vcpu;
484 pmu->gp_counters[i].idx = i; 484 pmu->gp_counters[i].idx = i;
485 } 485 }
486 for (i = 0; i < X86_PMC_MAX_FIXED; i++) { 486 for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) {
487 pmu->fixed_counters[i].type = KVM_PMC_FIXED; 487 pmu->fixed_counters[i].type = KVM_PMC_FIXED;
488 pmu->fixed_counters[i].vcpu = vcpu; 488 pmu->fixed_counters[i].vcpu = vcpu;
489 pmu->fixed_counters[i].idx = i + X86_PMC_IDX_FIXED; 489 pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED;
490 } 490 }
491 init_irq_work(&pmu->irq_work, trigger_pmi); 491 init_irq_work(&pmu->irq_work, trigger_pmi);
492 kvm_pmu_cpuid_update(vcpu); 492 kvm_pmu_cpuid_update(vcpu);
@@ -498,13 +498,13 @@ void kvm_pmu_reset(struct kvm_vcpu *vcpu)
498 int i; 498 int i;
499 499
500 irq_work_sync(&pmu->irq_work); 500 irq_work_sync(&pmu->irq_work);
501 for (i = 0; i < X86_PMC_MAX_GENERIC; i++) { 501 for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) {
502 struct kvm_pmc *pmc = &pmu->gp_counters[i]; 502 struct kvm_pmc *pmc = &pmu->gp_counters[i];
503 stop_counter(pmc); 503 stop_counter(pmc);
504 pmc->counter = pmc->eventsel = 0; 504 pmc->counter = pmc->eventsel = 0;
505 } 505 }
506 506
507 for (i = 0; i < X86_PMC_MAX_FIXED; i++) 507 for (i = 0; i < INTEL_PMC_MAX_FIXED; i++)
508 stop_counter(&pmu->fixed_counters[i]); 508 stop_counter(&pmu->fixed_counters[i]);
509 509
510 pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 510 pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status =
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e334389e1c7..baead950d6c 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -22,6 +22,7 @@
22#include "x86.h" 22#include "x86.h"
23 23
24#include <linux/module.h> 24#include <linux/module.h>
25#include <linux/mod_devicetable.h>
25#include <linux/kernel.h> 26#include <linux/kernel.h>
26#include <linux/vmalloc.h> 27#include <linux/vmalloc.h>
27#include <linux/highmem.h> 28#include <linux/highmem.h>
@@ -42,6 +43,12 @@
42MODULE_AUTHOR("Qumranet"); 43MODULE_AUTHOR("Qumranet");
43MODULE_LICENSE("GPL"); 44MODULE_LICENSE("GPL");
44 45
46static const struct x86_cpu_id svm_cpu_id[] = {
47 X86_FEATURE_MATCH(X86_FEATURE_SVM),
48 {}
49};
50MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
51
45#define IOPM_ALLOC_ORDER 2 52#define IOPM_ALLOC_ORDER 2
46#define MSRPM_ALLOC_ORDER 1 53#define MSRPM_ALLOC_ORDER 1
47 54
@@ -3178,8 +3185,8 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
3178 break; 3185 break;
3179 case MSR_IA32_DEBUGCTLMSR: 3186 case MSR_IA32_DEBUGCTLMSR:
3180 if (!boot_cpu_has(X86_FEATURE_LBRV)) { 3187 if (!boot_cpu_has(X86_FEATURE_LBRV)) {
3181 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", 3188 vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
3182 __func__, data); 3189 __func__, data);
3183 break; 3190 break;
3184 } 3191 }
3185 if (data & DEBUGCTL_RESERVED_BITS) 3192 if (data & DEBUGCTL_RESERVED_BITS)
@@ -3198,7 +3205,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
3198 case MSR_VM_CR: 3205 case MSR_VM_CR:
3199 return svm_set_vm_cr(vcpu, data); 3206 return svm_set_vm_cr(vcpu, data);
3200 case MSR_VM_IGNNE: 3207 case MSR_VM_IGNNE:
3201 pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); 3208 vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
3202 break; 3209 break;
3203 default: 3210 default:
3204 return kvm_set_msr_common(vcpu, ecx, data); 3211 return kvm_set_msr_common(vcpu, ecx, data);
@@ -3240,6 +3247,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
3240 svm_clear_vintr(svm); 3247 svm_clear_vintr(svm);
3241 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 3248 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
3242 mark_dirty(svm->vmcb, VMCB_INTR); 3249 mark_dirty(svm->vmcb, VMCB_INTR);
3250 ++svm->vcpu.stat.irq_window_exits;
3243 /* 3251 /*
3244 * If the user space waits to inject interrupts, exit as soon as 3252 * If the user space waits to inject interrupts, exit as soon as
3245 * possible 3253 * possible
@@ -3247,7 +3255,6 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
3247 if (!irqchip_in_kernel(svm->vcpu.kvm) && 3255 if (!irqchip_in_kernel(svm->vcpu.kvm) &&
3248 kvm_run->request_interrupt_window && 3256 kvm_run->request_interrupt_window &&
3249 !kvm_cpu_has_interrupt(&svm->vcpu)) { 3257 !kvm_cpu_has_interrupt(&svm->vcpu)) {
3250 ++svm->vcpu.stat.irq_window_exits;
3251 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; 3258 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
3252 return 0; 3259 return 0;
3253 } 3260 }
@@ -4037,6 +4044,11 @@ static bool svm_rdtscp_supported(void)
4037 return false; 4044 return false;
4038} 4045}
4039 4046
4047static bool svm_invpcid_supported(void)
4048{
4049 return false;
4050}
4051
4040static bool svm_has_wbinvd_exit(void) 4052static bool svm_has_wbinvd_exit(void)
4041{ 4053{
4042 return true; 4054 return true;
@@ -4305,6 +4317,7 @@ static struct kvm_x86_ops svm_x86_ops = {
4305 .cpuid_update = svm_cpuid_update, 4317 .cpuid_update = svm_cpuid_update,
4306 4318
4307 .rdtscp_supported = svm_rdtscp_supported, 4319 .rdtscp_supported = svm_rdtscp_supported,
4320 .invpcid_supported = svm_invpcid_supported,
4308 4321
4309 .set_supported_cpuid = svm_set_supported_cpuid, 4322 .set_supported_cpuid = svm_set_supported_cpuid,
4310 4323
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 911d2641f14..a71faf727ff 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -517,6 +517,40 @@ TRACE_EVENT(kvm_apic_accept_irq,
517 __entry->coalesced ? " (coalesced)" : "") 517 __entry->coalesced ? " (coalesced)" : "")
518); 518);
519 519
520TRACE_EVENT(kvm_eoi,
521 TP_PROTO(struct kvm_lapic *apic, int vector),
522 TP_ARGS(apic, vector),
523
524 TP_STRUCT__entry(
525 __field( __u32, apicid )
526 __field( int, vector )
527 ),
528
529 TP_fast_assign(
530 __entry->apicid = apic->vcpu->vcpu_id;
531 __entry->vector = vector;
532 ),
533
534 TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector)
535);
536
537TRACE_EVENT(kvm_pv_eoi,
538 TP_PROTO(struct kvm_lapic *apic, int vector),
539 TP_ARGS(apic, vector),
540
541 TP_STRUCT__entry(
542 __field( __u32, apicid )
543 __field( int, vector )
544 ),
545
546 TP_fast_assign(
547 __entry->apicid = apic->vcpu->vcpu_id;
548 __entry->vector = vector;
549 ),
550
551 TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector)
552);
553
520/* 554/*
521 * Tracepoint for nested VMRUN 555 * Tracepoint for nested VMRUN
522 */ 556 */
@@ -710,16 +744,6 @@ TRACE_EVENT(kvm_skinit,
710 __entry->rip, __entry->slb) 744 __entry->rip, __entry->slb)
711); 745);
712 746
713#define __print_insn(insn, ilen) ({ \
714 int i; \
715 const char *ret = p->buffer + p->len; \
716 \
717 for (i = 0; i < ilen; ++i) \
718 trace_seq_printf(p, " %02x", insn[i]); \
719 trace_seq_printf(p, "%c", 0); \
720 ret; \
721 })
722
723#define KVM_EMUL_INSN_F_CR0_PE (1 << 0) 747#define KVM_EMUL_INSN_F_CR0_PE (1 << 0)
724#define KVM_EMUL_INSN_F_EFL_VM (1 << 1) 748#define KVM_EMUL_INSN_F_EFL_VM (1 << 1)
725#define KVM_EMUL_INSN_F_CS_D (1 << 2) 749#define KVM_EMUL_INSN_F_CS_D (1 << 2)
@@ -786,7 +810,7 @@ TRACE_EVENT(kvm_emulate_insn,
786 810
787 TP_printk("%x:%llx:%s (%s)%s", 811 TP_printk("%x:%llx:%s (%s)%s",
788 __entry->csbase, __entry->rip, 812 __entry->csbase, __entry->rip,
789 __print_insn(__entry->insn, __entry->len), 813 __print_hex(__entry->insn, __entry->len),
790 __print_symbolic(__entry->flags, 814 __print_symbolic(__entry->flags,
791 kvm_trace_symbol_emul_flags), 815 kvm_trace_symbol_emul_flags),
792 __entry->failed ? " failed" : "" 816 __entry->failed ? " failed" : ""
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4ff0ab9bc3c..c39b60707e0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -27,6 +27,7 @@
27#include <linux/highmem.h> 27#include <linux/highmem.h>
28#include <linux/sched.h> 28#include <linux/sched.h>
29#include <linux/moduleparam.h> 29#include <linux/moduleparam.h>
30#include <linux/mod_devicetable.h>
30#include <linux/ftrace_event.h> 31#include <linux/ftrace_event.h>
31#include <linux/slab.h> 32#include <linux/slab.h>
32#include <linux/tboot.h> 33#include <linux/tboot.h>
@@ -51,6 +52,12 @@
51MODULE_AUTHOR("Qumranet"); 52MODULE_AUTHOR("Qumranet");
52MODULE_LICENSE("GPL"); 53MODULE_LICENSE("GPL");
53 54
55static const struct x86_cpu_id vmx_cpu_id[] = {
56 X86_FEATURE_MATCH(X86_FEATURE_VMX),
57 {}
58};
59MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
60
54static bool __read_mostly enable_vpid = 1; 61static bool __read_mostly enable_vpid = 1;
55module_param_named(vpid, enable_vpid, bool, 0444); 62module_param_named(vpid, enable_vpid, bool, 0444);
56 63
@@ -64,7 +71,10 @@ static bool __read_mostly enable_unrestricted_guest = 1;
64module_param_named(unrestricted_guest, 71module_param_named(unrestricted_guest,
65 enable_unrestricted_guest, bool, S_IRUGO); 72 enable_unrestricted_guest, bool, S_IRUGO);
66 73
67static bool __read_mostly emulate_invalid_guest_state = 0; 74static bool __read_mostly enable_ept_ad_bits = 1;
75module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
76
77static bool __read_mostly emulate_invalid_guest_state = true;
68module_param(emulate_invalid_guest_state, bool, S_IRUGO); 78module_param(emulate_invalid_guest_state, bool, S_IRUGO);
69 79
70static bool __read_mostly vmm_exclusive = 1; 80static bool __read_mostly vmm_exclusive = 1;
@@ -386,6 +396,9 @@ struct vcpu_vmx {
386 struct { 396 struct {
387 int loaded; 397 int loaded;
388 u16 fs_sel, gs_sel, ldt_sel; 398 u16 fs_sel, gs_sel, ldt_sel;
399#ifdef CONFIG_X86_64
400 u16 ds_sel, es_sel;
401#endif
389 int gs_ldt_reload_needed; 402 int gs_ldt_reload_needed;
390 int fs_reload_needed; 403 int fs_reload_needed;
391 } host_state; 404 } host_state;
@@ -605,6 +618,10 @@ static void kvm_cpu_vmxon(u64 addr);
605static void kvm_cpu_vmxoff(void); 618static void kvm_cpu_vmxoff(void);
606static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); 619static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
607static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); 620static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
621static void vmx_set_segment(struct kvm_vcpu *vcpu,
622 struct kvm_segment *var, int seg);
623static void vmx_get_segment(struct kvm_vcpu *vcpu,
624 struct kvm_segment *var, int seg);
608 625
609static DEFINE_PER_CPU(struct vmcs *, vmxarea); 626static DEFINE_PER_CPU(struct vmcs *, vmxarea);
610static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 627static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -779,6 +796,11 @@ static inline bool cpu_has_vmx_ept_4levels(void)
779 return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; 796 return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
780} 797}
781 798
799static inline bool cpu_has_vmx_ept_ad_bits(void)
800{
801 return vmx_capability.ept & VMX_EPT_AD_BIT;
802}
803
782static inline bool cpu_has_vmx_invept_individual_addr(void) 804static inline bool cpu_has_vmx_invept_individual_addr(void)
783{ 805{
784 return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT; 806 return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
@@ -839,6 +861,12 @@ static inline bool cpu_has_vmx_rdtscp(void)
839 SECONDARY_EXEC_RDTSCP; 861 SECONDARY_EXEC_RDTSCP;
840} 862}
841 863
864static inline bool cpu_has_vmx_invpcid(void)
865{
866 return vmcs_config.cpu_based_2nd_exec_ctrl &
867 SECONDARY_EXEC_ENABLE_INVPCID;
868}
869
842static inline bool cpu_has_virtual_nmis(void) 870static inline bool cpu_has_virtual_nmis(void)
843{ 871{
844 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; 872 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
@@ -1411,6 +1439,11 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
1411 } 1439 }
1412 1440
1413#ifdef CONFIG_X86_64 1441#ifdef CONFIG_X86_64
1442 savesegment(ds, vmx->host_state.ds_sel);
1443 savesegment(es, vmx->host_state.es_sel);
1444#endif
1445
1446#ifdef CONFIG_X86_64
1414 vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE)); 1447 vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
1415 vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE)); 1448 vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
1416#else 1449#else
@@ -1450,6 +1483,19 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
1450 } 1483 }
1451 if (vmx->host_state.fs_reload_needed) 1484 if (vmx->host_state.fs_reload_needed)
1452 loadsegment(fs, vmx->host_state.fs_sel); 1485 loadsegment(fs, vmx->host_state.fs_sel);
1486#ifdef CONFIG_X86_64
1487 if (unlikely(vmx->host_state.ds_sel | vmx->host_state.es_sel)) {
1488 loadsegment(ds, vmx->host_state.ds_sel);
1489 loadsegment(es, vmx->host_state.es_sel);
1490 }
1491#else
1492 /*
1493 * The sysexit path does not restore ds/es, so we must set them to
1494 * a reasonable value ourselves.
1495 */
1496 loadsegment(ds, __USER_DS);
1497 loadsegment(es, __USER_DS);
1498#endif
1453 reload_tss(); 1499 reload_tss();
1454#ifdef CONFIG_X86_64 1500#ifdef CONFIG_X86_64
1455 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); 1501 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
@@ -1711,6 +1757,11 @@ static bool vmx_rdtscp_supported(void)
1711 return cpu_has_vmx_rdtscp(); 1757 return cpu_has_vmx_rdtscp();
1712} 1758}
1713 1759
1760static bool vmx_invpcid_supported(void)
1761{
1762 return cpu_has_vmx_invpcid() && enable_ept;
1763}
1764
1714/* 1765/*
1715 * Swap MSR entry in host/guest MSR entry array. 1766 * Swap MSR entry in host/guest MSR entry array.
1716 */ 1767 */
@@ -2430,7 +2481,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
2430 SECONDARY_EXEC_ENABLE_EPT | 2481 SECONDARY_EXEC_ENABLE_EPT |
2431 SECONDARY_EXEC_UNRESTRICTED_GUEST | 2482 SECONDARY_EXEC_UNRESTRICTED_GUEST |
2432 SECONDARY_EXEC_PAUSE_LOOP_EXITING | 2483 SECONDARY_EXEC_PAUSE_LOOP_EXITING |
2433 SECONDARY_EXEC_RDTSCP; 2484 SECONDARY_EXEC_RDTSCP |
2485 SECONDARY_EXEC_ENABLE_INVPCID;
2434 if (adjust_vmx_controls(min2, opt2, 2486 if (adjust_vmx_controls(min2, opt2,
2435 MSR_IA32_VMX_PROCBASED_CTLS2, 2487 MSR_IA32_VMX_PROCBASED_CTLS2,
2436 &_cpu_based_2nd_exec_control) < 0) 2488 &_cpu_based_2nd_exec_control) < 0)
@@ -2617,8 +2669,12 @@ static __init int hardware_setup(void)
2617 !cpu_has_vmx_ept_4levels()) { 2669 !cpu_has_vmx_ept_4levels()) {
2618 enable_ept = 0; 2670 enable_ept = 0;
2619 enable_unrestricted_guest = 0; 2671 enable_unrestricted_guest = 0;
2672 enable_ept_ad_bits = 0;
2620 } 2673 }
2621 2674
2675 if (!cpu_has_vmx_ept_ad_bits())
2676 enable_ept_ad_bits = 0;
2677
2622 if (!cpu_has_vmx_unrestricted_guest()) 2678 if (!cpu_has_vmx_unrestricted_guest())
2623 enable_unrestricted_guest = 0; 2679 enable_unrestricted_guest = 0;
2624 2680
@@ -2742,6 +2798,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
2742{ 2798{
2743 unsigned long flags; 2799 unsigned long flags;
2744 struct vcpu_vmx *vmx = to_vmx(vcpu); 2800 struct vcpu_vmx *vmx = to_vmx(vcpu);
2801 struct kvm_segment var;
2745 2802
2746 if (enable_unrestricted_guest) 2803 if (enable_unrestricted_guest)
2747 return; 2804 return;
@@ -2785,20 +2842,23 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
2785 if (emulate_invalid_guest_state) 2842 if (emulate_invalid_guest_state)
2786 goto continue_rmode; 2843 goto continue_rmode;
2787 2844
2788 vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); 2845 vmx_get_segment(vcpu, &var, VCPU_SREG_SS);
2789 vmcs_write32(GUEST_SS_LIMIT, 0xffff); 2846 vmx_set_segment(vcpu, &var, VCPU_SREG_SS);
2790 vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
2791 2847
2792 vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); 2848 vmx_get_segment(vcpu, &var, VCPU_SREG_CS);
2793 vmcs_write32(GUEST_CS_LIMIT, 0xffff); 2849 vmx_set_segment(vcpu, &var, VCPU_SREG_CS);
2794 if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
2795 vmcs_writel(GUEST_CS_BASE, 0xf0000);
2796 vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
2797 2850
2798 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es); 2851 vmx_get_segment(vcpu, &var, VCPU_SREG_ES);
2799 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds); 2852 vmx_set_segment(vcpu, &var, VCPU_SREG_ES);
2800 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs); 2853
2801 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs); 2854 vmx_get_segment(vcpu, &var, VCPU_SREG_DS);
2855 vmx_set_segment(vcpu, &var, VCPU_SREG_DS);
2856
2857 vmx_get_segment(vcpu, &var, VCPU_SREG_GS);
2858 vmx_set_segment(vcpu, &var, VCPU_SREG_GS);
2859
2860 vmx_get_segment(vcpu, &var, VCPU_SREG_FS);
2861 vmx_set_segment(vcpu, &var, VCPU_SREG_FS);
2802 2862
2803continue_rmode: 2863continue_rmode:
2804 kvm_mmu_reset_context(vcpu); 2864 kvm_mmu_reset_context(vcpu);
@@ -2999,6 +3059,8 @@ static u64 construct_eptp(unsigned long root_hpa)
2999 /* TODO write the value reading from MSR */ 3059 /* TODO write the value reading from MSR */
3000 eptp = VMX_EPT_DEFAULT_MT | 3060 eptp = VMX_EPT_DEFAULT_MT |
3001 VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT; 3061 VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
3062 if (enable_ept_ad_bits)
3063 eptp |= VMX_EPT_AD_ENABLE_BIT;
3002 eptp |= (root_hpa & PAGE_MASK); 3064 eptp |= (root_hpa & PAGE_MASK);
3003 3065
3004 return eptp; 3066 return eptp;
@@ -3125,11 +3187,22 @@ static int __vmx_get_cpl(struct kvm_vcpu *vcpu)
3125 3187
3126static int vmx_get_cpl(struct kvm_vcpu *vcpu) 3188static int vmx_get_cpl(struct kvm_vcpu *vcpu)
3127{ 3189{
3190 struct vcpu_vmx *vmx = to_vmx(vcpu);
3191
3192 /*
3193 * If we enter real mode with cs.sel & 3 != 0, the normal CPL calculations
3194 * fail; use the cache instead.
3195 */
3196 if (unlikely(vmx->emulation_required && emulate_invalid_guest_state)) {
3197 return vmx->cpl;
3198 }
3199
3128 if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) { 3200 if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) {
3129 __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); 3201 __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
3130 to_vmx(vcpu)->cpl = __vmx_get_cpl(vcpu); 3202 vmx->cpl = __vmx_get_cpl(vcpu);
3131 } 3203 }
3132 return to_vmx(vcpu)->cpl; 3204
3205 return vmx->cpl;
3133} 3206}
3134 3207
3135 3208
@@ -3137,7 +3210,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var)
3137{ 3210{
3138 u32 ar; 3211 u32 ar;
3139 3212
3140 if (var->unusable) 3213 if (var->unusable || !var->present)
3141 ar = 1 << 16; 3214 ar = 1 << 16;
3142 else { 3215 else {
3143 ar = var->type & 15; 3216 ar = var->type & 15;
@@ -3149,8 +3222,6 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var)
3149 ar |= (var->db & 1) << 14; 3222 ar |= (var->db & 1) << 14;
3150 ar |= (var->g & 1) << 15; 3223 ar |= (var->g & 1) << 15;
3151 } 3224 }
3152 if (ar == 0) /* a 0 value means unusable */
3153 ar = AR_UNUSABLE_MASK;
3154 3225
3155 return ar; 3226 return ar;
3156} 3227}
@@ -3201,6 +3272,44 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
3201 3272
3202 vmcs_write32(sf->ar_bytes, ar); 3273 vmcs_write32(sf->ar_bytes, ar);
3203 __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); 3274 __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
3275
3276 /*
3277 * Fix segments for real mode guest in hosts that don't have
3278 * "unrestricted_mode" or it was disabled.
3279 * This is done to allow migration of the guests from hosts with
3280 * unrestricted guest like Westmere to older host that don't have
3281 * unrestricted guest like Nehelem.
3282 */
3283 if (!enable_unrestricted_guest && vmx->rmode.vm86_active) {
3284 switch (seg) {
3285 case VCPU_SREG_CS:
3286 vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
3287 vmcs_write32(GUEST_CS_LIMIT, 0xffff);
3288 if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
3289 vmcs_writel(GUEST_CS_BASE, 0xf0000);
3290 vmcs_write16(GUEST_CS_SELECTOR,
3291 vmcs_readl(GUEST_CS_BASE) >> 4);
3292 break;
3293 case VCPU_SREG_ES:
3294 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es);
3295 break;
3296 case VCPU_SREG_DS:
3297 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds);
3298 break;
3299 case VCPU_SREG_GS:
3300 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs);
3301 break;
3302 case VCPU_SREG_FS:
3303 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs);
3304 break;
3305 case VCPU_SREG_SS:
3306 vmcs_write16(GUEST_SS_SELECTOR,
3307 vmcs_readl(GUEST_SS_BASE) >> 4);
3308 vmcs_write32(GUEST_SS_LIMIT, 0xffff);
3309 vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
3310 break;
3311 }
3312 }
3204} 3313}
3205 3314
3206static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 3315static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
@@ -3633,8 +3742,18 @@ static void vmx_set_constant_host_state(void)
3633 vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ 3742 vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
3634 3743
3635 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ 3744 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
3745#ifdef CONFIG_X86_64
3746 /*
3747 * Load null selectors, so we can avoid reloading them in
3748 * __vmx_load_host_state(), in case userspace uses the null selectors
3749 * too (the expected case).
3750 */
3751 vmcs_write16(HOST_DS_SELECTOR, 0);
3752 vmcs_write16(HOST_ES_SELECTOR, 0);
3753#else
3636 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 3754 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
3637 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 3755 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
3756#endif
3638 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 3757 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
3639 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ 3758 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
3640 3759
@@ -3693,6 +3812,8 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
3693 if (!enable_ept) { 3812 if (!enable_ept) {
3694 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; 3813 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
3695 enable_unrestricted_guest = 0; 3814 enable_unrestricted_guest = 0;
3815 /* Enable INVPCID for non-ept guests may cause performance regression. */
3816 exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
3696 } 3817 }
3697 if (!enable_unrestricted_guest) 3818 if (!enable_unrestricted_guest)
3698 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 3819 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
@@ -4451,7 +4572,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
4451 break; 4572 break;
4452 } 4573 }
4453 vcpu->run->exit_reason = 0; 4574 vcpu->run->exit_reason = 0;
4454 pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n", 4575 vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
4455 (int)(exit_qualification >> 4) & 3, cr); 4576 (int)(exit_qualification >> 4) & 3, cr);
4456 return 0; 4577 return 0;
4457} 4578}
@@ -4731,6 +4852,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
4731{ 4852{
4732 unsigned long exit_qualification; 4853 unsigned long exit_qualification;
4733 gpa_t gpa; 4854 gpa_t gpa;
4855 u32 error_code;
4734 int gla_validity; 4856 int gla_validity;
4735 4857
4736 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 4858 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -4755,7 +4877,13 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
4755 4877
4756 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 4878 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
4757 trace_kvm_page_fault(gpa, exit_qualification); 4879 trace_kvm_page_fault(gpa, exit_qualification);
4758 return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0); 4880
4881 /* It is a write fault? */
4882 error_code = exit_qualification & (1U << 1);
4883 /* ept page table is present? */
4884 error_code |= (exit_qualification >> 3) & 0x1;
4885
4886 return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
4759} 4887}
4760 4888
4761static u64 ept_rsvd_mask(u64 spte, int level) 4889static u64 ept_rsvd_mask(u64 spte, int level)
@@ -4870,15 +4998,18 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
4870 int ret = 1; 4998 int ret = 1;
4871 u32 cpu_exec_ctrl; 4999 u32 cpu_exec_ctrl;
4872 bool intr_window_requested; 5000 bool intr_window_requested;
5001 unsigned count = 130;
4873 5002
4874 cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 5003 cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
4875 intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; 5004 intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
4876 5005
4877 while (!guest_state_valid(vcpu)) { 5006 while (!guest_state_valid(vcpu) && count-- != 0) {
4878 if (intr_window_requested 5007 if (intr_window_requested && vmx_interrupt_allowed(vcpu))
4879 && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF))
4880 return handle_interrupt_window(&vmx->vcpu); 5008 return handle_interrupt_window(&vmx->vcpu);
4881 5009
5010 if (test_bit(KVM_REQ_EVENT, &vcpu->requests))
5011 return 1;
5012
4882 err = emulate_instruction(vcpu, 0); 5013 err = emulate_instruction(vcpu, 0);
4883 5014
4884 if (err == EMULATE_DO_MMIO) { 5015 if (err == EMULATE_DO_MMIO) {
@@ -4886,8 +5017,12 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
4886 goto out; 5017 goto out;
4887 } 5018 }
4888 5019
4889 if (err != EMULATE_DONE) 5020 if (err != EMULATE_DONE) {
5021 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5022 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
5023 vcpu->run->internal.ndata = 0;
4890 return 0; 5024 return 0;
5025 }
4891 5026
4892 if (signal_pending(current)) 5027 if (signal_pending(current))
4893 goto out; 5028 goto out;
@@ -4895,7 +5030,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
4895 schedule(); 5030 schedule();
4896 } 5031 }
4897 5032
4898 vmx->emulation_required = 0; 5033 vmx->emulation_required = !guest_state_valid(vcpu);
4899out: 5034out:
4900 return ret; 5035 return ret;
4901} 5036}
@@ -6256,7 +6391,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
6256 } 6391 }
6257 } 6392 }
6258 6393
6259 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
6260 vmx->loaded_vmcs->launched = 1; 6394 vmx->loaded_vmcs->launched = 1;
6261 6395
6262 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); 6396 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
@@ -6343,7 +6477,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
6343 return &vmx->vcpu; 6477 return &vmx->vcpu;
6344 6478
6345free_vmcs: 6479free_vmcs:
6346 free_vmcs(vmx->loaded_vmcs->vmcs); 6480 free_loaded_vmcs(vmx->loaded_vmcs);
6347free_msrs: 6481free_msrs:
6348 kfree(vmx->guest_msrs); 6482 kfree(vmx->guest_msrs);
6349uninit_vcpu: 6483uninit_vcpu:
@@ -6430,6 +6564,23 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
6430 } 6564 }
6431 } 6565 }
6432 } 6566 }
6567
6568 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
6569 /* Exposing INVPCID only when PCID is exposed */
6570 best = kvm_find_cpuid_entry(vcpu, 0x7, 0);
6571 if (vmx_invpcid_supported() &&
6572 best && (best->ecx & bit(X86_FEATURE_INVPCID)) &&
6573 guest_cpuid_has_pcid(vcpu)) {
6574 exec_control |= SECONDARY_EXEC_ENABLE_INVPCID;
6575 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
6576 exec_control);
6577 } else {
6578 exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
6579 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
6580 exec_control);
6581 if (best)
6582 best->ecx &= ~bit(X86_FEATURE_INVPCID);
6583 }
6433} 6584}
6434 6585
6435static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) 6586static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@ -7164,6 +7315,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
7164 .cpuid_update = vmx_cpuid_update, 7315 .cpuid_update = vmx_cpuid_update,
7165 7316
7166 .rdtscp_supported = vmx_rdtscp_supported, 7317 .rdtscp_supported = vmx_rdtscp_supported,
7318 .invpcid_supported = vmx_invpcid_supported,
7167 7319
7168 .set_supported_cpuid = vmx_set_supported_cpuid, 7320 .set_supported_cpuid = vmx_set_supported_cpuid,
7169 7321
@@ -7193,23 +7345,21 @@ static int __init vmx_init(void)
7193 if (!vmx_io_bitmap_a) 7345 if (!vmx_io_bitmap_a)
7194 return -ENOMEM; 7346 return -ENOMEM;
7195 7347
7348 r = -ENOMEM;
7349
7196 vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); 7350 vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL);
7197 if (!vmx_io_bitmap_b) { 7351 if (!vmx_io_bitmap_b)
7198 r = -ENOMEM;
7199 goto out; 7352 goto out;
7200 }
7201 7353
7202 vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); 7354 vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL);
7203 if (!vmx_msr_bitmap_legacy) { 7355 if (!vmx_msr_bitmap_legacy)
7204 r = -ENOMEM;
7205 goto out1; 7356 goto out1;
7206 } 7357
7207 7358
7208 vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); 7359 vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
7209 if (!vmx_msr_bitmap_longmode) { 7360 if (!vmx_msr_bitmap_longmode)
7210 r = -ENOMEM;
7211 goto out2; 7361 goto out2;
7212 } 7362
7213 7363
7214 /* 7364 /*
7215 * Allow direct access to the PC debug port (it is often used for I/O 7365 * Allow direct access to the PC debug port (it is often used for I/O
@@ -7238,8 +7388,10 @@ static int __init vmx_init(void)
7238 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); 7388 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
7239 7389
7240 if (enable_ept) { 7390 if (enable_ept) {
7241 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, 7391 kvm_mmu_set_mask_ptes(0ull,
7242 VMX_EPT_EXECUTABLE_MASK); 7392 (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
7393 (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull,
7394 0ull, VMX_EPT_EXECUTABLE_MASK);
7243 ept_set_mmio_spte_mask(); 7395 ept_set_mmio_spte_mask();
7244 kvm_enable_tdp(); 7396 kvm_enable_tdp();
7245 } else 7397 } else
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 185a2b823a2..59b59508ff0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -528,6 +528,9 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
528 return 1; 528 return 1;
529 } 529 }
530 530
531 if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
532 return 1;
533
531 kvm_x86_ops->set_cr0(vcpu, cr0); 534 kvm_x86_ops->set_cr0(vcpu, cr0);
532 535
533 if ((cr0 ^ old_cr0) & X86_CR0_PG) { 536 if ((cr0 ^ old_cr0) & X86_CR0_PG) {
@@ -604,10 +607,20 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
604 kvm_read_cr3(vcpu))) 607 kvm_read_cr3(vcpu)))
605 return 1; 608 return 1;
606 609
610 if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
611 if (!guest_cpuid_has_pcid(vcpu))
612 return 1;
613
614 /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
615 if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
616 return 1;
617 }
618
607 if (kvm_x86_ops->set_cr4(vcpu, cr4)) 619 if (kvm_x86_ops->set_cr4(vcpu, cr4))
608 return 1; 620 return 1;
609 621
610 if ((cr4 ^ old_cr4) & pdptr_bits) 622 if (((cr4 ^ old_cr4) & pdptr_bits) ||
623 (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
611 kvm_mmu_reset_context(vcpu); 624 kvm_mmu_reset_context(vcpu);
612 625
613 if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) 626 if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
@@ -626,8 +639,12 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
626 } 639 }
627 640
628 if (is_long_mode(vcpu)) { 641 if (is_long_mode(vcpu)) {
629 if (cr3 & CR3_L_MODE_RESERVED_BITS) 642 if (kvm_read_cr4(vcpu) & X86_CR4_PCIDE) {
630 return 1; 643 if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS)
644 return 1;
645 } else
646 if (cr3 & CR3_L_MODE_RESERVED_BITS)
647 return 1;
631 } else { 648 } else {
632 if (is_pae(vcpu)) { 649 if (is_pae(vcpu)) {
633 if (cr3 & CR3_PAE_RESERVED_BITS) 650 if (cr3 & CR3_PAE_RESERVED_BITS)
@@ -795,6 +812,7 @@ static u32 msrs_to_save[] = {
795 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, 812 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
796 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, 813 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
797 HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, 814 HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
815 MSR_KVM_PV_EOI_EN,
798 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 816 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
799 MSR_STAR, 817 MSR_STAR,
800#ifdef CONFIG_X86_64 818#ifdef CONFIG_X86_64
@@ -1437,8 +1455,8 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1437 break; 1455 break;
1438 } 1456 }
1439 default: 1457 default:
1440 pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " 1458 vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
1441 "data 0x%llx\n", msr, data); 1459 "data 0x%llx\n", msr, data);
1442 return 1; 1460 return 1;
1443 } 1461 }
1444 return 0; 1462 return 0;
@@ -1470,8 +1488,8 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1470 case HV_X64_MSR_TPR: 1488 case HV_X64_MSR_TPR:
1471 return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); 1489 return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
1472 default: 1490 default:
1473 pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " 1491 vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
1474 "data 0x%llx\n", msr, data); 1492 "data 0x%llx\n", msr, data);
1475 return 1; 1493 return 1;
1476 } 1494 }
1477 1495
@@ -1551,15 +1569,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1551 data &= ~(u64)0x100; /* ignore ignne emulation enable */ 1569 data &= ~(u64)0x100; /* ignore ignne emulation enable */
1552 data &= ~(u64)0x8; /* ignore TLB cache disable */ 1570 data &= ~(u64)0x8; /* ignore TLB cache disable */
1553 if (data != 0) { 1571 if (data != 0) {
1554 pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", 1572 vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
1555 data); 1573 data);
1556 return 1; 1574 return 1;
1557 } 1575 }
1558 break; 1576 break;
1559 case MSR_FAM10H_MMIO_CONF_BASE: 1577 case MSR_FAM10H_MMIO_CONF_BASE:
1560 if (data != 0) { 1578 if (data != 0) {
1561 pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " 1579 vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
1562 "0x%llx\n", data); 1580 "0x%llx\n", data);
1563 return 1; 1581 return 1;
1564 } 1582 }
1565 break; 1583 break;
@@ -1574,8 +1592,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1574 thus reserved and should throw a #GP */ 1592 thus reserved and should throw a #GP */
1575 return 1; 1593 return 1;
1576 } 1594 }
1577 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", 1595 vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
1578 __func__, data); 1596 __func__, data);
1579 break; 1597 break;
1580 case MSR_IA32_UCODE_REV: 1598 case MSR_IA32_UCODE_REV:
1581 case MSR_IA32_UCODE_WRITE: 1599 case MSR_IA32_UCODE_WRITE:
@@ -1653,6 +1671,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1653 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); 1671 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
1654 1672
1655 break; 1673 break;
1674 case MSR_KVM_PV_EOI_EN:
1675 if (kvm_lapic_enable_pv_eoi(vcpu, data))
1676 return 1;
1677 break;
1656 1678
1657 case MSR_IA32_MCG_CTL: 1679 case MSR_IA32_MCG_CTL:
1658 case MSR_IA32_MCG_STATUS: 1680 case MSR_IA32_MCG_STATUS:
@@ -1671,8 +1693,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1671 case MSR_K7_EVNTSEL2: 1693 case MSR_K7_EVNTSEL2:
1672 case MSR_K7_EVNTSEL3: 1694 case MSR_K7_EVNTSEL3:
1673 if (data != 0) 1695 if (data != 0)
1674 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " 1696 vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: "
1675 "0x%x data 0x%llx\n", msr, data); 1697 "0x%x data 0x%llx\n", msr, data);
1676 break; 1698 break;
1677 /* at least RHEL 4 unconditionally writes to the perfctr registers, 1699 /* at least RHEL 4 unconditionally writes to the perfctr registers,
1678 * so we ignore writes to make it happy. 1700 * so we ignore writes to make it happy.
@@ -1681,8 +1703,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1681 case MSR_K7_PERFCTR1: 1703 case MSR_K7_PERFCTR1:
1682 case MSR_K7_PERFCTR2: 1704 case MSR_K7_PERFCTR2:
1683 case MSR_K7_PERFCTR3: 1705 case MSR_K7_PERFCTR3:
1684 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " 1706 vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: "
1685 "0x%x data 0x%llx\n", msr, data); 1707 "0x%x data 0x%llx\n", msr, data);
1686 break; 1708 break;
1687 case MSR_P6_PERFCTR0: 1709 case MSR_P6_PERFCTR0:
1688 case MSR_P6_PERFCTR1: 1710 case MSR_P6_PERFCTR1:
@@ -1693,8 +1715,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1693 return kvm_pmu_set_msr(vcpu, msr, data); 1715 return kvm_pmu_set_msr(vcpu, msr, data);
1694 1716
1695 if (pr || data != 0) 1717 if (pr || data != 0)
1696 pr_unimpl(vcpu, "disabled perfctr wrmsr: " 1718 vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
1697 "0x%x data 0x%llx\n", msr, data); 1719 "0x%x data 0x%llx\n", msr, data);
1698 break; 1720 break;
1699 case MSR_K7_CLK_CTL: 1721 case MSR_K7_CLK_CTL:
1700 /* 1722 /*
@@ -1720,7 +1742,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1720 /* Drop writes to this legacy MSR -- see rdmsr 1742 /* Drop writes to this legacy MSR -- see rdmsr
1721 * counterpart for further detail. 1743 * counterpart for further detail.
1722 */ 1744 */
1723 pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); 1745 vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data);
1724 break; 1746 break;
1725 case MSR_AMD64_OSVW_ID_LENGTH: 1747 case MSR_AMD64_OSVW_ID_LENGTH:
1726 if (!guest_cpuid_has_osvw(vcpu)) 1748 if (!guest_cpuid_has_osvw(vcpu))
@@ -1738,12 +1760,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1738 if (kvm_pmu_msr(vcpu, msr)) 1760 if (kvm_pmu_msr(vcpu, msr))
1739 return kvm_pmu_set_msr(vcpu, msr, data); 1761 return kvm_pmu_set_msr(vcpu, msr, data);
1740 if (!ignore_msrs) { 1762 if (!ignore_msrs) {
1741 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", 1763 vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
1742 msr, data); 1764 msr, data);
1743 return 1; 1765 return 1;
1744 } else { 1766 } else {
1745 pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", 1767 vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n",
1746 msr, data); 1768 msr, data);
1747 break; 1769 break;
1748 } 1770 }
1749 } 1771 }
@@ -1846,7 +1868,7 @@ static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1846 data = kvm->arch.hv_hypercall; 1868 data = kvm->arch.hv_hypercall;
1847 break; 1869 break;
1848 default: 1870 default:
1849 pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); 1871 vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
1850 return 1; 1872 return 1;
1851 } 1873 }
1852 1874
@@ -1877,7 +1899,7 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1877 data = vcpu->arch.hv_vapic; 1899 data = vcpu->arch.hv_vapic;
1878 break; 1900 break;
1879 default: 1901 default:
1880 pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); 1902 vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
1881 return 1; 1903 return 1;
1882 } 1904 }
1883 *pdata = data; 1905 *pdata = data;
@@ -2030,10 +2052,10 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
2030 if (kvm_pmu_msr(vcpu, msr)) 2052 if (kvm_pmu_msr(vcpu, msr))
2031 return kvm_pmu_get_msr(vcpu, msr, pdata); 2053 return kvm_pmu_get_msr(vcpu, msr, pdata);
2032 if (!ignore_msrs) { 2054 if (!ignore_msrs) {
2033 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); 2055 vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
2034 return 1; 2056 return 1;
2035 } else { 2057 } else {
2036 pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr); 2058 vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);
2037 data = 0; 2059 data = 0;
2038 } 2060 }
2039 break; 2061 break;
@@ -2147,6 +2169,7 @@ int kvm_dev_ioctl_check_extension(long ext)
2147 case KVM_CAP_ASYNC_PF: 2169 case KVM_CAP_ASYNC_PF:
2148 case KVM_CAP_GET_TSC_KHZ: 2170 case KVM_CAP_GET_TSC_KHZ:
2149 case KVM_CAP_PCI_2_3: 2171 case KVM_CAP_PCI_2_3:
2172 case KVM_CAP_KVMCLOCK_CTRL:
2150 r = 1; 2173 r = 1;
2151 break; 2174 break;
2152 case KVM_CAP_COALESCED_MMIO: 2175 case KVM_CAP_COALESCED_MMIO:
@@ -2597,6 +2620,23 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
2597 return r; 2620 return r;
2598} 2621}
2599 2622
2623/*
2624 * kvm_set_guest_paused() indicates to the guest kernel that it has been
2625 * stopped by the hypervisor. This function will be called from the host only.
2626 * EINVAL is returned when the host attempts to set the flag for a guest that
2627 * does not support pv clocks.
2628 */
2629static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
2630{
2631 struct pvclock_vcpu_time_info *src = &vcpu->arch.hv_clock;
2632 if (!vcpu->arch.time_page)
2633 return -EINVAL;
2634 src->flags |= PVCLOCK_GUEST_STOPPED;
2635 mark_page_dirty(vcpu->kvm, vcpu->arch.time >> PAGE_SHIFT);
2636 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2637 return 0;
2638}
2639
2600long kvm_arch_vcpu_ioctl(struct file *filp, 2640long kvm_arch_vcpu_ioctl(struct file *filp,
2601 unsigned int ioctl, unsigned long arg) 2641 unsigned int ioctl, unsigned long arg)
2602{ 2642{
@@ -2873,6 +2913,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2873 r = vcpu->arch.virtual_tsc_khz; 2913 r = vcpu->arch.virtual_tsc_khz;
2874 goto out; 2914 goto out;
2875 } 2915 }
2916 case KVM_KVMCLOCK_CTRL: {
2917 r = kvm_set_guest_paused(vcpu);
2918 goto out;
2919 }
2876 default: 2920 default:
2877 r = -EINVAL; 2921 r = -EINVAL;
2878 } 2922 }
@@ -3045,57 +3089,32 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
3045} 3089}
3046 3090
3047/** 3091/**
3048 * write_protect_slot - write protect a slot for dirty logging 3092 * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
3049 * @kvm: the kvm instance 3093 * @kvm: kvm instance
3050 * @memslot: the slot we protect 3094 * @log: slot id and address to which we copy the log
3051 * @dirty_bitmap: the bitmap indicating which pages are dirty
3052 * @nr_dirty_pages: the number of dirty pages
3053 * 3095 *
3054 * We have two ways to find all sptes to protect: 3096 * We need to keep it in mind that VCPU threads can write to the bitmap
3055 * 1. Use kvm_mmu_slot_remove_write_access() which walks all shadow pages and 3097 * concurrently. So, to avoid losing data, we keep the following order for
3056 * checks ones that have a spte mapping a page in the slot. 3098 * each bit:
3057 * 2. Use kvm_mmu_rmap_write_protect() for each gfn found in the bitmap.
3058 * 3099 *
3059 * Generally speaking, if there are not so many dirty pages compared to the 3100 * 1. Take a snapshot of the bit and clear it if needed.
3060 * number of shadow pages, we should use the latter. 3101 * 2. Write protect the corresponding page.
3102 * 3. Flush TLB's if needed.
3103 * 4. Copy the snapshot to the userspace.
3061 * 3104 *
3062 * Note that letting others write into a page marked dirty in the old bitmap 3105 * Between 2 and 3, the guest may write to the page using the remaining TLB
3063 * by using the remaining tlb entry is not a problem. That page will become 3106 * entry. This is not a problem because the page will be reported dirty at
3064 * write protected again when we flush the tlb and then be reported dirty to 3107 * step 4 using the snapshot taken before and step 3 ensures that successive
3065 * the user space by copying the old bitmap. 3108 * writes will be logged for the next call.
3066 */
3067static void write_protect_slot(struct kvm *kvm,
3068 struct kvm_memory_slot *memslot,
3069 unsigned long *dirty_bitmap,
3070 unsigned long nr_dirty_pages)
3071{
3072 spin_lock(&kvm->mmu_lock);
3073
3074 /* Not many dirty pages compared to # of shadow pages. */
3075 if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) {
3076 unsigned long gfn_offset;
3077
3078 for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) {
3079 unsigned long gfn = memslot->base_gfn + gfn_offset;
3080
3081 kvm_mmu_rmap_write_protect(kvm, gfn, memslot);
3082 }
3083 kvm_flush_remote_tlbs(kvm);
3084 } else
3085 kvm_mmu_slot_remove_write_access(kvm, memslot->id);
3086
3087 spin_unlock(&kvm->mmu_lock);
3088}
3089
3090/*
3091 * Get (and clear) the dirty memory log for a memory slot.
3092 */ 3109 */
3093int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, 3110int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
3094 struct kvm_dirty_log *log)
3095{ 3111{
3096 int r; 3112 int r;
3097 struct kvm_memory_slot *memslot; 3113 struct kvm_memory_slot *memslot;
3098 unsigned long n, nr_dirty_pages; 3114 unsigned long n, i;
3115 unsigned long *dirty_bitmap;
3116 unsigned long *dirty_bitmap_buffer;
3117 bool is_dirty = false;
3099 3118
3100 mutex_lock(&kvm->slots_lock); 3119 mutex_lock(&kvm->slots_lock);
3101 3120
@@ -3104,49 +3123,42 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
3104 goto out; 3123 goto out;
3105 3124
3106 memslot = id_to_memslot(kvm->memslots, log->slot); 3125 memslot = id_to_memslot(kvm->memslots, log->slot);
3126
3127 dirty_bitmap = memslot->dirty_bitmap;
3107 r = -ENOENT; 3128 r = -ENOENT;
3108 if (!memslot->dirty_bitmap) 3129 if (!dirty_bitmap)
3109 goto out; 3130 goto out;
3110 3131
3111 n = kvm_dirty_bitmap_bytes(memslot); 3132 n = kvm_dirty_bitmap_bytes(memslot);
3112 nr_dirty_pages = memslot->nr_dirty_pages;
3113 3133
3114 /* If nothing is dirty, don't bother messing with page tables. */ 3134 dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long);
3115 if (nr_dirty_pages) { 3135 memset(dirty_bitmap_buffer, 0, n);
3116 struct kvm_memslots *slots, *old_slots;
3117 unsigned long *dirty_bitmap, *dirty_bitmap_head;
3118 3136
3119 dirty_bitmap = memslot->dirty_bitmap; 3137 spin_lock(&kvm->mmu_lock);
3120 dirty_bitmap_head = memslot->dirty_bitmap_head;
3121 if (dirty_bitmap == dirty_bitmap_head)
3122 dirty_bitmap_head += n / sizeof(long);
3123 memset(dirty_bitmap_head, 0, n);
3124 3138
3125 r = -ENOMEM; 3139 for (i = 0; i < n / sizeof(long); i++) {
3126 slots = kmemdup(kvm->memslots, sizeof(*kvm->memslots), GFP_KERNEL); 3140 unsigned long mask;
3127 if (!slots) 3141 gfn_t offset;
3128 goto out;
3129 3142
3130 memslot = id_to_memslot(slots, log->slot); 3143 if (!dirty_bitmap[i])
3131 memslot->nr_dirty_pages = 0; 3144 continue;
3132 memslot->dirty_bitmap = dirty_bitmap_head;
3133 update_memslots(slots, NULL);
3134 3145
3135 old_slots = kvm->memslots; 3146 is_dirty = true;
3136 rcu_assign_pointer(kvm->memslots, slots);
3137 synchronize_srcu_expedited(&kvm->srcu);
3138 kfree(old_slots);
3139 3147
3140 write_protect_slot(kvm, memslot, dirty_bitmap, nr_dirty_pages); 3148 mask = xchg(&dirty_bitmap[i], 0);
3149 dirty_bitmap_buffer[i] = mask;
3141 3150
3142 r = -EFAULT; 3151 offset = i * BITS_PER_LONG;
3143 if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) 3152 kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask);
3144 goto out;
3145 } else {
3146 r = -EFAULT;
3147 if (clear_user(log->dirty_bitmap, n))
3148 goto out;
3149 } 3153 }
3154 if (is_dirty)
3155 kvm_flush_remote_tlbs(kvm);
3156
3157 spin_unlock(&kvm->mmu_lock);
3158
3159 r = -EFAULT;
3160 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
3161 goto out;
3150 3162
3151 r = 0; 3163 r = 0;
3152out: 3164out:
@@ -3728,9 +3740,8 @@ struct read_write_emulator_ops {
3728static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes) 3740static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
3729{ 3741{
3730 if (vcpu->mmio_read_completed) { 3742 if (vcpu->mmio_read_completed) {
3731 memcpy(val, vcpu->mmio_data, bytes);
3732 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, 3743 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
3733 vcpu->mmio_phys_addr, *(u64 *)val); 3744 vcpu->mmio_fragments[0].gpa, *(u64 *)val);
3734 vcpu->mmio_read_completed = 0; 3745 vcpu->mmio_read_completed = 0;
3735 return 1; 3746 return 1;
3736 } 3747 }
@@ -3766,8 +3777,9 @@ static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
3766static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, 3777static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
3767 void *val, int bytes) 3778 void *val, int bytes)
3768{ 3779{
3769 memcpy(vcpu->mmio_data, val, bytes); 3780 struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0];
3770 memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8); 3781
3782 memcpy(vcpu->run->mmio.data, frag->data, frag->len);
3771 return X86EMUL_CONTINUE; 3783 return X86EMUL_CONTINUE;
3772} 3784}
3773 3785
@@ -3794,10 +3806,7 @@ static int emulator_read_write_onepage(unsigned long addr, void *val,
3794 gpa_t gpa; 3806 gpa_t gpa;
3795 int handled, ret; 3807 int handled, ret;
3796 bool write = ops->write; 3808 bool write = ops->write;
3797 3809 struct kvm_mmio_fragment *frag;
3798 if (ops->read_write_prepare &&
3799 ops->read_write_prepare(vcpu, val, bytes))
3800 return X86EMUL_CONTINUE;
3801 3810
3802 ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write); 3811 ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
3803 3812
@@ -3823,15 +3832,19 @@ mmio:
3823 bytes -= handled; 3832 bytes -= handled;
3824 val += handled; 3833 val += handled;
3825 3834
3826 vcpu->mmio_needed = 1; 3835 while (bytes) {
3827 vcpu->run->exit_reason = KVM_EXIT_MMIO; 3836 unsigned now = min(bytes, 8U);
3828 vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
3829 vcpu->mmio_size = bytes;
3830 vcpu->run->mmio.len = min(vcpu->mmio_size, 8);
3831 vcpu->run->mmio.is_write = vcpu->mmio_is_write = write;
3832 vcpu->mmio_index = 0;
3833 3837
3834 return ops->read_write_exit_mmio(vcpu, gpa, val, bytes); 3838 frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++];
3839 frag->gpa = gpa;
3840 frag->data = val;
3841 frag->len = now;
3842
3843 gpa += now;
3844 val += now;
3845 bytes -= now;
3846 }
3847 return X86EMUL_CONTINUE;
3835} 3848}
3836 3849
3837int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, 3850int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
@@ -3840,10 +3853,18 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
3840 struct read_write_emulator_ops *ops) 3853 struct read_write_emulator_ops *ops)
3841{ 3854{
3842 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 3855 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3856 gpa_t gpa;
3857 int rc;
3858
3859 if (ops->read_write_prepare &&
3860 ops->read_write_prepare(vcpu, val, bytes))
3861 return X86EMUL_CONTINUE;
3862
3863 vcpu->mmio_nr_fragments = 0;
3843 3864
3844 /* Crossing a page boundary? */ 3865 /* Crossing a page boundary? */
3845 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { 3866 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
3846 int rc, now; 3867 int now;
3847 3868
3848 now = -addr & ~PAGE_MASK; 3869 now = -addr & ~PAGE_MASK;
3849 rc = emulator_read_write_onepage(addr, val, now, exception, 3870 rc = emulator_read_write_onepage(addr, val, now, exception,
@@ -3856,8 +3877,25 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
3856 bytes -= now; 3877 bytes -= now;
3857 } 3878 }
3858 3879
3859 return emulator_read_write_onepage(addr, val, bytes, exception, 3880 rc = emulator_read_write_onepage(addr, val, bytes, exception,
3860 vcpu, ops); 3881 vcpu, ops);
3882 if (rc != X86EMUL_CONTINUE)
3883 return rc;
3884
3885 if (!vcpu->mmio_nr_fragments)
3886 return rc;
3887
3888 gpa = vcpu->mmio_fragments[0].gpa;
3889
3890 vcpu->mmio_needed = 1;
3891 vcpu->mmio_cur_fragment = 0;
3892
3893 vcpu->run->mmio.len = vcpu->mmio_fragments[0].len;
3894 vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
3895 vcpu->run->exit_reason = KVM_EXIT_MMIO;
3896 vcpu->run->mmio.phys_addr = gpa;
3897
3898 return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
3861} 3899}
3862 3900
3863static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, 3901static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
@@ -4100,7 +4138,7 @@ static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)
4100 value = kvm_get_cr8(vcpu); 4138 value = kvm_get_cr8(vcpu);
4101 break; 4139 break;
4102 default: 4140 default:
4103 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 4141 kvm_err("%s: unexpected cr %u\n", __func__, cr);
4104 return 0; 4142 return 0;
4105 } 4143 }
4106 4144
@@ -4129,7 +4167,7 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
4129 res = kvm_set_cr8(vcpu, val); 4167 res = kvm_set_cr8(vcpu, val);
4130 break; 4168 break;
4131 default: 4169 default:
4132 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 4170 kvm_err("%s: unexpected cr %u\n", __func__, cr);
4133 res = -1; 4171 res = -1;
4134 } 4172 }
4135 4173
@@ -4281,26 +4319,10 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
4281 return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage); 4319 return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
4282} 4320}
4283 4321
4284static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, 4322static void emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
4285 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) 4323 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
4286{ 4324{
4287 struct kvm_cpuid_entry2 *cpuid = NULL; 4325 kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx);
4288
4289 if (eax && ecx)
4290 cpuid = kvm_find_cpuid_entry(emul_to_vcpu(ctxt),
4291 *eax, *ecx);
4292
4293 if (cpuid) {
4294 *eax = cpuid->eax;
4295 *ecx = cpuid->ecx;
4296 if (ebx)
4297 *ebx = cpuid->ebx;
4298 if (edx)
4299 *edx = cpuid->edx;
4300 return true;
4301 }
4302
4303 return false;
4304} 4326}
4305 4327
4306static struct x86_emulate_ops emulate_ops = { 4328static struct x86_emulate_ops emulate_ops = {
@@ -5263,10 +5285,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5263 kvm_deliver_pmi(vcpu); 5285 kvm_deliver_pmi(vcpu);
5264 } 5286 }
5265 5287
5266 r = kvm_mmu_reload(vcpu);
5267 if (unlikely(r))
5268 goto out;
5269
5270 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { 5288 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
5271 inject_pending_event(vcpu); 5289 inject_pending_event(vcpu);
5272 5290
@@ -5282,6 +5300,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5282 } 5300 }
5283 } 5301 }
5284 5302
5303 r = kvm_mmu_reload(vcpu);
5304 if (unlikely(r)) {
5305 goto cancel_injection;
5306 }
5307
5285 preempt_disable(); 5308 preempt_disable();
5286 5309
5287 kvm_x86_ops->prepare_guest_switch(vcpu); 5310 kvm_x86_ops->prepare_guest_switch(vcpu);
@@ -5304,9 +5327,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5304 smp_wmb(); 5327 smp_wmb();
5305 local_irq_enable(); 5328 local_irq_enable();
5306 preempt_enable(); 5329 preempt_enable();
5307 kvm_x86_ops->cancel_injection(vcpu);
5308 r = 1; 5330 r = 1;
5309 goto out; 5331 goto cancel_injection;
5310 } 5332 }
5311 5333
5312 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 5334 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
@@ -5370,9 +5392,16 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5370 if (unlikely(vcpu->arch.tsc_always_catchup)) 5392 if (unlikely(vcpu->arch.tsc_always_catchup))
5371 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 5393 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
5372 5394
5373 kvm_lapic_sync_from_vapic(vcpu); 5395 if (vcpu->arch.apic_attention)
5396 kvm_lapic_sync_from_vapic(vcpu);
5374 5397
5375 r = kvm_x86_ops->handle_exit(vcpu); 5398 r = kvm_x86_ops->handle_exit(vcpu);
5399 return r;
5400
5401cancel_injection:
5402 kvm_x86_ops->cancel_injection(vcpu);
5403 if (unlikely(vcpu->arch.apic_attention))
5404 kvm_lapic_sync_from_vapic(vcpu);
5376out: 5405out:
5377 return r; 5406 return r;
5378} 5407}
@@ -5456,33 +5485,55 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
5456 return r; 5485 return r;
5457} 5486}
5458 5487
5488/*
5489 * Implements the following, as a state machine:
5490 *
5491 * read:
5492 * for each fragment
5493 * write gpa, len
5494 * exit
5495 * copy data
5496 * execute insn
5497 *
5498 * write:
5499 * for each fragment
5500 * write gpa, len
5501 * copy data
5502 * exit
5503 */
5459static int complete_mmio(struct kvm_vcpu *vcpu) 5504static int complete_mmio(struct kvm_vcpu *vcpu)
5460{ 5505{
5461 struct kvm_run *run = vcpu->run; 5506 struct kvm_run *run = vcpu->run;
5507 struct kvm_mmio_fragment *frag;
5462 int r; 5508 int r;
5463 5509
5464 if (!(vcpu->arch.pio.count || vcpu->mmio_needed)) 5510 if (!(vcpu->arch.pio.count || vcpu->mmio_needed))
5465 return 1; 5511 return 1;
5466 5512
5467 if (vcpu->mmio_needed) { 5513 if (vcpu->mmio_needed) {
5468 vcpu->mmio_needed = 0; 5514 /* Complete previous fragment */
5515 frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment++];
5469 if (!vcpu->mmio_is_write) 5516 if (!vcpu->mmio_is_write)
5470 memcpy(vcpu->mmio_data + vcpu->mmio_index, 5517 memcpy(frag->data, run->mmio.data, frag->len);
5471 run->mmio.data, 8); 5518 if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) {
5472 vcpu->mmio_index += 8; 5519 vcpu->mmio_needed = 0;
5473 if (vcpu->mmio_index < vcpu->mmio_size) { 5520 if (vcpu->mmio_is_write)
5474 run->exit_reason = KVM_EXIT_MMIO; 5521 return 1;
5475 run->mmio.phys_addr = vcpu->mmio_phys_addr + vcpu->mmio_index; 5522 vcpu->mmio_read_completed = 1;
5476 memcpy(run->mmio.data, vcpu->mmio_data + vcpu->mmio_index, 8); 5523 goto done;
5477 run->mmio.len = min(vcpu->mmio_size - vcpu->mmio_index, 8);
5478 run->mmio.is_write = vcpu->mmio_is_write;
5479 vcpu->mmio_needed = 1;
5480 return 0;
5481 } 5524 }
5525 /* Initiate next fragment */
5526 ++frag;
5527 run->exit_reason = KVM_EXIT_MMIO;
5528 run->mmio.phys_addr = frag->gpa;
5482 if (vcpu->mmio_is_write) 5529 if (vcpu->mmio_is_write)
5483 return 1; 5530 memcpy(run->mmio.data, frag->data, frag->len);
5484 vcpu->mmio_read_completed = 1; 5531 run->mmio.len = frag->len;
5532 run->mmio.is_write = vcpu->mmio_is_write;
5533 return 0;
5534
5485 } 5535 }
5536done:
5486 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 5537 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
5487 r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE); 5538 r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
5488 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 5539 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
@@ -6264,7 +6315,7 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free,
6264 6315
6265 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { 6316 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
6266 if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) { 6317 if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) {
6267 vfree(free->arch.lpage_info[i]); 6318 kvm_kvfree(free->arch.lpage_info[i]);
6268 free->arch.lpage_info[i] = NULL; 6319 free->arch.lpage_info[i] = NULL;
6269 } 6320 }
6270 } 6321 }
@@ -6283,7 +6334,7 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
6283 slot->base_gfn, level) + 1; 6334 slot->base_gfn, level) + 1;
6284 6335
6285 slot->arch.lpage_info[i] = 6336 slot->arch.lpage_info[i] =
6286 vzalloc(lpages * sizeof(*slot->arch.lpage_info[i])); 6337 kvm_kvzalloc(lpages * sizeof(*slot->arch.lpage_info[i]));
6287 if (!slot->arch.lpage_info[i]) 6338 if (!slot->arch.lpage_info[i])
6288 goto out_free; 6339 goto out_free;
6289 6340
@@ -6310,7 +6361,7 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
6310 6361
6311out_free: 6362out_free:
6312 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { 6363 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
6313 vfree(slot->arch.lpage_info[i]); 6364 kvm_kvfree(slot->arch.lpage_info[i]);
6314 slot->arch.lpage_info[i] = NULL; 6365 slot->arch.lpage_info[i] = NULL;
6315 } 6366 }
6316 return -ENOMEM; 6367 return -ENOMEM;
@@ -6399,21 +6450,9 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
6399 kvm_cpu_has_interrupt(vcpu)); 6450 kvm_cpu_has_interrupt(vcpu));
6400} 6451}
6401 6452
6402void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 6453int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
6403{ 6454{
6404 int me; 6455 return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
6405 int cpu = vcpu->cpu;
6406
6407 if (waitqueue_active(&vcpu->wq)) {
6408 wake_up_interruptible(&vcpu->wq);
6409 ++vcpu->stat.halt_wakeup;
6410 }
6411
6412 me = get_cpu();
6413 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
6414 if (kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE)
6415 smp_send_reschedule(cpu);
6416 put_cpu();
6417} 6456}
6418 6457
6419int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) 6458int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index cb80c293cdd..3d1134ddb88 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -64,7 +64,7 @@ static inline int is_pse(struct kvm_vcpu *vcpu)
64 64
65static inline int is_paging(struct kvm_vcpu *vcpu) 65static inline int is_paging(struct kvm_vcpu *vcpu)
66{ 66{
67 return kvm_read_cr0_bits(vcpu, X86_CR0_PG); 67 return likely(kvm_read_cr0_bits(vcpu, X86_CR0_PG));
68} 68}
69 69
70static inline u32 bit(int bitno) 70static inline u32 bit(int bitno)
diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c
index 459b58a8a15..25b7ae8d058 100644
--- a/arch/x86/lib/csum-wrappers_64.c
+++ b/arch/x86/lib/csum-wrappers_64.c
@@ -115,7 +115,7 @@ EXPORT_SYMBOL(csum_partial_copy_to_user);
115 * @src: source address 115 * @src: source address
116 * @dst: destination address 116 * @dst: destination address
117 * @len: number of bytes to be copied. 117 * @len: number of bytes to be copied.
118 * @isum: initial sum that is added into the result (32bit unfolded) 118 * @sum: initial sum that is added into the result (32bit unfolded)
119 * 119 *
120 * Returns an 32bit unfolded checksum of the buffer. 120 * Returns an 32bit unfolded checksum of the buffer.
121 */ 121 */
diff --git a/arch/x86/lib/msr-reg-export.c b/arch/x86/lib/msr-reg-export.c
index a311cc59b65..8d6ef78b5d0 100644
--- a/arch/x86/lib/msr-reg-export.c
+++ b/arch/x86/lib/msr-reg-export.c
@@ -1,5 +1,5 @@
1#include <linux/module.h> 1#include <linux/module.h>
2#include <asm/msr.h> 2#include <asm/msr.h>
3 3
4EXPORT_SYMBOL(native_rdmsr_safe_regs); 4EXPORT_SYMBOL(rdmsr_safe_regs);
5EXPORT_SYMBOL(native_wrmsr_safe_regs); 5EXPORT_SYMBOL(wrmsr_safe_regs);
diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S
index 69fa10623f2..f6d13eefad1 100644
--- a/arch/x86/lib/msr-reg.S
+++ b/arch/x86/lib/msr-reg.S
@@ -6,13 +6,13 @@
6 6
7#ifdef CONFIG_X86_64 7#ifdef CONFIG_X86_64
8/* 8/*
9 * int native_{rdmsr,wrmsr}_safe_regs(u32 gprs[8]); 9 * int {rdmsr,wrmsr}_safe_regs(u32 gprs[8]);
10 * 10 *
11 * reg layout: u32 gprs[eax, ecx, edx, ebx, esp, ebp, esi, edi] 11 * reg layout: u32 gprs[eax, ecx, edx, ebx, esp, ebp, esi, edi]
12 * 12 *
13 */ 13 */
14.macro op_safe_regs op 14.macro op_safe_regs op
15ENTRY(native_\op\()_safe_regs) 15ENTRY(\op\()_safe_regs)
16 CFI_STARTPROC 16 CFI_STARTPROC
17 pushq_cfi %rbx 17 pushq_cfi %rbx
18 pushq_cfi %rbp 18 pushq_cfi %rbp
@@ -45,13 +45,13 @@ ENTRY(native_\op\()_safe_regs)
45 45
46 _ASM_EXTABLE(1b, 3b) 46 _ASM_EXTABLE(1b, 3b)
47 CFI_ENDPROC 47 CFI_ENDPROC
48ENDPROC(native_\op\()_safe_regs) 48ENDPROC(\op\()_safe_regs)
49.endm 49.endm
50 50
51#else /* X86_32 */ 51#else /* X86_32 */
52 52
53.macro op_safe_regs op 53.macro op_safe_regs op
54ENTRY(native_\op\()_safe_regs) 54ENTRY(\op\()_safe_regs)
55 CFI_STARTPROC 55 CFI_STARTPROC
56 pushl_cfi %ebx 56 pushl_cfi %ebx
57 pushl_cfi %ebp 57 pushl_cfi %ebp
@@ -92,7 +92,7 @@ ENTRY(native_\op\()_safe_regs)
92 92
93 _ASM_EXTABLE(1b, 3b) 93 _ASM_EXTABLE(1b, 3b)
94 CFI_ENDPROC 94 CFI_ENDPROC
95ENDPROC(native_\op\()_safe_regs) 95ENDPROC(\op\()_safe_regs)
96.endm 96.endm
97 97
98#endif 98#endif
diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
index 2e4e4b02c37..4f74d94c8d9 100644
--- a/arch/x86/lib/usercopy.c
+++ b/arch/x86/lib/usercopy.c
@@ -8,6 +8,7 @@
8#include <linux/module.h> 8#include <linux/module.h>
9 9
10#include <asm/word-at-a-time.h> 10#include <asm/word-at-a-time.h>
11#include <linux/sched.h>
11 12
12/* 13/*
13 * best effort, GUP based copy_from_user() that is NMI-safe 14 * best effort, GUP based copy_from_user() that is NMI-safe
@@ -21,6 +22,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
21 void *map; 22 void *map;
22 int ret; 23 int ret;
23 24
25 if (__range_not_ok(from, n, TASK_SIZE))
26 return len;
27
24 do { 28 do {
25 ret = __get_user_pages_fast(addr, 1, 0, &page); 29 ret = __get_user_pages_fast(addr, 1, 0, &page);
26 if (!ret) 30 if (!ret)
@@ -43,100 +47,3 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
43 return len; 47 return len;
44} 48}
45EXPORT_SYMBOL_GPL(copy_from_user_nmi); 49EXPORT_SYMBOL_GPL(copy_from_user_nmi);
46
47/*
48 * Do a strncpy, return length of string without final '\0'.
49 * 'count' is the user-supplied count (return 'count' if we
50 * hit it), 'max' is the address space maximum (and we return
51 * -EFAULT if we hit it).
52 */
53static inline long do_strncpy_from_user(char *dst, const char __user *src, long count, unsigned long max)
54{
55 long res = 0;
56
57 /*
58 * Truncate 'max' to the user-specified limit, so that
59 * we only have one limit we need to check in the loop
60 */
61 if (max > count)
62 max = count;
63
64 while (max >= sizeof(unsigned long)) {
65 unsigned long c, mask;
66
67 /* Fall back to byte-at-a-time if we get a page fault */
68 if (unlikely(__get_user(c,(unsigned long __user *)(src+res))))
69 break;
70 mask = has_zero(c);
71 if (mask) {
72 mask = (mask - 1) & ~mask;
73 mask >>= 7;
74 *(unsigned long *)(dst+res) = c & mask;
75 return res + count_masked_bytes(mask);
76 }
77 *(unsigned long *)(dst+res) = c;
78 res += sizeof(unsigned long);
79 max -= sizeof(unsigned long);
80 }
81
82 while (max) {
83 char c;
84
85 if (unlikely(__get_user(c,src+res)))
86 return -EFAULT;
87 dst[res] = c;
88 if (!c)
89 return res;
90 res++;
91 max--;
92 }
93
94 /*
95 * Uhhuh. We hit 'max'. But was that the user-specified maximum
96 * too? If so, that's ok - we got as much as the user asked for.
97 */
98 if (res >= count)
99 return res;
100
101 /*
102 * Nope: we hit the address space limit, and we still had more
103 * characters the caller would have wanted. That's an EFAULT.
104 */
105 return -EFAULT;
106}
107
108/**
109 * strncpy_from_user: - Copy a NUL terminated string from userspace.
110 * @dst: Destination address, in kernel space. This buffer must be at
111 * least @count bytes long.
112 * @src: Source address, in user space.
113 * @count: Maximum number of bytes to copy, including the trailing NUL.
114 *
115 * Copies a NUL-terminated string from userspace to kernel space.
116 *
117 * On success, returns the length of the string (not including the trailing
118 * NUL).
119 *
120 * If access to userspace fails, returns -EFAULT (some data may have been
121 * copied).
122 *
123 * If @count is smaller than the length of the string, copies @count bytes
124 * and returns @count.
125 */
126long
127strncpy_from_user(char *dst, const char __user *src, long count)
128{
129 unsigned long max_addr, src_addr;
130
131 if (unlikely(count <= 0))
132 return 0;
133
134 max_addr = current_thread_info()->addr_limit.seg;
135 src_addr = (unsigned long)src;
136 if (likely(src_addr < max_addr)) {
137 unsigned long max = max_addr - src_addr;
138 return do_strncpy_from_user(dst, src, count, max);
139 }
140 return -EFAULT;
141}
142EXPORT_SYMBOL(strncpy_from_user);
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 883b216c60b..1781b2f950e 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -95,47 +95,6 @@ __clear_user(void __user *to, unsigned long n)
95} 95}
96EXPORT_SYMBOL(__clear_user); 96EXPORT_SYMBOL(__clear_user);
97 97
98/**
99 * strnlen_user: - Get the size of a string in user space.
100 * @s: The string to measure.
101 * @n: The maximum valid length
102 *
103 * Get the size of a NUL-terminated string in user space.
104 *
105 * Returns the size of the string INCLUDING the terminating NUL.
106 * On exception, returns 0.
107 * If the string is too long, returns a value greater than @n.
108 */
109long strnlen_user(const char __user *s, long n)
110{
111 unsigned long mask = -__addr_ok(s);
112 unsigned long res, tmp;
113
114 might_fault();
115
116 __asm__ __volatile__(
117 " testl %0, %0\n"
118 " jz 3f\n"
119 " andl %0,%%ecx\n"
120 "0: repne; scasb\n"
121 " setne %%al\n"
122 " subl %%ecx,%0\n"
123 " addl %0,%%eax\n"
124 "1:\n"
125 ".section .fixup,\"ax\"\n"
126 "2: xorl %%eax,%%eax\n"
127 " jmp 1b\n"
128 "3: movb $1,%%al\n"
129 " jmp 1b\n"
130 ".previous\n"
131 _ASM_EXTABLE(0b,2b)
132 :"=&r" (n), "=&D" (s), "=&a" (res), "=&c" (tmp)
133 :"0" (n), "1" (s), "2" (0), "3" (mask)
134 :"cc");
135 return res & mask;
136}
137EXPORT_SYMBOL(strnlen_user);
138
139#ifdef CONFIG_X86_INTEL_USERCOPY 98#ifdef CONFIG_X86_INTEL_USERCOPY
140static unsigned long 99static unsigned long
141__copy_user_intel(void __user *to, const void *from, unsigned long size) 100__copy_user_intel(void __user *to, const void *from, unsigned long size)
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index 0d0326f388c..e5b130bc2d0 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -52,54 +52,6 @@ unsigned long clear_user(void __user *to, unsigned long n)
52} 52}
53EXPORT_SYMBOL(clear_user); 53EXPORT_SYMBOL(clear_user);
54 54
55/*
56 * Return the size of a string (including the ending 0)
57 *
58 * Return 0 on exception, a value greater than N if too long
59 */
60
61long __strnlen_user(const char __user *s, long n)
62{
63 long res = 0;
64 char c;
65
66 while (1) {
67 if (res>n)
68 return n+1;
69 if (__get_user(c, s))
70 return 0;
71 if (!c)
72 return res+1;
73 res++;
74 s++;
75 }
76}
77EXPORT_SYMBOL(__strnlen_user);
78
79long strnlen_user(const char __user *s, long n)
80{
81 if (!access_ok(VERIFY_READ, s, 1))
82 return 0;
83 return __strnlen_user(s, n);
84}
85EXPORT_SYMBOL(strnlen_user);
86
87long strlen_user(const char __user *s)
88{
89 long res = 0;
90 char c;
91
92 for (;;) {
93 if (get_user(c, s))
94 return 0;
95 if (!c)
96 return res+1;
97 res++;
98 s++;
99 }
100}
101EXPORT_SYMBOL(strlen_user);
102
103unsigned long copy_in_user(void __user *to, const void __user *from, unsigned len) 55unsigned long copy_in_user(void __user *to, const void __user *from, unsigned len)
104{ 56{
105 if (access_ok(VERIFY_WRITE, to, len) && access_ok(VERIFY_READ, from, len)) { 57 if (access_ok(VERIFY_WRITE, to, len) && access_ok(VERIFY_READ, from, len)) {
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index 81913790442..5d7e51f3fd2 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -28,7 +28,7 @@
28# - (66): the last prefix is 0x66 28# - (66): the last prefix is 0x66
29# - (F3): the last prefix is 0xF3 29# - (F3): the last prefix is 0xF3
30# - (F2): the last prefix is 0xF2 30# - (F2): the last prefix is 0xF2
31# 31# - (!F3) : the last prefix is not 0xF3 (including non-last prefix case)
32 32
33Table: one byte opcode 33Table: one byte opcode
34Referrer: 34Referrer:
@@ -515,12 +515,12 @@ b4: LFS Gv,Mp
515b5: LGS Gv,Mp 515b5: LGS Gv,Mp
516b6: MOVZX Gv,Eb 516b6: MOVZX Gv,Eb
517b7: MOVZX Gv,Ew 517b7: MOVZX Gv,Ew
518b8: JMPE | POPCNT Gv,Ev (F3) 518b8: JMPE (!F3) | POPCNT Gv,Ev (F3)
519b9: Grp10 (1A) 519b9: Grp10 (1A)
520ba: Grp8 Ev,Ib (1A) 520ba: Grp8 Ev,Ib (1A)
521bb: BTC Ev,Gv 521bb: BTC Ev,Gv
522bc: BSF Gv,Ev | TZCNT Gv,Ev (F3) 522bc: BSF Gv,Ev (!F3) | TZCNT Gv,Ev (F3)
523bd: BSR Gv,Ev | LZCNT Gv,Ev (F3) 523bd: BSR Gv,Ev (!F3) | LZCNT Gv,Ev (F3)
524be: MOVSX Gv,Eb 524be: MOVSX Gv,Eb
525bf: MOVSX Gv,Ew 525bf: MOVSX Gv,Ew
526# 0x0f 0xc0-0xcf 526# 0x0f 0xc0-0xcf
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 319b6f2fb8b..e0e6990723e 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -62,7 +62,8 @@ static void __init find_early_table_space(struct map_range *mr, unsigned long en
62 extra += PMD_SIZE; 62 extra += PMD_SIZE;
63#endif 63#endif
64 /* The first 2/4M doesn't use large pages. */ 64 /* The first 2/4M doesn't use large pages. */
65 extra += mr->end - mr->start; 65 if (mr->start < PMD_SIZE)
66 extra += mr->end - mr->start;
66 67
67 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; 68 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
68 } else 69 } else
@@ -84,8 +85,9 @@ static void __init find_early_table_space(struct map_range *mr, unsigned long en
84 pgt_buf_end = pgt_buf_start; 85 pgt_buf_end = pgt_buf_start;
85 pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); 86 pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
86 87
87 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", 88 printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx]\n",
88 end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT); 89 end - 1, pgt_buf_start << PAGE_SHIFT,
90 (pgt_buf_top << PAGE_SHIFT) - 1);
89} 91}
90 92
91void __init native_pagetable_reserve(u64 start, u64 end) 93void __init native_pagetable_reserve(u64 start, u64 end)
@@ -132,7 +134,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
132 int nr_range, i; 134 int nr_range, i;
133 int use_pse, use_gbpages; 135 int use_pse, use_gbpages;
134 136
135 printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end); 137 printk(KERN_INFO "init_memory_mapping: [mem %#010lx-%#010lx]\n",
138 start, end - 1);
136 139
137#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK) 140#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
138 /* 141 /*
@@ -251,8 +254,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
251 } 254 }
252 255
253 for (i = 0; i < nr_range; i++) 256 for (i = 0; i < nr_range; i++)
254 printk(KERN_DEBUG " %010lx - %010lx page %s\n", 257 printk(KERN_DEBUG " [mem %#010lx-%#010lx] page %s\n",
255 mr[i].start, mr[i].end, 258 mr[i].start, mr[i].end - 1,
256 (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":( 259 (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
257 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k")); 260 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
258 261
@@ -350,8 +353,8 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
350 * create a kernel page fault: 353 * create a kernel page fault:
351 */ 354 */
352#ifdef CONFIG_DEBUG_PAGEALLOC 355#ifdef CONFIG_DEBUG_PAGEALLOC
353 printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", 356 printk(KERN_INFO "debug: unmapping init [mem %#010lx-%#010lx]\n",
354 begin, end); 357 begin, end - 1);
355 set_memory_np(begin, (end - begin) >> PAGE_SHIFT); 358 set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
356#else 359#else
357 /* 360 /*
@@ -382,7 +385,7 @@ void free_initmem(void)
382} 385}
383 386
384#ifdef CONFIG_BLK_DEV_INITRD 387#ifdef CONFIG_BLK_DEV_INITRD
385void free_initrd_mem(unsigned long start, unsigned long end) 388void __init free_initrd_mem(unsigned long start, unsigned long end)
386{ 389{
387 /* 390 /*
388 * end could be not aligned, and We can not align that, 391 * end could be not aligned, and We can not align that,
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index be1ef574ce9..78fe3f1ac49 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -180,7 +180,7 @@ err_free_memtype:
180 180
181/** 181/**
182 * ioremap_nocache - map bus memory into CPU space 182 * ioremap_nocache - map bus memory into CPU space
183 * @offset: bus address of the memory 183 * @phys_addr: bus address of the memory
184 * @size: size of the resource to map 184 * @size: size of the resource to map
185 * 185 *
186 * ioremap_nocache performs a platform specific sequence of operations to 186 * ioremap_nocache performs a platform specific sequence of operations to
@@ -217,7 +217,7 @@ EXPORT_SYMBOL(ioremap_nocache);
217 217
218/** 218/**
219 * ioremap_wc - map memory into CPU space write combined 219 * ioremap_wc - map memory into CPU space write combined
220 * @offset: bus address of the memory 220 * @phys_addr: bus address of the memory
221 * @size: size of the resource to map 221 * @size: size of the resource to map
222 * 222 *
223 * This version of ioremap ensures that the memory is marked write combining. 223 * This version of ioremap ensures that the memory is marked write combining.
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 19d3fa08b11..2d125be1bae 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -141,8 +141,8 @@ static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
141 141
142 /* whine about and ignore invalid blks */ 142 /* whine about and ignore invalid blks */
143 if (start > end || nid < 0 || nid >= MAX_NUMNODES) { 143 if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
144 pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n", 144 pr_warning("NUMA: Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
145 nid, start, end); 145 nid, start, end - 1);
146 return 0; 146 return 0;
147 } 147 }
148 148
@@ -210,8 +210,8 @@ static void __init setup_node_data(int nid, u64 start, u64 end)
210 210
211 start = roundup(start, ZONE_ALIGN); 211 start = roundup(start, ZONE_ALIGN);
212 212
213 printk(KERN_INFO "Initmem setup node %d %016Lx-%016Lx\n", 213 printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n",
214 nid, start, end); 214 nid, start, end - 1);
215 215
216 /* 216 /*
217 * Allocate node data. Try remap allocator first, node-local 217 * Allocate node data. Try remap allocator first, node-local
@@ -232,7 +232,7 @@ static void __init setup_node_data(int nid, u64 start, u64 end)
232 } 232 }
233 233
234 /* report and initialize */ 234 /* report and initialize */
235 printk(KERN_INFO " NODE_DATA [%016Lx - %016Lx]%s\n", 235 printk(KERN_INFO " NODE_DATA [mem %#010Lx-%#010Lx]%s\n",
236 nd_pa, nd_pa + nd_size - 1, remapped ? " (remapped)" : ""); 236 nd_pa, nd_pa + nd_size - 1, remapped ? " (remapped)" : "");
237 tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); 237 tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
238 if (!remapped && tnid != nid) 238 if (!remapped && tnid != nid)
@@ -291,14 +291,14 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
291 */ 291 */
292 if (bi->end > bj->start && bi->start < bj->end) { 292 if (bi->end > bj->start && bi->start < bj->end) {
293 if (bi->nid != bj->nid) { 293 if (bi->nid != bj->nid) {
294 pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n", 294 pr_err("NUMA: node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n",
295 bi->nid, bi->start, bi->end, 295 bi->nid, bi->start, bi->end - 1,
296 bj->nid, bj->start, bj->end); 296 bj->nid, bj->start, bj->end - 1);
297 return -EINVAL; 297 return -EINVAL;
298 } 298 }
299 pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n", 299 pr_warning("NUMA: Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n",
300 bi->nid, bi->start, bi->end, 300 bi->nid, bi->start, bi->end - 1,
301 bj->start, bj->end); 301 bj->start, bj->end - 1);
302 } 302 }
303 303
304 /* 304 /*
@@ -320,9 +320,9 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
320 } 320 }
321 if (k < mi->nr_blks) 321 if (k < mi->nr_blks)
322 continue; 322 continue;
323 printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%Lx,%Lx)\n", 323 printk(KERN_INFO "NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n",
324 bi->nid, bi->start, bi->end, bj->start, bj->end, 324 bi->nid, bi->start, bi->end - 1, bj->start,
325 start, end); 325 bj->end - 1, start, end - 1);
326 bi->start = start; 326 bi->start = start;
327 bi->end = end; 327 bi->end = end;
328 numa_remove_memblk_from(j--, mi); 328 numa_remove_memblk_from(j--, mi);
@@ -616,8 +616,8 @@ static int __init dummy_numa_init(void)
616{ 616{
617 printk(KERN_INFO "%s\n", 617 printk(KERN_INFO "%s\n",
618 numa_off ? "NUMA turned off" : "No NUMA configuration found"); 618 numa_off ? "NUMA turned off" : "No NUMA configuration found");
619 printk(KERN_INFO "Faking a node at %016Lx-%016Lx\n", 619 printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n",
620 0LLU, PFN_PHYS(max_pfn)); 620 0LLU, PFN_PHYS(max_pfn) - 1);
621 621
622 node_set(0, numa_nodes_parsed); 622 node_set(0, numa_nodes_parsed);
623 numa_add_memblk(0, 0, PFN_PHYS(max_pfn)); 623 numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index 871dd886817..dbbbb47260c 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -68,8 +68,8 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei,
68 numa_remove_memblk_from(phys_blk, pi); 68 numa_remove_memblk_from(phys_blk, pi);
69 } 69 }
70 70
71 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, 71 printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n",
72 eb->start, eb->end, (eb->end - eb->start) >> 20); 72 nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20);
73 return 0; 73 return 0;
74} 74}
75 75
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index e1ebde31521..a718e0d2350 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -122,7 +122,7 @@ within(unsigned long addr, unsigned long start, unsigned long end)
122 122
123/** 123/**
124 * clflush_cache_range - flush a cache range with clflush 124 * clflush_cache_range - flush a cache range with clflush
125 * @addr: virtual start address 125 * @vaddr: virtual start address
126 * @size: number of bytes to flush 126 * @size: number of bytes to flush
127 * 127 *
128 * clflush is an unordered instruction which needs fencing with mfence 128 * clflush is an unordered instruction which needs fencing with mfence
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index f6ff57b7efa..3d68ef6d226 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -158,31 +158,47 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
158 return req_type; 158 return req_type;
159} 159}
160 160
161struct pagerange_state {
162 unsigned long cur_pfn;
163 int ram;
164 int not_ram;
165};
166
167static int
168pagerange_is_ram_callback(unsigned long initial_pfn, unsigned long total_nr_pages, void *arg)
169{
170 struct pagerange_state *state = arg;
171
172 state->not_ram |= initial_pfn > state->cur_pfn;
173 state->ram |= total_nr_pages > 0;
174 state->cur_pfn = initial_pfn + total_nr_pages;
175
176 return state->ram && state->not_ram;
177}
178
161static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end) 179static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end)
162{ 180{
163 int ram_page = 0, not_rampage = 0; 181 int ret = 0;
164 unsigned long page_nr; 182 unsigned long start_pfn = start >> PAGE_SHIFT;
183 unsigned long end_pfn = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
184 struct pagerange_state state = {start_pfn, 0, 0};
165 185
166 for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT); 186 /*
167 ++page_nr) { 187 * For legacy reasons, physical address range in the legacy ISA
168 /* 188 * region is tracked as non-RAM. This will allow users of
169 * For legacy reasons, physical address range in the legacy ISA 189 * /dev/mem to map portions of legacy ISA region, even when
170 * region is tracked as non-RAM. This will allow users of 190 * some of those portions are listed(or not even listed) with
171 * /dev/mem to map portions of legacy ISA region, even when 191 * different e820 types(RAM/reserved/..)
172 * some of those portions are listed(or not even listed) with 192 */
173 * different e820 types(RAM/reserved/..) 193 if (start_pfn < ISA_END_ADDRESS >> PAGE_SHIFT)
174 */ 194 start_pfn = ISA_END_ADDRESS >> PAGE_SHIFT;
175 if (page_nr >= (ISA_END_ADDRESS >> PAGE_SHIFT) && 195
176 page_is_ram(page_nr)) 196 if (start_pfn < end_pfn) {
177 ram_page = 1; 197 ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn,
178 else 198 &state, pagerange_is_ram_callback);
179 not_rampage = 1;
180
181 if (ram_page == not_rampage)
182 return -1;
183 } 199 }
184 200
185 return ram_page; 201 return (ret > 0) ? -1 : (state.ram ? 1 : 0);
186} 202}
187 203
188/* 204/*
@@ -209,9 +225,8 @@ static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type,
209 page = pfn_to_page(pfn); 225 page = pfn_to_page(pfn);
210 type = get_page_memtype(page); 226 type = get_page_memtype(page);
211 if (type != -1) { 227 if (type != -1) {
212 printk(KERN_INFO "reserve_ram_pages_type failed " 228 printk(KERN_INFO "reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%lx, req 0x%lx\n",
213 "0x%Lx-0x%Lx, track 0x%lx, req 0x%lx\n", 229 start, end - 1, type, req_type);
214 start, end, type, req_type);
215 if (new_type) 230 if (new_type)
216 *new_type = type; 231 *new_type = type;
217 232
@@ -314,9 +329,9 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
314 329
315 err = rbt_memtype_check_insert(new, new_type); 330 err = rbt_memtype_check_insert(new, new_type);
316 if (err) { 331 if (err) {
317 printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, " 332 printk(KERN_INFO "reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n",
318 "track %s, req %s\n", 333 start, end - 1,
319 start, end, cattr_name(new->type), cattr_name(req_type)); 334 cattr_name(new->type), cattr_name(req_type));
320 kfree(new); 335 kfree(new);
321 spin_unlock(&memtype_lock); 336 spin_unlock(&memtype_lock);
322 337
@@ -325,8 +340,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
325 340
326 spin_unlock(&memtype_lock); 341 spin_unlock(&memtype_lock);
327 342
328 dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", 343 dprintk("reserve_memtype added [mem %#010Lx-%#010Lx], track %s, req %s, ret %s\n",
329 start, end, cattr_name(new->type), cattr_name(req_type), 344 start, end - 1, cattr_name(new->type), cattr_name(req_type),
330 new_type ? cattr_name(*new_type) : "-"); 345 new_type ? cattr_name(*new_type) : "-");
331 346
332 return err; 347 return err;
@@ -360,14 +375,14 @@ int free_memtype(u64 start, u64 end)
360 spin_unlock(&memtype_lock); 375 spin_unlock(&memtype_lock);
361 376
362 if (!entry) { 377 if (!entry) {
363 printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n", 378 printk(KERN_INFO "%s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n",
364 current->comm, current->pid, start, end); 379 current->comm, current->pid, start, end - 1);
365 return -EINVAL; 380 return -EINVAL;
366 } 381 }
367 382
368 kfree(entry); 383 kfree(entry);
369 384
370 dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end); 385 dprintk("free_memtype request [mem %#010Lx-%#010Lx]\n", start, end - 1);
371 386
372 return 0; 387 return 0;
373} 388}
@@ -491,9 +506,8 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size)
491 506
492 while (cursor < to) { 507 while (cursor < to) {
493 if (!devmem_is_allowed(pfn)) { 508 if (!devmem_is_allowed(pfn)) {
494 printk(KERN_INFO 509 printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx]\n",
495 "Program %s tried to access /dev/mem between %Lx->%Lx.\n", 510 current->comm, from, to - 1);
496 current->comm, from, to);
497 return 0; 511 return 0;
498 } 512 }
499 cursor += PAGE_SIZE; 513 cursor += PAGE_SIZE;
@@ -554,12 +568,11 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags)
554 size; 568 size;
555 569
556 if (ioremap_change_attr((unsigned long)__va(base), id_sz, flags) < 0) { 570 if (ioremap_change_attr((unsigned long)__va(base), id_sz, flags) < 0) {
557 printk(KERN_INFO 571 printk(KERN_INFO "%s:%d ioremap_change_attr failed %s "
558 "%s:%d ioremap_change_attr failed %s " 572 "for [mem %#010Lx-%#010Lx]\n",
559 "for %Lx-%Lx\n",
560 current->comm, current->pid, 573 current->comm, current->pid,
561 cattr_name(flags), 574 cattr_name(flags),
562 base, (unsigned long long)(base + size)); 575 base, (unsigned long long)(base + size-1));
563 return -EINVAL; 576 return -EINVAL;
564 } 577 }
565 return 0; 578 return 0;
@@ -591,12 +604,11 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
591 604
592 flags = lookup_memtype(paddr); 605 flags = lookup_memtype(paddr);
593 if (want_flags != flags) { 606 if (want_flags != flags) {
594 printk(KERN_WARNING 607 printk(KERN_WARNING "%s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n",
595 "%s:%d map pfn RAM range req %s for %Lx-%Lx, got %s\n",
596 current->comm, current->pid, 608 current->comm, current->pid,
597 cattr_name(want_flags), 609 cattr_name(want_flags),
598 (unsigned long long)paddr, 610 (unsigned long long)paddr,
599 (unsigned long long)(paddr + size), 611 (unsigned long long)(paddr + size - 1),
600 cattr_name(flags)); 612 cattr_name(flags));
601 *vma_prot = __pgprot((pgprot_val(*vma_prot) & 613 *vma_prot = __pgprot((pgprot_val(*vma_prot) &
602 (~_PAGE_CACHE_MASK)) | 614 (~_PAGE_CACHE_MASK)) |
@@ -614,11 +626,11 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
614 !is_new_memtype_allowed(paddr, size, want_flags, flags)) { 626 !is_new_memtype_allowed(paddr, size, want_flags, flags)) {
615 free_memtype(paddr, paddr + size); 627 free_memtype(paddr, paddr + size);
616 printk(KERN_ERR "%s:%d map pfn expected mapping type %s" 628 printk(KERN_ERR "%s:%d map pfn expected mapping type %s"
617 " for %Lx-%Lx, got %s\n", 629 " for [mem %#010Lx-%#010Lx], got %s\n",
618 current->comm, current->pid, 630 current->comm, current->pid,
619 cattr_name(want_flags), 631 cattr_name(want_flags),
620 (unsigned long long)paddr, 632 (unsigned long long)paddr,
621 (unsigned long long)(paddr + size), 633 (unsigned long long)(paddr + size - 1),
622 cattr_name(flags)); 634 cattr_name(flags));
623 return -EINVAL; 635 return -EINVAL;
624 } 636 }
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index efb5b4b9371..4599c3e8bcb 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -176,8 +176,11 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
176 return; 176 return;
177 } 177 }
178 178
179 printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm, 179 node_set(node, numa_nodes_parsed);
180 start, end); 180
181 printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n",
182 node, pxm,
183 (unsigned long long) start, (unsigned long long) end - 1);
181} 184}
182 185
183void __init acpi_numa_arch_fixup(void) {} 186void __init acpi_numa_arch_fixup(void) {}
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 0597f95b6da..33643a8bcbb 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -309,6 +309,10 @@ void bpf_jit_compile(struct sk_filter *fp)
309 else 309 else
310 EMIT1_off32(0x0d, K); /* or imm32,%eax */ 310 EMIT1_off32(0x0d, K); /* or imm32,%eax */
311 break; 311 break;
312 case BPF_S_ANC_ALU_XOR_X: /* A ^= X; */
313 seen |= SEEN_XREG;
314 EMIT2(0x31, 0xd8); /* xor %ebx,%eax */
315 break;
312 case BPF_S_ALU_LSH_X: /* A <<= X; */ 316 case BPF_S_ALU_LSH_X: /* A <<= X; */
313 seen |= SEEN_XREG; 317 seen |= SEEN_XREG;
314 EMIT4(0x89, 0xd9, 0xd3, 0xe0); /* mov %ebx,%ecx; shl %cl,%eax */ 318 EMIT4(0x89, 0xd9, 0xd3, 0xe0); /* mov %ebx,%ecx; shl %cl,%eax */
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 303f0863782..b2b94438ff0 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -312,7 +312,7 @@ static int op_amd_fill_in_addresses(struct op_msrs * const msrs)
312 goto fail; 312 goto fail;
313 } 313 }
314 /* both registers must be reserved */ 314 /* both registers must be reserved */
315 if (num_counters == AMD64_NUM_COUNTERS_F15H) { 315 if (num_counters == AMD64_NUM_COUNTERS_CORE) {
316 msrs->counters[i].addr = MSR_F15H_PERF_CTR + (i << 1); 316 msrs->counters[i].addr = MSR_F15H_PERF_CTR + (i << 1);
317 msrs->controls[i].addr = MSR_F15H_PERF_CTL + (i << 1); 317 msrs->controls[i].addr = MSR_F15H_PERF_CTL + (i << 1);
318 } else { 318 } else {
@@ -514,7 +514,7 @@ static int op_amd_init(struct oprofile_operations *ops)
514 ops->create_files = setup_ibs_files; 514 ops->create_files = setup_ibs_files;
515 515
516 if (boot_cpu_data.x86 == 0x15) { 516 if (boot_cpu_data.x86 == 0x15) {
517 num_counters = AMD64_NUM_COUNTERS_F15H; 517 num_counters = AMD64_NUM_COUNTERS_CORE;
518 } else { 518 } else {
519 num_counters = AMD64_NUM_COUNTERS; 519 num_counters = AMD64_NUM_COUNTERS;
520 } 520 }
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index fc09c2754e0..505acdd6d60 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -12,8 +12,13 @@ struct pci_root_info {
12 char name[16]; 12 char name[16];
13 unsigned int res_num; 13 unsigned int res_num;
14 struct resource *res; 14 struct resource *res;
15 int busnum;
16 struct pci_sysdata sd; 15 struct pci_sysdata sd;
16#ifdef CONFIG_PCI_MMCONFIG
17 bool mcfg_added;
18 u16 segment;
19 u8 start_bus;
20 u8 end_bus;
21#endif
17}; 22};
18 23
19static bool pci_use_crs = true; 24static bool pci_use_crs = true;
@@ -120,6 +125,81 @@ void __init pci_acpi_crs_quirks(void)
120 pci_use_crs ? "nocrs" : "use_crs"); 125 pci_use_crs ? "nocrs" : "use_crs");
121} 126}
122 127
128#ifdef CONFIG_PCI_MMCONFIG
129static int __devinit check_segment(u16 seg, struct device *dev, char *estr)
130{
131 if (seg) {
132 dev_err(dev,
133 "%s can't access PCI configuration "
134 "space under this host bridge.\n",
135 estr);
136 return -EIO;
137 }
138
139 /*
140 * Failure in adding MMCFG information is not fatal,
141 * just can't access extended configuration space of
142 * devices under this host bridge.
143 */
144 dev_warn(dev,
145 "%s can't access extended PCI configuration "
146 "space under this bridge.\n",
147 estr);
148
149 return 0;
150}
151
152static int __devinit setup_mcfg_map(struct pci_root_info *info,
153 u16 seg, u8 start, u8 end,
154 phys_addr_t addr)
155{
156 int result;
157 struct device *dev = &info->bridge->dev;
158
159 info->start_bus = start;
160 info->end_bus = end;
161 info->mcfg_added = false;
162
163 /* return success if MMCFG is not in use */
164 if (raw_pci_ext_ops && raw_pci_ext_ops != &pci_mmcfg)
165 return 0;
166
167 if (!(pci_probe & PCI_PROBE_MMCONF))
168 return check_segment(seg, dev, "MMCONFIG is disabled,");
169
170 result = pci_mmconfig_insert(dev, seg, start, end, addr);
171 if (result == 0) {
172 /* enable MMCFG if it hasn't been enabled yet */
173 if (raw_pci_ext_ops == NULL)
174 raw_pci_ext_ops = &pci_mmcfg;
175 info->mcfg_added = true;
176 } else if (result != -EEXIST)
177 return check_segment(seg, dev,
178 "fail to add MMCONFIG information,");
179
180 return 0;
181}
182
183static void teardown_mcfg_map(struct pci_root_info *info)
184{
185 if (info->mcfg_added) {
186 pci_mmconfig_delete(info->segment, info->start_bus,
187 info->end_bus);
188 info->mcfg_added = false;
189 }
190}
191#else
192static int __devinit setup_mcfg_map(struct pci_root_info *info,
193 u16 seg, u8 start, u8 end,
194 phys_addr_t addr)
195{
196 return 0;
197}
198static void teardown_mcfg_map(struct pci_root_info *info)
199{
200}
201#endif
202
123static acpi_status 203static acpi_status
124resource_to_addr(struct acpi_resource *resource, 204resource_to_addr(struct acpi_resource *resource,
125 struct acpi_resource_address64 *addr) 205 struct acpi_resource_address64 *addr)
@@ -234,13 +314,6 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
234 } 314 }
235 315
236 info->res_num++; 316 info->res_num++;
237 if (addr.translation_offset)
238 dev_info(&info->bridge->dev, "host bridge window %pR "
239 "(PCI address [%#llx-%#llx])\n",
240 res, res->start - addr.translation_offset,
241 res->end - addr.translation_offset);
242 else
243 dev_info(&info->bridge->dev, "host bridge window %pR\n", res);
244 317
245 return AE_OK; 318 return AE_OK;
246} 319}
@@ -332,8 +405,11 @@ static void __release_pci_root_info(struct pci_root_info *info)
332 405
333 free_pci_root_info_res(info); 406 free_pci_root_info_res(info);
334 407
408 teardown_mcfg_map(info);
409
335 kfree(info); 410 kfree(info);
336} 411}
412
337static void release_pci_root_info(struct pci_host_bridge *bridge) 413static void release_pci_root_info(struct pci_host_bridge *bridge)
338{ 414{
339 struct pci_root_info *info = bridge->release_data; 415 struct pci_root_info *info = bridge->release_data;
@@ -347,7 +423,9 @@ probe_pci_root_info(struct pci_root_info *info, struct acpi_device *device,
347{ 423{
348 size_t size; 424 size_t size;
349 425
426 sprintf(info->name, "PCI Bus %04x:%02x", domain, busnum);
350 info->bridge = device; 427 info->bridge = device;
428
351 info->res_num = 0; 429 info->res_num = 0;
352 acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_resource, 430 acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_resource,
353 info); 431 info);
@@ -360,8 +438,6 @@ probe_pci_root_info(struct pci_root_info *info, struct acpi_device *device,
360 if (!info->res) 438 if (!info->res)
361 return; 439 return;
362 440
363 sprintf(info->name, "PCI Bus %04x:%02x", domain, busnum);
364
365 acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource, 441 acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource,
366 info); 442 info);
367} 443}
@@ -373,7 +449,7 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
373 int domain = root->segment; 449 int domain = root->segment;
374 int busnum = root->secondary.start; 450 int busnum = root->secondary.start;
375 LIST_HEAD(resources); 451 LIST_HEAD(resources);
376 struct pci_bus *bus; 452 struct pci_bus *bus = NULL;
377 struct pci_sysdata *sd; 453 struct pci_sysdata *sd;
378 int node; 454 int node;
379#ifdef CONFIG_ACPI_NUMA 455#ifdef CONFIG_ACPI_NUMA
@@ -426,6 +502,8 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
426 } else { 502 } else {
427 probe_pci_root_info(info, device, busnum, domain); 503 probe_pci_root_info(info, device, busnum, domain);
428 504
505 /* insert busn res at first */
506 pci_add_resource(&resources, &root->secondary);
429 /* 507 /*
430 * _CRS with no apertures is normal, so only fall back to 508 * _CRS with no apertures is normal, so only fall back to
431 * defaults or native bridge info if we're ignoring _CRS. 509 * defaults or native bridge info if we're ignoring _CRS.
@@ -437,10 +515,13 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
437 x86_pci_root_bus_resources(busnum, &resources); 515 x86_pci_root_bus_resources(busnum, &resources);
438 } 516 }
439 517
440 bus = pci_create_root_bus(NULL, busnum, &pci_root_ops, sd, 518 if (!setup_mcfg_map(info, domain, (u8)root->secondary.start,
441 &resources); 519 (u8)root->secondary.end, root->mcfg_addr))
520 bus = pci_create_root_bus(NULL, busnum, &pci_root_ops,
521 sd, &resources);
522
442 if (bus) { 523 if (bus) {
443 bus->subordinate = pci_scan_child_bus(bus); 524 pci_scan_child_bus(bus);
444 pci_set_host_bridge_release( 525 pci_set_host_bridge_release(
445 to_pci_host_bridge(bus->bridge), 526 to_pci_host_bridge(bus->bridge),
446 release_pci_root_info, info); 527 release_pci_root_info, info);
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 5aed49bff05..e9e6ed5cdf9 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -121,7 +121,6 @@ static int __init early_fill_mp_bus_info(void)
121 link = (reg >> 8) & 0x03; 121 link = (reg >> 8) & 0x03;
122 122
123 info = alloc_pci_root_info(min_bus, max_bus, node, link); 123 info = alloc_pci_root_info(min_bus, max_bus, node, link);
124 sprintf(info->name, "PCI Bus #%02x", min_bus);
125 } 124 }
126 125
127 /* get the default node and link for left over res */ 126 /* get the default node and link for left over res */
@@ -300,9 +299,9 @@ static int __init early_fill_mp_bus_info(void)
300 int busnum; 299 int busnum;
301 struct pci_root_res *root_res; 300 struct pci_root_res *root_res;
302 301
303 busnum = info->bus_min; 302 busnum = info->busn.start;
304 printk(KERN_DEBUG "bus: [%02x, %02x] on node %x link %x\n", 303 printk(KERN_DEBUG "bus: %pR on node %x link %x\n",
305 info->bus_min, info->bus_max, info->node, info->link); 304 &info->busn, info->node, info->link);
306 list_for_each_entry(root_res, &info->resources, list) 305 list_for_each_entry(root_res, &info->resources, list)
307 printk(KERN_DEBUG "bus: %02x %pR\n", 306 printk(KERN_DEBUG "bus: %02x %pR\n",
308 busnum, &root_res->res); 307 busnum, &root_res->res);
diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c
index 306579f7d0f..d37e2fec97e 100644
--- a/arch/x86/pci/bus_numa.c
+++ b/arch/x86/pci/bus_numa.c
@@ -14,7 +14,7 @@ static struct pci_root_info *x86_find_pci_root_info(int bus)
14 return NULL; 14 return NULL;
15 15
16 list_for_each_entry(info, &pci_root_infos, list) 16 list_for_each_entry(info, &pci_root_infos, list)
17 if (info->bus_min == bus) 17 if (info->busn.start == bus)
18 return info; 18 return info;
19 19
20 return NULL; 20 return NULL;
@@ -24,6 +24,8 @@ void x86_pci_root_bus_resources(int bus, struct list_head *resources)
24{ 24{
25 struct pci_root_info *info = x86_find_pci_root_info(bus); 25 struct pci_root_info *info = x86_find_pci_root_info(bus);
26 struct pci_root_res *root_res; 26 struct pci_root_res *root_res;
27 struct pci_host_bridge_window *window;
28 bool found = false;
27 29
28 if (!info) 30 if (!info)
29 goto default_resources; 31 goto default_resources;
@@ -31,6 +33,16 @@ void x86_pci_root_bus_resources(int bus, struct list_head *resources)
31 printk(KERN_DEBUG "PCI: root bus %02x: hardware-probed resources\n", 33 printk(KERN_DEBUG "PCI: root bus %02x: hardware-probed resources\n",
32 bus); 34 bus);
33 35
36 /* already added by acpi ? */
37 list_for_each_entry(window, resources, list)
38 if (window->res->flags & IORESOURCE_BUS) {
39 found = true;
40 break;
41 }
42
43 if (!found)
44 pci_add_resource(resources, &info->busn);
45
34 list_for_each_entry(root_res, &info->resources, list) { 46 list_for_each_entry(root_res, &info->resources, list) {
35 struct resource *res; 47 struct resource *res;
36 struct resource *root; 48 struct resource *root;
@@ -66,9 +78,13 @@ struct pci_root_info __init *alloc_pci_root_info(int bus_min, int bus_max,
66 if (!info) 78 if (!info)
67 return info; 79 return info;
68 80
81 sprintf(info->name, "PCI Bus #%02x", bus_min);
82
69 INIT_LIST_HEAD(&info->resources); 83 INIT_LIST_HEAD(&info->resources);
70 info->bus_min = bus_min; 84 info->busn.name = info->name;
71 info->bus_max = bus_max; 85 info->busn.start = bus_min;
86 info->busn.end = bus_max;
87 info->busn.flags = IORESOURCE_BUS;
72 info->node = node; 88 info->node = node;
73 info->link = link; 89 info->link = link;
74 90
diff --git a/arch/x86/pci/bus_numa.h b/arch/x86/pci/bus_numa.h
index 226a466b2b2..ff8f65b0457 100644
--- a/arch/x86/pci/bus_numa.h
+++ b/arch/x86/pci/bus_numa.h
@@ -13,8 +13,7 @@ struct pci_root_info {
13 struct list_head list; 13 struct list_head list;
14 char name[12]; 14 char name[12];
15 struct list_head resources; 15 struct list_head resources;
16 int bus_min; 16 struct resource busn;
17 int bus_max;
18 int node; 17 int node;
19 int link; 18 int link;
20}; 19};
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 0ad990a20d4..720e973fc34 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -494,7 +494,7 @@ int __init pcibios_init(void)
494 return 0; 494 return 0;
495} 495}
496 496
497char * __devinit pcibios_setup(char *str) 497char * __init pcibios_setup(char *str)
498{ 498{
499 if (!strcmp(str, "off")) { 499 if (!strcmp(str, "off")) {
500 pci_probe = 0; 500 pci_probe = 0;
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 5dd467bd612..af8a224db21 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -6,6 +6,7 @@
6#include <linux/dmi.h> 6#include <linux/dmi.h>
7#include <linux/pci.h> 7#include <linux/pci.h>
8#include <linux/init.h> 8#include <linux/init.h>
9#include <linux/vgaarb.h>
9#include <asm/pci_x86.h> 10#include <asm/pci_x86.h>
10 11
11static void __devinit pci_fixup_i450nx(struct pci_dev *d) 12static void __devinit pci_fixup_i450nx(struct pci_dev *d)
@@ -348,6 +349,8 @@ static void __devinit pci_fixup_video(struct pci_dev *pdev)
348 if (config & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY)) { 349 if (config & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY)) {
349 pdev->resource[PCI_ROM_RESOURCE].flags |= IORESOURCE_ROM_SHADOW; 350 pdev->resource[PCI_ROM_RESOURCE].flags |= IORESOURCE_ROM_SHADOW;
350 dev_printk(KERN_DEBUG, &pdev->dev, "Boot video device\n"); 351 dev_printk(KERN_DEBUG, &pdev->dev, "Boot video device\n");
352 if (!vga_default_device())
353 vga_set_default_device(pdev);
351 } 354 }
352} 355}
353DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_ANY_ID, PCI_ANY_ID, 356DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_ANY_ID, PCI_ANY_ID,
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 301e325992f..937bcece700 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -17,6 +17,8 @@
17#include <linux/bitmap.h> 17#include <linux/bitmap.h>
18#include <linux/dmi.h> 18#include <linux/dmi.h>
19#include <linux/slab.h> 19#include <linux/slab.h>
20#include <linux/mutex.h>
21#include <linux/rculist.h>
20#include <asm/e820.h> 22#include <asm/e820.h>
21#include <asm/pci_x86.h> 23#include <asm/pci_x86.h>
22#include <asm/acpi.h> 24#include <asm/acpi.h>
@@ -24,7 +26,9 @@
24#define PREFIX "PCI: " 26#define PREFIX "PCI: "
25 27
26/* Indicate if the mmcfg resources have been placed into the resource table. */ 28/* Indicate if the mmcfg resources have been placed into the resource table. */
27static int __initdata pci_mmcfg_resources_inserted; 29static bool pci_mmcfg_running_state;
30static bool pci_mmcfg_arch_init_failed;
31static DEFINE_MUTEX(pci_mmcfg_lock);
28 32
29LIST_HEAD(pci_mmcfg_list); 33LIST_HEAD(pci_mmcfg_list);
30 34
@@ -45,24 +49,25 @@ static __init void free_all_mmcfg(void)
45 pci_mmconfig_remove(cfg); 49 pci_mmconfig_remove(cfg);
46} 50}
47 51
48static __init void list_add_sorted(struct pci_mmcfg_region *new) 52static __devinit void list_add_sorted(struct pci_mmcfg_region *new)
49{ 53{
50 struct pci_mmcfg_region *cfg; 54 struct pci_mmcfg_region *cfg;
51 55
52 /* keep list sorted by segment and starting bus number */ 56 /* keep list sorted by segment and starting bus number */
53 list_for_each_entry(cfg, &pci_mmcfg_list, list) { 57 list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list) {
54 if (cfg->segment > new->segment || 58 if (cfg->segment > new->segment ||
55 (cfg->segment == new->segment && 59 (cfg->segment == new->segment &&
56 cfg->start_bus >= new->start_bus)) { 60 cfg->start_bus >= new->start_bus)) {
57 list_add_tail(&new->list, &cfg->list); 61 list_add_tail_rcu(&new->list, &cfg->list);
58 return; 62 return;
59 } 63 }
60 } 64 }
61 list_add_tail(&new->list, &pci_mmcfg_list); 65 list_add_tail_rcu(&new->list, &pci_mmcfg_list);
62} 66}
63 67
64static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start, 68static __devinit struct pci_mmcfg_region *pci_mmconfig_alloc(int segment,
65 int end, u64 addr) 69 int start,
70 int end, u64 addr)
66{ 71{
67 struct pci_mmcfg_region *new; 72 struct pci_mmcfg_region *new;
68 struct resource *res; 73 struct resource *res;
@@ -79,8 +84,6 @@ static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start,
79 new->start_bus = start; 84 new->start_bus = start;
80 new->end_bus = end; 85 new->end_bus = end;
81 86
82 list_add_sorted(new);
83
84 res = &new->res; 87 res = &new->res;
85 res->start = addr + PCI_MMCFG_BUS_OFFSET(start); 88 res->start = addr + PCI_MMCFG_BUS_OFFSET(start);
86 res->end = addr + PCI_MMCFG_BUS_OFFSET(end + 1) - 1; 89 res->end = addr + PCI_MMCFG_BUS_OFFSET(end + 1) - 1;
@@ -89,9 +92,25 @@ static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start,
89 "PCI MMCONFIG %04x [bus %02x-%02x]", segment, start, end); 92 "PCI MMCONFIG %04x [bus %02x-%02x]", segment, start, end);
90 res->name = new->name; 93 res->name = new->name;
91 94
92 printk(KERN_INFO PREFIX "MMCONFIG for domain %04x [bus %02x-%02x] at " 95 return new;
93 "%pR (base %#lx)\n", segment, start, end, &new->res, 96}
94 (unsigned long) addr); 97
98static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start,
99 int end, u64 addr)
100{
101 struct pci_mmcfg_region *new;
102
103 new = pci_mmconfig_alloc(segment, start, end, addr);
104 if (new) {
105 mutex_lock(&pci_mmcfg_lock);
106 list_add_sorted(new);
107 mutex_unlock(&pci_mmcfg_lock);
108
109 pr_info(PREFIX
110 "MMCONFIG for domain %04x [bus %02x-%02x] at %pR "
111 "(base %#lx)\n",
112 segment, start, end, &new->res, (unsigned long)addr);
113 }
95 114
96 return new; 115 return new;
97} 116}
@@ -100,7 +119,7 @@ struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus)
100{ 119{
101 struct pci_mmcfg_region *cfg; 120 struct pci_mmcfg_region *cfg;
102 121
103 list_for_each_entry(cfg, &pci_mmcfg_list, list) 122 list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list)
104 if (cfg->segment == segment && 123 if (cfg->segment == segment &&
105 cfg->start_bus <= bus && bus <= cfg->end_bus) 124 cfg->start_bus <= bus && bus <= cfg->end_bus)
106 return cfg; 125 return cfg;
@@ -343,8 +362,7 @@ static int __init pci_mmcfg_check_hostbridge(void)
343 name = pci_mmcfg_probes[i].probe(); 362 name = pci_mmcfg_probes[i].probe();
344 363
345 if (name) 364 if (name)
346 printk(KERN_INFO PREFIX "%s with MMCONFIG support\n", 365 pr_info(PREFIX "%s with MMCONFIG support\n", name);
347 name);
348 } 366 }
349 367
350 /* some end_bus_number is crazy, fix it */ 368 /* some end_bus_number is crazy, fix it */
@@ -353,19 +371,8 @@ static int __init pci_mmcfg_check_hostbridge(void)
353 return !list_empty(&pci_mmcfg_list); 371 return !list_empty(&pci_mmcfg_list);
354} 372}
355 373
356static void __init pci_mmcfg_insert_resources(void) 374static acpi_status __devinit check_mcfg_resource(struct acpi_resource *res,
357{ 375 void *data)
358 struct pci_mmcfg_region *cfg;
359
360 list_for_each_entry(cfg, &pci_mmcfg_list, list)
361 insert_resource(&iomem_resource, &cfg->res);
362
363 /* Mark that the resources have been inserted. */
364 pci_mmcfg_resources_inserted = 1;
365}
366
367static acpi_status __init check_mcfg_resource(struct acpi_resource *res,
368 void *data)
369{ 376{
370 struct resource *mcfg_res = data; 377 struct resource *mcfg_res = data;
371 struct acpi_resource_address64 address; 378 struct acpi_resource_address64 address;
@@ -401,8 +408,8 @@ static acpi_status __init check_mcfg_resource(struct acpi_resource *res,
401 return AE_OK; 408 return AE_OK;
402} 409}
403 410
404static acpi_status __init find_mboard_resource(acpi_handle handle, u32 lvl, 411static acpi_status __devinit find_mboard_resource(acpi_handle handle, u32 lvl,
405 void *context, void **rv) 412 void *context, void **rv)
406{ 413{
407 struct resource *mcfg_res = context; 414 struct resource *mcfg_res = context;
408 415
@@ -415,7 +422,7 @@ static acpi_status __init find_mboard_resource(acpi_handle handle, u32 lvl,
415 return AE_OK; 422 return AE_OK;
416} 423}
417 424
418static int __init is_acpi_reserved(u64 start, u64 end, unsigned not_used) 425static int __devinit is_acpi_reserved(u64 start, u64 end, unsigned not_used)
419{ 426{
420 struct resource mcfg_res; 427 struct resource mcfg_res;
421 428
@@ -434,13 +441,15 @@ static int __init is_acpi_reserved(u64 start, u64 end, unsigned not_used)
434 441
435typedef int (*check_reserved_t)(u64 start, u64 end, unsigned type); 442typedef int (*check_reserved_t)(u64 start, u64 end, unsigned type);
436 443
437static int __init is_mmconf_reserved(check_reserved_t is_reserved, 444static int __ref is_mmconf_reserved(check_reserved_t is_reserved,
438 struct pci_mmcfg_region *cfg, int with_e820) 445 struct pci_mmcfg_region *cfg,
446 struct device *dev, int with_e820)
439{ 447{
440 u64 addr = cfg->res.start; 448 u64 addr = cfg->res.start;
441 u64 size = resource_size(&cfg->res); 449 u64 size = resource_size(&cfg->res);
442 u64 old_size = size; 450 u64 old_size = size;
443 int valid = 0, num_buses; 451 int num_buses;
452 char *method = with_e820 ? "E820" : "ACPI motherboard resources";
444 453
445 while (!is_reserved(addr, addr + size, E820_RESERVED)) { 454 while (!is_reserved(addr, addr + size, E820_RESERVED)) {
446 size >>= 1; 455 size >>= 1;
@@ -448,30 +457,76 @@ static int __init is_mmconf_reserved(check_reserved_t is_reserved,
448 break; 457 break;
449 } 458 }
450 459
451 if (size >= (16UL<<20) || size == old_size) { 460 if (size < (16UL<<20) && size != old_size)
452 printk(KERN_INFO PREFIX "MMCONFIG at %pR reserved in %s\n", 461 return 0;
453 &cfg->res, 462
454 with_e820 ? "E820" : "ACPI motherboard resources"); 463 if (dev)
455 valid = 1; 464 dev_info(dev, "MMCONFIG at %pR reserved in %s\n",
456 465 &cfg->res, method);
457 if (old_size != size) { 466 else
458 /* update end_bus */ 467 pr_info(PREFIX "MMCONFIG at %pR reserved in %s\n",
459 cfg->end_bus = cfg->start_bus + ((size>>20) - 1); 468 &cfg->res, method);
460 num_buses = cfg->end_bus - cfg->start_bus + 1; 469
461 cfg->res.end = cfg->res.start + 470 if (old_size != size) {
462 PCI_MMCFG_BUS_OFFSET(num_buses) - 1; 471 /* update end_bus */
463 snprintf(cfg->name, PCI_MMCFG_RESOURCE_NAME_LEN, 472 cfg->end_bus = cfg->start_bus + ((size>>20) - 1);
464 "PCI MMCONFIG %04x [bus %02x-%02x]", 473 num_buses = cfg->end_bus - cfg->start_bus + 1;
465 cfg->segment, cfg->start_bus, cfg->end_bus); 474 cfg->res.end = cfg->res.start +
466 printk(KERN_INFO PREFIX 475 PCI_MMCFG_BUS_OFFSET(num_buses) - 1;
467 "MMCONFIG for %04x [bus%02x-%02x] " 476 snprintf(cfg->name, PCI_MMCFG_RESOURCE_NAME_LEN,
468 "at %pR (base %#lx) (size reduced!)\n", 477 "PCI MMCONFIG %04x [bus %02x-%02x]",
469 cfg->segment, cfg->start_bus, cfg->end_bus, 478 cfg->segment, cfg->start_bus, cfg->end_bus);
470 &cfg->res, (unsigned long) cfg->address); 479
471 } 480 if (dev)
481 dev_info(dev,
482 "MMCONFIG "
483 "at %pR (base %#lx) (size reduced!)\n",
484 &cfg->res, (unsigned long) cfg->address);
485 else
486 pr_info(PREFIX
487 "MMCONFIG for %04x [bus%02x-%02x] "
488 "at %pR (base %#lx) (size reduced!)\n",
489 cfg->segment, cfg->start_bus, cfg->end_bus,
490 &cfg->res, (unsigned long) cfg->address);
472 } 491 }
473 492
474 return valid; 493 return 1;
494}
495
496static int __ref pci_mmcfg_check_reserved(struct device *dev,
497 struct pci_mmcfg_region *cfg, int early)
498{
499 if (!early && !acpi_disabled) {
500 if (is_mmconf_reserved(is_acpi_reserved, cfg, dev, 0))
501 return 1;
502
503 if (dev)
504 dev_info(dev, FW_INFO
505 "MMCONFIG at %pR not reserved in "
506 "ACPI motherboard resources\n",
507 &cfg->res);
508 else
509 pr_info(FW_INFO PREFIX
510 "MMCONFIG at %pR not reserved in "
511 "ACPI motherboard resources\n",
512 &cfg->res);
513 }
514
515 /*
516 * e820_all_mapped() is marked as __init.
517 * All entries from ACPI MCFG table have been checked at boot time.
518 * For MCFG information constructed from hotpluggable host bridge's
519 * _CBA method, just assume it's reserved.
520 */
521 if (pci_mmcfg_running_state)
522 return 1;
523
524 /* Don't try to do this check unless configuration
525 type 1 is available. how about type 2 ?*/
526 if (raw_pci_ops)
527 return is_mmconf_reserved(e820_all_mapped, cfg, dev, 1);
528
529 return 0;
475} 530}
476 531
477static void __init pci_mmcfg_reject_broken(int early) 532static void __init pci_mmcfg_reject_broken(int early)
@@ -479,38 +534,14 @@ static void __init pci_mmcfg_reject_broken(int early)
479 struct pci_mmcfg_region *cfg; 534 struct pci_mmcfg_region *cfg;
480 535
481 list_for_each_entry(cfg, &pci_mmcfg_list, list) { 536 list_for_each_entry(cfg, &pci_mmcfg_list, list) {
482 int valid = 0; 537 if (pci_mmcfg_check_reserved(NULL, cfg, early) == 0) {
483 538 pr_info(PREFIX "not using MMCONFIG\n");
484 if (!early && !acpi_disabled) { 539 free_all_mmcfg();
485 valid = is_mmconf_reserved(is_acpi_reserved, cfg, 0); 540 return;
486
487 if (valid)
488 continue;
489 else
490 printk(KERN_ERR FW_BUG PREFIX
491 "MMCONFIG at %pR not reserved in "
492 "ACPI motherboard resources\n",
493 &cfg->res);
494 } 541 }
495
496 /* Don't try to do this check unless configuration
497 type 1 is available. how about type 2 ?*/
498 if (raw_pci_ops)
499 valid = is_mmconf_reserved(e820_all_mapped, cfg, 1);
500
501 if (!valid)
502 goto reject;
503 } 542 }
504
505 return;
506
507reject:
508 printk(KERN_INFO PREFIX "not using MMCONFIG\n");
509 free_all_mmcfg();
510} 543}
511 544
512static int __initdata known_bridge;
513
514static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg, 545static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg,
515 struct acpi_mcfg_allocation *cfg) 546 struct acpi_mcfg_allocation *cfg)
516{ 547{
@@ -529,7 +560,7 @@ static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg,
529 return 0; 560 return 0;
530 } 561 }
531 562
532 printk(KERN_ERR PREFIX "MCFG region for %04x [bus %02x-%02x] at %#llx " 563 pr_err(PREFIX "MCFG region for %04x [bus %02x-%02x] at %#llx "
533 "is above 4GB, ignored\n", cfg->pci_segment, 564 "is above 4GB, ignored\n", cfg->pci_segment,
534 cfg->start_bus_number, cfg->end_bus_number, cfg->address); 565 cfg->start_bus_number, cfg->end_bus_number, cfg->address);
535 return -EINVAL; 566 return -EINVAL;
@@ -556,7 +587,7 @@ static int __init pci_parse_mcfg(struct acpi_table_header *header)
556 i -= sizeof(struct acpi_mcfg_allocation); 587 i -= sizeof(struct acpi_mcfg_allocation);
557 }; 588 };
558 if (entries == 0) { 589 if (entries == 0) {
559 printk(KERN_ERR PREFIX "MMCONFIG has no entries\n"); 590 pr_err(PREFIX "MMCONFIG has no entries\n");
560 return -ENODEV; 591 return -ENODEV;
561 } 592 }
562 593
@@ -570,8 +601,7 @@ static int __init pci_parse_mcfg(struct acpi_table_header *header)
570 601
571 if (pci_mmconfig_add(cfg->pci_segment, cfg->start_bus_number, 602 if (pci_mmconfig_add(cfg->pci_segment, cfg->start_bus_number,
572 cfg->end_bus_number, cfg->address) == NULL) { 603 cfg->end_bus_number, cfg->address) == NULL) {
573 printk(KERN_WARNING PREFIX 604 pr_warn(PREFIX "no memory for MCFG entries\n");
574 "no memory for MCFG entries\n");
575 free_all_mmcfg(); 605 free_all_mmcfg();
576 return -ENOMEM; 606 return -ENOMEM;
577 } 607 }
@@ -582,28 +612,7 @@ static int __init pci_parse_mcfg(struct acpi_table_header *header)
582 612
583static void __init __pci_mmcfg_init(int early) 613static void __init __pci_mmcfg_init(int early)
584{ 614{
585 /* MMCONFIG disabled */
586 if ((pci_probe & PCI_PROBE_MMCONF) == 0)
587 return;
588
589 /* MMCONFIG already enabled */
590 if (!early && !(pci_probe & PCI_PROBE_MASK & ~PCI_PROBE_MMCONF))
591 return;
592
593 /* for late to exit */
594 if (known_bridge)
595 return;
596
597 if (early) {
598 if (pci_mmcfg_check_hostbridge())
599 known_bridge = 1;
600 }
601
602 if (!known_bridge)
603 acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
604
605 pci_mmcfg_reject_broken(early); 615 pci_mmcfg_reject_broken(early);
606
607 if (list_empty(&pci_mmcfg_list)) 616 if (list_empty(&pci_mmcfg_list))
608 return; 617 return;
609 618
@@ -620,33 +629,48 @@ static void __init __pci_mmcfg_init(int early)
620 if (pci_mmcfg_arch_init()) 629 if (pci_mmcfg_arch_init())
621 pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; 630 pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
622 else { 631 else {
623 /* 632 free_all_mmcfg();
624 * Signal not to attempt to insert mmcfg resources because 633 pci_mmcfg_arch_init_failed = true;
625 * the architecture mmcfg setup could not initialize.
626 */
627 pci_mmcfg_resources_inserted = 1;
628 } 634 }
629} 635}
630 636
637static int __initdata known_bridge;
638
631void __init pci_mmcfg_early_init(void) 639void __init pci_mmcfg_early_init(void)
632{ 640{
633 __pci_mmcfg_init(1); 641 if (pci_probe & PCI_PROBE_MMCONF) {
642 if (pci_mmcfg_check_hostbridge())
643 known_bridge = 1;
644 else
645 acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
646 __pci_mmcfg_init(1);
647 }
634} 648}
635 649
636void __init pci_mmcfg_late_init(void) 650void __init pci_mmcfg_late_init(void)
637{ 651{
638 __pci_mmcfg_init(0); 652 /* MMCONFIG disabled */
653 if ((pci_probe & PCI_PROBE_MMCONF) == 0)
654 return;
655
656 if (known_bridge)
657 return;
658
659 /* MMCONFIG hasn't been enabled yet, try again */
660 if (pci_probe & PCI_PROBE_MASK & ~PCI_PROBE_MMCONF) {
661 acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
662 __pci_mmcfg_init(0);
663 }
639} 664}
640 665
641static int __init pci_mmcfg_late_insert_resources(void) 666static int __init pci_mmcfg_late_insert_resources(void)
642{ 667{
643 /* 668 struct pci_mmcfg_region *cfg;
644 * If resources are already inserted or we are not using MMCONFIG, 669
645 * don't insert the resources. 670 pci_mmcfg_running_state = true;
646 */ 671
647 if ((pci_mmcfg_resources_inserted == 1) || 672 /* If we are not using MMCONFIG, don't insert the resources. */
648 (pci_probe & PCI_PROBE_MMCONF) == 0 || 673 if ((pci_probe & PCI_PROBE_MMCONF) == 0)
649 list_empty(&pci_mmcfg_list))
650 return 1; 674 return 1;
651 675
652 /* 676 /*
@@ -654,7 +678,9 @@ static int __init pci_mmcfg_late_insert_resources(void)
654 * marked so it won't cause request errors when __request_region is 678 * marked so it won't cause request errors when __request_region is
655 * called. 679 * called.
656 */ 680 */
657 pci_mmcfg_insert_resources(); 681 list_for_each_entry(cfg, &pci_mmcfg_list, list)
682 if (!cfg->res.parent)
683 insert_resource(&iomem_resource, &cfg->res);
658 684
659 return 0; 685 return 0;
660} 686}
@@ -665,3 +691,101 @@ static int __init pci_mmcfg_late_insert_resources(void)
665 * with other system resources. 691 * with other system resources.
666 */ 692 */
667late_initcall(pci_mmcfg_late_insert_resources); 693late_initcall(pci_mmcfg_late_insert_resources);
694
695/* Add MMCFG information for host bridges */
696int __devinit pci_mmconfig_insert(struct device *dev,
697 u16 seg, u8 start, u8 end,
698 phys_addr_t addr)
699{
700 int rc;
701 struct resource *tmp = NULL;
702 struct pci_mmcfg_region *cfg;
703
704 if (!(pci_probe & PCI_PROBE_MMCONF) || pci_mmcfg_arch_init_failed)
705 return -ENODEV;
706
707 if (start > end)
708 return -EINVAL;
709
710 mutex_lock(&pci_mmcfg_lock);
711 cfg = pci_mmconfig_lookup(seg, start);
712 if (cfg) {
713 if (cfg->end_bus < end)
714 dev_info(dev, FW_INFO
715 "MMCONFIG for "
716 "domain %04x [bus %02x-%02x] "
717 "only partially covers this bridge\n",
718 cfg->segment, cfg->start_bus, cfg->end_bus);
719 mutex_unlock(&pci_mmcfg_lock);
720 return -EEXIST;
721 }
722
723 if (!addr) {
724 mutex_unlock(&pci_mmcfg_lock);
725 return -EINVAL;
726 }
727
728 rc = -EBUSY;
729 cfg = pci_mmconfig_alloc(seg, start, end, addr);
730 if (cfg == NULL) {
731 dev_warn(dev, "fail to add MMCONFIG (out of memory)\n");
732 rc = -ENOMEM;
733 } else if (!pci_mmcfg_check_reserved(dev, cfg, 0)) {
734 dev_warn(dev, FW_BUG "MMCONFIG %pR isn't reserved\n",
735 &cfg->res);
736 } else {
737 /* Insert resource if it's not in boot stage */
738 if (pci_mmcfg_running_state)
739 tmp = insert_resource_conflict(&iomem_resource,
740 &cfg->res);
741
742 if (tmp) {
743 dev_warn(dev,
744 "MMCONFIG %pR conflicts with "
745 "%s %pR\n",
746 &cfg->res, tmp->name, tmp);
747 } else if (pci_mmcfg_arch_map(cfg)) {
748 dev_warn(dev, "fail to map MMCONFIG %pR.\n",
749 &cfg->res);
750 } else {
751 list_add_sorted(cfg);
752 dev_info(dev, "MMCONFIG at %pR (base %#lx)\n",
753 &cfg->res, (unsigned long)addr);
754 cfg = NULL;
755 rc = 0;
756 }
757 }
758
759 if (cfg) {
760 if (cfg->res.parent)
761 release_resource(&cfg->res);
762 kfree(cfg);
763 }
764
765 mutex_unlock(&pci_mmcfg_lock);
766
767 return rc;
768}
769
770/* Delete MMCFG information for host bridges */
771int pci_mmconfig_delete(u16 seg, u8 start, u8 end)
772{
773 struct pci_mmcfg_region *cfg;
774
775 mutex_lock(&pci_mmcfg_lock);
776 list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list)
777 if (cfg->segment == seg && cfg->start_bus == start &&
778 cfg->end_bus == end) {
779 list_del_rcu(&cfg->list);
780 synchronize_rcu();
781 pci_mmcfg_arch_unmap(cfg);
782 if (cfg->res.parent)
783 release_resource(&cfg->res);
784 mutex_unlock(&pci_mmcfg_lock);
785 kfree(cfg);
786 return 0;
787 }
788 mutex_unlock(&pci_mmcfg_lock);
789
790 return -ENOENT;
791}
diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c
index 5372e86834c..db63ac23e3d 100644
--- a/arch/x86/pci/mmconfig_32.c
+++ b/arch/x86/pci/mmconfig_32.c
@@ -11,6 +11,7 @@
11 11
12#include <linux/pci.h> 12#include <linux/pci.h>
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/rcupdate.h>
14#include <asm/e820.h> 15#include <asm/e820.h>
15#include <asm/pci_x86.h> 16#include <asm/pci_x86.h>
16#include <acpi/acpi.h> 17#include <acpi/acpi.h>
@@ -60,9 +61,12 @@ err: *value = -1;
60 return -EINVAL; 61 return -EINVAL;
61 } 62 }
62 63
64 rcu_read_lock();
63 base = get_base_addr(seg, bus, devfn); 65 base = get_base_addr(seg, bus, devfn);
64 if (!base) 66 if (!base) {
67 rcu_read_unlock();
65 goto err; 68 goto err;
69 }
66 70
67 raw_spin_lock_irqsave(&pci_config_lock, flags); 71 raw_spin_lock_irqsave(&pci_config_lock, flags);
68 72
@@ -80,6 +84,7 @@ err: *value = -1;
80 break; 84 break;
81 } 85 }
82 raw_spin_unlock_irqrestore(&pci_config_lock, flags); 86 raw_spin_unlock_irqrestore(&pci_config_lock, flags);
87 rcu_read_unlock();
83 88
84 return 0; 89 return 0;
85} 90}
@@ -93,9 +98,12 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus,
93 if ((bus > 255) || (devfn > 255) || (reg > 4095)) 98 if ((bus > 255) || (devfn > 255) || (reg > 4095))
94 return -EINVAL; 99 return -EINVAL;
95 100
101 rcu_read_lock();
96 base = get_base_addr(seg, bus, devfn); 102 base = get_base_addr(seg, bus, devfn);
97 if (!base) 103 if (!base) {
104 rcu_read_unlock();
98 return -EINVAL; 105 return -EINVAL;
106 }
99 107
100 raw_spin_lock_irqsave(&pci_config_lock, flags); 108 raw_spin_lock_irqsave(&pci_config_lock, flags);
101 109
@@ -113,11 +121,12 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus,
113 break; 121 break;
114 } 122 }
115 raw_spin_unlock_irqrestore(&pci_config_lock, flags); 123 raw_spin_unlock_irqrestore(&pci_config_lock, flags);
124 rcu_read_unlock();
116 125
117 return 0; 126 return 0;
118} 127}
119 128
120static const struct pci_raw_ops pci_mmcfg = { 129const struct pci_raw_ops pci_mmcfg = {
121 .read = pci_mmcfg_read, 130 .read = pci_mmcfg_read,
122 .write = pci_mmcfg_write, 131 .write = pci_mmcfg_write,
123}; 132};
@@ -132,3 +141,18 @@ int __init pci_mmcfg_arch_init(void)
132void __init pci_mmcfg_arch_free(void) 141void __init pci_mmcfg_arch_free(void)
133{ 142{
134} 143}
144
145int __devinit pci_mmcfg_arch_map(struct pci_mmcfg_region *cfg)
146{
147 return 0;
148}
149
150void pci_mmcfg_arch_unmap(struct pci_mmcfg_region *cfg)
151{
152 unsigned long flags;
153
154 /* Invalidate the cached mmcfg map entry. */
155 raw_spin_lock_irqsave(&pci_config_lock, flags);
156 mmcfg_last_accessed_device = 0;
157 raw_spin_unlock_irqrestore(&pci_config_lock, flags);
158}
diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c
index 915a493502c..d4ebd07c306 100644
--- a/arch/x86/pci/mmconfig_64.c
+++ b/arch/x86/pci/mmconfig_64.c
@@ -9,6 +9,7 @@
9#include <linux/init.h> 9#include <linux/init.h>
10#include <linux/acpi.h> 10#include <linux/acpi.h>
11#include <linux/bitmap.h> 11#include <linux/bitmap.h>
12#include <linux/rcupdate.h>
12#include <asm/e820.h> 13#include <asm/e820.h>
13#include <asm/pci_x86.h> 14#include <asm/pci_x86.h>
14 15
@@ -34,9 +35,12 @@ err: *value = -1;
34 return -EINVAL; 35 return -EINVAL;
35 } 36 }
36 37
38 rcu_read_lock();
37 addr = pci_dev_base(seg, bus, devfn); 39 addr = pci_dev_base(seg, bus, devfn);
38 if (!addr) 40 if (!addr) {
41 rcu_read_unlock();
39 goto err; 42 goto err;
43 }
40 44
41 switch (len) { 45 switch (len) {
42 case 1: 46 case 1:
@@ -49,6 +53,7 @@ err: *value = -1;
49 *value = mmio_config_readl(addr + reg); 53 *value = mmio_config_readl(addr + reg);
50 break; 54 break;
51 } 55 }
56 rcu_read_unlock();
52 57
53 return 0; 58 return 0;
54} 59}
@@ -62,9 +67,12 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus,
62 if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095))) 67 if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095)))
63 return -EINVAL; 68 return -EINVAL;
64 69
70 rcu_read_lock();
65 addr = pci_dev_base(seg, bus, devfn); 71 addr = pci_dev_base(seg, bus, devfn);
66 if (!addr) 72 if (!addr) {
73 rcu_read_unlock();
67 return -EINVAL; 74 return -EINVAL;
75 }
68 76
69 switch (len) { 77 switch (len) {
70 case 1: 78 case 1:
@@ -77,16 +85,17 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus,
77 mmio_config_writel(addr + reg, value); 85 mmio_config_writel(addr + reg, value);
78 break; 86 break;
79 } 87 }
88 rcu_read_unlock();
80 89
81 return 0; 90 return 0;
82} 91}
83 92
84static const struct pci_raw_ops pci_mmcfg = { 93const struct pci_raw_ops pci_mmcfg = {
85 .read = pci_mmcfg_read, 94 .read = pci_mmcfg_read,
86 .write = pci_mmcfg_write, 95 .write = pci_mmcfg_write,
87}; 96};
88 97
89static void __iomem * __init mcfg_ioremap(struct pci_mmcfg_region *cfg) 98static void __iomem * __devinit mcfg_ioremap(struct pci_mmcfg_region *cfg)
90{ 99{
91 void __iomem *addr; 100 void __iomem *addr;
92 u64 start, size; 101 u64 start, size;
@@ -105,16 +114,14 @@ int __init pci_mmcfg_arch_init(void)
105{ 114{
106 struct pci_mmcfg_region *cfg; 115 struct pci_mmcfg_region *cfg;
107 116
108 list_for_each_entry(cfg, &pci_mmcfg_list, list) { 117 list_for_each_entry(cfg, &pci_mmcfg_list, list)
109 cfg->virt = mcfg_ioremap(cfg); 118 if (pci_mmcfg_arch_map(cfg)) {
110 if (!cfg->virt) {
111 printk(KERN_ERR PREFIX "can't map MMCONFIG at %pR\n",
112 &cfg->res);
113 pci_mmcfg_arch_free(); 119 pci_mmcfg_arch_free();
114 return 0; 120 return 0;
115 } 121 }
116 } 122
117 raw_pci_ext_ops = &pci_mmcfg; 123 raw_pci_ext_ops = &pci_mmcfg;
124
118 return 1; 125 return 1;
119} 126}
120 127
@@ -122,10 +129,25 @@ void __init pci_mmcfg_arch_free(void)
122{ 129{
123 struct pci_mmcfg_region *cfg; 130 struct pci_mmcfg_region *cfg;
124 131
125 list_for_each_entry(cfg, &pci_mmcfg_list, list) { 132 list_for_each_entry(cfg, &pci_mmcfg_list, list)
126 if (cfg->virt) { 133 pci_mmcfg_arch_unmap(cfg);
127 iounmap(cfg->virt + PCI_MMCFG_BUS_OFFSET(cfg->start_bus)); 134}
128 cfg->virt = NULL; 135
129 } 136int __devinit pci_mmcfg_arch_map(struct pci_mmcfg_region *cfg)
137{
138 cfg->virt = mcfg_ioremap(cfg);
139 if (!cfg->virt) {
140 pr_err(PREFIX "can't map MMCONFIG at %pR\n", &cfg->res);
141 return -ENOMEM;
142 }
143
144 return 0;
145}
146
147void pci_mmcfg_arch_unmap(struct pci_mmcfg_region *cfg)
148{
149 if (cfg && cfg->virt) {
150 iounmap(cfg->virt + PCI_MMCFG_BUS_OFFSET(cfg->start_bus));
151 cfg->virt = NULL;
130 } 152 }
131} 153}
diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/mrst.c
index 140942f66b3..e14a2ff708b 100644
--- a/arch/x86/pci/mrst.c
+++ b/arch/x86/pci/mrst.c
@@ -264,7 +264,7 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_d3delay_fixup);
264 264
265static void __devinit mrst_power_off_unused_dev(struct pci_dev *dev) 265static void __devinit mrst_power_off_unused_dev(struct pci_dev *dev)
266{ 266{
267 pci_set_power_state(dev, PCI_D3cold); 267 pci_set_power_state(dev, PCI_D3hot);
268} 268}
269DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0801, mrst_power_off_unused_dev); 269DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0801, mrst_power_off_unused_dev);
270DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0809, mrst_power_off_unused_dev); 270DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0809, mrst_power_off_unused_dev);
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 7415aa92791..56ab74989cf 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -64,6 +64,10 @@ static int xen_register_pirq(u32 gsi, int gsi_override, int triggering,
64 int shareable = 0; 64 int shareable = 0;
65 char *name; 65 char *name;
66 66
67 irq = xen_irq_from_gsi(gsi);
68 if (irq > 0)
69 return irq;
70
67 if (set_pirq) 71 if (set_pirq)
68 pirq = gsi; 72 pirq = gsi;
69 73
diff --git a/arch/x86/platform/mrst/early_printk_mrst.c b/arch/x86/platform/mrst/early_printk_mrst.c
index 3c6e328483c..028454f0c3a 100644
--- a/arch/x86/platform/mrst/early_printk_mrst.c
+++ b/arch/x86/platform/mrst/early_printk_mrst.c
@@ -110,19 +110,16 @@ static struct kmsg_dumper dw_dumper;
110static int dumper_registered; 110static int dumper_registered;
111 111
112static void dw_kmsg_dump(struct kmsg_dumper *dumper, 112static void dw_kmsg_dump(struct kmsg_dumper *dumper,
113 enum kmsg_dump_reason reason, 113 enum kmsg_dump_reason reason)
114 const char *s1, unsigned long l1,
115 const char *s2, unsigned long l2)
116{ 114{
117 int i; 115 static char line[1024];
116 size_t len;
118 117
119 /* When run to this, we'd better re-init the HW */ 118 /* When run to this, we'd better re-init the HW */
120 mrst_early_console_init(); 119 mrst_early_console_init();
121 120
122 for (i = 0; i < l1; i++) 121 while (kmsg_dump_get_line(dumper, true, line, sizeof(line), &len))
123 early_mrst_console.write(&early_mrst_console, s1 + i, 1); 122 early_mrst_console.write(&early_mrst_console, line, len);
124 for (i = 0; i < l2; i++)
125 early_mrst_console.write(&early_mrst_console, s2 + i, 1);
126} 123}
127 124
128/* Set the ratio rate to 115200, 8n1, IRQ disabled */ 125/* Set the ratio rate to 115200, 8n1, IRQ disabled */
diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
index e31bcd8f2ee..fd41a9262d6 100644
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -782,7 +782,7 @@ BLOCKING_NOTIFIER_HEAD(intel_scu_notifier);
782EXPORT_SYMBOL_GPL(intel_scu_notifier); 782EXPORT_SYMBOL_GPL(intel_scu_notifier);
783 783
784/* Called by IPC driver */ 784/* Called by IPC driver */
785void intel_scu_devices_create(void) 785void __devinit intel_scu_devices_create(void)
786{ 786{
787 int i; 787 int i;
788 788
diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c
index 23e5b9d7977..599be499fdf 100644
--- a/arch/x86/platform/olpc/olpc-xo15-sci.c
+++ b/arch/x86/platform/olpc/olpc-xo15-sci.c
@@ -203,7 +203,7 @@ static int xo15_sci_remove(struct acpi_device *device, int type)
203 return 0; 203 return 0;
204} 204}
205 205
206static int xo15_sci_resume(struct acpi_device *device) 206static int xo15_sci_resume(struct device *dev)
207{ 207{
208 /* Enable all EC events */ 208 /* Enable all EC events */
209 olpc_ec_mask_write(EC_SCI_SRC_ALL); 209 olpc_ec_mask_write(EC_SCI_SRC_ALL);
@@ -215,6 +215,8 @@ static int xo15_sci_resume(struct acpi_device *device)
215 return 0; 215 return 0;
216} 216}
217 217
218static SIMPLE_DEV_PM_OPS(xo15_sci_pm, NULL, xo15_sci_resume);
219
218static const struct acpi_device_id xo15_sci_device_ids[] = { 220static const struct acpi_device_id xo15_sci_device_ids[] = {
219 {"XO15EC", 0}, 221 {"XO15EC", 0},
220 {"", 0}, 222 {"", 0},
@@ -227,8 +229,8 @@ static struct acpi_driver xo15_sci_drv = {
227 .ops = { 229 .ops = {
228 .add = xo15_sci_add, 230 .add = xo15_sci_add,
229 .remove = xo15_sci_remove, 231 .remove = xo15_sci_remove,
230 .resume = xo15_sci_resume,
231 }, 232 },
233 .drv.pm = &xo15_sci_pm,
232}; 234};
233 235
234static int __init xo15_sci_init(void) 236static int __init xo15_sci_init(void)
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 3ae0e61abd2..71b5d5a07d7 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * SGI UltraViolet TLB flush routines. 2 * SGI UltraViolet TLB flush routines.
3 * 3 *
4 * (c) 2008-2011 Cliff Wickman <cpw@sgi.com>, SGI. 4 * (c) 2008-2012 Cliff Wickman <cpw@sgi.com>, SGI.
5 * 5 *
6 * This code is released under the GNU General Public License version 2 or 6 * This code is released under the GNU General Public License version 2 or
7 * later. 7 * later.
@@ -38,8 +38,7 @@ static int timeout_base_ns[] = {
38 38
39static int timeout_us; 39static int timeout_us;
40static int nobau; 40static int nobau;
41static int baudisabled; 41static int nobau_perm;
42static spinlock_t disable_lock;
43static cycles_t congested_cycles; 42static cycles_t congested_cycles;
44 43
45/* tunables: */ 44/* tunables: */
@@ -47,12 +46,13 @@ static int max_concurr = MAX_BAU_CONCURRENT;
47static int max_concurr_const = MAX_BAU_CONCURRENT; 46static int max_concurr_const = MAX_BAU_CONCURRENT;
48static int plugged_delay = PLUGGED_DELAY; 47static int plugged_delay = PLUGGED_DELAY;
49static int plugsb4reset = PLUGSB4RESET; 48static int plugsb4reset = PLUGSB4RESET;
49static int giveup_limit = GIVEUP_LIMIT;
50static int timeoutsb4reset = TIMEOUTSB4RESET; 50static int timeoutsb4reset = TIMEOUTSB4RESET;
51static int ipi_reset_limit = IPI_RESET_LIMIT; 51static int ipi_reset_limit = IPI_RESET_LIMIT;
52static int complete_threshold = COMPLETE_THRESHOLD; 52static int complete_threshold = COMPLETE_THRESHOLD;
53static int congested_respns_us = CONGESTED_RESPONSE_US; 53static int congested_respns_us = CONGESTED_RESPONSE_US;
54static int congested_reps = CONGESTED_REPS; 54static int congested_reps = CONGESTED_REPS;
55static int congested_period = CONGESTED_PERIOD; 55static int disabled_period = DISABLED_PERIOD;
56 56
57static struct tunables tunables[] = { 57static struct tunables tunables[] = {
58 {&max_concurr, MAX_BAU_CONCURRENT}, /* must be [0] */ 58 {&max_concurr, MAX_BAU_CONCURRENT}, /* must be [0] */
@@ -63,7 +63,8 @@ static struct tunables tunables[] = {
63 {&complete_threshold, COMPLETE_THRESHOLD}, 63 {&complete_threshold, COMPLETE_THRESHOLD},
64 {&congested_respns_us, CONGESTED_RESPONSE_US}, 64 {&congested_respns_us, CONGESTED_RESPONSE_US},
65 {&congested_reps, CONGESTED_REPS}, 65 {&congested_reps, CONGESTED_REPS},
66 {&congested_period, CONGESTED_PERIOD} 66 {&disabled_period, DISABLED_PERIOD},
67 {&giveup_limit, GIVEUP_LIMIT}
67}; 68};
68 69
69static struct dentry *tunables_dir; 70static struct dentry *tunables_dir;
@@ -120,6 +121,40 @@ static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
120static DEFINE_PER_CPU(struct bau_control, bau_control); 121static DEFINE_PER_CPU(struct bau_control, bau_control);
121static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask); 122static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
122 123
124static void
125set_bau_on(void)
126{
127 int cpu;
128 struct bau_control *bcp;
129
130 if (nobau_perm) {
131 pr_info("BAU not initialized; cannot be turned on\n");
132 return;
133 }
134 nobau = 0;
135 for_each_present_cpu(cpu) {
136 bcp = &per_cpu(bau_control, cpu);
137 bcp->nobau = 0;
138 }
139 pr_info("BAU turned on\n");
140 return;
141}
142
143static void
144set_bau_off(void)
145{
146 int cpu;
147 struct bau_control *bcp;
148
149 nobau = 1;
150 for_each_present_cpu(cpu) {
151 bcp = &per_cpu(bau_control, cpu);
152 bcp->nobau = 1;
153 }
154 pr_info("BAU turned off\n");
155 return;
156}
157
123/* 158/*
124 * Determine the first node on a uvhub. 'Nodes' are used for kernel 159 * Determine the first node on a uvhub. 'Nodes' are used for kernel
125 * memory allocation. 160 * memory allocation.
@@ -278,7 +313,7 @@ static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp,
278 * Both sockets dump their completed count total into 313 * Both sockets dump their completed count total into
279 * the message's count. 314 * the message's count.
280 */ 315 */
281 smaster->socket_acknowledge_count[mdp->msg_slot] = 0; 316 *sp = 0;
282 asp = (struct atomic_short *)&msg->acknowledge_count; 317 asp = (struct atomic_short *)&msg->acknowledge_count;
283 msg_ack_count = atom_asr(socket_ack_count, asp); 318 msg_ack_count = atom_asr(socket_ack_count, asp);
284 319
@@ -491,16 +526,15 @@ static int uv1_wait_completion(struct bau_desc *bau_desc,
491} 526}
492 527
493/* 528/*
494 * UV2 has an extra bit of status in the ACTIVATION_STATUS_2 register. 529 * UV2 could have an extra bit of status in the ACTIVATION_STATUS_2 register.
530 * But not currently used.
495 */ 531 */
496static unsigned long uv2_read_status(unsigned long offset, int rshft, int desc) 532static unsigned long uv2_read_status(unsigned long offset, int rshft, int desc)
497{ 533{
498 unsigned long descriptor_status; 534 unsigned long descriptor_status;
499 unsigned long descriptor_status2;
500 535
501 descriptor_status = ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK); 536 descriptor_status =
502 descriptor_status2 = (read_mmr_uv2_status() >> desc) & 0x1UL; 537 ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK) << 1;
503 descriptor_status = (descriptor_status << 1) | descriptor_status2;
504 return descriptor_status; 538 return descriptor_status;
505} 539}
506 540
@@ -531,87 +565,11 @@ int normal_busy(struct bau_control *bcp)
531 */ 565 */
532int handle_uv2_busy(struct bau_control *bcp) 566int handle_uv2_busy(struct bau_control *bcp)
533{ 567{
534 int busy_one = bcp->using_desc;
535 int normal = bcp->uvhub_cpu;
536 int selected = -1;
537 int i;
538 unsigned long descriptor_status;
539 unsigned long status;
540 int mmr_offset;
541 struct bau_desc *bau_desc_old;
542 struct bau_desc *bau_desc_new;
543 struct bau_control *hmaster = bcp->uvhub_master;
544 struct ptc_stats *stat = bcp->statp; 568 struct ptc_stats *stat = bcp->statp;
545 cycles_t ttm;
546 569
547 stat->s_uv2_wars++; 570 stat->s_uv2_wars++;
548 spin_lock(&hmaster->uvhub_lock); 571 bcp->busy = 1;
549 /* try for the original first */ 572 return FLUSH_GIVEUP;
550 if (busy_one != normal) {
551 if (!normal_busy(bcp))
552 selected = normal;
553 }
554 if (selected < 0) {
555 /* can't use the normal, select an alternate */
556 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
557 descriptor_status = read_lmmr(mmr_offset);
558
559 /* scan available descriptors 32-63 */
560 for (i = 0; i < UV_CPUS_PER_AS; i++) {
561 if ((hmaster->inuse_map & (1 << i)) == 0) {
562 status = ((descriptor_status >>
563 (i * UV_ACT_STATUS_SIZE)) &
564 UV_ACT_STATUS_MASK) << 1;
565 if (status != UV2H_DESC_BUSY) {
566 selected = i + UV_CPUS_PER_AS;
567 break;
568 }
569 }
570 }
571 }
572
573 if (busy_one != normal)
574 /* mark the busy alternate as not in-use */
575 hmaster->inuse_map &= ~(1 << (busy_one - UV_CPUS_PER_AS));
576
577 if (selected >= 0) {
578 /* switch to the selected descriptor */
579 if (selected != normal) {
580 /* set the selected alternate as in-use */
581 hmaster->inuse_map |=
582 (1 << (selected - UV_CPUS_PER_AS));
583 if (selected > stat->s_uv2_wars_hw)
584 stat->s_uv2_wars_hw = selected;
585 }
586 bau_desc_old = bcp->descriptor_base;
587 bau_desc_old += (ITEMS_PER_DESC * busy_one);
588 bcp->using_desc = selected;
589 bau_desc_new = bcp->descriptor_base;
590 bau_desc_new += (ITEMS_PER_DESC * selected);
591 *bau_desc_new = *bau_desc_old;
592 } else {
593 /*
594 * All are busy. Wait for the normal one for this cpu to
595 * free up.
596 */
597 stat->s_uv2_war_waits++;
598 spin_unlock(&hmaster->uvhub_lock);
599 ttm = get_cycles();
600 do {
601 cpu_relax();
602 } while (normal_busy(bcp));
603 spin_lock(&hmaster->uvhub_lock);
604 /* switch to the original descriptor */
605 bcp->using_desc = normal;
606 bau_desc_old = bcp->descriptor_base;
607 bau_desc_old += (ITEMS_PER_DESC * bcp->using_desc);
608 bcp->using_desc = (ITEMS_PER_DESC * normal);
609 bau_desc_new = bcp->descriptor_base;
610 bau_desc_new += (ITEMS_PER_DESC * normal);
611 *bau_desc_new = *bau_desc_old; /* copy the entire descriptor */
612 }
613 spin_unlock(&hmaster->uvhub_lock);
614 return FLUSH_RETRY_BUSYBUG;
615} 573}
616 574
617static int uv2_wait_completion(struct bau_desc *bau_desc, 575static int uv2_wait_completion(struct bau_desc *bau_desc,
@@ -620,7 +578,7 @@ static int uv2_wait_completion(struct bau_desc *bau_desc,
620{ 578{
621 unsigned long descriptor_stat; 579 unsigned long descriptor_stat;
622 cycles_t ttm; 580 cycles_t ttm;
623 int desc = bcp->using_desc; 581 int desc = bcp->uvhub_cpu;
624 long busy_reps = 0; 582 long busy_reps = 0;
625 struct ptc_stats *stat = bcp->statp; 583 struct ptc_stats *stat = bcp->statp;
626 584
@@ -628,24 +586,38 @@ static int uv2_wait_completion(struct bau_desc *bau_desc,
628 586
629 /* spin on the status MMR, waiting for it to go idle */ 587 /* spin on the status MMR, waiting for it to go idle */
630 while (descriptor_stat != UV2H_DESC_IDLE) { 588 while (descriptor_stat != UV2H_DESC_IDLE) {
631 /* 589 if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT)) {
632 * Our software ack messages may be blocked because 590 /*
633 * there are no swack resources available. As long 591 * A h/w bug on the destination side may
634 * as none of them has timed out hardware will NACK 592 * have prevented the message being marked
635 * our message and its state will stay IDLE. 593 * pending, thus it doesn't get replied to
636 */ 594 * and gets continually nacked until it times
637 if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT) || 595 * out with a SOURCE_TIMEOUT.
638 (descriptor_stat == UV2H_DESC_DEST_PUT_ERR)) { 596 */
639 stat->s_stimeout++; 597 stat->s_stimeout++;
640 return FLUSH_GIVEUP; 598 return FLUSH_GIVEUP;
641 } else if (descriptor_stat == UV2H_DESC_DEST_STRONG_NACK) {
642 stat->s_strongnacks++;
643 bcp->conseccompletes = 0;
644 return FLUSH_GIVEUP;
645 } else if (descriptor_stat == UV2H_DESC_DEST_TIMEOUT) { 599 } else if (descriptor_stat == UV2H_DESC_DEST_TIMEOUT) {
600 ttm = get_cycles();
601
602 /*
603 * Our retries may be blocked by all destination
604 * swack resources being consumed, and a timeout
605 * pending. In that case hardware returns the
606 * ERROR that looks like a destination timeout.
607 * Without using the extended status we have to
608 * deduce from the short time that this was a
609 * strong nack.
610 */
611 if (cycles_2_us(ttm - bcp->send_message) < timeout_us) {
612 bcp->conseccompletes = 0;
613 stat->s_plugged++;
614 /* FLUSH_RETRY_PLUGGED causes hang on boot */
615 return FLUSH_GIVEUP;
616 }
646 stat->s_dtimeout++; 617 stat->s_dtimeout++;
647 bcp->conseccompletes = 0; 618 bcp->conseccompletes = 0;
648 return FLUSH_RETRY_TIMEOUT; 619 /* FLUSH_RETRY_TIMEOUT causes hang on boot */
620 return FLUSH_GIVEUP;
649 } else { 621 } else {
650 busy_reps++; 622 busy_reps++;
651 if (busy_reps > 1000000) { 623 if (busy_reps > 1000000) {
@@ -653,9 +625,8 @@ static int uv2_wait_completion(struct bau_desc *bau_desc,
653 busy_reps = 0; 625 busy_reps = 0;
654 ttm = get_cycles(); 626 ttm = get_cycles();
655 if ((ttm - bcp->send_message) > 627 if ((ttm - bcp->send_message) >
656 (bcp->clocks_per_100_usec)) { 628 bcp->timeout_interval)
657 return handle_uv2_busy(bcp); 629 return handle_uv2_busy(bcp);
658 }
659 } 630 }
660 /* 631 /*
661 * descriptor_stat is still BUSY 632 * descriptor_stat is still BUSY
@@ -679,7 +650,7 @@ static int wait_completion(struct bau_desc *bau_desc,
679{ 650{
680 int right_shift; 651 int right_shift;
681 unsigned long mmr_offset; 652 unsigned long mmr_offset;
682 int desc = bcp->using_desc; 653 int desc = bcp->uvhub_cpu;
683 654
684 if (desc < UV_CPUS_PER_AS) { 655 if (desc < UV_CPUS_PER_AS) {
685 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; 656 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
@@ -758,33 +729,31 @@ static void destination_timeout(struct bau_desc *bau_desc,
758} 729}
759 730
760/* 731/*
761 * Completions are taking a very long time due to a congested numalink 732 * Stop all cpus on a uvhub from using the BAU for a period of time.
762 * network. 733 * This is reversed by check_enable.
763 */ 734 */
764static void disable_for_congestion(struct bau_control *bcp, 735static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat)
765 struct ptc_stats *stat)
766{ 736{
767 /* let only one cpu do this disabling */ 737 int tcpu;
768 spin_lock(&disable_lock); 738 struct bau_control *tbcp;
769 739 struct bau_control *hmaster;
770 if (!baudisabled && bcp->period_requests && 740 cycles_t tm1;
771 ((bcp->period_time / bcp->period_requests) > congested_cycles)) { 741
772 int tcpu; 742 hmaster = bcp->uvhub_master;
773 struct bau_control *tbcp; 743 spin_lock(&hmaster->disable_lock);
774 /* it becomes this cpu's job to turn on the use of the 744 if (!bcp->baudisabled) {
775 BAU again */
776 baudisabled = 1;
777 bcp->set_bau_off = 1;
778 bcp->set_bau_on_time = get_cycles();
779 bcp->set_bau_on_time += sec_2_cycles(bcp->cong_period);
780 stat->s_bau_disabled++; 745 stat->s_bau_disabled++;
746 tm1 = get_cycles();
781 for_each_present_cpu(tcpu) { 747 for_each_present_cpu(tcpu) {
782 tbcp = &per_cpu(bau_control, tcpu); 748 tbcp = &per_cpu(bau_control, tcpu);
783 tbcp->baudisabled = 1; 749 if (tbcp->uvhub_master == hmaster) {
750 tbcp->baudisabled = 1;
751 tbcp->set_bau_on_time =
752 tm1 + bcp->disabled_period;
753 }
784 } 754 }
785 } 755 }
786 756 spin_unlock(&hmaster->disable_lock);
787 spin_unlock(&disable_lock);
788} 757}
789 758
790static void count_max_concurr(int stat, struct bau_control *bcp, 759static void count_max_concurr(int stat, struct bau_control *bcp,
@@ -815,16 +784,30 @@ static void record_send_stats(cycles_t time1, cycles_t time2,
815 bcp->period_requests++; 784 bcp->period_requests++;
816 bcp->period_time += elapsed; 785 bcp->period_time += elapsed;
817 if ((elapsed > congested_cycles) && 786 if ((elapsed > congested_cycles) &&
818 (bcp->period_requests > bcp->cong_reps)) 787 (bcp->period_requests > bcp->cong_reps) &&
819 disable_for_congestion(bcp, stat); 788 ((bcp->period_time / bcp->period_requests) >
789 congested_cycles)) {
790 stat->s_congested++;
791 disable_for_period(bcp, stat);
792 }
820 } 793 }
821 } else 794 } else
822 stat->s_requestor--; 795 stat->s_requestor--;
823 796
824 if (completion_status == FLUSH_COMPLETE && try > 1) 797 if (completion_status == FLUSH_COMPLETE && try > 1)
825 stat->s_retriesok++; 798 stat->s_retriesok++;
826 else if (completion_status == FLUSH_GIVEUP) 799 else if (completion_status == FLUSH_GIVEUP) {
827 stat->s_giveup++; 800 stat->s_giveup++;
801 if (get_cycles() > bcp->period_end)
802 bcp->period_giveups = 0;
803 bcp->period_giveups++;
804 if (bcp->period_giveups == 1)
805 bcp->period_end = get_cycles() + bcp->disabled_period;
806 if (bcp->period_giveups > bcp->giveup_limit) {
807 disable_for_period(bcp, stat);
808 stat->s_giveuplimit++;
809 }
810 }
828} 811}
829 812
830/* 813/*
@@ -868,7 +851,8 @@ static void handle_cmplt(int completion_status, struct bau_desc *bau_desc,
868 * Returns 1 if it gives up entirely and the original cpu mask is to be 851 * Returns 1 if it gives up entirely and the original cpu mask is to be
869 * returned to the kernel. 852 * returned to the kernel.
870 */ 853 */
871int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp) 854int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp,
855 struct bau_desc *bau_desc)
872{ 856{
873 int seq_number = 0; 857 int seq_number = 0;
874 int completion_stat = 0; 858 int completion_stat = 0;
@@ -881,24 +865,23 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp)
881 struct bau_control *hmaster = bcp->uvhub_master; 865 struct bau_control *hmaster = bcp->uvhub_master;
882 struct uv1_bau_msg_header *uv1_hdr = NULL; 866 struct uv1_bau_msg_header *uv1_hdr = NULL;
883 struct uv2_bau_msg_header *uv2_hdr = NULL; 867 struct uv2_bau_msg_header *uv2_hdr = NULL;
884 struct bau_desc *bau_desc;
885 868
886 if (bcp->uvhub_version == 1) 869 if (bcp->uvhub_version == 1) {
870 uv1 = 1;
887 uv1_throttle(hmaster, stat); 871 uv1_throttle(hmaster, stat);
872 }
888 873
889 while (hmaster->uvhub_quiesce) 874 while (hmaster->uvhub_quiesce)
890 cpu_relax(); 875 cpu_relax();
891 876
892 time1 = get_cycles(); 877 time1 = get_cycles();
878 if (uv1)
879 uv1_hdr = &bau_desc->header.uv1_hdr;
880 else
881 uv2_hdr = &bau_desc->header.uv2_hdr;
882
893 do { 883 do {
894 bau_desc = bcp->descriptor_base; 884 if (try == 0) {
895 bau_desc += (ITEMS_PER_DESC * bcp->using_desc);
896 if (bcp->uvhub_version == 1) {
897 uv1 = 1;
898 uv1_hdr = &bau_desc->header.uv1_hdr;
899 } else
900 uv2_hdr = &bau_desc->header.uv2_hdr;
901 if ((try == 0) || (completion_stat == FLUSH_RETRY_BUSYBUG)) {
902 if (uv1) 885 if (uv1)
903 uv1_hdr->msg_type = MSG_REGULAR; 886 uv1_hdr->msg_type = MSG_REGULAR;
904 else 887 else
@@ -916,25 +899,24 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp)
916 uv1_hdr->sequence = seq_number; 899 uv1_hdr->sequence = seq_number;
917 else 900 else
918 uv2_hdr->sequence = seq_number; 901 uv2_hdr->sequence = seq_number;
919 index = (1UL << AS_PUSH_SHIFT) | bcp->using_desc; 902 index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu;
920 bcp->send_message = get_cycles(); 903 bcp->send_message = get_cycles();
921 904
922 write_mmr_activation(index); 905 write_mmr_activation(index);
923 906
924 try++; 907 try++;
925 completion_stat = wait_completion(bau_desc, bcp, try); 908 completion_stat = wait_completion(bau_desc, bcp, try);
926 /* UV2: wait_completion() may change the bcp->using_desc */
927 909
928 handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat); 910 handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat);
929 911
930 if (bcp->ipi_attempts >= bcp->ipi_reset_limit) { 912 if (bcp->ipi_attempts >= bcp->ipi_reset_limit) {
931 bcp->ipi_attempts = 0; 913 bcp->ipi_attempts = 0;
914 stat->s_overipilimit++;
932 completion_stat = FLUSH_GIVEUP; 915 completion_stat = FLUSH_GIVEUP;
933 break; 916 break;
934 } 917 }
935 cpu_relax(); 918 cpu_relax();
936 } while ((completion_stat == FLUSH_RETRY_PLUGGED) || 919 } while ((completion_stat == FLUSH_RETRY_PLUGGED) ||
937 (completion_stat == FLUSH_RETRY_BUSYBUG) ||
938 (completion_stat == FLUSH_RETRY_TIMEOUT)); 920 (completion_stat == FLUSH_RETRY_TIMEOUT));
939 921
940 time2 = get_cycles(); 922 time2 = get_cycles();
@@ -955,28 +937,33 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp)
955} 937}
956 938
957/* 939/*
958 * The BAU is disabled. When the disabled time period has expired, the cpu 940 * The BAU is disabled for this uvhub. When the disabled time period has
959 * that disabled it must re-enable it. 941 * expired re-enable it.
960 * Return 0 if it is re-enabled for all cpus. 942 * Return 0 if it is re-enabled for all cpus on this uvhub.
961 */ 943 */
962static int check_enable(struct bau_control *bcp, struct ptc_stats *stat) 944static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
963{ 945{
964 int tcpu; 946 int tcpu;
965 struct bau_control *tbcp; 947 struct bau_control *tbcp;
948 struct bau_control *hmaster;
966 949
967 if (bcp->set_bau_off) { 950 hmaster = bcp->uvhub_master;
968 if (get_cycles() >= bcp->set_bau_on_time) { 951 spin_lock(&hmaster->disable_lock);
969 stat->s_bau_reenabled++; 952 if (bcp->baudisabled && (get_cycles() >= bcp->set_bau_on_time)) {
970 baudisabled = 0; 953 stat->s_bau_reenabled++;
971 for_each_present_cpu(tcpu) { 954 for_each_present_cpu(tcpu) {
972 tbcp = &per_cpu(bau_control, tcpu); 955 tbcp = &per_cpu(bau_control, tcpu);
956 if (tbcp->uvhub_master == hmaster) {
973 tbcp->baudisabled = 0; 957 tbcp->baudisabled = 0;
974 tbcp->period_requests = 0; 958 tbcp->period_requests = 0;
975 tbcp->period_time = 0; 959 tbcp->period_time = 0;
960 tbcp->period_giveups = 0;
976 } 961 }
977 return 0;
978 } 962 }
963 spin_unlock(&hmaster->disable_lock);
964 return 0;
979 } 965 }
966 spin_unlock(&hmaster->disable_lock);
980 return -1; 967 return -1;
981} 968}
982 969
@@ -1078,18 +1065,32 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
1078 struct cpumask *flush_mask; 1065 struct cpumask *flush_mask;
1079 struct ptc_stats *stat; 1066 struct ptc_stats *stat;
1080 struct bau_control *bcp; 1067 struct bau_control *bcp;
1081 1068 unsigned long descriptor_status;
1082 /* kernel was booted 'nobau' */ 1069 unsigned long status;
1083 if (nobau)
1084 return cpumask;
1085 1070
1086 bcp = &per_cpu(bau_control, cpu); 1071 bcp = &per_cpu(bau_control, cpu);
1087 stat = bcp->statp; 1072 stat = bcp->statp;
1073 stat->s_enters++;
1074
1075 if (bcp->nobau)
1076 return cpumask;
1077
1078 if (bcp->busy) {
1079 descriptor_status =
1080 read_lmmr(UVH_LB_BAU_SB_ACTIVATION_STATUS_0);
1081 status = ((descriptor_status >> (bcp->uvhub_cpu *
1082 UV_ACT_STATUS_SIZE)) & UV_ACT_STATUS_MASK) << 1;
1083 if (status == UV2H_DESC_BUSY)
1084 return cpumask;
1085 bcp->busy = 0;
1086 }
1088 1087
1089 /* bau was disabled due to slow response */ 1088 /* bau was disabled due to slow response */
1090 if (bcp->baudisabled) { 1089 if (bcp->baudisabled) {
1091 if (check_enable(bcp, stat)) 1090 if (check_enable(bcp, stat)) {
1091 stat->s_ipifordisabled++;
1092 return cpumask; 1092 return cpumask;
1093 }
1093 } 1094 }
1094 1095
1095 /* 1096 /*
@@ -1105,7 +1106,7 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
1105 stat->s_ntargself++; 1106 stat->s_ntargself++;
1106 1107
1107 bau_desc = bcp->descriptor_base; 1108 bau_desc = bcp->descriptor_base;
1108 bau_desc += (ITEMS_PER_DESC * bcp->using_desc); 1109 bau_desc += (ITEMS_PER_DESC * bcp->uvhub_cpu);
1109 bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); 1110 bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
1110 if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes)) 1111 if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes))
1111 return NULL; 1112 return NULL;
@@ -1118,25 +1119,27 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
1118 * uv_flush_send_and_wait returns 0 if all cpu's were messaged, 1119 * uv_flush_send_and_wait returns 0 if all cpu's were messaged,
1119 * or 1 if it gave up and the original cpumask should be returned. 1120 * or 1 if it gave up and the original cpumask should be returned.
1120 */ 1121 */
1121 if (!uv_flush_send_and_wait(flush_mask, bcp)) 1122 if (!uv_flush_send_and_wait(flush_mask, bcp, bau_desc))
1122 return NULL; 1123 return NULL;
1123 else 1124 else
1124 return cpumask; 1125 return cpumask;
1125} 1126}
1126 1127
1127/* 1128/*
1128 * Search the message queue for any 'other' message with the same software 1129 * Search the message queue for any 'other' unprocessed message with the
1129 * acknowledge resource bit vector. 1130 * same software acknowledge resource bit vector as the 'msg' message.
1130 */ 1131 */
1131struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg, 1132struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg,
1132 struct bau_control *bcp, unsigned char swack_vec) 1133 struct bau_control *bcp)
1133{ 1134{
1134 struct bau_pq_entry *msg_next = msg + 1; 1135 struct bau_pq_entry *msg_next = msg + 1;
1136 unsigned char swack_vec = msg->swack_vec;
1135 1137
1136 if (msg_next > bcp->queue_last) 1138 if (msg_next > bcp->queue_last)
1137 msg_next = bcp->queue_first; 1139 msg_next = bcp->queue_first;
1138 while ((msg_next->swack_vec != 0) && (msg_next != msg)) { 1140 while (msg_next != msg) {
1139 if (msg_next->swack_vec == swack_vec) 1141 if ((msg_next->canceled == 0) && (msg_next->replied_to == 0) &&
1142 (msg_next->swack_vec == swack_vec))
1140 return msg_next; 1143 return msg_next;
1141 msg_next++; 1144 msg_next++;
1142 if (msg_next > bcp->queue_last) 1145 if (msg_next > bcp->queue_last)
@@ -1165,32 +1168,30 @@ void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp)
1165 * This message was assigned a swack resource, but no 1168 * This message was assigned a swack resource, but no
1166 * reserved acknowlegment is pending. 1169 * reserved acknowlegment is pending.
1167 * The bug has prevented this message from setting the MMR. 1170 * The bug has prevented this message from setting the MMR.
1168 * And no other message has used the same sw_ack resource.
1169 * Do the requested shootdown but do not reply to the msg.
1170 * (the 0 means make no acknowledge)
1171 */ 1171 */
1172 bau_process_message(mdp, bcp, 0);
1173 return;
1174 }
1175
1176 /*
1177 * Some message has set the MMR 'pending' bit; it might have been
1178 * another message. Look for that message.
1179 */
1180 other_msg = find_another_by_swack(msg, bcp, msg->swack_vec);
1181 if (other_msg) {
1182 /* There is another. Do not ack the current one. */
1183 bau_process_message(mdp, bcp, 0);
1184 /* 1172 /*
1185 * Let the natural processing of that message acknowledge 1173 * Some message has set the MMR 'pending' bit; it might have
1186 * it. Don't get the processing of sw_ack's out of order. 1174 * been another message. Look for that message.
1187 */ 1175 */
1188 return; 1176 other_msg = find_another_by_swack(msg, bcp);
1177 if (other_msg) {
1178 /*
1179 * There is another. Process this one but do not
1180 * ack it.
1181 */
1182 bau_process_message(mdp, bcp, 0);
1183 /*
1184 * Let the natural processing of that other message
1185 * acknowledge it. Don't get the processing of sw_ack's
1186 * out of order.
1187 */
1188 return;
1189 }
1189 } 1190 }
1190 1191
1191 /* 1192 /*
1192 * There is no other message using this sw_ack, so it is safe to 1193 * Either the MMR shows this one pending a reply or there is no
1193 * acknowledge it. 1194 * other message using this sw_ack, so it is safe to acknowledge it.
1194 */ 1195 */
1195 bau_process_message(mdp, bcp, 1); 1196 bau_process_message(mdp, bcp, 1);
1196 1197
@@ -1295,8 +1296,8 @@ static void __init enable_timeouts(void)
1295 */ 1296 */
1296 mmr_image |= (1L << SOFTACK_MSHIFT); 1297 mmr_image |= (1L << SOFTACK_MSHIFT);
1297 if (is_uv2_hub()) { 1298 if (is_uv2_hub()) {
1298 mmr_image &= ~(1L << UV2_LEG_SHFT); 1299 /* hw bug workaround; do not use extended status */
1299 mmr_image |= (1L << UV2_EXT_SHFT); 1300 mmr_image &= ~(1L << UV2_EXT_SHFT);
1300 } 1301 }
1301 write_mmr_misc_control(pnode, mmr_image); 1302 write_mmr_misc_control(pnode, mmr_image);
1302 } 1303 }
@@ -1339,29 +1340,34 @@ static inline unsigned long long usec_2_cycles(unsigned long microsec)
1339static int ptc_seq_show(struct seq_file *file, void *data) 1340static int ptc_seq_show(struct seq_file *file, void *data)
1340{ 1341{
1341 struct ptc_stats *stat; 1342 struct ptc_stats *stat;
1343 struct bau_control *bcp;
1342 int cpu; 1344 int cpu;
1343 1345
1344 cpu = *(loff_t *)data; 1346 cpu = *(loff_t *)data;
1345 if (!cpu) { 1347 if (!cpu) {
1346 seq_printf(file, 1348 seq_printf(file,
1347 "# cpu sent stime self locals remotes ncpus localhub "); 1349 "# cpu bauoff sent stime self locals remotes ncpus localhub ");
1348 seq_printf(file, 1350 seq_printf(file,
1349 "remotehub numuvhubs numuvhubs16 numuvhubs8 "); 1351 "remotehub numuvhubs numuvhubs16 numuvhubs8 ");
1350 seq_printf(file, 1352 seq_printf(file,
1351 "numuvhubs4 numuvhubs2 numuvhubs1 dto snacks retries rok "); 1353 "numuvhubs4 numuvhubs2 numuvhubs1 dto snacks retries ");
1354 seq_printf(file,
1355 "rok resetp resett giveup sto bz throt disable ");
1352 seq_printf(file, 1356 seq_printf(file,
1353 "resetp resett giveup sto bz throt swack recv rtime "); 1357 "enable wars warshw warwaits enters ipidis plugged ");
1354 seq_printf(file, 1358 seq_printf(file,
1355 "all one mult none retry canc nocan reset rcan "); 1359 "ipiover glim cong swack recv rtime all one mult ");
1356 seq_printf(file, 1360 seq_printf(file,
1357 "disable enable wars warshw warwaits\n"); 1361 "none retry canc nocan reset rcan\n");
1358 } 1362 }
1359 if (cpu < num_possible_cpus() && cpu_online(cpu)) { 1363 if (cpu < num_possible_cpus() && cpu_online(cpu)) {
1360 stat = &per_cpu(ptcstats, cpu); 1364 bcp = &per_cpu(bau_control, cpu);
1365 stat = bcp->statp;
1361 /* source side statistics */ 1366 /* source side statistics */
1362 seq_printf(file, 1367 seq_printf(file,
1363 "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", 1368 "cpu %d %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
1364 cpu, stat->s_requestor, cycles_2_us(stat->s_time), 1369 cpu, bcp->nobau, stat->s_requestor,
1370 cycles_2_us(stat->s_time),
1365 stat->s_ntargself, stat->s_ntarglocals, 1371 stat->s_ntargself, stat->s_ntarglocals,
1366 stat->s_ntargremotes, stat->s_ntargcpu, 1372 stat->s_ntargremotes, stat->s_ntargcpu,
1367 stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub, 1373 stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub,
@@ -1375,20 +1381,23 @@ static int ptc_seq_show(struct seq_file *file, void *data)
1375 stat->s_resets_plug, stat->s_resets_timeout, 1381 stat->s_resets_plug, stat->s_resets_timeout,
1376 stat->s_giveup, stat->s_stimeout, 1382 stat->s_giveup, stat->s_stimeout,
1377 stat->s_busy, stat->s_throttles); 1383 stat->s_busy, stat->s_throttles);
1384 seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
1385 stat->s_bau_disabled, stat->s_bau_reenabled,
1386 stat->s_uv2_wars, stat->s_uv2_wars_hw,
1387 stat->s_uv2_war_waits, stat->s_enters,
1388 stat->s_ipifordisabled, stat->s_plugged,
1389 stat->s_overipilimit, stat->s_giveuplimit,
1390 stat->s_congested);
1378 1391
1379 /* destination side statistics */ 1392 /* destination side statistics */
1380 seq_printf(file, 1393 seq_printf(file,
1381 "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", 1394 "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n",
1382 read_gmmr_sw_ack(uv_cpu_to_pnode(cpu)), 1395 read_gmmr_sw_ack(uv_cpu_to_pnode(cpu)),
1383 stat->d_requestee, cycles_2_us(stat->d_time), 1396 stat->d_requestee, cycles_2_us(stat->d_time),
1384 stat->d_alltlb, stat->d_onetlb, stat->d_multmsg, 1397 stat->d_alltlb, stat->d_onetlb, stat->d_multmsg,
1385 stat->d_nomsg, stat->d_retries, stat->d_canceled, 1398 stat->d_nomsg, stat->d_retries, stat->d_canceled,
1386 stat->d_nocanceled, stat->d_resets, 1399 stat->d_nocanceled, stat->d_resets,
1387 stat->d_rcanceled); 1400 stat->d_rcanceled);
1388 seq_printf(file, "%ld %ld %ld %ld %ld\n",
1389 stat->s_bau_disabled, stat->s_bau_reenabled,
1390 stat->s_uv2_wars, stat->s_uv2_wars_hw,
1391 stat->s_uv2_war_waits);
1392 } 1401 }
1393 return 0; 1402 return 0;
1394} 1403}
@@ -1402,13 +1411,14 @@ static ssize_t tunables_read(struct file *file, char __user *userbuf,
1402 char *buf; 1411 char *buf;
1403 int ret; 1412 int ret;
1404 1413
1405 buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n", 1414 buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d %d\n",
1406 "max_concur plugged_delay plugsb4reset", 1415 "max_concur plugged_delay plugsb4reset timeoutsb4reset",
1407 "timeoutsb4reset ipi_reset_limit complete_threshold", 1416 "ipi_reset_limit complete_threshold congested_response_us",
1408 "congested_response_us congested_reps congested_period", 1417 "congested_reps disabled_period giveup_limit",
1409 max_concurr, plugged_delay, plugsb4reset, 1418 max_concurr, plugged_delay, plugsb4reset,
1410 timeoutsb4reset, ipi_reset_limit, complete_threshold, 1419 timeoutsb4reset, ipi_reset_limit, complete_threshold,
1411 congested_respns_us, congested_reps, congested_period); 1420 congested_respns_us, congested_reps, disabled_period,
1421 giveup_limit);
1412 1422
1413 if (!buf) 1423 if (!buf)
1414 return -ENOMEM; 1424 return -ENOMEM;
@@ -1439,6 +1449,14 @@ static ssize_t ptc_proc_write(struct file *file, const char __user *user,
1439 return -EFAULT; 1449 return -EFAULT;
1440 optstr[count - 1] = '\0'; 1450 optstr[count - 1] = '\0';
1441 1451
1452 if (!strcmp(optstr, "on")) {
1453 set_bau_on();
1454 return count;
1455 } else if (!strcmp(optstr, "off")) {
1456 set_bau_off();
1457 return count;
1458 }
1459
1442 if (strict_strtol(optstr, 10, &input_arg) < 0) { 1460 if (strict_strtol(optstr, 10, &input_arg) < 0) {
1443 printk(KERN_DEBUG "%s is invalid\n", optstr); 1461 printk(KERN_DEBUG "%s is invalid\n", optstr);
1444 return -EINVAL; 1462 return -EINVAL;
@@ -1571,7 +1589,8 @@ static ssize_t tunables_write(struct file *file, const char __user *user,
1571 bcp->complete_threshold = complete_threshold; 1589 bcp->complete_threshold = complete_threshold;
1572 bcp->cong_response_us = congested_respns_us; 1590 bcp->cong_response_us = congested_respns_us;
1573 bcp->cong_reps = congested_reps; 1591 bcp->cong_reps = congested_reps;
1574 bcp->cong_period = congested_period; 1592 bcp->disabled_period = sec_2_cycles(disabled_period);
1593 bcp->giveup_limit = giveup_limit;
1575 } 1594 }
1576 return count; 1595 return count;
1577} 1596}
@@ -1700,6 +1719,10 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode)
1700 * fairness chaining multilevel count replied_to 1719 * fairness chaining multilevel count replied_to
1701 */ 1720 */
1702 } else { 1721 } else {
1722 /*
1723 * BIOS uses legacy mode, but UV2 hardware always
1724 * uses native mode for selective broadcasts.
1725 */
1703 uv2_hdr = &bd2->header.uv2_hdr; 1726 uv2_hdr = &bd2->header.uv2_hdr;
1704 uv2_hdr->swack_flag = 1; 1727 uv2_hdr->swack_flag = 1;
1705 uv2_hdr->base_dest_nasid = 1728 uv2_hdr->base_dest_nasid =
@@ -1812,8 +1835,8 @@ static int calculate_destination_timeout(void)
1812 index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK; 1835 index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK;
1813 mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT); 1836 mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT);
1814 mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK; 1837 mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK;
1815 base = timeout_base_ns[index]; 1838 ts_ns = timeout_base_ns[index];
1816 ts_ns = base * mult1 * mult2; 1839 ts_ns *= (mult1 * mult2);
1817 ret = ts_ns / 1000; 1840 ret = ts_ns / 1000;
1818 } else { 1841 } else {
1819 /* 4 bits 0/1 for 10/80us base, 3 bits of multiplier */ 1842 /* 4 bits 0/1 for 10/80us base, 3 bits of multiplier */
@@ -1837,6 +1860,8 @@ static void __init init_per_cpu_tunables(void)
1837 for_each_present_cpu(cpu) { 1860 for_each_present_cpu(cpu) {
1838 bcp = &per_cpu(bau_control, cpu); 1861 bcp = &per_cpu(bau_control, cpu);
1839 bcp->baudisabled = 0; 1862 bcp->baudisabled = 0;
1863 if (nobau)
1864 bcp->nobau = 1;
1840 bcp->statp = &per_cpu(ptcstats, cpu); 1865 bcp->statp = &per_cpu(ptcstats, cpu);
1841 /* time interval to catch a hardware stay-busy bug */ 1866 /* time interval to catch a hardware stay-busy bug */
1842 bcp->timeout_interval = usec_2_cycles(2*timeout_us); 1867 bcp->timeout_interval = usec_2_cycles(2*timeout_us);
@@ -1849,10 +1874,11 @@ static void __init init_per_cpu_tunables(void)
1849 bcp->complete_threshold = complete_threshold; 1874 bcp->complete_threshold = complete_threshold;
1850 bcp->cong_response_us = congested_respns_us; 1875 bcp->cong_response_us = congested_respns_us;
1851 bcp->cong_reps = congested_reps; 1876 bcp->cong_reps = congested_reps;
1852 bcp->cong_period = congested_period; 1877 bcp->disabled_period = sec_2_cycles(disabled_period);
1853 bcp->clocks_per_100_usec = usec_2_cycles(100); 1878 bcp->giveup_limit = giveup_limit;
1854 spin_lock_init(&bcp->queue_lock); 1879 spin_lock_init(&bcp->queue_lock);
1855 spin_lock_init(&bcp->uvhub_lock); 1880 spin_lock_init(&bcp->uvhub_lock);
1881 spin_lock_init(&bcp->disable_lock);
1856 } 1882 }
1857} 1883}
1858 1884
@@ -1973,7 +1999,6 @@ static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp,
1973 } 1999 }
1974 bcp->uvhub_master = *hmasterp; 2000 bcp->uvhub_master = *hmasterp;
1975 bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id; 2001 bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id;
1976 bcp->using_desc = bcp->uvhub_cpu;
1977 if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) { 2002 if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) {
1978 printk(KERN_EMERG "%d cpus per uvhub invalid\n", 2003 printk(KERN_EMERG "%d cpus per uvhub invalid\n",
1979 bcp->uvhub_cpu); 2004 bcp->uvhub_cpu);
@@ -2070,16 +2095,12 @@ static int __init uv_bau_init(void)
2070 if (!is_uv_system()) 2095 if (!is_uv_system())
2071 return 0; 2096 return 0;
2072 2097
2073 if (nobau)
2074 return 0;
2075
2076 for_each_possible_cpu(cur_cpu) { 2098 for_each_possible_cpu(cur_cpu) {
2077 mask = &per_cpu(uv_flush_tlb_mask, cur_cpu); 2099 mask = &per_cpu(uv_flush_tlb_mask, cur_cpu);
2078 zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cur_cpu)); 2100 zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cur_cpu));
2079 } 2101 }
2080 2102
2081 nuvhubs = uv_num_possible_blades(); 2103 nuvhubs = uv_num_possible_blades();
2082 spin_lock_init(&disable_lock);
2083 congested_cycles = usec_2_cycles(congested_respns_us); 2104 congested_cycles = usec_2_cycles(congested_respns_us);
2084 2105
2085 uv_base_pnode = 0x7fffffff; 2106 uv_base_pnode = 0x7fffffff;
@@ -2092,7 +2113,8 @@ static int __init uv_bau_init(void)
2092 enable_timeouts(); 2113 enable_timeouts();
2093 2114
2094 if (init_per_cpu(nuvhubs, uv_base_pnode)) { 2115 if (init_per_cpu(nuvhubs, uv_base_pnode)) {
2095 nobau = 1; 2116 set_bau_off();
2117 nobau_perm = 1;
2096 return 0; 2118 return 0;
2097 } 2119 }
2098 2120
diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c
index a22c41656b5..acf7752da95 100644
--- a/arch/x86/platform/uv/uv_irq.c
+++ b/arch/x86/platform/uv/uv_irq.c
@@ -135,6 +135,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
135 unsigned long mmr_value; 135 unsigned long mmr_value;
136 struct uv_IO_APIC_route_entry *entry; 136 struct uv_IO_APIC_route_entry *entry;
137 int mmr_pnode, err; 137 int mmr_pnode, err;
138 unsigned int dest;
138 139
139 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != 140 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
140 sizeof(unsigned long)); 141 sizeof(unsigned long));
@@ -143,6 +144,10 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
143 if (err != 0) 144 if (err != 0)
144 return err; 145 return err;
145 146
147 err = apic->cpu_mask_to_apicid_and(eligible_cpu, eligible_cpu, &dest);
148 if (err != 0)
149 return err;
150
146 if (limit == UV_AFFINITY_CPU) 151 if (limit == UV_AFFINITY_CPU)
147 irq_set_status_flags(irq, IRQ_NO_BALANCING); 152 irq_set_status_flags(irq, IRQ_NO_BALANCING);
148 else 153 else
@@ -159,7 +164,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
159 entry->polarity = 0; 164 entry->polarity = 0;
160 entry->trigger = 0; 165 entry->trigger = 0;
161 entry->mask = 0; 166 entry->mask = 0;
162 entry->dest = apic->cpu_mask_to_apicid(eligible_cpu); 167 entry->dest = dest;
163 168
164 mmr_pnode = uv_blade_to_pnode(mmr_blade); 169 mmr_pnode = uv_blade_to_pnode(mmr_blade);
165 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); 170 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
diff --git a/arch/x86/realmode/Makefile b/arch/x86/realmode/Makefile
new file mode 100644
index 00000000000..94f7fbe97b0
--- /dev/null
+++ b/arch/x86/realmode/Makefile
@@ -0,0 +1,18 @@
1#
2# arch/x86/realmode/Makefile
3#
4# This file is subject to the terms and conditions of the GNU General Public
5# License. See the file "COPYING" in the main directory of this archive
6# for more details.
7#
8#
9
10subdir- := rm
11
12obj-y += init.o
13obj-y += rmpiggy.o
14
15$(obj)/rmpiggy.o: $(obj)/rm/realmode.bin
16
17$(obj)/rm/realmode.bin: FORCE
18 $(Q)$(MAKE) $(build)=$(obj)/rm $@
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
new file mode 100644
index 00000000000..cbca565af5b
--- /dev/null
+++ b/arch/x86/realmode/init.c
@@ -0,0 +1,115 @@
1#include <linux/io.h>
2#include <linux/memblock.h>
3
4#include <asm/cacheflush.h>
5#include <asm/pgtable.h>
6#include <asm/realmode.h>
7
8struct real_mode_header *real_mode_header;
9u32 *trampoline_cr4_features;
10
11void __init setup_real_mode(void)
12{
13 phys_addr_t mem;
14 u16 real_mode_seg;
15 u32 *rel;
16 u32 count;
17 u32 *ptr;
18 u16 *seg;
19 int i;
20 unsigned char *base;
21 struct trampoline_header *trampoline_header;
22 size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob);
23#ifdef CONFIG_X86_64
24 u64 *trampoline_pgd;
25 u64 efer;
26#endif
27
28 /* Has to be in very low memory so we can execute real-mode AP code. */
29 mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
30 if (!mem)
31 panic("Cannot allocate trampoline\n");
32
33 base = __va(mem);
34 memblock_reserve(mem, size);
35 real_mode_header = (struct real_mode_header *) base;
36 printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
37 base, (unsigned long long)mem, size);
38
39 memcpy(base, real_mode_blob, size);
40
41 real_mode_seg = __pa(base) >> 4;
42 rel = (u32 *) real_mode_relocs;
43
44 /* 16-bit segment relocations. */
45 count = rel[0];
46 rel = &rel[1];
47 for (i = 0; i < count; i++) {
48 seg = (u16 *) (base + rel[i]);
49 *seg = real_mode_seg;
50 }
51
52 /* 32-bit linear relocations. */
53 count = rel[i];
54 rel = &rel[i + 1];
55 for (i = 0; i < count; i++) {
56 ptr = (u32 *) (base + rel[i]);
57 *ptr += __pa(base);
58 }
59
60 /* Must be perfomed *after* relocation. */
61 trampoline_header = (struct trampoline_header *)
62 __va(real_mode_header->trampoline_header);
63
64#ifdef CONFIG_X86_32
65 trampoline_header->start = __pa(startup_32_smp);
66 trampoline_header->gdt_limit = __BOOT_DS + 7;
67 trampoline_header->gdt_base = __pa(boot_gdt);
68#else
69 /*
70 * Some AMD processors will #GP(0) if EFER.LMA is set in WRMSR
71 * so we need to mask it out.
72 */
73 rdmsrl(MSR_EFER, efer);
74 trampoline_header->efer = efer & ~EFER_LMA;
75
76 trampoline_header->start = (u64) secondary_startup_64;
77 trampoline_cr4_features = &trampoline_header->cr4;
78 *trampoline_cr4_features = read_cr4();
79
80 trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
81 trampoline_pgd[0] = __pa(level3_ident_pgt) + _KERNPG_TABLE;
82 trampoline_pgd[511] = __pa(level3_kernel_pgt) + _KERNPG_TABLE;
83#endif
84}
85
86/*
87 * set_real_mode_permissions() gets called very early, to guarantee the
88 * availability of low memory. This is before the proper kernel page
89 * tables are set up, so we cannot set page permissions in that
90 * function. Thus, we use an arch_initcall instead.
91 */
92static int __init set_real_mode_permissions(void)
93{
94 unsigned char *base = (unsigned char *) real_mode_header;
95 size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob);
96
97 size_t ro_size =
98 PAGE_ALIGN(real_mode_header->ro_end) -
99 __pa(base);
100
101 size_t text_size =
102 PAGE_ALIGN(real_mode_header->ro_end) -
103 real_mode_header->text_start;
104
105 unsigned long text_start =
106 (unsigned long) __va(real_mode_header->text_start);
107
108 set_memory_nx((unsigned long) base, size >> PAGE_SHIFT);
109 set_memory_ro((unsigned long) base, ro_size >> PAGE_SHIFT);
110 set_memory_x((unsigned long) text_start, text_size >> PAGE_SHIFT);
111
112 return 0;
113}
114
115arch_initcall(set_real_mode_permissions);
diff --git a/arch/x86/realmode/rm/.gitignore b/arch/x86/realmode/rm/.gitignore
new file mode 100644
index 00000000000..b6ed3a2555c
--- /dev/null
+++ b/arch/x86/realmode/rm/.gitignore
@@ -0,0 +1,3 @@
1pasyms.h
2realmode.lds
3realmode.relocs
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
new file mode 100644
index 00000000000..b2d534cab25
--- /dev/null
+++ b/arch/x86/realmode/rm/Makefile
@@ -0,0 +1,82 @@
1#
2# arch/x86/realmode/Makefile
3#
4# This file is subject to the terms and conditions of the GNU General Public
5# License. See the file "COPYING" in the main directory of this archive
6# for more details.
7#
8#
9
10always := realmode.bin realmode.relocs
11
12wakeup-objs := wakeup_asm.o wakemain.o video-mode.o
13wakeup-objs += copy.o bioscall.o regs.o
14# The link order of the video-*.o modules can matter. In particular,
15# video-vga.o *must* be listed first, followed by video-vesa.o.
16# Hardware-specific drivers should follow in the order they should be
17# probed, and video-bios.o should typically be last.
18wakeup-objs += video-vga.o
19wakeup-objs += video-vesa.o
20wakeup-objs += video-bios.o
21
22realmode-y += header.o
23realmode-y += trampoline_$(BITS).o
24realmode-y += stack.o
25realmode-y += reboot.o
26realmode-$(CONFIG_ACPI_SLEEP) += $(wakeup-objs)
27
28targets += $(realmode-y)
29
30REALMODE_OBJS = $(addprefix $(obj)/,$(realmode-y))
31
32sed-pasyms := -n -r -e 's/^([0-9a-fA-F]+) [ABCDGRSTVW] (.+)$$/pa_\2 = \2;/p'
33
34quiet_cmd_pasyms = PASYMS $@
35 cmd_pasyms = $(NM) $(filter-out FORCE,$^) | \
36 sed $(sed-pasyms) | sort | uniq > $@
37
38targets += pasyms.h
39$(obj)/pasyms.h: $(REALMODE_OBJS) FORCE
40 $(call if_changed,pasyms)
41
42targets += realmode.lds
43$(obj)/realmode.lds: $(obj)/pasyms.h
44
45LDFLAGS_realmode.elf := --emit-relocs -T
46CPPFLAGS_realmode.lds += -P -C -I$(obj)
47
48targets += realmode.elf
49$(obj)/realmode.elf: $(obj)/realmode.lds $(REALMODE_OBJS) FORCE
50 $(call if_changed,ld)
51
52OBJCOPYFLAGS_realmode.bin := -O binary
53
54targets += realmode.bin
55$(obj)/realmode.bin: $(obj)/realmode.elf $(obj)/realmode.relocs
56 $(call if_changed,objcopy)
57
58quiet_cmd_relocs = RELOCS $@
59 cmd_relocs = arch/x86/tools/relocs --realmode $< > $@
60
61targets += realmode.relocs
62$(obj)/realmode.relocs: $(obj)/realmode.elf FORCE
63 $(call if_changed,relocs)
64
65# ---------------------------------------------------------------------------
66
67# How to compile the 16-bit code. Note we always compile for -march=i386,
68# that way we can complain to the user if the CPU is insufficient.
69KBUILD_CFLAGS := $(LINUXINCLUDE) -m32 -g -Os -D_SETUP -D__KERNEL__ -D_WAKEUP \
70 -I$(srctree)/arch/x86/boot \
71 -DDISABLE_BRANCH_PROFILING \
72 -Wall -Wstrict-prototypes \
73 -march=i386 -mregparm=3 \
74 -include $(srctree)/$(src)/../../boot/code16gcc.h \
75 -fno-strict-aliasing -fomit-frame-pointer \
76 $(call cc-option, -ffreestanding) \
77 $(call cc-option, -fno-toplevel-reorder,\
78 $(call cc-option, -fno-unit-at-a-time)) \
79 $(call cc-option, -fno-stack-protector) \
80 $(call cc-option, -mpreferred-stack-boundary=2)
81KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
82GCOV_PROFILE := n
diff --git a/arch/x86/realmode/rm/bioscall.S b/arch/x86/realmode/rm/bioscall.S
new file mode 100644
index 00000000000..16162d19791
--- /dev/null
+++ b/arch/x86/realmode/rm/bioscall.S
@@ -0,0 +1 @@
#include "../../boot/bioscall.S"
diff --git a/arch/x86/realmode/rm/copy.S b/arch/x86/realmode/rm/copy.S
new file mode 100644
index 00000000000..b785e6f38fd
--- /dev/null
+++ b/arch/x86/realmode/rm/copy.S
@@ -0,0 +1 @@
#include "../../boot/copy.S"
diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S
new file mode 100644
index 00000000000..a28221d94e6
--- /dev/null
+++ b/arch/x86/realmode/rm/header.S
@@ -0,0 +1,43 @@
1/*
2 * Real-mode blob header; this should match realmode.h and be
3 * readonly; for mutable data instead add pointers into the .data
4 * or .bss sections as appropriate.
5 */
6
7#include <linux/linkage.h>
8#include <asm/page_types.h>
9#include <asm/segment.h>
10
11#include "realmode.h"
12
13 .section ".header", "a"
14
15 .balign 16
16GLOBAL(real_mode_header)
17 .long pa_text_start
18 .long pa_ro_end
19 /* SMP trampoline */
20 .long pa_trampoline_start
21 .long pa_trampoline_status
22 .long pa_trampoline_header
23#ifdef CONFIG_X86_64
24 .long pa_trampoline_pgd;
25#endif
26 /* ACPI S3 wakeup */
27#ifdef CONFIG_ACPI_SLEEP
28 .long pa_wakeup_start
29 .long pa_wakeup_header
30#endif
31 /* APM/BIOS reboot */
32 .long pa_machine_real_restart_asm
33#ifdef CONFIG_X86_64
34 .long __KERNEL32_CS
35#endif
36END(real_mode_header)
37
38 /* End signature, used to verify integrity */
39 .section ".signature","a"
40 .balign 4
41GLOBAL(end_signature)
42 .long REALMODE_END_SIGNATURE
43END(end_signature)
diff --git a/arch/x86/realmode/rm/realmode.h b/arch/x86/realmode/rm/realmode.h
new file mode 100644
index 00000000000..d74cff6350e
--- /dev/null
+++ b/arch/x86/realmode/rm/realmode.h
@@ -0,0 +1,21 @@
1#ifndef ARCH_X86_REALMODE_RM_REALMODE_H
2#define ARCH_X86_REALMODE_RM_REALMODE_H
3
4#ifdef __ASSEMBLY__
5
6/*
7 * 16-bit ljmpw to the real_mode_seg
8 *
9 * This must be open-coded since gas will choke on using a
10 * relocatable symbol for the segment portion.
11 */
12#define LJMPW_RM(to) .byte 0xea ; .word (to), real_mode_seg
13
14#endif /* __ASSEMBLY__ */
15
16/*
17 * Signature at the end of the realmode region
18 */
19#define REALMODE_END_SIGNATURE 0x65a22c82
20
21#endif /* ARCH_X86_REALMODE_RM_REALMODE_H */
diff --git a/arch/x86/realmode/rm/realmode.lds.S b/arch/x86/realmode/rm/realmode.lds.S
new file mode 100644
index 00000000000..86b2e8d6b1f
--- /dev/null
+++ b/arch/x86/realmode/rm/realmode.lds.S
@@ -0,0 +1,76 @@
1/*
2 * realmode.lds.S
3 *
4 * Linker script for the real-mode code
5 */
6
7#include <asm/page_types.h>
8
9#undef i386
10
11OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
12OUTPUT_ARCH(i386)
13
14SECTIONS
15{
16 real_mode_seg = 0;
17
18 . = 0;
19 .header : {
20 pa_real_mode_base = .;
21 *(.header)
22 }
23
24 . = ALIGN(4);
25 .rodata : {
26 *(.rodata)
27 *(.rodata.*)
28 . = ALIGN(16);
29 video_cards = .;
30 *(.videocards)
31 video_cards_end = .;
32 }
33
34 . = ALIGN(PAGE_SIZE);
35 pa_text_start = .;
36 .text : {
37 *(.text)
38 *(.text.*)
39 }
40
41 .text32 : {
42 *(.text32)
43 *(.text32.*)
44 }
45
46 .text64 : {
47 *(.text64)
48 *(.text64.*)
49 }
50 pa_ro_end = .;
51
52 . = ALIGN(PAGE_SIZE);
53 .data : {
54 *(.data)
55 *(.data.*)
56 }
57
58 . = ALIGN(128);
59 .bss : {
60 *(.bss*)
61 }
62
63 /* End signature for integrity checking */
64 . = ALIGN(4);
65 .signature : {
66 *(.signature)
67 }
68
69 /DISCARD/ : {
70 *(.note*)
71 *(.debug*)
72 *(.eh_frame*)
73 }
74
75#include "pasyms.h"
76}
diff --git a/arch/x86/kernel/reboot_32.S b/arch/x86/realmode/rm/reboot.S
index 1d5c46df0d7..f932ea61d1c 100644
--- a/arch/x86/kernel/reboot_32.S
+++ b/arch/x86/realmode/rm/reboot.S
@@ -2,6 +2,9 @@
2#include <linux/init.h> 2#include <linux/init.h>
3#include <asm/segment.h> 3#include <asm/segment.h>
4#include <asm/page_types.h> 4#include <asm/page_types.h>
5#include <asm/processor-flags.h>
6#include <asm/msr-index.h>
7#include "realmode.h"
5 8
6/* 9/*
7 * The following code and data reboots the machine by switching to real 10 * The following code and data reboots the machine by switching to real
@@ -11,36 +14,44 @@
11 * doesn't work with at least one type of 486 motherboard. It is easy 14 * doesn't work with at least one type of 486 motherboard. It is easy
12 * to stop this code working; hence the copious comments. 15 * to stop this code working; hence the copious comments.
13 * 16 *
14 * This code is called with the restart type (0 = BIOS, 1 = APM) in %eax. 17 * This code is called with the restart type (0 = BIOS, 1 = APM) in
18 * the primary argument register (%eax for 32 bit, %edi for 64 bit).
15 */ 19 */
16 .section ".x86_trampoline","a" 20 .section ".text32", "ax"
17 .balign 16
18 .code32 21 .code32
19ENTRY(machine_real_restart_asm) 22ENTRY(machine_real_restart_asm)
20r_base = .
21 /* Get our own relocated address */
22 call 1f
231: popl %ebx
24 subl $(1b - r_base), %ebx
25
26 /* Compute the equivalent real-mode segment */
27 movl %ebx, %ecx
28 shrl $4, %ecx
29
30 /* Patch post-real-mode segment jump */
31 movw (dispatch_table - r_base)(%ebx,%eax,2),%ax
32 movw %ax, (101f - r_base)(%ebx)
33 movw %cx, (102f - r_base)(%ebx)
34 23
24#ifdef CONFIG_X86_64
25 /* Switch to trampoline GDT as it is guaranteed < 4 GiB */
26 movl $__KERNEL_DS, %eax
27 movl %eax, %ds
28 lgdtl pa_tr_gdt
29
30 /* Disable paging to drop us out of long mode */
31 movl %cr0, %eax
32 andl $~X86_CR0_PG, %eax
33 movl %eax, %cr0
34 ljmpl $__KERNEL32_CS, $pa_machine_real_restart_paging_off
35
36GLOBAL(machine_real_restart_paging_off)
37 xorl %eax, %eax
38 xorl %edx, %edx
39 movl $MSR_EFER, %ecx
40 wrmsr
41
42 movl %edi, %eax
43
44#endif /* CONFIG_X86_64 */
45
35 /* Set up the IDT for real mode. */ 46 /* Set up the IDT for real mode. */
36 lidtl (machine_real_restart_idt - r_base)(%ebx) 47 lidtl pa_machine_real_restart_idt
37 48
38 /* 49 /*
39 * Set up a GDT from which we can load segment descriptors for real 50 * Set up a GDT from which we can load segment descriptors for real
40 * mode. The GDT is not used in real mode; it is just needed here to 51 * mode. The GDT is not used in real mode; it is just needed here to
41 * prepare the descriptors. 52 * prepare the descriptors.
42 */ 53 */
43 lgdtl (machine_real_restart_gdt - r_base)(%ebx) 54 lgdtl pa_machine_real_restart_gdt
44 55
45 /* 56 /*
46 * Load the data segment registers with 16-bit compatible values 57 * Load the data segment registers with 16-bit compatible values
@@ -51,7 +62,7 @@ r_base = .
51 movl %ecx, %fs 62 movl %ecx, %fs
52 movl %ecx, %gs 63 movl %ecx, %gs
53 movl %ecx, %ss 64 movl %ecx, %ss
54 ljmpl $8, $1f - r_base 65 ljmpw $8, $1f
55 66
56/* 67/*
57 * This is 16-bit protected mode code to disable paging and the cache, 68 * This is 16-bit protected mode code to disable paging and the cache,
@@ -76,27 +87,29 @@ r_base = .
76 * 87 *
77 * Most of this work is probably excessive, but it is what is tested. 88 * Most of this work is probably excessive, but it is what is tested.
78 */ 89 */
90 .text
79 .code16 91 .code16
92
93 .balign 16
94machine_real_restart_asm16:
801: 951:
81 xorl %ecx, %ecx 96 xorl %ecx, %ecx
82 movl %cr0, %eax 97 movl %cr0, %edx
83 andl $0x00000011, %eax 98 andl $0x00000011, %edx
84 orl $0x60000000, %eax 99 orl $0x60000000, %edx
85 movl %eax, %cr0 100 movl %edx, %cr0
86 movl %ecx, %cr3 101 movl %ecx, %cr3
87 movl %cr0, %edx 102 movl %cr0, %edx
88 andl $0x60000000, %edx /* If no cache bits -> no wbinvd */ 103 testl $0x60000000, %edx /* If no cache bits -> no wbinvd */
89 jz 2f 104 jz 2f
90 wbinvd 105 wbinvd
912: 1062:
92 andb $0x10, %al 107 andb $0x10, %dl
93 movl %eax, %cr0 108 movl %edx, %cr0
94 .byte 0xea /* ljmpw */ 109 LJMPW_RM(3f)
95101: .word 0 /* Offset */ 1103:
96102: .word 0 /* Segment */ 111 andw %ax, %ax
97 112 jz bios
98bios:
99 ljmpw $0xf000, $0xfff0
100 113
101apm: 114apm:
102 movw $0x1000, %ax 115 movw $0x1000, %ax
@@ -106,26 +119,34 @@ apm:
106 movw $0x0001, %bx 119 movw $0x0001, %bx
107 movw $0x0003, %cx 120 movw $0x0003, %cx
108 int $0x15 121 int $0x15
122 /* This should never return... */
109 123
110END(machine_real_restart_asm) 124bios:
125 ljmpw $0xf000, $0xfff0
111 126
112 .balign 16 127 .section ".rodata", "a"
113 /* These must match <asm/reboot.h */
114dispatch_table:
115 .word bios - r_base
116 .word apm - r_base
117END(dispatch_table)
118 128
119 .balign 16 129 .balign 16
120machine_real_restart_idt: 130GLOBAL(machine_real_restart_idt)
121 .word 0xffff /* Length - real mode default value */ 131 .word 0xffff /* Length - real mode default value */
122 .long 0 /* Base - real mode default value */ 132 .long 0 /* Base - real mode default value */
123END(machine_real_restart_idt) 133END(machine_real_restart_idt)
124 134
125 .balign 16 135 .balign 16
126ENTRY(machine_real_restart_gdt) 136GLOBAL(machine_real_restart_gdt)
127 .quad 0 /* Self-pointer, filled in by PM code */ 137 /* Self-pointer */
128 .quad 0 /* 16-bit code segment, filled in by PM code */ 138 .word 0xffff /* Length - real mode default value */
139 .long pa_machine_real_restart_gdt
140 .word 0
141
142 /*
143 * 16-bit code segment pointing to real_mode_seg
144 * Selector value 8
145 */
146 .word 0xffff /* Limit */
147 .long 0x9b000000 + pa_real_mode_base
148 .word 0
149
129 /* 150 /*
130 * 16-bit data segment with the selector value 16 = 0x10 and 151 * 16-bit data segment with the selector value 16 = 0x10 and
131 * base value 0x100; since this is consistent with real mode 152 * base value 0x100; since this is consistent with real mode
diff --git a/arch/x86/realmode/rm/regs.c b/arch/x86/realmode/rm/regs.c
new file mode 100644
index 00000000000..fbb15b9f9ca
--- /dev/null
+++ b/arch/x86/realmode/rm/regs.c
@@ -0,0 +1 @@
#include "../../boot/regs.c"
diff --git a/arch/x86/realmode/rm/stack.S b/arch/x86/realmode/rm/stack.S
new file mode 100644
index 00000000000..867ae87adfa
--- /dev/null
+++ b/arch/x86/realmode/rm/stack.S
@@ -0,0 +1,19 @@
1/*
2 * Common heap and stack allocations
3 */
4
5#include <linux/linkage.h>
6
7 .data
8GLOBAL(HEAP)
9 .long rm_heap
10GLOBAL(heap_end)
11 .long rm_stack
12
13 .bss
14 .balign 16
15GLOBAL(rm_heap)
16 .space 2048
17GLOBAL(rm_stack)
18 .space 2048
19GLOBAL(rm_stack_end)
diff --git a/arch/x86/realmode/rm/trampoline_32.S b/arch/x86/realmode/rm/trampoline_32.S
new file mode 100644
index 00000000000..c1b2791183e
--- /dev/null
+++ b/arch/x86/realmode/rm/trampoline_32.S
@@ -0,0 +1,74 @@
1/*
2 *
3 * Trampoline.S Derived from Setup.S by Linus Torvalds
4 *
5 * 4 Jan 1997 Michael Chastain: changed to gnu as.
6 *
7 * This is only used for booting secondary CPUs in SMP machine
8 *
9 * Entry: CS:IP point to the start of our code, we are
10 * in real mode with no stack, but the rest of the
11 * trampoline page to make our stack and everything else
12 * is a mystery.
13 *
14 * We jump into arch/x86/kernel/head_32.S.
15 *
16 * On entry to trampoline_start, the processor is in real mode
17 * with 16-bit addressing and 16-bit data. CS has some value
18 * and IP is zero. Thus, we load CS to the physical segment
19 * of the real mode code before doing anything further.
20 */
21
22#include <linux/linkage.h>
23#include <linux/init.h>
24#include <asm/segment.h>
25#include <asm/page_types.h>
26#include "realmode.h"
27
28 .text
29 .code16
30
31 .balign PAGE_SIZE
32ENTRY(trampoline_start)
33 wbinvd # Needed for NUMA-Q should be harmless for others
34
35 LJMPW_RM(1f)
361:
37 mov %cs, %ax # Code and data in the same place
38 mov %ax, %ds
39
40 cli # We should be safe anyway
41
42 movl tr_start, %eax # where we need to go
43
44 movl $0xA5A5A5A5, trampoline_status
45 # write marker for master knows we're running
46
47 /*
48 * GDT tables in non default location kernel can be beyond 16MB and
49 * lgdt will not be able to load the address as in real mode default
50 * operand size is 16bit. Use lgdtl instead to force operand size
51 * to 32 bit.
52 */
53 lidtl tr_idt # load idt with 0, 0
54 lgdtl tr_gdt # load gdt with whatever is appropriate
55
56 movw $1, %dx # protected mode (PE) bit
57 lmsw %dx # into protected mode
58
59 ljmpl $__BOOT_CS, $pa_startup_32
60
61 .section ".text32","ax"
62 .code32
63ENTRY(startup_32) # note: also used from wakeup_asm.S
64 jmp *%eax
65
66 .bss
67 .balign 8
68GLOBAL(trampoline_header)
69 tr_start: .space 4
70 tr_gdt_pad: .space 2
71 tr_gdt: .space 6
72END(trampoline_header)
73
74#include "trampoline_common.S"
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S
index 09ff51799e9..bb360dc39d2 100644
--- a/arch/x86/kernel/trampoline_64.S
+++ b/arch/x86/realmode/rm/trampoline_64.S
@@ -5,12 +5,12 @@
5 * 4 Jan 1997 Michael Chastain: changed to gnu as. 5 * 4 Jan 1997 Michael Chastain: changed to gnu as.
6 * 15 Sept 2005 Eric Biederman: 64bit PIC support 6 * 15 Sept 2005 Eric Biederman: 64bit PIC support
7 * 7 *
8 * Entry: CS:IP point to the start of our code, we are 8 * Entry: CS:IP point to the start of our code, we are
9 * in real mode with no stack, but the rest of the 9 * in real mode with no stack, but the rest of the
10 * trampoline page to make our stack and everything else 10 * trampoline page to make our stack and everything else
11 * is a mystery. 11 * is a mystery.
12 * 12 *
13 * On entry to trampoline_data, the processor is in real mode 13 * On entry to trampoline_start, the processor is in real mode
14 * with 16-bit addressing and 16-bit data. CS has some value 14 * with 16-bit addressing and 16-bit data. CS has some value
15 * and IP is zero. Thus, data addresses need to be absolute 15 * and IP is zero. Thus, data addresses need to be absolute
16 * (no relocation) and are taken with regard to r_base. 16 * (no relocation) and are taken with regard to r_base.
@@ -31,43 +31,33 @@
31#include <asm/msr.h> 31#include <asm/msr.h>
32#include <asm/segment.h> 32#include <asm/segment.h>
33#include <asm/processor-flags.h> 33#include <asm/processor-flags.h>
34#include "realmode.h"
34 35
35 .section ".x86_trampoline","a" 36 .text
36 .balign PAGE_SIZE
37 .code16 37 .code16
38 38
39ENTRY(trampoline_data) 39 .balign PAGE_SIZE
40r_base = . 40ENTRY(trampoline_start)
41 cli # We should be safe anyway 41 cli # We should be safe anyway
42 wbinvd 42 wbinvd
43
44 LJMPW_RM(1f)
451:
43 mov %cs, %ax # Code and data in the same place 46 mov %cs, %ax # Code and data in the same place
44 mov %ax, %ds 47 mov %ax, %ds
45 mov %ax, %es 48 mov %ax, %es
46 mov %ax, %ss 49 mov %ax, %ss
47 50
51 movl $0xA5A5A5A5, trampoline_status
52 # write marker for master knows we're running
48 53
49 movl $0xA5A5A5A5, trampoline_status - r_base 54 # Setup stack
50 # write marker for master knows we're running 55 movl $rm_stack_end, %esp
51
52 # Setup stack
53 movw $(trampoline_stack_end - r_base), %sp
54 56
55 call verify_cpu # Verify the cpu supports long mode 57 call verify_cpu # Verify the cpu supports long mode
56 testl %eax, %eax # Check for return code 58 testl %eax, %eax # Check for return code
57 jnz no_longmode 59 jnz no_longmode
58 60
59 mov %cs, %ax
60 movzx %ax, %esi # Find the 32bit trampoline location
61 shll $4, %esi
62
63 # Fixup the absolute vectors
64 leal (startup_32 - r_base)(%esi), %eax
65 movl %eax, startup_32_vector - r_base
66 leal (startup_64 - r_base)(%esi), %eax
67 movl %eax, startup_64_vector - r_base
68 leal (tgdt - r_base)(%esi), %eax
69 movl %eax, (tgdt + 2 - r_base)
70
71 /* 61 /*
72 * GDT tables in non default location kernel can be beyond 16MB and 62 * GDT tables in non default location kernel can be beyond 16MB and
73 * lgdt will not be able to load the address as in real mode default 63 * lgdt will not be able to load the address as in real mode default
@@ -75,36 +65,49 @@ r_base = .
75 * to 32 bit. 65 * to 32 bit.
76 */ 66 */
77 67
78 lidtl tidt - r_base # load idt with 0, 0 68 lidtl tr_idt # load idt with 0, 0
79 lgdtl tgdt - r_base # load gdt with whatever is appropriate 69 lgdtl tr_gdt # load gdt with whatever is appropriate
70
71 movw $__KERNEL_DS, %dx # Data segment descriptor
80 72
81 mov $X86_CR0_PE, %ax # protected mode (PE) bit 73 # Enable protected mode
82 lmsw %ax # into protected mode 74 movl $X86_CR0_PE, %eax # protected mode (PE) bit
75 movl %eax, %cr0 # into protected mode
83 76
84 # flush prefetch and jump to startup_32 77 # flush prefetch and jump to startup_32
85 ljmpl *(startup_32_vector - r_base) 78 ljmpl $__KERNEL32_CS, $pa_startup_32
86 79
80no_longmode:
81 hlt
82 jmp no_longmode
83#include "../kernel/verify_cpu.S"
84
85 .section ".text32","ax"
87 .code32 86 .code32
88 .balign 4 87 .balign 4
89startup_32: 88ENTRY(startup_32)
90 movl $__KERNEL_DS, %eax # Initialize the %ds segment register 89 movl %edx, %ss
91 movl %eax, %ds 90 addl $pa_real_mode_base, %esp
92 91 movl %edx, %ds
93 movl $X86_CR4_PAE, %eax 92 movl %edx, %es
93 movl %edx, %fs
94 movl %edx, %gs
95
96 movl pa_tr_cr4, %eax
94 movl %eax, %cr4 # Enable PAE mode 97 movl %eax, %cr4 # Enable PAE mode
95 98
96 # Setup trampoline 4 level pagetables 99 # Setup trampoline 4 level pagetables
97 leal (trampoline_level4_pgt - r_base)(%esi), %eax 100 movl $pa_trampoline_pgd, %eax
98 movl %eax, %cr3 101 movl %eax, %cr3
99 102
103 # Set up EFER
104 movl pa_tr_efer, %eax
105 movl pa_tr_efer + 4, %edx
100 movl $MSR_EFER, %ecx 106 movl $MSR_EFER, %ecx
101 movl $(1 << _EFER_LME), %eax # Enable Long Mode
102 xorl %edx, %edx
103 wrmsr 107 wrmsr
104 108
105 # Enable paging and in turn activate Long Mode 109 # Enable paging and in turn activate Long Mode
106 # Enable protected mode 110 movl $(X86_CR0_PG | X86_CR0_WP | X86_CR0_PE), %eax
107 movl $(X86_CR0_PG | X86_CR0_PE), %eax
108 movl %eax, %cr0 111 movl %eax, %cr0
109 112
110 /* 113 /*
@@ -113,59 +116,38 @@ startup_32:
113 * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use 116 * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use
114 * the new gdt/idt that has __KERNEL_CS with CS.L = 1. 117 * the new gdt/idt that has __KERNEL_CS with CS.L = 1.
115 */ 118 */
116 ljmp *(startup_64_vector - r_base)(%esi) 119 ljmpl $__KERNEL_CS, $pa_startup_64
117 120
121 .section ".text64","ax"
118 .code64 122 .code64
119 .balign 4 123 .balign 4
120startup_64: 124ENTRY(startup_64)
121 # Now jump into the kernel using virtual addresses 125 # Now jump into the kernel using virtual addresses
122 movq $secondary_startup_64, %rax 126 jmpq *tr_start(%rip)
123 jmp *%rax
124
125 .code16
126no_longmode:
127 hlt
128 jmp no_longmode
129#include "verify_cpu.S"
130
131 .balign 4
132 # Careful these need to be in the same 64K segment as the above;
133tidt:
134 .word 0 # idt limit = 0
135 .word 0, 0 # idt base = 0L
136 127
128 .section ".rodata","a"
137 # Duplicate the global descriptor table 129 # Duplicate the global descriptor table
138 # so the kernel can live anywhere 130 # so the kernel can live anywhere
139 .balign 4 131 .balign 16
140tgdt: 132 .globl tr_gdt
141 .short tgdt_end - tgdt # gdt limit 133tr_gdt:
142 .long tgdt - r_base 134 .short tr_gdt_end - tr_gdt - 1 # gdt limit
143 .short 0 135 .long pa_tr_gdt
136 .short 0
144 .quad 0x00cf9b000000ffff # __KERNEL32_CS 137 .quad 0x00cf9b000000ffff # __KERNEL32_CS
145 .quad 0x00af9b000000ffff # __KERNEL_CS 138 .quad 0x00af9b000000ffff # __KERNEL_CS
146 .quad 0x00cf93000000ffff # __KERNEL_DS 139 .quad 0x00cf93000000ffff # __KERNEL_DS
147tgdt_end: 140tr_gdt_end:
148 141
149 .balign 4 142 .bss
150startup_32_vector: 143 .balign PAGE_SIZE
151 .long startup_32 - r_base 144GLOBAL(trampoline_pgd) .space PAGE_SIZE
152 .word __KERNEL32_CS, 0
153 145
154 .balign 4 146 .balign 8
155startup_64_vector: 147GLOBAL(trampoline_header)
156 .long startup_64 - r_base 148 tr_start: .space 8
157 .word __KERNEL_CS, 0 149 GLOBAL(tr_efer) .space 8
150 GLOBAL(tr_cr4) .space 4
151END(trampoline_header)
158 152
159 .balign 4 153#include "trampoline_common.S"
160ENTRY(trampoline_status)
161 .long 0
162
163trampoline_stack:
164 .org 0x1000
165trampoline_stack_end:
166ENTRY(trampoline_level4_pgt)
167 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
168 .fill 510,8,0
169 .quad level3_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
170
171ENTRY(trampoline_end)
diff --git a/arch/x86/realmode/rm/trampoline_common.S b/arch/x86/realmode/rm/trampoline_common.S
new file mode 100644
index 00000000000..b1ecdb9692a
--- /dev/null
+++ b/arch/x86/realmode/rm/trampoline_common.S
@@ -0,0 +1,7 @@
1 .section ".rodata","a"
2 .balign 16
3tr_idt: .fill 1, 6, 0
4
5 .bss
6 .balign 4
7GLOBAL(trampoline_status) .space 4
diff --git a/arch/x86/realmode/rm/video-bios.c b/arch/x86/realmode/rm/video-bios.c
new file mode 100644
index 00000000000..848b25aaf11
--- /dev/null
+++ b/arch/x86/realmode/rm/video-bios.c
@@ -0,0 +1 @@
#include "../../boot/video-bios.c"
diff --git a/arch/x86/realmode/rm/video-mode.c b/arch/x86/realmode/rm/video-mode.c
new file mode 100644
index 00000000000..2a98b7e2368
--- /dev/null
+++ b/arch/x86/realmode/rm/video-mode.c
@@ -0,0 +1 @@
#include "../../boot/video-mode.c"
diff --git a/arch/x86/realmode/rm/video-vesa.c b/arch/x86/realmode/rm/video-vesa.c
new file mode 100644
index 00000000000..413edddb51e
--- /dev/null
+++ b/arch/x86/realmode/rm/video-vesa.c
@@ -0,0 +1 @@
#include "../../boot/video-vesa.c"
diff --git a/arch/x86/realmode/rm/video-vga.c b/arch/x86/realmode/rm/video-vga.c
new file mode 100644
index 00000000000..3085f5c9d28
--- /dev/null
+++ b/arch/x86/realmode/rm/video-vga.c
@@ -0,0 +1 @@
#include "../../boot/video-vga.c"
diff --git a/arch/x86/kernel/acpi/realmode/wakemain.c b/arch/x86/realmode/rm/wakemain.c
index 883962d9eef..91405d515ec 100644
--- a/arch/x86/kernel/acpi/realmode/wakemain.c
+++ b/arch/x86/realmode/rm/wakemain.c
@@ -65,7 +65,8 @@ void main(void)
65{ 65{
66 /* Kill machine if structures are wrong */ 66 /* Kill machine if structures are wrong */
67 if (wakeup_header.real_magic != 0x12345678) 67 if (wakeup_header.real_magic != 0x12345678)
68 while (1); 68 while (1)
69 ;
69 70
70 if (wakeup_header.realmode_flags & 4) 71 if (wakeup_header.realmode_flags & 4)
71 send_morse("...-"); 72 send_morse("...-");
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.h b/arch/x86/realmode/rm/wakeup.h
index 97a29e1430e..9317e0042f2 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.h
+++ b/arch/x86/realmode/rm/wakeup.h
@@ -12,9 +12,8 @@
12/* This must match data at wakeup.S */ 12/* This must match data at wakeup.S */
13struct wakeup_header { 13struct wakeup_header {
14 u16 video_mode; /* Video mode number */ 14 u16 video_mode; /* Video mode number */
15 u16 _jmp1; /* ljmpl opcode, 32-bit only */
16 u32 pmode_entry; /* Protected mode resume point, 32-bit only */ 15 u32 pmode_entry; /* Protected mode resume point, 32-bit only */
17 u16 _jmp2; /* CS value, 32-bit only */ 16 u16 pmode_cs;
18 u32 pmode_cr0; /* Protected mode cr0 */ 17 u32 pmode_cr0; /* Protected mode cr0 */
19 u32 pmode_cr3; /* Protected mode cr3 */ 18 u32 pmode_cr3; /* Protected mode cr3 */
20 u32 pmode_cr4; /* Protected mode cr4 */ 19 u32 pmode_cr4; /* Protected mode cr4 */
@@ -26,12 +25,6 @@ struct wakeup_header {
26 u32 pmode_behavior; /* Wakeup routine behavior flags */ 25 u32 pmode_behavior; /* Wakeup routine behavior flags */
27 u32 realmode_flags; 26 u32 realmode_flags;
28 u32 real_magic; 27 u32 real_magic;
29 u16 trampoline_segment; /* segment with trampoline code, 64-bit only */
30 u8 _pad1;
31 u8 wakeup_jmp;
32 u16 wakeup_jmp_off;
33 u16 wakeup_jmp_seg;
34 u64 wakeup_gdt[3];
35 u32 signature; /* To check we have correct structure */ 28 u32 signature; /* To check we have correct structure */
36} __attribute__((__packed__)); 29} __attribute__((__packed__));
37 30
@@ -40,7 +33,6 @@ extern struct wakeup_header wakeup_header;
40 33
41#define WAKEUP_HEADER_OFFSET 8 34#define WAKEUP_HEADER_OFFSET 8
42#define WAKEUP_HEADER_SIGNATURE 0x51ee1111 35#define WAKEUP_HEADER_SIGNATURE 0x51ee1111
43#define WAKEUP_END_SIGNATURE 0x65a22c82
44 36
45/* Wakeup behavior bits */ 37/* Wakeup behavior bits */
46#define WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE 0 38#define WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE 0
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/realmode/rm/wakeup_asm.S
index b4fd836e405..8905166b0bb 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.S
+++ b/arch/x86/realmode/rm/wakeup_asm.S
@@ -1,50 +1,47 @@
1/* 1/*
2 * ACPI wakeup real mode startup stub 2 * ACPI wakeup real mode startup stub
3 */ 3 */
4#include <linux/linkage.h>
4#include <asm/segment.h> 5#include <asm/segment.h>
5#include <asm/msr-index.h> 6#include <asm/msr-index.h>
6#include <asm/page_types.h> 7#include <asm/page_types.h>
7#include <asm/pgtable_types.h> 8#include <asm/pgtable_types.h>
8#include <asm/processor-flags.h> 9#include <asm/processor-flags.h>
10#include "realmode.h"
9#include "wakeup.h" 11#include "wakeup.h"
10 12
11 .code16 13 .code16
12 .section ".jump", "ax"
13 .globl _start
14_start:
15 cli
16 jmp wakeup_code
17 14
18/* This should match the structure in wakeup.h */ 15/* This should match the structure in wakeup.h */
19 .section ".header", "a" 16 .section ".data", "aw"
20 .globl wakeup_header 17
21wakeup_header: 18 .balign 16
22video_mode: .short 0 /* Video mode number */ 19GLOBAL(wakeup_header)
23pmode_return: .byte 0x66, 0xea /* ljmpl */ 20 video_mode: .short 0 /* Video mode number */
24 .long 0 /* offset goes here */ 21 pmode_entry: .long 0
25 .short __KERNEL_CS 22 pmode_cs: .short __KERNEL_CS
26pmode_cr0: .long 0 /* Saved %cr0 */ 23 pmode_cr0: .long 0 /* Saved %cr0 */
27pmode_cr3: .long 0 /* Saved %cr3 */ 24 pmode_cr3: .long 0 /* Saved %cr3 */
28pmode_cr4: .long 0 /* Saved %cr4 */ 25 pmode_cr4: .long 0 /* Saved %cr4 */
29pmode_efer: .quad 0 /* Saved EFER */ 26 pmode_efer: .quad 0 /* Saved EFER */
30pmode_gdt: .quad 0 27 pmode_gdt: .quad 0
31pmode_misc_en: .quad 0 /* Saved MISC_ENABLE MSR */ 28 pmode_misc_en: .quad 0 /* Saved MISC_ENABLE MSR */
32pmode_behavior: .long 0 /* Wakeup behavior flags */ 29 pmode_behavior: .long 0 /* Wakeup behavior flags */
33realmode_flags: .long 0 30 realmode_flags: .long 0
34real_magic: .long 0 31 real_magic: .long 0
35trampoline_segment: .word 0 32 signature: .long WAKEUP_HEADER_SIGNATURE
36_pad1: .byte 0 33END(wakeup_header)
37wakeup_jmp: .byte 0xea /* ljmpw */
38wakeup_jmp_off: .word 3f
39wakeup_jmp_seg: .word 0
40wakeup_gdt: .quad 0, 0, 0
41signature: .long WAKEUP_HEADER_SIGNATURE
42 34
43 .text 35 .text
44 .code16 36 .code16
45wakeup_code: 37
38 .balign 16
39ENTRY(wakeup_start)
40 cli
46 cld 41 cld
47 42
43 LJMPW_RM(3f)
443:
48 /* Apparently some dimwit BIOS programmers don't know how to 45 /* Apparently some dimwit BIOS programmers don't know how to
49 program a PM to RM transition, and we might end up here with 46 program a PM to RM transition, and we might end up here with
50 junk in the data segment descriptor registers. The only way 47 junk in the data segment descriptor registers. The only way
@@ -54,8 +51,7 @@ wakeup_code:
54 movl %cr0, %eax 51 movl %cr0, %eax
55 orb $X86_CR0_PE, %al 52 orb $X86_CR0_PE, %al
56 movl %eax, %cr0 53 movl %eax, %cr0
57 jmp 1f 54 ljmpw $8, $2f
581: ljmpw $8, $2f
592: 552:
60 movw %cx, %ds 56 movw %cx, %ds
61 movw %cx, %es 57 movw %cx, %es
@@ -65,16 +61,18 @@ wakeup_code:
65 61
66 andb $~X86_CR0_PE, %al 62 andb $~X86_CR0_PE, %al
67 movl %eax, %cr0 63 movl %eax, %cr0
68 jmp wakeup_jmp 64 LJMPW_RM(3f)
693: 653:
70 /* Set up segments */ 66 /* Set up segments */
71 movw %cs, %ax 67 movw %cs, %ax
68 movw %ax, %ss
69 movl $rm_stack_end, %esp
72 movw %ax, %ds 70 movw %ax, %ds
73 movw %ax, %es 71 movw %ax, %es
74 movw %ax, %ss 72 movw %ax, %fs
75 lidtl wakeup_idt 73 movw %ax, %gs
76 74
77 movl $wakeup_stack_end, %esp 75 lidtl wakeup_idt
78 76
79 /* Clear the EFLAGS */ 77 /* Clear the EFLAGS */
80 pushl $0 78 pushl $0
@@ -87,7 +85,7 @@ wakeup_code:
87 85
88 /* Check we really have everything... */ 86 /* Check we really have everything... */
89 movl end_signature, %eax 87 movl end_signature, %eax
90 cmpl $WAKEUP_END_SIGNATURE, %eax 88 cmpl $REALMODE_END_SIGNATURE, %eax
91 jne bogus_real_magic 89 jne bogus_real_magic
92 90
93 /* Call the C code */ 91 /* Call the C code */
@@ -128,14 +126,13 @@ wakeup_code:
128 lgdtl pmode_gdt 126 lgdtl pmode_gdt
129 127
130 /* This really couldn't... */ 128 /* This really couldn't... */
131 movl pmode_cr0, %eax 129 movl pmode_entry, %eax
132 movl %eax, %cr0 130 movl pmode_cr0, %ecx
133 jmp pmode_return 131 movl %ecx, %cr0
132 ljmpl $__KERNEL_CS, $pa_startup_32
133 /* -> jmp *%eax in trampoline_32.S */
134#else 134#else
135 pushw $0 135 jmp trampoline_start
136 pushw trampoline_segment
137 pushw $0
138 lret
139#endif 136#endif
140 137
141bogus_real_magic: 138bogus_real_magic:
@@ -143,28 +140,38 @@ bogus_real_magic:
143 hlt 140 hlt
144 jmp 1b 141 jmp 1b
145 142
146 .data 143 .section ".rodata","a"
144
145 /*
146 * Set up the wakeup GDT. We set these up as Big Real Mode,
147 * that is, with limits set to 4 GB. At least the Lenovo
148 * Thinkpad X61 is known to need this for the video BIOS
149 * initialization quirk to work; this is likely to also
150 * be the case for other laptops or integrated video devices.
151 */
152
153 .balign 16
154GLOBAL(wakeup_gdt)
155 .word 3*8-1 /* Self-descriptor */
156 .long pa_wakeup_gdt
157 .word 0
158
159 .word 0xffff /* 16-bit code segment @ real_mode_base */
160 .long 0x9b000000 + pa_real_mode_base
161 .word 0x008f /* big real mode */
162
163 .word 0xffff /* 16-bit data segment @ real_mode_base */
164 .long 0x93000000 + pa_real_mode_base
165 .word 0x008f /* big real mode */
166END(wakeup_gdt)
167
168 .section ".rodata","a"
147 .balign 8 169 .balign 8
148 170
149 /* This is the standard real-mode IDT */ 171 /* This is the standard real-mode IDT */
150wakeup_idt: 172 .balign 16
173GLOBAL(wakeup_idt)
151 .word 0xffff /* limit */ 174 .word 0xffff /* limit */
152 .long 0 /* address */ 175 .long 0 /* address */
153 .word 0 176 .word 0
154 177END(wakeup_idt)
155 .globl HEAP, heap_end
156HEAP:
157 .long wakeup_heap
158heap_end:
159 .long wakeup_stack
160
161 .bss
162wakeup_heap:
163 .space 2048
164wakeup_stack:
165 .space 2048
166wakeup_stack_end:
167
168 .section ".signature","a"
169end_signature:
170 .long WAKEUP_END_SIGNATURE
diff --git a/arch/x86/realmode/rmpiggy.S b/arch/x86/realmode/rmpiggy.S
new file mode 100644
index 00000000000..204c6ece0e9
--- /dev/null
+++ b/arch/x86/realmode/rmpiggy.S
@@ -0,0 +1,20 @@
1/*
2 * Wrapper script for the realmode binary as a transport object
3 * before copying to low memory.
4 */
5#include <linux/linkage.h>
6#include <asm/page_types.h>
7
8 .section ".init.data","aw"
9
10 .balign PAGE_SIZE
11
12GLOBAL(real_mode_blob)
13 .incbin "arch/x86/realmode/rm/realmode.bin"
14END(real_mode_blob)
15
16GLOBAL(real_mode_blob_end);
17
18GLOBAL(real_mode_relocs)
19 .incbin "arch/x86/realmode/rm/realmode.relocs"
20END(real_mode_relocs)
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index 29f9f0554f7..7a35a6e71d4 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -355,3 +355,4 @@
355346 i386 setns sys_setns 355346 i386 setns sys_setns
356347 i386 process_vm_readv sys_process_vm_readv compat_sys_process_vm_readv 356347 i386 process_vm_readv sys_process_vm_readv compat_sys_process_vm_readv
357348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev 357348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev
358349 i386 kcmp sys_kcmp
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index dd29a9ea27c..51171aeff0d 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -318,6 +318,8 @@
318309 common getcpu sys_getcpu 318309 common getcpu sys_getcpu
319310 64 process_vm_readv sys_process_vm_readv 319310 64 process_vm_readv sys_process_vm_readv
320311 64 process_vm_writev sys_process_vm_writev 320311 64 process_vm_writev sys_process_vm_writev
321312 64 kcmp sys_kcmp
322
321# 323#
322# x32-specific system call numbers start at 512 to avoid cache impact 324# x32-specific system call numbers start at 512 to avoid cache impact
323# for native 64-bit operation. 325# for native 64-bit operation.
diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk
index 5f6a5b6c3a1..ddcf39b1a18 100644
--- a/arch/x86/tools/gen-insn-attr-x86.awk
+++ b/arch/x86/tools/gen-insn-attr-x86.awk
@@ -66,9 +66,10 @@ BEGIN {
66 rex_expr = "^REX(\\.[XRWB]+)*" 66 rex_expr = "^REX(\\.[XRWB]+)*"
67 fpu_expr = "^ESC" # TODO 67 fpu_expr = "^ESC" # TODO
68 68
69 lprefix1_expr = "\\(66\\)" 69 lprefix1_expr = "\\((66|!F3)\\)"
70 lprefix2_expr = "\\(F3\\)" 70 lprefix2_expr = "\\(F3\\)"
71 lprefix3_expr = "\\(F2\\)" 71 lprefix3_expr = "\\((F2|!F3)\\)"
72 lprefix_expr = "\\((66|F2|F3)\\)"
72 max_lprefix = 4 73 max_lprefix = 4
73 74
74 # All opcodes starting with lower-case 'v' or with (v1) superscript 75 # All opcodes starting with lower-case 'v' or with (v1) superscript
@@ -333,13 +334,16 @@ function convert_operands(count,opnd, i,j,imm,mod)
333 if (match(ext, lprefix1_expr)) { 334 if (match(ext, lprefix1_expr)) {
334 lptable1[idx] = add_flags(lptable1[idx],flags) 335 lptable1[idx] = add_flags(lptable1[idx],flags)
335 variant = "INAT_VARIANT" 336 variant = "INAT_VARIANT"
336 } else if (match(ext, lprefix2_expr)) { 337 }
338 if (match(ext, lprefix2_expr)) {
337 lptable2[idx] = add_flags(lptable2[idx],flags) 339 lptable2[idx] = add_flags(lptable2[idx],flags)
338 variant = "INAT_VARIANT" 340 variant = "INAT_VARIANT"
339 } else if (match(ext, lprefix3_expr)) { 341 }
342 if (match(ext, lprefix3_expr)) {
340 lptable3[idx] = add_flags(lptable3[idx],flags) 343 lptable3[idx] = add_flags(lptable3[idx],flags)
341 variant = "INAT_VARIANT" 344 variant = "INAT_VARIANT"
342 } else { 345 }
346 if (!match(ext, lprefix_expr)){
343 table[idx] = add_flags(table[idx],flags) 347 table[idx] = add_flags(table[idx],flags)
344 } 348 }
345 } 349 }
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index b43cfcd9bf4..5a1847d6193 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -60,12 +60,31 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = {
60 "__x86_cpu_dev_(start|end)|" 60 "__x86_cpu_dev_(start|end)|"
61 "(__parainstructions|__alt_instructions)(|_end)|" 61 "(__parainstructions|__alt_instructions)(|_end)|"
62 "(__iommu_table|__apicdrivers|__smp_locks)(|_end)|" 62 "(__iommu_table|__apicdrivers|__smp_locks)(|_end)|"
63 "__(start|end)_pci_.*|"
64 "__(start|end)_builtin_fw|"
65 "__(start|stop)___ksymtab(|_gpl|_unused|_unused_gpl|_gpl_future)|"
66 "__(start|stop)___kcrctab(|_gpl|_unused|_unused_gpl|_gpl_future)|"
67 "__(start|stop)___param|"
68 "__(start|stop)___modver|"
69 "__(start|stop)___bug_table|"
70 "__tracedata_(start|end)|"
71 "__(start|stop)_notes|"
72 "__end_rodata|"
73 "__initramfs_start|"
74 "(jiffies|jiffies_64)|"
63 "_end)$" 75 "_end)$"
64}; 76};
65 77
66 78
67static const char * const sym_regex_realmode[S_NSYMTYPES] = { 79static const char * const sym_regex_realmode[S_NSYMTYPES] = {
68/* 80/*
81 * These symbols are known to be relative, even if the linker marks them
82 * as absolute (typically defined outside any section in the linker script.)
83 */
84 [S_REL] =
85 "^pa_",
86
87/*
69 * These are 16-bit segment symbols when compiling 16-bit code. 88 * These are 16-bit segment symbols when compiling 16-bit code.
70 */ 89 */
71 [S_SEG] = 90 [S_SEG] =
diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c
index bb0fb03b9f8..a508cea1350 100644
--- a/arch/x86/um/signal.c
+++ b/arch/x86/um/signal.c
@@ -486,7 +486,6 @@ long sys_sigreturn(struct pt_regs *regs)
486 copy_from_user(&set.sig[1], extramask, sig_size)) 486 copy_from_user(&set.sig[1], extramask, sig_size))
487 goto segfault; 487 goto segfault;
488 488
489 sigdelsetmask(&set, ~_BLOCKABLE);
490 set_current_blocked(&set); 489 set_current_blocked(&set);
491 490
492 if (copy_sc_from_user(&current->thread.regs, sc)) 491 if (copy_sc_from_user(&current->thread.regs, sc))
@@ -600,7 +599,6 @@ long sys_rt_sigreturn(struct pt_regs *regs)
600 if (copy_from_user(&set, &uc->uc_sigmask, sizeof(set))) 599 if (copy_from_user(&set, &uc->uc_sigmask, sizeof(set)))
601 goto segfault; 600 goto segfault;
602 601
603 sigdelsetmask(&set, ~_BLOCKABLE);
604 set_current_blocked(&set); 602 set_current_blocked(&set);
605 603
606 if (copy_sc_from_user(&current->thread.regs, &uc->uc_mcontext)) 604 if (copy_sc_from_user(&current->thread.regs, &uc->uc_mcontext))
diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c
index 416bd40c0eb..68d1dc91b37 100644
--- a/arch/x86/um/sys_call_table_32.c
+++ b/arch/x86/um/sys_call_table_32.c
@@ -39,9 +39,9 @@
39#undef __SYSCALL_I386 39#undef __SYSCALL_I386
40#define __SYSCALL_I386(nr, sym, compat) [ nr ] = sym, 40#define __SYSCALL_I386(nr, sym, compat) [ nr ] = sym,
41 41
42typedef void (*sys_call_ptr_t)(void); 42typedef asmlinkage void (*sys_call_ptr_t)(void);
43 43
44extern void sys_ni_syscall(void); 44extern asmlinkage void sys_ni_syscall(void);
45 45
46const sys_call_ptr_t sys_call_table[] __cacheline_aligned = { 46const sys_call_ptr_t sys_call_table[] __cacheline_aligned = {
47 /* 47 /*
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 66e6d935982..0faad646f5f 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -205,9 +205,9 @@ void syscall32_cpu_init(void)
205{ 205{
206 /* Load these always in case some future AMD CPU supports 206 /* Load these always in case some future AMD CPU supports
207 SYSENTER from compat mode too. */ 207 SYSENTER from compat mode too. */
208 checking_wrmsrl(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); 208 wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
209 checking_wrmsrl(MSR_IA32_SYSENTER_ESP, 0ULL); 209 wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
210 checking_wrmsrl(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); 210 wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
211 211
212 wrmsrl(MSR_CSTAR, ia32_cstar_target); 212 wrmsrl(MSR_CSTAR, ia32_cstar_target);
213} 213}
diff --git a/arch/x86/video/fbdev.c b/arch/x86/video/fbdev.c
index c5ffb6ac870..d5644bbe8cb 100644
--- a/arch/x86/video/fbdev.c
+++ b/arch/x86/video/fbdev.c
@@ -9,24 +9,34 @@
9#include <linux/fb.h> 9#include <linux/fb.h>
10#include <linux/pci.h> 10#include <linux/pci.h>
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/vgaarb.h>
12 13
13int fb_is_primary_device(struct fb_info *info) 14int fb_is_primary_device(struct fb_info *info)
14{ 15{
15 struct device *device = info->device; 16 struct device *device = info->device;
16 struct pci_dev *pci_dev = NULL; 17 struct pci_dev *pci_dev = NULL;
18 struct pci_dev *default_device = vga_default_device();
17 struct resource *res = NULL; 19 struct resource *res = NULL;
18 int retval = 0;
19 20
20 if (device) 21 if (device)
21 pci_dev = to_pci_dev(device); 22 pci_dev = to_pci_dev(device);
22 23
23 if (pci_dev) 24 if (!pci_dev)
24 res = &pci_dev->resource[PCI_ROM_RESOURCE]; 25 return 0;
26
27 if (default_device) {
28 if (pci_dev == default_device)
29 return 1;
30 else
31 return 0;
32 }
33
34 res = &pci_dev->resource[PCI_ROM_RESOURCE];
25 35
26 if (res && res->flags & IORESOURCE_ROM_SHADOW) 36 if (res && res->flags & IORESOURCE_ROM_SHADOW)
27 retval = 1; 37 return 1;
28 38
29 return retval; 39 return 0;
30} 40}
31EXPORT_SYMBOL(fb_is_primary_device); 41EXPORT_SYMBOL(fb_is_primary_device);
32MODULE_LICENSE("GPL"); 42MODULE_LICENSE("GPL");
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c
index ef1db1900d8..c8377fb26cd 100644
--- a/arch/x86/xen/debugfs.c
+++ b/arch/x86/xen/debugfs.c
@@ -19,107 +19,3 @@ struct dentry * __init xen_init_debugfs(void)
19 return d_xen_debug; 19 return d_xen_debug;
20} 20}
21 21
22struct array_data
23{
24 void *array;
25 unsigned elements;
26};
27
28static int u32_array_open(struct inode *inode, struct file *file)
29{
30 file->private_data = NULL;
31 return nonseekable_open(inode, file);
32}
33
34static size_t format_array(char *buf, size_t bufsize, const char *fmt,
35 u32 *array, unsigned array_size)
36{
37 size_t ret = 0;
38 unsigned i;
39
40 for(i = 0; i < array_size; i++) {
41 size_t len;
42
43 len = snprintf(buf, bufsize, fmt, array[i]);
44 len++; /* ' ' or '\n' */
45 ret += len;
46
47 if (buf) {
48 buf += len;
49 bufsize -= len;
50 buf[-1] = (i == array_size-1) ? '\n' : ' ';
51 }
52 }
53
54 ret++; /* \0 */
55 if (buf)
56 *buf = '\0';
57
58 return ret;
59}
60
61static char *format_array_alloc(const char *fmt, u32 *array, unsigned array_size)
62{
63 size_t len = format_array(NULL, 0, fmt, array, array_size);
64 char *ret;
65
66 ret = kmalloc(len, GFP_KERNEL);
67 if (ret == NULL)
68 return NULL;
69
70 format_array(ret, len, fmt, array, array_size);
71 return ret;
72}
73
74static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len,
75 loff_t *ppos)
76{
77 struct inode *inode = file->f_path.dentry->d_inode;
78 struct array_data *data = inode->i_private;
79 size_t size;
80
81 if (*ppos == 0) {
82 if (file->private_data) {
83 kfree(file->private_data);
84 file->private_data = NULL;
85 }
86
87 file->private_data = format_array_alloc("%u", data->array, data->elements);
88 }
89
90 size = 0;
91 if (file->private_data)
92 size = strlen(file->private_data);
93
94 return simple_read_from_buffer(buf, len, ppos, file->private_data, size);
95}
96
97static int xen_array_release(struct inode *inode, struct file *file)
98{
99 kfree(file->private_data);
100
101 return 0;
102}
103
104static const struct file_operations u32_array_fops = {
105 .owner = THIS_MODULE,
106 .open = u32_array_open,
107 .release= xen_array_release,
108 .read = u32_array_read,
109 .llseek = no_llseek,
110};
111
112struct dentry *xen_debugfs_create_u32_array(const char *name, umode_t mode,
113 struct dentry *parent,
114 u32 *array, unsigned elements)
115{
116 struct array_data *data = kmalloc(sizeof(*data), GFP_KERNEL);
117
118 if (data == NULL)
119 return NULL;
120
121 data->array = array;
122 data->elements = elements;
123
124 return debugfs_create_file(name, mode, parent, data, &u32_array_fops);
125}
diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h
index 78d25499be5..12ebf3325c7 100644
--- a/arch/x86/xen/debugfs.h
+++ b/arch/x86/xen/debugfs.h
@@ -3,8 +3,4 @@
3 3
4struct dentry * __init xen_init_debugfs(void); 4struct dentry * __init xen_init_debugfs(void);
5 5
6struct dentry *xen_debugfs_create_u32_array(const char *name, umode_t mode,
7 struct dentry *parent,
8 u32 *array, unsigned elements);
9
10#endif /* _XEN_DEBUGFS_H */ 6#endif /* _XEN_DEBUGFS_H */
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index c0f5facdb10..bf4bda6d3e9 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -31,6 +31,7 @@
31#include <linux/pci.h> 31#include <linux/pci.h>
32#include <linux/gfp.h> 32#include <linux/gfp.h>
33#include <linux/memblock.h> 33#include <linux/memblock.h>
34#include <linux/syscore_ops.h>
34 35
35#include <xen/xen.h> 36#include <xen/xen.h>
36#include <xen/interface/xen.h> 37#include <xen/interface/xen.h>
@@ -38,10 +39,12 @@
38#include <xen/interface/physdev.h> 39#include <xen/interface/physdev.h>
39#include <xen/interface/vcpu.h> 40#include <xen/interface/vcpu.h>
40#include <xen/interface/memory.h> 41#include <xen/interface/memory.h>
42#include <xen/interface/xen-mca.h>
41#include <xen/features.h> 43#include <xen/features.h>
42#include <xen/page.h> 44#include <xen/page.h>
43#include <xen/hvm.h> 45#include <xen/hvm.h>
44#include <xen/hvc-console.h> 46#include <xen/hvc-console.h>
47#include <xen/acpi.h>
45 48
46#include <asm/paravirt.h> 49#include <asm/paravirt.h>
47#include <asm/apic.h> 50#include <asm/apic.h>
@@ -75,6 +78,7 @@
75 78
76#include "xen-ops.h" 79#include "xen-ops.h"
77#include "mmu.h" 80#include "mmu.h"
81#include "smp.h"
78#include "multicalls.h" 82#include "multicalls.h"
79 83
80EXPORT_SYMBOL_GPL(hypercall_page); 84EXPORT_SYMBOL_GPL(hypercall_page);
@@ -105,7 +109,7 @@ EXPORT_SYMBOL_GPL(xen_have_vector_callback);
105 * Point at some empty memory to start with. We map the real shared_info 109 * Point at some empty memory to start with. We map the real shared_info
106 * page as soon as fixmap is up and running. 110 * page as soon as fixmap is up and running.
107 */ 111 */
108struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info; 112struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info;
109 113
110/* 114/*
111 * Flag to determine whether vcpu info placement is available on all 115 * Flag to determine whether vcpu info placement is available on all
@@ -122,6 +126,19 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
122 */ 126 */
123static int have_vcpu_info_placement = 1; 127static int have_vcpu_info_placement = 1;
124 128
129struct tls_descs {
130 struct desc_struct desc[3];
131};
132
133/*
134 * Updating the 3 TLS descriptors in the GDT on every task switch is
135 * surprisingly expensive so we avoid updating them if they haven't
136 * changed. Since Xen writes different descriptors than the one
137 * passed in the update_descriptor hypercall we keep shadow copies to
138 * compare against.
139 */
140static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
141
125static void clamp_max_cpus(void) 142static void clamp_max_cpus(void)
126{ 143{
127#ifdef CONFIG_SMP 144#ifdef CONFIG_SMP
@@ -207,6 +224,9 @@ static void __init xen_banner(void)
207 xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); 224 xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
208} 225}
209 226
227#define CPUID_THERM_POWER_LEAF 6
228#define APERFMPERF_PRESENT 0
229
210static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0; 230static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0;
211static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0; 231static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0;
212 232
@@ -240,6 +260,11 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
240 *dx = cpuid_leaf5_edx_val; 260 *dx = cpuid_leaf5_edx_val;
241 return; 261 return;
242 262
263 case CPUID_THERM_POWER_LEAF:
264 /* Disabling APERFMPERF for kernel usage */
265 maskecx = ~(1 << APERFMPERF_PRESENT);
266 break;
267
243 case 0xb: 268 case 0xb:
244 /* Suppress extended topology stuff */ 269 /* Suppress extended topology stuff */
245 maskebx = 0; 270 maskebx = 0;
@@ -331,9 +356,7 @@ static void __init xen_init_cpuid_mask(void)
331 unsigned int xsave_mask; 356 unsigned int xsave_mask;
332 357
333 cpuid_leaf1_edx_mask = 358 cpuid_leaf1_edx_mask =
334 ~((1 << X86_FEATURE_MCE) | /* disable MCE */ 359 ~((1 << X86_FEATURE_MTRR) | /* disable MTRR */
335 (1 << X86_FEATURE_MCA) | /* disable MCA */
336 (1 << X86_FEATURE_MTRR) | /* disable MTRR */
337 (1 << X86_FEATURE_ACC)); /* thermal monitoring */ 360 (1 << X86_FEATURE_ACC)); /* thermal monitoring */
338 361
339 if (!xen_initial_domain()) 362 if (!xen_initial_domain())
@@ -530,12 +553,28 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
530 BUG(); 553 BUG();
531} 554}
532 555
556static inline bool desc_equal(const struct desc_struct *d1,
557 const struct desc_struct *d2)
558{
559 return d1->a == d2->a && d1->b == d2->b;
560}
561
533static void load_TLS_descriptor(struct thread_struct *t, 562static void load_TLS_descriptor(struct thread_struct *t,
534 unsigned int cpu, unsigned int i) 563 unsigned int cpu, unsigned int i)
535{ 564{
536 struct desc_struct *gdt = get_cpu_gdt_table(cpu); 565 struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i];
537 xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); 566 struct desc_struct *gdt;
538 struct multicall_space mc = __xen_mc_entry(0); 567 xmaddr_t maddr;
568 struct multicall_space mc;
569
570 if (desc_equal(shadow, &t->tls_array[i]))
571 return;
572
573 *shadow = t->tls_array[i];
574
575 gdt = get_cpu_gdt_table(cpu);
576 maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
577 mc = __xen_mc_entry(0);
539 578
540 MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); 579 MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
541} 580}
@@ -617,8 +656,8 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
617 /* 656 /*
618 * Look for known traps using IST, and substitute them 657 * Look for known traps using IST, and substitute them
619 * appropriately. The debugger ones are the only ones we care 658 * appropriately. The debugger ones are the only ones we care
620 * about. Xen will handle faults like double_fault and 659 * about. Xen will handle faults like double_fault,
621 * machine_check, so we should never see them. Warn if 660 * so we should never see them. Warn if
622 * there's an unexpected IST-using fault handler. 661 * there's an unexpected IST-using fault handler.
623 */ 662 */
624 if (addr == (unsigned long)debug) 663 if (addr == (unsigned long)debug)
@@ -633,7 +672,11 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
633 return 0; 672 return 0;
634#ifdef CONFIG_X86_MCE 673#ifdef CONFIG_X86_MCE
635 } else if (addr == (unsigned long)machine_check) { 674 } else if (addr == (unsigned long)machine_check) {
636 return 0; 675 /*
676 * when xen hypervisor inject vMCE to guest,
677 * use native mce handler to handle it
678 */
679 ;
637#endif 680#endif
638 } else { 681 } else {
639 /* Some other trap using IST? */ 682 /* Some other trap using IST? */
@@ -883,6 +926,14 @@ static void set_xen_basic_apic_ops(void)
883 apic->safe_wait_icr_idle = xen_safe_apic_wait_icr_idle; 926 apic->safe_wait_icr_idle = xen_safe_apic_wait_icr_idle;
884 apic->set_apic_id = xen_set_apic_id; 927 apic->set_apic_id = xen_set_apic_id;
885 apic->get_apic_id = xen_get_apic_id; 928 apic->get_apic_id = xen_get_apic_id;
929
930#ifdef CONFIG_SMP
931 apic->send_IPI_allbutself = xen_send_IPI_allbutself;
932 apic->send_IPI_mask_allbutself = xen_send_IPI_mask_allbutself;
933 apic->send_IPI_mask = xen_send_IPI_mask;
934 apic->send_IPI_all = xen_send_IPI_all;
935 apic->send_IPI_self = xen_send_IPI_self;
936#endif
886} 937}
887 938
888#endif 939#endif
@@ -1107,6 +1158,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
1107 1158
1108 .read_msr = native_read_msr_safe, 1159 .read_msr = native_read_msr_safe,
1109 .write_msr = xen_write_msr_safe, 1160 .write_msr = xen_write_msr_safe,
1161
1110 .read_tsc = native_read_tsc, 1162 .read_tsc = native_read_tsc,
1111 .read_pmc = native_read_pmc, 1163 .read_pmc = native_read_pmc,
1112 1164
@@ -1340,7 +1392,6 @@ asmlinkage void __init xen_start_kernel(void)
1340 1392
1341 xen_raw_console_write("mapping kernel into physical memory\n"); 1393 xen_raw_console_write("mapping kernel into physical memory\n");
1342 pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages); 1394 pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
1343 xen_ident_map_ISA();
1344 1395
1345 /* Allocate and initialize top and mid mfn levels for p2m structure */ 1396 /* Allocate and initialize top and mid mfn levels for p2m structure */
1346 xen_build_mfn_list_list(); 1397 xen_build_mfn_list_list();
@@ -1400,6 +1451,8 @@ asmlinkage void __init xen_start_kernel(void)
1400 1451
1401 /* Make sure ACS will be enabled */ 1452 /* Make sure ACS will be enabled */
1402 pci_request_acs(); 1453 pci_request_acs();
1454
1455 xen_acpi_sleep_register();
1403 } 1456 }
1404#ifdef CONFIG_PCI 1457#ifdef CONFIG_PCI
1405 /* PCI BIOS service won't work from a PV guest. */ 1458 /* PCI BIOS service won't work from a PV guest. */
@@ -1417,64 +1470,155 @@ asmlinkage void __init xen_start_kernel(void)
1417#endif 1470#endif
1418} 1471}
1419 1472
1420static int init_hvm_pv_info(int *major, int *minor) 1473#ifdef CONFIG_XEN_PVHVM
1421{ 1474/*
1422 uint32_t eax, ebx, ecx, edx, pages, msr, base; 1475 * The pfn containing the shared_info is located somewhere in RAM. This
1423 u64 pfn; 1476 * will cause trouble if the current kernel is doing a kexec boot into a
1424 1477 * new kernel. The new kernel (and its startup code) can not know where
1425 base = xen_cpuid_base(); 1478 * the pfn is, so it can not reserve the page. The hypervisor will
1426 cpuid(base + 1, &eax, &ebx, &ecx, &edx); 1479 * continue to update the pfn, and as a result memory corruption occours
1427 1480 * in the new kernel.
1428 *major = eax >> 16; 1481 *
1429 *minor = eax & 0xffff; 1482 * One way to work around this issue is to allocate a page in the
1430 printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor); 1483 * xen-platform pci device's BAR memory range. But pci init is done very
1431 1484 * late and the shared_info page is already in use very early to read
1432 cpuid(base + 2, &pages, &msr, &ecx, &edx); 1485 * the pvclock. So moving the pfn from RAM to MMIO is racy because some
1433 1486 * code paths on other vcpus could access the pfn during the small
1434 pfn = __pa(hypercall_page); 1487 * window when the old pfn is moved to the new pfn. There is even a
1435 wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); 1488 * small window were the old pfn is not backed by a mfn, and during that
1436 1489 * time all reads return -1.
1437 xen_setup_features(); 1490 *
1438 1491 * Because it is not known upfront where the MMIO region is located it
1439 pv_info.name = "Xen HVM"; 1492 * can not be used right from the start in xen_hvm_init_shared_info.
1440 1493 *
1441 xen_domain_type = XEN_HVM_DOMAIN; 1494 * To minimise trouble the move of the pfn is done shortly before kexec.
1495 * This does not eliminate the race because all vcpus are still online
1496 * when the syscore_ops will be called. But hopefully there is no work
1497 * pending at this point in time. Also the syscore_op is run last which
1498 * reduces the risk further.
1499 */
1442 1500
1443 return 0; 1501static struct shared_info *xen_hvm_shared_info;
1444}
1445 1502
1446void __ref xen_hvm_init_shared_info(void) 1503static void xen_hvm_connect_shared_info(unsigned long pfn)
1447{ 1504{
1448 int cpu;
1449 struct xen_add_to_physmap xatp; 1505 struct xen_add_to_physmap xatp;
1450 static struct shared_info *shared_info_page = 0;
1451 1506
1452 if (!shared_info_page)
1453 shared_info_page = (struct shared_info *)
1454 extend_brk(PAGE_SIZE, PAGE_SIZE);
1455 xatp.domid = DOMID_SELF; 1507 xatp.domid = DOMID_SELF;
1456 xatp.idx = 0; 1508 xatp.idx = 0;
1457 xatp.space = XENMAPSPACE_shared_info; 1509 xatp.space = XENMAPSPACE_shared_info;
1458 xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT; 1510 xatp.gpfn = pfn;
1459 if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) 1511 if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
1460 BUG(); 1512 BUG();
1461 1513
1462 HYPERVISOR_shared_info = (struct shared_info *)shared_info_page; 1514}
1515static void xen_hvm_set_shared_info(struct shared_info *sip)
1516{
1517 int cpu;
1518
1519 HYPERVISOR_shared_info = sip;
1463 1520
1464 /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info 1521 /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
1465 * page, we use it in the event channel upcall and in some pvclock 1522 * page, we use it in the event channel upcall and in some pvclock
1466 * related functions. We don't need the vcpu_info placement 1523 * related functions. We don't need the vcpu_info placement
1467 * optimizations because we don't use any pv_mmu or pv_irq op on 1524 * optimizations because we don't use any pv_mmu or pv_irq op on
1468 * HVM. 1525 * HVM.
1469 * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is 1526 * When xen_hvm_set_shared_info is run at boot time only vcpu 0 is
1470 * online but xen_hvm_init_shared_info is run at resume time too and 1527 * online but xen_hvm_set_shared_info is run at resume time too and
1471 * in that case multiple vcpus might be online. */ 1528 * in that case multiple vcpus might be online. */
1472 for_each_online_cpu(cpu) { 1529 for_each_online_cpu(cpu) {
1473 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; 1530 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
1474 } 1531 }
1475} 1532}
1476 1533
1477#ifdef CONFIG_XEN_PVHVM 1534/* Reconnect the shared_info pfn to a mfn */
1535void xen_hvm_resume_shared_info(void)
1536{
1537 xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
1538}
1539
1540#ifdef CONFIG_KEXEC
1541static struct shared_info *xen_hvm_shared_info_kexec;
1542static unsigned long xen_hvm_shared_info_pfn_kexec;
1543
1544/* Remember a pfn in MMIO space for kexec reboot */
1545void __devinit xen_hvm_prepare_kexec(struct shared_info *sip, unsigned long pfn)
1546{
1547 xen_hvm_shared_info_kexec = sip;
1548 xen_hvm_shared_info_pfn_kexec = pfn;
1549}
1550
1551static void xen_hvm_syscore_shutdown(void)
1552{
1553 struct xen_memory_reservation reservation = {
1554 .domid = DOMID_SELF,
1555 .nr_extents = 1,
1556 };
1557 unsigned long prev_pfn;
1558 int rc;
1559
1560 if (!xen_hvm_shared_info_kexec)
1561 return;
1562
1563 prev_pfn = __pa(xen_hvm_shared_info) >> PAGE_SHIFT;
1564 set_xen_guest_handle(reservation.extent_start, &prev_pfn);
1565
1566 /* Move pfn to MMIO, disconnects previous pfn from mfn */
1567 xen_hvm_connect_shared_info(xen_hvm_shared_info_pfn_kexec);
1568
1569 /* Update pointers, following hypercall is also a memory barrier */
1570 xen_hvm_set_shared_info(xen_hvm_shared_info_kexec);
1571
1572 /* Allocate new mfn for previous pfn */
1573 do {
1574 rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
1575 if (rc == 0)
1576 msleep(123);
1577 } while (rc == 0);
1578
1579 /* Make sure the previous pfn is really connected to a (new) mfn */
1580 BUG_ON(rc != 1);
1581}
1582
1583static struct syscore_ops xen_hvm_syscore_ops = {
1584 .shutdown = xen_hvm_syscore_shutdown,
1585};
1586#endif
1587
1588/* Use a pfn in RAM, may move to MMIO before kexec. */
1589static void __init xen_hvm_init_shared_info(void)
1590{
1591 /* Remember pointer for resume */
1592 xen_hvm_shared_info = extend_brk(PAGE_SIZE, PAGE_SIZE);
1593 xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
1594 xen_hvm_set_shared_info(xen_hvm_shared_info);
1595}
1596
1597static void __init init_hvm_pv_info(void)
1598{
1599 int major, minor;
1600 uint32_t eax, ebx, ecx, edx, pages, msr, base;
1601 u64 pfn;
1602
1603 base = xen_cpuid_base();
1604 cpuid(base + 1, &eax, &ebx, &ecx, &edx);
1605
1606 major = eax >> 16;
1607 minor = eax & 0xffff;
1608 printk(KERN_INFO "Xen version %d.%d.\n", major, minor);
1609
1610 cpuid(base + 2, &pages, &msr, &ecx, &edx);
1611
1612 pfn = __pa(hypercall_page);
1613 wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
1614
1615 xen_setup_features();
1616
1617 pv_info.name = "Xen HVM";
1618
1619 xen_domain_type = XEN_HVM_DOMAIN;
1620}
1621
1478static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self, 1622static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
1479 unsigned long action, void *hcpu) 1623 unsigned long action, void *hcpu)
1480{ 1624{
@@ -1497,14 +1641,12 @@ static struct notifier_block xen_hvm_cpu_notifier __cpuinitdata = {
1497 1641
1498static void __init xen_hvm_guest_init(void) 1642static void __init xen_hvm_guest_init(void)
1499{ 1643{
1500 int r; 1644 init_hvm_pv_info();
1501 int major, minor;
1502
1503 r = init_hvm_pv_info(&major, &minor);
1504 if (r < 0)
1505 return;
1506 1645
1507 xen_hvm_init_shared_info(); 1646 xen_hvm_init_shared_info();
1647#ifdef CONFIG_KEXEC
1648 register_syscore_ops(&xen_hvm_syscore_ops);
1649#endif
1508 1650
1509 if (xen_feature(XENFEAT_hvm_callback_vector)) 1651 if (xen_feature(XENFEAT_hvm_callback_vector))
1510 xen_have_vector_callback = 1; 1652 xen_have_vector_callback = 1;
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 3506cd4f9a4..27336dfcda8 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -308,8 +308,20 @@ static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
308 308
309static inline void __xen_set_pte(pte_t *ptep, pte_t pteval) 309static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
310{ 310{
311 if (!xen_batched_set_pte(ptep, pteval)) 311 if (!xen_batched_set_pte(ptep, pteval)) {
312 native_set_pte(ptep, pteval); 312 /*
313 * Could call native_set_pte() here and trap and
314 * emulate the PTE write but with 32-bit guests this
315 * needs two traps (one for each of the two 32-bit
316 * words in the PTE) so do one hypercall directly
317 * instead.
318 */
319 struct mmu_update u;
320
321 u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
322 u.val = pte_val_ma(pteval);
323 HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
324 }
313} 325}
314 326
315static void xen_set_pte(pte_t *ptep, pte_t pteval) 327static void xen_set_pte(pte_t *ptep, pte_t pteval)
@@ -1416,13 +1428,28 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
1416} 1428}
1417#endif /* CONFIG_X86_64 */ 1429#endif /* CONFIG_X86_64 */
1418 1430
1419/* Init-time set_pte while constructing initial pagetables, which 1431/*
1420 doesn't allow RO pagetable pages to be remapped RW */ 1432 * Init-time set_pte while constructing initial pagetables, which
1433 * doesn't allow RO page table pages to be remapped RW.
1434 *
1435 * If there is no MFN for this PFN then this page is initially
1436 * ballooned out so clear the PTE (as in decrease_reservation() in
1437 * drivers/xen/balloon.c).
1438 *
1439 * Many of these PTE updates are done on unpinned and writable pages
1440 * and doing a hypercall for these is unnecessary and expensive. At
1441 * this point it is not possible to tell if a page is pinned or not,
1442 * so always write the PTE directly and rely on Xen trapping and
1443 * emulating any updates as necessary.
1444 */
1421static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) 1445static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
1422{ 1446{
1423 pte = mask_rw_pte(ptep, pte); 1447 if (pte_mfn(pte) != INVALID_P2M_ENTRY)
1448 pte = mask_rw_pte(ptep, pte);
1449 else
1450 pte = __pte_ma(0);
1424 1451
1425 xen_set_pte(ptep, pte); 1452 native_set_pte(ptep, pte);
1426} 1453}
1427 1454
1428static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) 1455static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
@@ -1933,29 +1960,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
1933#endif 1960#endif
1934} 1961}
1935 1962
1936void __init xen_ident_map_ISA(void)
1937{
1938 unsigned long pa;
1939
1940 /*
1941 * If we're dom0, then linear map the ISA machine addresses into
1942 * the kernel's address space.
1943 */
1944 if (!xen_initial_domain())
1945 return;
1946
1947 xen_raw_printk("Xen: setup ISA identity maps\n");
1948
1949 for (pa = ISA_START_ADDRESS; pa < ISA_END_ADDRESS; pa += PAGE_SIZE) {
1950 pte_t pte = mfn_pte(PFN_DOWN(pa), PAGE_KERNEL_IO);
1951
1952 if (HYPERVISOR_update_va_mapping(PAGE_OFFSET + pa, pte, 0))
1953 BUG();
1954 }
1955
1956 xen_flush_tlb();
1957}
1958
1959static void __init xen_post_allocator_init(void) 1963static void __init xen_post_allocator_init(void)
1960{ 1964{
1961 pv_mmu_ops.set_pte = xen_set_pte; 1965 pv_mmu_ops.set_pte = xen_set_pte;
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 1b267e75158..64effdc6da9 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -499,16 +499,18 @@ static bool alloc_p2m(unsigned long pfn)
499 return true; 499 return true;
500} 500}
501 501
502static bool __init __early_alloc_p2m(unsigned long pfn) 502static bool __init early_alloc_p2m_middle(unsigned long pfn, bool check_boundary)
503{ 503{
504 unsigned topidx, mididx, idx; 504 unsigned topidx, mididx, idx;
505 unsigned long *p2m;
506 unsigned long *mid_mfn_p;
505 507
506 topidx = p2m_top_index(pfn); 508 topidx = p2m_top_index(pfn);
507 mididx = p2m_mid_index(pfn); 509 mididx = p2m_mid_index(pfn);
508 idx = p2m_index(pfn); 510 idx = p2m_index(pfn);
509 511
510 /* Pfff.. No boundary cross-over, lets get out. */ 512 /* Pfff.. No boundary cross-over, lets get out. */
511 if (!idx) 513 if (!idx && check_boundary)
512 return false; 514 return false;
513 515
514 WARN(p2m_top[topidx][mididx] == p2m_identity, 516 WARN(p2m_top[topidx][mididx] == p2m_identity,
@@ -522,24 +524,66 @@ static bool __init __early_alloc_p2m(unsigned long pfn)
522 return false; 524 return false;
523 525
524 /* Boundary cross-over for the edges: */ 526 /* Boundary cross-over for the edges: */
525 if (idx) { 527 p2m = extend_brk(PAGE_SIZE, PAGE_SIZE);
526 unsigned long *p2m = extend_brk(PAGE_SIZE, PAGE_SIZE);
527 unsigned long *mid_mfn_p;
528 528
529 p2m_init(p2m); 529 p2m_init(p2m);
530 530
531 p2m_top[topidx][mididx] = p2m; 531 p2m_top[topidx][mididx] = p2m;
532 532
533 /* For save/restore we need to MFN of the P2M saved */ 533 /* For save/restore we need to MFN of the P2M saved */
534 534
535 mid_mfn_p = p2m_top_mfn_p[topidx]; 535 mid_mfn_p = p2m_top_mfn_p[topidx];
536 WARN(mid_mfn_p[mididx] != virt_to_mfn(p2m_missing), 536 WARN(mid_mfn_p[mididx] != virt_to_mfn(p2m_missing),
537 "P2M_TOP_P[%d][%d] != MFN of p2m_missing!\n", 537 "P2M_TOP_P[%d][%d] != MFN of p2m_missing!\n",
538 topidx, mididx); 538 topidx, mididx);
539 mid_mfn_p[mididx] = virt_to_mfn(p2m); 539 mid_mfn_p[mididx] = virt_to_mfn(p2m);
540
541 return true;
542}
543
544static bool __init early_alloc_p2m(unsigned long pfn)
545{
546 unsigned topidx = p2m_top_index(pfn);
547 unsigned long *mid_mfn_p;
548 unsigned long **mid;
549
550 mid = p2m_top[topidx];
551 mid_mfn_p = p2m_top_mfn_p[topidx];
552 if (mid == p2m_mid_missing) {
553 mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
554
555 p2m_mid_init(mid);
540 556
557 p2m_top[topidx] = mid;
558
559 BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
560 }
561 /* And the save/restore P2M tables.. */
562 if (mid_mfn_p == p2m_mid_missing_mfn) {
563 mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
564 p2m_mid_mfn_init(mid_mfn_p);
565
566 p2m_top_mfn_p[topidx] = mid_mfn_p;
567 p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
568 /* Note: we don't set mid_mfn_p[midix] here,
569 * look in early_alloc_p2m_middle */
541 } 570 }
542 return idx != 0; 571 return true;
572}
573bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn)
574{
575 if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
576 if (!early_alloc_p2m(pfn))
577 return false;
578
579 if (!early_alloc_p2m_middle(pfn, false /* boundary crossover OK!*/))
580 return false;
581
582 if (!__set_phys_to_machine(pfn, mfn))
583 return false;
584 }
585
586 return true;
543} 587}
544unsigned long __init set_phys_range_identity(unsigned long pfn_s, 588unsigned long __init set_phys_range_identity(unsigned long pfn_s,
545 unsigned long pfn_e) 589 unsigned long pfn_e)
@@ -559,35 +603,11 @@ unsigned long __init set_phys_range_identity(unsigned long pfn_s,
559 pfn < ALIGN(pfn_e, (P2M_MID_PER_PAGE * P2M_PER_PAGE)); 603 pfn < ALIGN(pfn_e, (P2M_MID_PER_PAGE * P2M_PER_PAGE));
560 pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE) 604 pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE)
561 { 605 {
562 unsigned topidx = p2m_top_index(pfn); 606 WARN_ON(!early_alloc_p2m(pfn));
563 unsigned long *mid_mfn_p;
564 unsigned long **mid;
565
566 mid = p2m_top[topidx];
567 mid_mfn_p = p2m_top_mfn_p[topidx];
568 if (mid == p2m_mid_missing) {
569 mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
570
571 p2m_mid_init(mid);
572
573 p2m_top[topidx] = mid;
574
575 BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
576 }
577 /* And the save/restore P2M tables.. */
578 if (mid_mfn_p == p2m_mid_missing_mfn) {
579 mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
580 p2m_mid_mfn_init(mid_mfn_p);
581
582 p2m_top_mfn_p[topidx] = mid_mfn_p;
583 p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
584 /* Note: we don't set mid_mfn_p[midix] here,
585 * look in __early_alloc_p2m */
586 }
587 } 607 }
588 608
589 __early_alloc_p2m(pfn_s); 609 early_alloc_p2m_middle(pfn_s, true);
590 __early_alloc_p2m(pfn_e); 610 early_alloc_p2m_middle(pfn_e, true);
591 611
592 for (pfn = pfn_s; pfn < pfn_e; pfn++) 612 for (pfn = pfn_s; pfn < pfn_e; pfn++)
593 if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn))) 613 if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn)))
@@ -686,6 +706,7 @@ int m2p_add_override(unsigned long mfn, struct page *page,
686 unsigned long uninitialized_var(address); 706 unsigned long uninitialized_var(address);
687 unsigned level; 707 unsigned level;
688 pte_t *ptep = NULL; 708 pte_t *ptep = NULL;
709 int ret = 0;
689 710
690 pfn = page_to_pfn(page); 711 pfn = page_to_pfn(page);
691 if (!PageHighMem(page)) { 712 if (!PageHighMem(page)) {
@@ -721,6 +742,24 @@ int m2p_add_override(unsigned long mfn, struct page *page,
721 list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]); 742 list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]);
722 spin_unlock_irqrestore(&m2p_override_lock, flags); 743 spin_unlock_irqrestore(&m2p_override_lock, flags);
723 744
745 /* p2m(m2p(mfn)) == mfn: the mfn is already present somewhere in
746 * this domain. Set the FOREIGN_FRAME_BIT in the p2m for the other
747 * pfn so that the following mfn_to_pfn(mfn) calls will return the
748 * pfn from the m2p_override (the backend pfn) instead.
749 * We need to do this because the pages shared by the frontend
750 * (xen-blkfront) can be already locked (lock_page, called by
751 * do_read_cache_page); when the userspace backend tries to use them
752 * with direct_IO, mfn_to_pfn returns the pfn of the frontend, so
753 * do_blockdev_direct_IO is going to try to lock the same pages
754 * again resulting in a deadlock.
755 * As a side effect get_user_pages_fast might not be safe on the
756 * frontend pages while they are being shared with the backend,
757 * because mfn_to_pfn (that ends up being called by GUPF) will
758 * return the backend pfn rather than the frontend pfn. */
759 ret = __get_user(pfn, &machine_to_phys_mapping[mfn]);
760 if (ret == 0 && get_phys_to_machine(pfn) == mfn)
761 set_phys_to_machine(pfn, FOREIGN_FRAME(mfn));
762
724 return 0; 763 return 0;
725} 764}
726EXPORT_SYMBOL_GPL(m2p_add_override); 765EXPORT_SYMBOL_GPL(m2p_add_override);
@@ -732,6 +771,7 @@ int m2p_remove_override(struct page *page, bool clear_pte)
732 unsigned long uninitialized_var(address); 771 unsigned long uninitialized_var(address);
733 unsigned level; 772 unsigned level;
734 pte_t *ptep = NULL; 773 pte_t *ptep = NULL;
774 int ret = 0;
735 775
736 pfn = page_to_pfn(page); 776 pfn = page_to_pfn(page);
737 mfn = get_phys_to_machine(pfn); 777 mfn = get_phys_to_machine(pfn);
@@ -801,6 +841,22 @@ int m2p_remove_override(struct page *page, bool clear_pte)
801 } else 841 } else
802 set_phys_to_machine(pfn, page->index); 842 set_phys_to_machine(pfn, page->index);
803 843
844 /* p2m(m2p(mfn)) == FOREIGN_FRAME(mfn): the mfn is already present
845 * somewhere in this domain, even before being added to the
846 * m2p_override (see comment above in m2p_add_override).
847 * If there are no other entries in the m2p_override corresponding
848 * to this mfn, then remove the FOREIGN_FRAME_BIT from the p2m for
849 * the original pfn (the one shared by the frontend): the backend
850 * cannot do any IO on this page anymore because it has been
851 * unshared. Removing the FOREIGN_FRAME_BIT from the p2m entry of
852 * the original pfn causes mfn_to_pfn(mfn) to return the frontend
853 * pfn again. */
854 mfn &= ~FOREIGN_FRAME_BIT;
855 ret = __get_user(pfn, &machine_to_phys_mapping[mfn]);
856 if (ret == 0 && get_phys_to_machine(pfn) == FOREIGN_FRAME(mfn) &&
857 m2p_find_override(mfn) == NULL)
858 set_phys_to_machine(pfn, mfn);
859
804 return 0; 860 return 0;
805} 861}
806EXPORT_SYMBOL_GPL(m2p_remove_override); 862EXPORT_SYMBOL_GPL(m2p_remove_override);
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 1ba8dff2675..ead85576d54 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -26,7 +26,6 @@
26#include <xen/interface/memory.h> 26#include <xen/interface/memory.h>
27#include <xen/interface/physdev.h> 27#include <xen/interface/physdev.h>
28#include <xen/features.h> 28#include <xen/features.h>
29
30#include "xen-ops.h" 29#include "xen-ops.h"
31#include "vdso.h" 30#include "vdso.h"
32 31
@@ -84,8 +83,8 @@ static void __init xen_add_extra_mem(u64 start, u64 size)
84 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); 83 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
85} 84}
86 85
87static unsigned long __init xen_release_chunk(unsigned long start, 86static unsigned long __init xen_do_chunk(unsigned long start,
88 unsigned long end) 87 unsigned long end, bool release)
89{ 88{
90 struct xen_memory_reservation reservation = { 89 struct xen_memory_reservation reservation = {
91 .address_bits = 0, 90 .address_bits = 0,
@@ -96,30 +95,133 @@ static unsigned long __init xen_release_chunk(unsigned long start,
96 unsigned long pfn; 95 unsigned long pfn;
97 int ret; 96 int ret;
98 97
99 for(pfn = start; pfn < end; pfn++) { 98 for (pfn = start; pfn < end; pfn++) {
99 unsigned long frame;
100 unsigned long mfn = pfn_to_mfn(pfn); 100 unsigned long mfn = pfn_to_mfn(pfn);
101 101
102 /* Make sure pfn exists to start with */ 102 if (release) {
103 if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn) 103 /* Make sure pfn exists to start with */
104 continue; 104 if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn)
105 105 continue;
106 set_xen_guest_handle(reservation.extent_start, &mfn); 106 frame = mfn;
107 } else {
108 if (mfn != INVALID_P2M_ENTRY)
109 continue;
110 frame = pfn;
111 }
112 set_xen_guest_handle(reservation.extent_start, &frame);
107 reservation.nr_extents = 1; 113 reservation.nr_extents = 1;
108 114
109 ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, 115 ret = HYPERVISOR_memory_op(release ? XENMEM_decrease_reservation : XENMEM_populate_physmap,
110 &reservation); 116 &reservation);
111 WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret); 117 WARN(ret != 1, "Failed to %s pfn %lx err=%d\n",
118 release ? "release" : "populate", pfn, ret);
119
112 if (ret == 1) { 120 if (ret == 1) {
113 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); 121 if (!early_set_phys_to_machine(pfn, release ? INVALID_P2M_ENTRY : frame)) {
122 if (release)
123 break;
124 set_xen_guest_handle(reservation.extent_start, &frame);
125 reservation.nr_extents = 1;
126 ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
127 &reservation);
128 break;
129 }
114 len++; 130 len++;
115 } 131 } else
132 break;
116 } 133 }
117 printk(KERN_INFO "Freeing %lx-%lx pfn range: %lu pages freed\n", 134 if (len)
118 start, end, len); 135 printk(KERN_INFO "%s %lx-%lx pfn range: %lu pages %s\n",
136 release ? "Freeing" : "Populating",
137 start, end, len,
138 release ? "freed" : "added");
119 139
120 return len; 140 return len;
121} 141}
122 142
143static unsigned long __init xen_release_chunk(unsigned long start,
144 unsigned long end)
145{
146 return xen_do_chunk(start, end, true);
147}
148
149static unsigned long __init xen_populate_chunk(
150 const struct e820entry *list, size_t map_size,
151 unsigned long max_pfn, unsigned long *last_pfn,
152 unsigned long credits_left)
153{
154 const struct e820entry *entry;
155 unsigned int i;
156 unsigned long done = 0;
157 unsigned long dest_pfn;
158
159 for (i = 0, entry = list; i < map_size; i++, entry++) {
160 unsigned long s_pfn;
161 unsigned long e_pfn;
162 unsigned long pfns;
163 long capacity;
164
165 if (credits_left <= 0)
166 break;
167
168 if (entry->type != E820_RAM)
169 continue;
170
171 e_pfn = PFN_DOWN(entry->addr + entry->size);
172
173 /* We only care about E820 after the xen_start_info->nr_pages */
174 if (e_pfn <= max_pfn)
175 continue;
176
177 s_pfn = PFN_UP(entry->addr);
178 /* If the E820 falls within the nr_pages, we want to start
179 * at the nr_pages PFN.
180 * If that would mean going past the E820 entry, skip it
181 */
182 if (s_pfn <= max_pfn) {
183 capacity = e_pfn - max_pfn;
184 dest_pfn = max_pfn;
185 } else {
186 capacity = e_pfn - s_pfn;
187 dest_pfn = s_pfn;
188 }
189
190 if (credits_left < capacity)
191 capacity = credits_left;
192
193 pfns = xen_do_chunk(dest_pfn, dest_pfn + capacity, false);
194 done += pfns;
195 *last_pfn = (dest_pfn + pfns);
196 if (pfns < capacity)
197 break;
198 credits_left -= pfns;
199 }
200 return done;
201}
202
203static void __init xen_set_identity_and_release_chunk(
204 unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages,
205 unsigned long *released, unsigned long *identity)
206{
207 unsigned long pfn;
208
209 /*
210 * If the PFNs are currently mapped, the VA mapping also needs
211 * to be updated to be 1:1.
212 */
213 for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++)
214 (void)HYPERVISOR_update_va_mapping(
215 (unsigned long)__va(pfn << PAGE_SHIFT),
216 mfn_pte(pfn, PAGE_KERNEL_IO), 0);
217
218 if (start_pfn < nr_pages)
219 *released += xen_release_chunk(
220 start_pfn, min(end_pfn, nr_pages));
221
222 *identity += set_phys_range_identity(start_pfn, end_pfn);
223}
224
123static unsigned long __init xen_set_identity_and_release( 225static unsigned long __init xen_set_identity_and_release(
124 const struct e820entry *list, size_t map_size, unsigned long nr_pages) 226 const struct e820entry *list, size_t map_size, unsigned long nr_pages)
125{ 227{
@@ -142,7 +244,6 @@ static unsigned long __init xen_set_identity_and_release(
142 */ 244 */
143 for (i = 0, entry = list; i < map_size; i++, entry++) { 245 for (i = 0, entry = list; i < map_size; i++, entry++) {
144 phys_addr_t end = entry->addr + entry->size; 246 phys_addr_t end = entry->addr + entry->size;
145
146 if (entry->type == E820_RAM || i == map_size - 1) { 247 if (entry->type == E820_RAM || i == map_size - 1) {
147 unsigned long start_pfn = PFN_DOWN(start); 248 unsigned long start_pfn = PFN_DOWN(start);
148 unsigned long end_pfn = PFN_UP(end); 249 unsigned long end_pfn = PFN_UP(end);
@@ -150,20 +251,19 @@ static unsigned long __init xen_set_identity_and_release(
150 if (entry->type == E820_RAM) 251 if (entry->type == E820_RAM)
151 end_pfn = PFN_UP(entry->addr); 252 end_pfn = PFN_UP(entry->addr);
152 253
153 if (start_pfn < end_pfn) { 254 if (start_pfn < end_pfn)
154 if (start_pfn < nr_pages) 255 xen_set_identity_and_release_chunk(
155 released += xen_release_chunk( 256 start_pfn, end_pfn, nr_pages,
156 start_pfn, min(end_pfn, nr_pages)); 257 &released, &identity);
157 258
158 identity += set_phys_range_identity(
159 start_pfn, end_pfn);
160 }
161 start = end; 259 start = end;
162 } 260 }
163 } 261 }
164 262
165 printk(KERN_INFO "Released %lu pages of unused memory\n", released); 263 if (released)
166 printk(KERN_INFO "Set %ld page(s) to 1-1 mapping\n", identity); 264 printk(KERN_INFO "Released %lu pages of unused memory\n", released);
265 if (identity)
266 printk(KERN_INFO "Set %ld page(s) to 1-1 mapping\n", identity);
167 267
168 return released; 268 return released;
169} 269}
@@ -217,7 +317,9 @@ char * __init xen_memory_setup(void)
217 int rc; 317 int rc;
218 struct xen_memory_map memmap; 318 struct xen_memory_map memmap;
219 unsigned long max_pages; 319 unsigned long max_pages;
320 unsigned long last_pfn = 0;
220 unsigned long extra_pages = 0; 321 unsigned long extra_pages = 0;
322 unsigned long populated;
221 int i; 323 int i;
222 int op; 324 int op;
223 325
@@ -257,8 +359,20 @@ char * __init xen_memory_setup(void)
257 */ 359 */
258 xen_released_pages = xen_set_identity_and_release( 360 xen_released_pages = xen_set_identity_and_release(
259 map, memmap.nr_entries, max_pfn); 361 map, memmap.nr_entries, max_pfn);
362
363 /*
364 * Populate back the non-RAM pages and E820 gaps that had been
365 * released. */
366 populated = xen_populate_chunk(map, memmap.nr_entries,
367 max_pfn, &last_pfn, xen_released_pages);
368
369 xen_released_pages -= populated;
260 extra_pages += xen_released_pages; 370 extra_pages += xen_released_pages;
261 371
372 if (last_pfn > max_pfn) {
373 max_pfn = min(MAX_DOMAIN_PAGES, last_pfn);
374 mem_end = PFN_PHYS(max_pfn);
375 }
262 /* 376 /*
263 * Clamp the amount of extra memory to a EXTRA_MEM_RATIO 377 * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
264 * factor the base size. On non-highmem systems, the base 378 * factor the base size. On non-highmem systems, the base
@@ -272,7 +386,6 @@ char * __init xen_memory_setup(void)
272 */ 386 */
273 extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), 387 extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
274 extra_pages); 388 extra_pages);
275
276 i = 0; 389 i = 0;
277 while (i < memmap.nr_entries) { 390 while (i < memmap.nr_entries) {
278 u64 addr = map[i].addr; 391 u64 addr = map[i].addr;
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 3700945ed0d..f58dca7a6e5 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -16,6 +16,7 @@
16#include <linux/err.h> 16#include <linux/err.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/smp.h> 18#include <linux/smp.h>
19#include <linux/irq_work.h>
19 20
20#include <asm/paravirt.h> 21#include <asm/paravirt.h>
21#include <asm/desc.h> 22#include <asm/desc.h>
@@ -41,10 +42,12 @@ cpumask_var_t xen_cpu_initialized_map;
41static DEFINE_PER_CPU(int, xen_resched_irq); 42static DEFINE_PER_CPU(int, xen_resched_irq);
42static DEFINE_PER_CPU(int, xen_callfunc_irq); 43static DEFINE_PER_CPU(int, xen_callfunc_irq);
43static DEFINE_PER_CPU(int, xen_callfuncsingle_irq); 44static DEFINE_PER_CPU(int, xen_callfuncsingle_irq);
45static DEFINE_PER_CPU(int, xen_irq_work);
44static DEFINE_PER_CPU(int, xen_debug_irq) = -1; 46static DEFINE_PER_CPU(int, xen_debug_irq) = -1;
45 47
46static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); 48static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
47static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); 49static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
50static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id);
48 51
49/* 52/*
50 * Reschedule call back. 53 * Reschedule call back.
@@ -77,9 +80,7 @@ static void __cpuinit cpu_bringup(void)
77 80
78 notify_cpu_starting(cpu); 81 notify_cpu_starting(cpu);
79 82
80 ipi_call_lock();
81 set_cpu_online(cpu, true); 83 set_cpu_online(cpu, true);
82 ipi_call_unlock();
83 84
84 this_cpu_write(cpu_state, CPU_ONLINE); 85 this_cpu_write(cpu_state, CPU_ONLINE);
85 86
@@ -143,6 +144,17 @@ static int xen_smp_intr_init(unsigned int cpu)
143 goto fail; 144 goto fail;
144 per_cpu(xen_callfuncsingle_irq, cpu) = rc; 145 per_cpu(xen_callfuncsingle_irq, cpu) = rc;
145 146
147 callfunc_name = kasprintf(GFP_KERNEL, "irqwork%d", cpu);
148 rc = bind_ipi_to_irqhandler(XEN_IRQ_WORK_VECTOR,
149 cpu,
150 xen_irq_work_interrupt,
151 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
152 callfunc_name,
153 NULL);
154 if (rc < 0)
155 goto fail;
156 per_cpu(xen_irq_work, cpu) = rc;
157
146 return 0; 158 return 0;
147 159
148 fail: 160 fail:
@@ -155,6 +167,8 @@ static int xen_smp_intr_init(unsigned int cpu)
155 if (per_cpu(xen_callfuncsingle_irq, cpu) >= 0) 167 if (per_cpu(xen_callfuncsingle_irq, cpu) >= 0)
156 unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), 168 unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu),
157 NULL); 169 NULL);
170 if (per_cpu(xen_irq_work, cpu) >= 0)
171 unbind_from_irqhandler(per_cpu(xen_irq_work, cpu), NULL);
158 172
159 return rc; 173 return rc;
160} 174}
@@ -407,6 +421,7 @@ static void xen_cpu_die(unsigned int cpu)
407 unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL); 421 unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL);
408 unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL); 422 unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL);
409 unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL); 423 unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL);
424 unbind_from_irqhandler(per_cpu(xen_irq_work, cpu), NULL);
410 xen_uninit_lock_cpu(cpu); 425 xen_uninit_lock_cpu(cpu);
411 xen_teardown_timer(cpu); 426 xen_teardown_timer(cpu);
412 427
@@ -469,8 +484,8 @@ static void xen_smp_send_reschedule(int cpu)
469 xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); 484 xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
470} 485}
471 486
472static void xen_send_IPI_mask(const struct cpumask *mask, 487static void __xen_send_IPI_mask(const struct cpumask *mask,
473 enum ipi_vector vector) 488 int vector)
474{ 489{
475 unsigned cpu; 490 unsigned cpu;
476 491
@@ -482,7 +497,7 @@ static void xen_smp_send_call_function_ipi(const struct cpumask *mask)
482{ 497{
483 int cpu; 498 int cpu;
484 499
485 xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); 500 __xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
486 501
487 /* Make sure other vcpus get a chance to run if they need to. */ 502 /* Make sure other vcpus get a chance to run if they need to. */
488 for_each_cpu(cpu, mask) { 503 for_each_cpu(cpu, mask) {
@@ -495,10 +510,86 @@ static void xen_smp_send_call_function_ipi(const struct cpumask *mask)
495 510
496static void xen_smp_send_call_function_single_ipi(int cpu) 511static void xen_smp_send_call_function_single_ipi(int cpu)
497{ 512{
498 xen_send_IPI_mask(cpumask_of(cpu), 513 __xen_send_IPI_mask(cpumask_of(cpu),
499 XEN_CALL_FUNCTION_SINGLE_VECTOR); 514 XEN_CALL_FUNCTION_SINGLE_VECTOR);
500} 515}
501 516
517static inline int xen_map_vector(int vector)
518{
519 int xen_vector;
520
521 switch (vector) {
522 case RESCHEDULE_VECTOR:
523 xen_vector = XEN_RESCHEDULE_VECTOR;
524 break;
525 case CALL_FUNCTION_VECTOR:
526 xen_vector = XEN_CALL_FUNCTION_VECTOR;
527 break;
528 case CALL_FUNCTION_SINGLE_VECTOR:
529 xen_vector = XEN_CALL_FUNCTION_SINGLE_VECTOR;
530 break;
531 case IRQ_WORK_VECTOR:
532 xen_vector = XEN_IRQ_WORK_VECTOR;
533 break;
534 default:
535 xen_vector = -1;
536 printk(KERN_ERR "xen: vector 0x%x is not implemented\n",
537 vector);
538 }
539
540 return xen_vector;
541}
542
543void xen_send_IPI_mask(const struct cpumask *mask,
544 int vector)
545{
546 int xen_vector = xen_map_vector(vector);
547
548 if (xen_vector >= 0)
549 __xen_send_IPI_mask(mask, xen_vector);
550}
551
552void xen_send_IPI_all(int vector)
553{
554 int xen_vector = xen_map_vector(vector);
555
556 if (xen_vector >= 0)
557 __xen_send_IPI_mask(cpu_online_mask, xen_vector);
558}
559
560void xen_send_IPI_self(int vector)
561{
562 int xen_vector = xen_map_vector(vector);
563
564 if (xen_vector >= 0)
565 xen_send_IPI_one(smp_processor_id(), xen_vector);
566}
567
568void xen_send_IPI_mask_allbutself(const struct cpumask *mask,
569 int vector)
570{
571 unsigned cpu;
572 unsigned int this_cpu = smp_processor_id();
573
574 if (!(num_online_cpus() > 1))
575 return;
576
577 for_each_cpu_and(cpu, mask, cpu_online_mask) {
578 if (this_cpu == cpu)
579 continue;
580
581 xen_smp_send_call_function_single_ipi(cpu);
582 }
583}
584
585void xen_send_IPI_allbutself(int vector)
586{
587 int xen_vector = xen_map_vector(vector);
588
589 if (xen_vector >= 0)
590 xen_send_IPI_mask_allbutself(cpu_online_mask, xen_vector);
591}
592
502static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id) 593static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
503{ 594{
504 irq_enter(); 595 irq_enter();
@@ -519,6 +610,16 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
519 return IRQ_HANDLED; 610 return IRQ_HANDLED;
520} 611}
521 612
613static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id)
614{
615 irq_enter();
616 irq_work_run();
617 inc_irq_stat(apic_irq_work_irqs);
618 irq_exit();
619
620 return IRQ_HANDLED;
621}
622
522static const struct smp_ops xen_smp_ops __initconst = { 623static const struct smp_ops xen_smp_ops __initconst = {
523 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, 624 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
524 .smp_prepare_cpus = xen_smp_prepare_cpus, 625 .smp_prepare_cpus = xen_smp_prepare_cpus,
@@ -565,6 +666,7 @@ static void xen_hvm_cpu_die(unsigned int cpu)
565 unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL); 666 unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL);
566 unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL); 667 unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL);
567 unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL); 668 unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL);
669 unbind_from_irqhandler(per_cpu(xen_irq_work, cpu), NULL);
568 native_cpu_die(cpu); 670 native_cpu_die(cpu);
569} 671}
570 672
diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h
new file mode 100644
index 00000000000..8981a76d081
--- /dev/null
+++ b/arch/x86/xen/smp.h
@@ -0,0 +1,12 @@
1#ifndef _XEN_SMP_H
2
3extern void xen_send_IPI_mask(const struct cpumask *mask,
4 int vector);
5extern void xen_send_IPI_mask_allbutself(const struct cpumask *mask,
6 int vector);
7extern void xen_send_IPI_allbutself(int vector);
8extern void physflat_send_IPI_allbutself(int vector);
9extern void xen_send_IPI_all(int vector);
10extern void xen_send_IPI_self(int vector);
11
12#endif
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index d69cc6c3f80..83e866d714c 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -440,12 +440,12 @@ static int __init xen_spinlock_debugfs(void)
440 debugfs_create_u64("time_total", 0444, d_spin_debug, 440 debugfs_create_u64("time_total", 0444, d_spin_debug,
441 &spinlock_stats.time_total); 441 &spinlock_stats.time_total);
442 442
443 xen_debugfs_create_u32_array("histo_total", 0444, d_spin_debug, 443 debugfs_create_u32_array("histo_total", 0444, d_spin_debug,
444 spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1); 444 spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1);
445 xen_debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug, 445 debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug,
446 spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1); 446 spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1);
447 xen_debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug, 447 debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
448 spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1); 448 spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);
449 449
450 return 0; 450 return 0;
451} 451}
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 45329c8c226..ae8a00c39de 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -30,7 +30,7 @@ void xen_arch_hvm_post_suspend(int suspend_cancelled)
30{ 30{
31#ifdef CONFIG_XEN_PVHVM 31#ifdef CONFIG_XEN_PVHVM
32 int cpu; 32 int cpu;
33 xen_hvm_init_shared_info(); 33 xen_hvm_resume_shared_info();
34 xen_callback_vector(); 34 xen_callback_vector();
35 xen_unplug_emulated_devices(); 35 xen_unplug_emulated_devices();
36 if (xen_feature(XENFEAT_hvm_safe_pvclock)) { 36 if (xen_feature(XENFEAT_hvm_safe_pvclock)) {
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 45c0c0667bd..1e4329e04e0 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -28,7 +28,6 @@ void xen_setup_shared_info(void);
28void xen_build_mfn_list_list(void); 28void xen_build_mfn_list_list(void);
29void xen_setup_machphys_mapping(void); 29void xen_setup_machphys_mapping(void);
30pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); 30pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
31void xen_ident_map_ISA(void);
32void xen_reserve_top(void); 31void xen_reserve_top(void);
33extern unsigned long xen_max_p2m_pfn; 32extern unsigned long xen_max_p2m_pfn;
34 33
@@ -42,7 +41,7 @@ void xen_enable_syscall(void);
42void xen_vcpu_restore(void); 41void xen_vcpu_restore(void);
43 42
44void xen_callback_vector(void); 43void xen_callback_vector(void);
45void xen_hvm_init_shared_info(void); 44void xen_hvm_resume_shared_info(void);
46void xen_unplug_emulated_devices(void); 45void xen_unplug_emulated_devices(void);
47 46
48void __init xen_build_dynamic_phys_to_machine(void); 47void __init xen_build_dynamic_phys_to_machine(void);