aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig268
-rw-r--r--arch/x86/Kconfig.cpu6
-rw-r--r--arch/x86/Kconfig.debug10
-rw-r--r--arch/x86/Makefile35
-rw-r--r--arch/x86/boot/Makefile19
-rw-r--r--arch/x86/boot/bioscall.S6
-rw-r--r--arch/x86/boot/boot.h23
-rw-r--r--arch/x86/boot/compressed/Makefile2
-rw-r--r--arch/x86/boot/compressed/aslr.c317
-rw-r--r--arch/x86/boot/compressed/cmdline.c2
-rw-r--r--arch/x86/boot/compressed/cpuflags.c12
-rw-r--r--arch/x86/boot/compressed/eboot.c1022
-rw-r--r--arch/x86/boot/compressed/eboot.h60
-rw-r--r--arch/x86/boot/compressed/efi_stub_64.S29
-rw-r--r--arch/x86/boot/compressed/head_32.S70
-rw-r--r--arch/x86/boot/compressed/head_64.S136
-rw-r--r--arch/x86/boot/compressed/misc.c69
-rw-r--r--arch/x86/boot/compressed/misc.h37
-rw-r--r--arch/x86/boot/compressed/string.c46
-rw-r--r--arch/x86/boot/copy.S22
-rw-r--r--arch/x86/boot/cpucheck.c115
-rw-r--r--arch/x86/boot/cpuflags.c119
-rw-r--r--arch/x86/boot/cpuflags.h19
-rw-r--r--arch/x86/boot/edd.c1
-rw-r--r--arch/x86/boot/header.S34
-rw-r--r--arch/x86/boot/main.c1
-rw-r--r--arch/x86/boot/regs.c1
-rw-r--r--arch/x86/boot/string.c14
-rw-r--r--arch/x86/boot/string.h21
-rw-r--r--arch/x86/boot/tools/build.c77
-rw-r--r--arch/x86/boot/video-vesa.c1
-rw-r--r--arch/x86/boot/video.h2
-rw-r--r--arch/x86/configs/i386_defconfig1
-rw-r--r--arch/x86/configs/x86_64_defconfig1
-rw-r--r--arch/x86/crypto/Makefile4
-rw-r--r--arch/x86/crypto/aesni-intel_avx-x86_64.S2811
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c147
-rw-r--r--arch/x86/crypto/blowfish_glue.c3
-rw-r--r--arch/x86/crypto/cast5_avx_glue.c3
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_asm.S29
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_glue.c14
-rw-r--r--arch/x86/crypto/sha1_avx2_x86_64_asm.S708
-rw-r--r--arch/x86/crypto/sha1_ssse3_glue.c53
-rw-r--r--arch/x86/include/asm/Kbuild3
-rw-r--r--arch/x86/include/asm/amd_nb.h2
-rw-r--r--arch/x86/include/asm/apic.h14
-rw-r--r--arch/x86/include/asm/archrandom.h63
-rw-r--r--arch/x86/include/asm/barrier.h49
-rw-r--r--arch/x86/include/asm/bug.h3
-rw-r--r--arch/x86/include/asm/clocksource.h4
-rw-r--r--arch/x86/include/asm/cpufeature.h17
-rw-r--r--arch/x86/include/asm/cputime.h1
-rw-r--r--arch/x86/include/asm/dmi.h6
-rw-r--r--arch/x86/include/asm/efi.h122
-rw-r--r--arch/x86/include/asm/elf.h4
-rw-r--r--arch/x86/include/asm/fixmap.h75
-rw-r--r--arch/x86/include/asm/floppy.h4
-rw-r--r--arch/x86/include/asm/futex.h21
-rw-r--r--arch/x86/include/asm/hardirq.h3
-rw-r--r--arch/x86/include/asm/hash.h7
-rw-r--r--arch/x86/include/asm/hw_irq.h4
-rw-r--r--arch/x86/include/asm/intel-mid.h48
-rw-r--r--arch/x86/include/asm/io.h16
-rw-r--r--arch/x86/include/asm/iosf_mbi.h90
-rw-r--r--arch/x86/include/asm/irq.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h21
-rw-r--r--arch/x86/include/asm/kvm_para.h33
-rw-r--r--arch/x86/include/asm/mce.h1
-rw-r--r--arch/x86/include/asm/microcode.h15
-rw-r--r--arch/x86/include/asm/microcode_amd.h7
-rw-r--r--arch/x86/include/asm/mmzone_32.h3
-rw-r--r--arch/x86/include/asm/mpspec.h7
-rw-r--r--arch/x86/include/asm/mshyperv.h4
-rw-r--r--arch/x86/include/asm/msr.h2
-rw-r--r--arch/x86/include/asm/mwait.h43
-rw-r--r--arch/x86/include/asm/nmi.h3
-rw-r--r--arch/x86/include/asm/numaq.h171
-rw-r--r--arch/x86/include/asm/page.h1
-rw-r--r--arch/x86/include/asm/page_32.h4
-rw-r--r--arch/x86/include/asm/page_64_types.h15
-rw-r--r--arch/x86/include/asm/paravirt.h2
-rw-r--r--arch/x86/include/asm/paravirt_types.h9
-rw-r--r--arch/x86/include/asm/pci.h10
-rw-r--r--arch/x86/include/asm/percpu.h98
-rw-r--r--arch/x86/include/asm/pgtable-2level.h100
-rw-r--r--arch/x86/include/asm/pgtable.h3
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h2
-rw-r--r--arch/x86/include/asm/pgtable_types.h17
-rw-r--r--arch/x86/include/asm/preempt.h16
-rw-r--r--arch/x86/include/asm/processor.h51
-rw-r--r--arch/x86/include/asm/ptrace.h1
-rw-r--r--arch/x86/include/asm/setup.h9
-rw-r--r--arch/x86/include/asm/smp.h1
-rw-r--r--arch/x86/include/asm/special_insns.h8
-rw-r--r--arch/x86/include/asm/spinlock.h5
-rw-r--r--arch/x86/include/asm/thread_info.h51
-rw-r--r--arch/x86/include/asm/timer.h78
-rw-r--r--arch/x86/include/asm/tlbflush.h6
-rw-r--r--arch/x86/include/asm/topology.h23
-rw-r--r--arch/x86/include/asm/tsc.h3
-rw-r--r--arch/x86/include/asm/uaccess.h124
-rw-r--r--arch/x86/include/asm/uaccess_64.h4
-rw-r--r--arch/x86/include/asm/unistd.h3
-rw-r--r--arch/x86/include/asm/uv/uv.h2
-rw-r--r--arch/x86/include/asm/vdso.h52
-rw-r--r--arch/x86/include/asm/vdso32.h11
-rw-r--r--arch/x86/include/asm/vgtod.h71
-rw-r--r--arch/x86/include/asm/visws/cobalt.h127
-rw-r--r--arch/x86/include/asm/visws/lithium.h53
-rw-r--r--arch/x86/include/asm/visws/piix4.h107
-rw-r--r--arch/x86/include/asm/visws/sgivw.h5
-rw-r--r--arch/x86/include/asm/vmx.h5
-rw-r--r--arch/x86/include/asm/vvar.h29
-rw-r--r--arch/x86/include/asm/x86_init.h2
-rw-r--r--arch/x86/include/asm/xen/page.h19
-rw-r--r--arch/x86/include/asm/xsave.h26
-rw-r--r--arch/x86/include/uapi/asm/bootparam.h2
-rw-r--r--arch/x86/include/uapi/asm/hyperv.h13
-rw-r--r--arch/x86/include/uapi/asm/msr-index.h78
-rw-r--r--arch/x86/include/uapi/asm/sembuf.h10
-rw-r--r--arch/x86/include/uapi/asm/stat.h42
-rw-r--r--arch/x86/kernel/Makefile15
-rw-r--r--arch/x86/kernel/acpi/boot.c31
-rw-r--r--arch/x86/kernel/acpi/cstate.c27
-rw-r--r--arch/x86/kernel/amd_nb.c4
-rw-r--r--arch/x86/kernel/aperture_64.c20
-rw-r--r--arch/x86/kernel/apic/Makefile3
-rw-r--r--arch/x86/kernel/apic/apic.c81
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c9
-rw-r--r--arch/x86/kernel/apic/apic_noop.c4
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c2
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c3
-rw-r--r--arch/x86/kernel/apic/es7000_32.c746
-rw-r--r--arch/x86/kernel/apic/io_apic.c23
-rw-r--r--arch/x86/kernel/apic/ipi.c1
-rw-r--r--arch/x86/kernel/apic/numaq_32.c525
-rw-r--r--arch/x86/kernel/apic/probe_32.c3
-rw-r--r--arch/x86/kernel/apic/summit_32.c552
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c3
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c3
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c3
-rw-r--r--arch/x86/kernel/check.c2
-rw-r--r--arch/x86/kernel/cpu/Makefile3
-rw-r--r--arch/x86/kernel/cpu/amd.c62
-rw-r--r--arch/x86/kernel/cpu/centaur.c273
-rw-r--r--arch/x86/kernel/cpu/common.c25
-rw-r--r--arch/x86/kernel/cpu/cyrix.c1
-rw-r--r--arch/x86/kernel/cpu/intel.c96
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c13
-rw-r--r--arch/x86/kernel/cpu/match.c42
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-apei.c14
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c38
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c20
-rw-r--r--arch/x86/kernel/cpu/mcheck/p5.c1
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c18
-rw-r--r--arch/x86/kernel/cpu/mcheck/winchip.c1
-rw-r--r--arch/x86/kernel/cpu/microcode/Makefile7
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c (renamed from arch/x86/kernel/microcode_amd.c)15
-rw-r--r--arch/x86/kernel/cpu/microcode/amd_early.c (renamed from arch/x86/kernel/microcode_amd_early.c)266
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c (renamed from arch/x86/kernel/microcode_core.c)0
-rw-r--r--arch/x86/kernel/cpu/microcode/core_early.c (renamed from arch/x86/kernel/microcode_core_early.c)0
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c (renamed from arch/x86/kernel/microcode_intel.c)2
-rw-r--r--arch/x86/kernel/cpu/microcode/intel_early.c (renamed from arch/x86/kernel/microcode_intel_early.c)10
-rw-r--r--arch/x86/kernel/cpu/microcode/intel_lib.c (renamed from arch/x86/kernel/microcode_intel_lib.c)0
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c84
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c4
-rw-r--r--arch/x86/kernel/cpu/perf_event.c70
-rw-r--r--arch/x86/kernel/cpu/perf_event.h9
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_ibs.c6
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_uncore.c7
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c11
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_rapl.c680
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.c557
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.h5
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c34
-rw-r--r--arch/x86/kernel/cpu/perf_event_p6.c48
-rw-r--r--arch/x86/kernel/cpu/rdrand.c14
-rw-r--r--arch/x86/kernel/cpu/transmeta.c1
-rw-r--r--arch/x86/kernel/cpu/umc.c1
-rw-r--r--arch/x86/kernel/cpuid.c15
-rw-r--r--arch/x86/kernel/crash.c3
-rw-r--r--arch/x86/kernel/doublefault.c1
-rw-r--r--arch/x86/kernel/dumpstack_32.c46
-rw-r--r--arch/x86/kernel/dumpstack_64.c118
-rw-r--r--arch/x86/kernel/e820.c2
-rw-r--r--arch/x86/kernel/early-quirks.c226
-rw-r--r--arch/x86/kernel/ftrace.c138
-rw-r--r--arch/x86/kernel/head_32.S7
-rw-r--r--arch/x86/kernel/head_64.S6
-rw-r--r--arch/x86/kernel/hpet.c10
-rw-r--r--arch/x86/kernel/hw_breakpoint.c1
-rw-r--r--arch/x86/kernel/i387.c15
-rw-r--r--arch/x86/kernel/iosf_mbi.c226
-rw-r--r--arch/x86/kernel/irq.c102
-rw-r--r--arch/x86/kernel/irq_32.c83
-rw-r--r--arch/x86/kernel/irqinit.c4
-rw-r--r--arch/x86/kernel/kgdb.c1
-rw-r--r--arch/x86/kernel/ksysfs.c340
-rw-r--r--arch/x86/kernel/kvm.c35
-rw-r--r--arch/x86/kernel/kvmclock.c2
-rw-r--r--arch/x86/kernel/ldt.c11
-rw-r--r--arch/x86/kernel/machine_kexec_32.c1
-rw-r--r--arch/x86/kernel/machine_kexec_64.c2
-rw-r--r--arch/x86/kernel/module.c46
-rw-r--r--arch/x86/kernel/msr.c16
-rw-r--r--arch/x86/kernel/nmi.c37
-rw-r--r--arch/x86/kernel/pci-calgary_64.c31
-rw-r--r--arch/x86/kernel/pci-dma.c4
-rw-r--r--arch/x86/kernel/pci-nommu.c1
-rw-r--r--arch/x86/kernel/process.c5
-rw-r--r--arch/x86/kernel/process_32.c5
-rw-r--r--arch/x86/kernel/ptrace.c8
-rw-r--r--arch/x86/kernel/quirks.c39
-rw-r--r--arch/x86/kernel/reboot.c15
-rw-r--r--arch/x86/kernel/setup.c69
-rw-r--r--arch/x86/kernel/smpboot.c28
-rw-r--r--arch/x86/kernel/time.c4
-rw-r--r--arch/x86/kernel/traps.c22
-rw-r--r--arch/x86/kernel/tsc.c330
-rw-r--r--arch/x86/kernel/tsc_msr.c127
-rw-r--r--arch/x86/kernel/tsc_sync.c1
-rw-r--r--arch/x86/kernel/vmlinux.lds.S8
-rw-r--r--arch/x86/kernel/vsmp_64.c8
-rw-r--r--arch/x86/kernel/vsyscall_64.c51
-rw-r--r--arch/x86/kernel/vsyscall_gtod.c69
-rw-r--r--arch/x86/kernel/x86_init.c4
-rw-r--r--arch/x86/kernel/xsave.c10
-rw-r--r--arch/x86/kvm/Kconfig2
-rw-r--r--arch/x86/kvm/cpuid.c39
-rw-r--r--arch/x86/kvm/cpuid.h8
-rw-r--r--arch/x86/kvm/emulate.c8
-rw-r--r--arch/x86/kvm/i8254.c18
-rw-r--r--arch/x86/kvm/lapic.c9
-rw-r--r--arch/x86/kvm/lapic.h2
-rw-r--r--arch/x86/kvm/mmu.c15
-rw-r--r--arch/x86/kvm/paging_tmpl.h15
-rw-r--r--arch/x86/kvm/svm.c105
-rw-r--r--arch/x86/kvm/vmx.c654
-rw-r--r--arch/x86/kvm/x86.c294
-rw-r--r--arch/x86/kvm/x86.h7
-rw-r--r--arch/x86/lguest/boot.c12
-rw-r--r--arch/x86/lib/Makefile2
-rw-r--r--arch/x86/lib/copy_user_64.S12
-rw-r--r--arch/x86/lib/delay.c1
-rw-r--r--arch/x86/lib/hash.c92
-rw-r--r--arch/x86/lib/memcpy_32.c6
-rw-r--r--arch/x86/lib/msr.c89
-rw-r--r--arch/x86/lib/x86-opcode-map.txt4
-rw-r--r--arch/x86/math-emu/errors.c5
-rw-r--r--arch/x86/mm/dump_pagetables.c84
-rw-r--r--arch/x86/mm/fault.c68
-rw-r--r--arch/x86/mm/gup.c8
-rw-r--r--arch/x86/mm/hugetlbpage.c9
-rw-r--r--arch/x86/mm/init_32.c5
-rw-r--r--arch/x86/mm/init_64.c2
-rw-r--r--arch/x86/mm/ioremap.c224
-rw-r--r--arch/x86/mm/kmemcheck/kmemcheck.c8
-rw-r--r--arch/x86/mm/kmmio.c1
-rw-r--r--arch/x86/mm/memtest.c2
-rw-r--r--arch/x86/mm/numa.c69
-rw-r--r--arch/x86/mm/numa_32.c2
-rw-r--r--arch/x86/mm/pageattr-test.c1
-rw-r--r--arch/x86/mm/pageattr.c493
-rw-r--r--arch/x86/mm/pgtable_32.c2
-rw-r--r--arch/x86/mm/srat.c29
-rw-r--r--arch/x86/mm/tlb.c52
-rw-r--r--arch/x86/net/bpf_jit.S2
-rw-r--r--arch/x86/net/bpf_jit_comp.c11
-rw-r--r--arch/x86/oprofile/nmi_int.c15
-rw-r--r--arch/x86/pci/Makefile3
-rw-r--r--arch/x86/pci/acpi.c59
-rw-r--r--arch/x86/pci/amd_bus.c15
-rw-r--r--arch/x86/pci/bus_numa.c13
-rw-r--r--arch/x86/pci/common.c133
-rw-r--r--arch/x86/pci/fixup.c25
-rw-r--r--arch/x86/pci/intel_mid_pci.c6
-rw-r--r--arch/x86/pci/irq.c6
-rw-r--r--arch/x86/pci/legacy.c4
-rw-r--r--arch/x86/pci/mmconfig-shared.c1
-rw-r--r--arch/x86/pci/mmconfig_32.c1
-rw-r--r--arch/x86/pci/numaq_32.c165
-rw-r--r--arch/x86/pci/visws.c87
-rw-r--r--arch/x86/pci/xen.c31
-rw-r--r--arch/x86/platform/Makefile1
-rw-r--r--arch/x86/platform/efi/Makefile1
-rw-r--r--arch/x86/platform/efi/efi-bgrt.c12
-rw-r--r--arch/x86/platform/efi/efi.c657
-rw-r--r--arch/x86/platform/efi/efi_32.c23
-rw-r--r--arch/x86/platform/efi/efi_64.c489
-rw-r--r--arch/x86/platform/efi/efi_stub_64.S220
-rw-r--r--arch/x86/platform/efi/efi_thunk_64.S65
-rw-r--r--arch/x86/platform/intel-mid/Makefile4
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_emc1403.c4
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c2
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_ipc.h5
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_lis331.c4
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_max7315.c2
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c2
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_msic.h4
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c2
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_tca6416.c4
-rw-r--r--arch/x86/platform/intel-mid/early_printk_intel_mid.c1
-rw-r--r--arch/x86/platform/intel-mid/intel-mid.c64
-rw-r--r--arch/x86/platform/intel-mid/intel_mid_weak_decls.h19
-rw-r--r--arch/x86/platform/intel-mid/mfld.c75
-rw-r--r--arch/x86/platform/intel-mid/mrfl.c103
-rw-r--r--arch/x86/platform/intel-mid/sfi.c46
-rw-r--r--arch/x86/platform/iris/iris.c1
-rw-r--r--arch/x86/platform/olpc/olpc-xo15-sci.c3
-rw-r--r--arch/x86/platform/ts5500/ts5500.c2
-rw-r--r--arch/x86/platform/uv/tlb_uv.c66
-rw-r--r--arch/x86/platform/uv/uv_nmi.c65
-rw-r--r--arch/x86/platform/visws/Makefile1
-rw-r--r--arch/x86/platform/visws/visws_quirks.c608
-rw-r--r--arch/x86/realmode/init.c26
-rw-r--r--arch/x86/realmode/rm/Makefile17
-rw-r--r--arch/x86/realmode/rm/reboot.S1
-rw-r--r--arch/x86/realmode/rm/trampoline_32.S1
-rw-r--r--arch/x86/realmode/rm/trampoline_64.S1
-rw-r--r--arch/x86/syscalls/syscall_32.tbl2
-rw-r--r--arch/x86/syscalls/syscall_64.tbl3
-rw-r--r--arch/x86/tools/relocs.c52
-rw-r--r--arch/x86/tools/relocs.h7
-rw-r--r--arch/x86/tools/relocs_common.c16
-rw-r--r--arch/x86/um/asm/barrier.h4
-rw-r--r--arch/x86/vdso/Makefile26
-rw-r--r--arch/x86/vdso/vclock_gettime.c256
-rw-r--r--arch/x86/vdso/vdso-layout.lds.S29
-rw-r--r--arch/x86/vdso/vdso.S23
-rw-r--r--arch/x86/vdso/vdso32-setup.c301
-rw-r--r--arch/x86/vdso/vdso32.S21
-rw-r--r--arch/x86/vdso/vdso32/vclock_gettime.c30
-rw-r--r--arch/x86/vdso/vdso32/vdso32.lds.S15
-rw-r--r--arch/x86/vdso/vdsox32.S23
-rw-r--r--arch/x86/vdso/vma.c20
-rw-r--r--arch/x86/xen/Kconfig11
-rw-r--r--arch/x86/xen/enlighten.c138
-rw-r--r--arch/x86/xen/grant-table.c64
-rw-r--r--arch/x86/xen/irq.c13
-rw-r--r--arch/x86/xen/mmu.c183
-rw-r--r--arch/x86/xen/p2m.c136
-rw-r--r--arch/x86/xen/platform-pci-unplug.c79
-rw-r--r--arch/x86/xen/setup.c44
-rw-r--r--arch/x86/xen/smp.c49
-rw-r--r--arch/x86/xen/spinlock.c4
-rw-r--r--arch/x86/xen/time.c1
-rw-r--r--arch/x86/xen/xen-head.S25
-rw-r--r--arch/x86/xen/xen-ops.h1
348 files changed, 14999 insertions, 7579 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 56f47caf6fa0..25d2c6f7325e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -23,6 +23,7 @@ config X86
23 def_bool y 23 def_bool y
24 select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS 24 select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
25 select ARCH_MIGHT_HAVE_PC_PARPORT 25 select ARCH_MIGHT_HAVE_PC_PARPORT
26 select ARCH_MIGHT_HAVE_PC_SERIO
26 select HAVE_AOUT if X86_32 27 select HAVE_AOUT if X86_32
27 select HAVE_UNSTABLE_SCHED_CLOCK 28 select HAVE_UNSTABLE_SCHED_CLOCK
28 select ARCH_SUPPORTS_NUMA_BALANCING 29 select ARCH_SUPPORTS_NUMA_BALANCING
@@ -42,6 +43,7 @@ config X86
42 select HAVE_DMA_ATTRS 43 select HAVE_DMA_ATTRS
43 select HAVE_DMA_CONTIGUOUS if !SWIOTLB 44 select HAVE_DMA_CONTIGUOUS if !SWIOTLB
44 select HAVE_KRETPROBES 45 select HAVE_KRETPROBES
46 select GENERIC_EARLY_IOREMAP
45 select HAVE_OPTPROBES 47 select HAVE_OPTPROBES
46 select HAVE_KPROBES_ON_FTRACE 48 select HAVE_KPROBES_ON_FTRACE
47 select HAVE_FTRACE_MCOUNT_RECORD 49 select HAVE_FTRACE_MCOUNT_RECORD
@@ -106,9 +108,9 @@ config X86
106 select HAVE_ARCH_SOFT_DIRTY 108 select HAVE_ARCH_SOFT_DIRTY
107 select CLOCKSOURCE_WATCHDOG 109 select CLOCKSOURCE_WATCHDOG
108 select GENERIC_CLOCKEVENTS 110 select GENERIC_CLOCKEVENTS
109 select ARCH_CLOCKSOURCE_DATA if X86_64 111 select ARCH_CLOCKSOURCE_DATA
110 select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC) 112 select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC)
111 select GENERIC_TIME_VSYSCALL if X86_64 113 select GENERIC_TIME_VSYSCALL
112 select KTIME_SCALAR if X86_32 114 select KTIME_SCALAR if X86_32
113 select GENERIC_STRNCPY_FROM_USER 115 select GENERIC_STRNCPY_FROM_USER
114 select GENERIC_STRNLEN_USER 116 select GENERIC_STRNLEN_USER
@@ -125,6 +127,8 @@ config X86
125 select RTC_LIB 127 select RTC_LIB
126 select HAVE_DEBUG_STACKOVERFLOW 128 select HAVE_DEBUG_STACKOVERFLOW
127 select HAVE_IRQ_EXIT_ON_IRQ_STACK if X86_64 129 select HAVE_IRQ_EXIT_ON_IRQ_STACK if X86_64
130 select HAVE_CC_STACKPROTECTOR
131 select GENERIC_CPU_AUTOPROBE
128 select HAVE_ARCH_AUDITSYSCALL 132 select HAVE_ARCH_AUDITSYSCALL
129 133
130config INSTRUCTION_DECODER 134config INSTRUCTION_DECODER
@@ -194,9 +198,6 @@ config ARCH_HAS_CPU_RELAX
194config ARCH_HAS_CACHE_LINE_SIZE 198config ARCH_HAS_CACHE_LINE_SIZE
195 def_bool y 199 def_bool y
196 200
197config ARCH_HAS_CPU_AUTOPROBE
198 def_bool y
199
200config HAVE_SETUP_PER_CPU_AREA 201config HAVE_SETUP_PER_CPU_AREA
201 def_bool y 202 def_bool y
202 203
@@ -279,13 +280,13 @@ config SMP
279 bool "Symmetric multi-processing support" 280 bool "Symmetric multi-processing support"
280 ---help--- 281 ---help---
281 This enables support for systems with more than one CPU. If you have 282 This enables support for systems with more than one CPU. If you have
282 a system with only one CPU, like most personal computers, say N. If 283 a system with only one CPU, say N. If you have a system with more
283 you have a system with more than one CPU, say Y. 284 than one CPU, say Y.
284 285
285 If you say N here, the kernel will run on single and multiprocessor 286 If you say N here, the kernel will run on uni- and multiprocessor
286 machines, but will use only one CPU of a multiprocessor machine. If 287 machines, but will use only one CPU of a multiprocessor machine. If
287 you say Y here, the kernel will run on many, but not all, 288 you say Y here, the kernel will run on many, but not all,
288 singleprocessor machines. On a singleprocessor machine, the kernel 289 uniprocessor machines. On a uniprocessor machine, the kernel
289 will run faster if you say N here. 290 will run faster if you say N here.
290 291
291 Note that if you say Y here and choose architecture "586" or 292 Note that if you say Y here and choose architecture "586" or
@@ -345,12 +346,9 @@ config X86_EXTENDED_PLATFORM
345 for the following (non-PC) 32 bit x86 platforms: 346 for the following (non-PC) 32 bit x86 platforms:
346 Goldfish (Android emulator) 347 Goldfish (Android emulator)
347 AMD Elan 348 AMD Elan
348 NUMAQ (IBM/Sequent)
349 RDC R-321x SoC 349 RDC R-321x SoC
350 SGI 320/540 (Visual Workstation) 350 SGI 320/540 (Visual Workstation)
351 STA2X11-based (e.g. Northville) 351 STA2X11-based (e.g. Northville)
352 Summit/EXA (IBM x440)
353 Unisys ES7000 IA32 series
354 Moorestown MID devices 352 Moorestown MID devices
355 353
356 If you have one of these systems, or if you want to build a 354 If you have one of these systems, or if you want to build a
@@ -439,42 +437,27 @@ config X86_INTEL_CE
439 This option compiles in support for the CE4100 SOC for settop 437 This option compiles in support for the CE4100 SOC for settop
440 boxes and media devices. 438 boxes and media devices.
441 439
442config X86_WANT_INTEL_MID 440config X86_INTEL_MID
443 bool "Intel MID platform support" 441 bool "Intel MID platform support"
444 depends on X86_32 442 depends on X86_32
445 depends on X86_EXTENDED_PLATFORM 443 depends on X86_EXTENDED_PLATFORM
446 ---help--- 444 depends on X86_PLATFORM_DEVICES
447 Select to build a kernel capable of supporting Intel MID platform
448 systems which do not have the PCI legacy interfaces (Moorestown,
449 Medfield). If you are building for a PC class system say N here.
450
451if X86_WANT_INTEL_MID
452
453config X86_INTEL_MID
454 bool
455
456config X86_MDFLD
457 bool "Medfield MID platform"
458 depends on PCI 445 depends on PCI
459 depends on PCI_GOANY 446 depends on PCI_GOANY
460 depends on X86_IO_APIC 447 depends on X86_IO_APIC
461 select X86_INTEL_MID
462 select SFI 448 select SFI
449 select I2C
463 select DW_APB_TIMER 450 select DW_APB_TIMER
464 select APB_TIMER 451 select APB_TIMER
465 select I2C
466 select SPI
467 select INTEL_SCU_IPC 452 select INTEL_SCU_IPC
468 select X86_PLATFORM_DEVICES
469 select MFD_INTEL_MSIC 453 select MFD_INTEL_MSIC
470 ---help--- 454 ---help---
471 Medfield is Intel's Low Power Intel Architecture (LPIA) based Moblin 455 Select to build a kernel capable of supporting Intel MID (Mobile
472 Internet Device(MID) platform. 456 Internet Device) platform systems which do not have the PCI legacy
473 Unlike standard x86 PCs, Medfield does not have many legacy devices 457 interfaces. If you are building for a PC class system say N here.
474 nor standard legacy replacement devices/features. e.g. Medfield does
475 not contain i8259, i8254, HPET, legacy BIOS, most of the io ports.
476 458
477endif 459 Intel MID platforms are based on an Intel processor and chipset which
460 consume less power than most of the x86 derivatives.
478 461
479config X86_INTEL_LPSS 462config X86_INTEL_LPSS
480 bool "Intel Low Power Subsystem Support" 463 bool "Intel Low Power Subsystem Support"
@@ -503,49 +486,22 @@ config X86_32_NON_STANDARD
503 depends on X86_32 && SMP 486 depends on X86_32 && SMP
504 depends on X86_EXTENDED_PLATFORM 487 depends on X86_EXTENDED_PLATFORM
505 ---help--- 488 ---help---
506 This option compiles in the NUMAQ, Summit, bigsmp, ES7000, 489 This option compiles in the bigsmp and STA2X11 default
507 STA2X11, default subarchitectures. It is intended for a generic 490 subarchitectures. It is intended for a generic binary
508 binary kernel. If you select them all, kernel will probe it 491 kernel. If you select them all, kernel will probe it one by
509 one by one and will fallback to default. 492 one and will fallback to default.
510 493
511# Alphabetically sorted list of Non standard 32 bit platforms 494# Alphabetically sorted list of Non standard 32 bit platforms
512 495
513config X86_NUMAQ
514 bool "NUMAQ (IBM/Sequent)"
515 depends on X86_32_NON_STANDARD
516 depends on PCI
517 select NUMA
518 select X86_MPPARSE
519 ---help---
520 This option is used for getting Linux to run on a NUMAQ (IBM/Sequent)
521 NUMA multiquad box. This changes the way that processors are
522 bootstrapped, and uses Clustered Logical APIC addressing mode instead
523 of Flat Logical. You will need a new lynxer.elf file to flash your
524 firmware with - send email to <Martin.Bligh@us.ibm.com>.
525
526config X86_SUPPORTS_MEMORY_FAILURE 496config X86_SUPPORTS_MEMORY_FAILURE
527 def_bool y 497 def_bool y
528 # MCE code calls memory_failure(): 498 # MCE code calls memory_failure():
529 depends on X86_MCE 499 depends on X86_MCE
530 # On 32-bit this adds too big of NODES_SHIFT and we run out of page flags: 500 # On 32-bit this adds too big of NODES_SHIFT and we run out of page flags:
531 depends on !X86_NUMAQ
532 # On 32-bit SPARSEMEM adds too big of SECTIONS_WIDTH: 501 # On 32-bit SPARSEMEM adds too big of SECTIONS_WIDTH:
533 depends on X86_64 || !SPARSEMEM 502 depends on X86_64 || !SPARSEMEM
534 select ARCH_SUPPORTS_MEMORY_FAILURE 503 select ARCH_SUPPORTS_MEMORY_FAILURE
535 504
536config X86_VISWS
537 bool "SGI 320/540 (Visual Workstation)"
538 depends on X86_32 && PCI && X86_MPPARSE && PCI_GODIRECT
539 depends on X86_32_NON_STANDARD
540 ---help---
541 The SGI Visual Workstation series is an IA32-based workstation
542 based on SGI systems chips with some legacy PC hardware attached.
543
544 Say Y here to create a kernel to run on the SGI 320 or 540.
545
546 A kernel compiled for the Visual Workstation will run on general
547 PCs as well. See <file:Documentation/sgi-visws.txt> for details.
548
549config STA2X11 505config STA2X11
550 bool "STA2X11 Companion Chip Support" 506 bool "STA2X11 Companion Chip Support"
551 depends on X86_32_NON_STANDARD && PCI 507 depends on X86_32_NON_STANDARD && PCI
@@ -562,20 +518,6 @@ config STA2X11
562 option is selected the kernel will still be able to boot on 518 option is selected the kernel will still be able to boot on
563 standard PC machines. 519 standard PC machines.
564 520
565config X86_SUMMIT
566 bool "Summit/EXA (IBM x440)"
567 depends on X86_32_NON_STANDARD
568 ---help---
569 This option is needed for IBM systems that use the Summit/EXA chipset.
570 In particular, it is needed for the x440.
571
572config X86_ES7000
573 bool "Unisys ES7000 IA32 series"
574 depends on X86_32_NON_STANDARD && X86_BIGSMP
575 ---help---
576 Support for Unisys ES7000 systems. Say 'Y' here if this kernel is
577 supposed to run on an IA32-based Unisys ES7000 system.
578
579config X86_32_IRIS 521config X86_32_IRIS
580 tristate "Eurobraille/Iris poweroff module" 522 tristate "Eurobraille/Iris poweroff module"
581 depends on X86_32 523 depends on X86_32
@@ -698,14 +640,6 @@ config MEMTEST
698 memtest=4, mean do 4 test patterns. 640 memtest=4, mean do 4 test patterns.
699 If you are unsure how to answer this question, answer N. 641 If you are unsure how to answer this question, answer N.
700 642
701config X86_SUMMIT_NUMA
702 def_bool y
703 depends on X86_32 && NUMA && X86_32_NON_STANDARD
704
705config X86_CYCLONE_TIMER
706 def_bool y
707 depends on X86_SUMMIT
708
709source "arch/x86/Kconfig.cpu" 643source "arch/x86/Kconfig.cpu"
710 644
711config HPET_TIMER 645config HPET_TIMER
@@ -747,6 +681,7 @@ config APB_TIMER
747# The code disables itself when not needed. 681# The code disables itself when not needed.
748config DMI 682config DMI
749 default y 683 default y
684 select DMI_SCAN_MACHINE_NON_EFI_FALLBACK
750 bool "Enable DMI scanning" if EXPERT 685 bool "Enable DMI scanning" if EXPERT
751 ---help--- 686 ---help---
752 Enabled scanning of DMI to identify machine quirks. Say Y 687 Enabled scanning of DMI to identify machine quirks. Say Y
@@ -833,7 +768,7 @@ config NR_CPUS
833 range 2 8192 if SMP && !MAXSMP && CPUMASK_OFFSTACK && X86_64 768 range 2 8192 if SMP && !MAXSMP && CPUMASK_OFFSTACK && X86_64
834 default "1" if !SMP 769 default "1" if !SMP
835 default "8192" if MAXSMP 770 default "8192" if MAXSMP
836 default "32" if SMP && (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000) 771 default "32" if SMP && X86_BIGSMP
837 default "8" if SMP 772 default "8" if SMP
838 ---help--- 773 ---help---
839 This allows you to specify the maximum number of CPUs which this 774 This allows you to specify the maximum number of CPUs which this
@@ -897,10 +832,6 @@ config X86_IO_APIC
897 def_bool y 832 def_bool y
898 depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_IOAPIC || PCI_MSI 833 depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_IOAPIC || PCI_MSI
899 834
900config X86_VISWS_APIC
901 def_bool y
902 depends on X86_32 && X86_VISWS
903
904config X86_REROUTE_FOR_BROKEN_BOOT_IRQS 835config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
905 bool "Reroute for broken boot IRQs" 836 bool "Reroute for broken boot IRQs"
906 depends on X86_IO_APIC 837 depends on X86_IO_APIC
@@ -954,7 +885,7 @@ config X86_ANCIENT_MCE
954 depends on X86_32 && X86_MCE 885 depends on X86_32 && X86_MCE
955 ---help--- 886 ---help---
956 Include support for machine check handling on old Pentium 5 or WinChip 887 Include support for machine check handling on old Pentium 5 or WinChip
957 systems. These typically need to be enabled explicitely on the command 888 systems. These typically need to be enabled explicitly on the command
958 line. 889 line.
959 890
960config X86_MCE_THRESHOLD 891config X86_MCE_THRESHOLD
@@ -1065,9 +996,9 @@ config MICROCODE_INTEL
1065 This options enables microcode patch loading support for Intel 996 This options enables microcode patch loading support for Intel
1066 processors. 997 processors.
1067 998
1068 For latest news and information on obtaining all the required 999 For the current Intel microcode data package go to
1069 Intel ingredients for this driver, check: 1000 <https://downloadcenter.intel.com> and search for
1070 <http://www.urbanmyth.org/microcode/>. 1001 'Linux Processor Microcode Data File'.
1071 1002
1072config MICROCODE_AMD 1003config MICROCODE_AMD
1073 bool "AMD microcode loading support" 1004 bool "AMD microcode loading support"
@@ -1081,10 +1012,6 @@ config MICROCODE_OLD_INTERFACE
1081 def_bool y 1012 def_bool y
1082 depends on MICROCODE 1013 depends on MICROCODE
1083 1014
1084config MICROCODE_INTEL_LIB
1085 def_bool y
1086 depends on MICROCODE_INTEL
1087
1088config MICROCODE_INTEL_EARLY 1015config MICROCODE_INTEL_EARLY
1089 def_bool n 1016 def_bool n
1090 1017
@@ -1122,13 +1049,11 @@ config X86_CPUID
1122 1049
1123choice 1050choice
1124 prompt "High Memory Support" 1051 prompt "High Memory Support"
1125 default HIGHMEM64G if X86_NUMAQ
1126 default HIGHMEM4G 1052 default HIGHMEM4G
1127 depends on X86_32 1053 depends on X86_32
1128 1054
1129config NOHIGHMEM 1055config NOHIGHMEM
1130 bool "off" 1056 bool "off"
1131 depends on !X86_NUMAQ
1132 ---help--- 1057 ---help---
1133 Linux can use up to 64 Gigabytes of physical memory on x86 systems. 1058 Linux can use up to 64 Gigabytes of physical memory on x86 systems.
1134 However, the address space of 32-bit x86 processors is only 4 1059 However, the address space of 32-bit x86 processors is only 4
@@ -1165,7 +1090,6 @@ config NOHIGHMEM
1165 1090
1166config HIGHMEM4G 1091config HIGHMEM4G
1167 bool "4GB" 1092 bool "4GB"
1168 depends on !X86_NUMAQ
1169 ---help--- 1093 ---help---
1170 Select this if you have a 32-bit processor and between 1 and 4 1094 Select this if you have a 32-bit processor and between 1 and 4
1171 gigabytes of physical RAM. 1095 gigabytes of physical RAM.
@@ -1257,8 +1181,8 @@ config DIRECT_GBPAGES
1257config NUMA 1181config NUMA
1258 bool "Numa Memory Allocation and Scheduler Support" 1182 bool "Numa Memory Allocation and Scheduler Support"
1259 depends on SMP 1183 depends on SMP
1260 depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI)) 1184 depends on X86_64 || (X86_32 && HIGHMEM64G && X86_BIGSMP)
1261 default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP) 1185 default y if X86_BIGSMP
1262 ---help--- 1186 ---help---
1263 Enable NUMA (Non Uniform Memory Access) support. 1187 Enable NUMA (Non Uniform Memory Access) support.
1264 1188
@@ -1269,15 +1193,11 @@ config NUMA
1269 For 64-bit this is recommended if the system is Intel Core i7 1193 For 64-bit this is recommended if the system is Intel Core i7
1270 (or later), AMD Opteron, or EM64T NUMA. 1194 (or later), AMD Opteron, or EM64T NUMA.
1271 1195
1272 For 32-bit this is only needed on (rare) 32-bit-only platforms 1196 For 32-bit this is only needed if you boot a 32-bit
1273 that support NUMA topologies, such as NUMAQ / Summit, or if you 1197 kernel on a 64-bit NUMA platform.
1274 boot a 32-bit kernel on a 64-bit NUMA platform.
1275 1198
1276 Otherwise, you should say N. 1199 Otherwise, you should say N.
1277 1200
1278comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI"
1279 depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI)
1280
1281config AMD_NUMA 1201config AMD_NUMA
1282 def_bool y 1202 def_bool y
1283 prompt "Old style AMD Opteron NUMA detection" 1203 prompt "Old style AMD Opteron NUMA detection"
@@ -1319,7 +1239,6 @@ config NODES_SHIFT
1319 range 1 10 1239 range 1 10
1320 default "10" if MAXSMP 1240 default "10" if MAXSMP
1321 default "6" if X86_64 1241 default "6" if X86_64
1322 default "4" if X86_NUMAQ
1323 default "3" 1242 default "3"
1324 depends on NEED_MULTIPLE_NODES 1243 depends on NEED_MULTIPLE_NODES
1325 ---help--- 1244 ---help---
@@ -1602,6 +1521,20 @@ config EFI_STUB
1602 1521
1603 See Documentation/efi-stub.txt for more information. 1522 See Documentation/efi-stub.txt for more information.
1604 1523
1524config EFI_MIXED
1525 bool "EFI mixed-mode support"
1526 depends on EFI_STUB && X86_64
1527 ---help---
1528 Enabling this feature allows a 64-bit kernel to be booted
1529 on a 32-bit firmware, provided that your CPU supports 64-bit
1530 mode.
1531
1532 Note that it is not possible to boot a mixed-mode enabled
1533 kernel via the EFI boot stub - a bootloader that supports
1534 the EFI handover protocol must be used.
1535
1536 If unsure, say N.
1537
1605config SECCOMP 1538config SECCOMP
1606 def_bool y 1539 def_bool y
1607 prompt "Enable seccomp to safely compute untrusted bytecode" 1540 prompt "Enable seccomp to safely compute untrusted bytecode"
@@ -1618,22 +1551,6 @@ config SECCOMP
1618 1551
1619 If unsure, say Y. Only embedded should say N here. 1552 If unsure, say Y. Only embedded should say N here.
1620 1553
1621config CC_STACKPROTECTOR
1622 bool "Enable -fstack-protector buffer overflow detection"
1623 ---help---
1624 This option turns on the -fstack-protector GCC feature. This
1625 feature puts, at the beginning of functions, a canary value on
1626 the stack just before the return address, and validates
1627 the value just before actually returning. Stack based buffer
1628 overflows (that need to overwrite this return address) now also
1629 overwrite the canary, which gets detected and the attack is then
1630 neutralized via a kernel panic.
1631
1632 This feature requires gcc version 4.2 or above, or a distribution
1633 gcc with the feature backported. Older versions are automatically
1634 detected and for those versions, this configuration option is
1635 ignored. (and a warning is printed during bootup)
1636
1637source kernel/Kconfig.hz 1554source kernel/Kconfig.hz
1638 1555
1639config KEXEC 1556config KEXEC
@@ -1729,16 +1646,67 @@ config RELOCATABLE
1729 1646
1730 Note: If CONFIG_RELOCATABLE=y, then the kernel runs from the address 1647 Note: If CONFIG_RELOCATABLE=y, then the kernel runs from the address
1731 it has been loaded at and the compile time physical address 1648 it has been loaded at and the compile time physical address
1732 (CONFIG_PHYSICAL_START) is ignored. 1649 (CONFIG_PHYSICAL_START) is used as the minimum location.
1733 1650
1734# Relocation on x86-32 needs some additional build support 1651config RANDOMIZE_BASE
1652 bool "Randomize the address of the kernel image"
1653 depends on RELOCATABLE
1654 depends on !HIBERNATION
1655 default n
1656 ---help---
1657 Randomizes the physical and virtual address at which the
1658 kernel image is decompressed, as a security feature that
1659 deters exploit attempts relying on knowledge of the location
1660 of kernel internals.
1661
1662 Entropy is generated using the RDRAND instruction if it is
1663 supported. If RDTSC is supported, it is used as well. If
1664 neither RDRAND nor RDTSC are supported, then randomness is
1665 read from the i8254 timer.
1666
1667 The kernel will be offset by up to RANDOMIZE_BASE_MAX_OFFSET,
1668 and aligned according to PHYSICAL_ALIGN. Since the kernel is
1669 built using 2GiB addressing, and PHYSICAL_ALGIN must be at a
1670 minimum of 2MiB, only 10 bits of entropy is theoretically
1671 possible. At best, due to page table layouts, 64-bit can use
1672 9 bits of entropy and 32-bit uses 8 bits.
1673
1674 If unsure, say N.
1675
1676config RANDOMIZE_BASE_MAX_OFFSET
1677 hex "Maximum kASLR offset allowed" if EXPERT
1678 depends on RANDOMIZE_BASE
1679 range 0x0 0x20000000 if X86_32
1680 default "0x20000000" if X86_32
1681 range 0x0 0x40000000 if X86_64
1682 default "0x40000000" if X86_64
1683 ---help---
1684 The lesser of RANDOMIZE_BASE_MAX_OFFSET and available physical
1685 memory is used to determine the maximal offset in bytes that will
1686 be applied to the kernel when kernel Address Space Layout
1687 Randomization (kASLR) is active. This must be a multiple of
1688 PHYSICAL_ALIGN.
1689
1690 On 32-bit this is limited to 512MiB by page table layouts. The
1691 default is 512MiB.
1692
1693 On 64-bit this is limited by how the kernel fixmap page table is
1694 positioned, so this cannot be larger than 1GiB currently. Without
1695 RANDOMIZE_BASE, there is a 512MiB to 1.5GiB split between kernel
1696 and modules. When RANDOMIZE_BASE_MAX_OFFSET is above 512MiB, the
1697 modules area will shrink to compensate, up to the current maximum
1698 1GiB to 1GiB split. The default is 1GiB.
1699
1700 If unsure, leave at the default value.
1701
1702# Relocation on x86 needs some additional build support
1735config X86_NEED_RELOCS 1703config X86_NEED_RELOCS
1736 def_bool y 1704 def_bool y
1737 depends on X86_32 && RELOCATABLE 1705 depends on RANDOMIZE_BASE || (X86_32 && RELOCATABLE)
1738 1706
1739config PHYSICAL_ALIGN 1707config PHYSICAL_ALIGN
1740 hex "Alignment value to which kernel should be aligned" 1708 hex "Alignment value to which kernel should be aligned"
1741 default "0x1000000" 1709 default "0x200000"
1742 range 0x2000 0x1000000 if X86_32 1710 range 0x2000 0x1000000 if X86_32
1743 range 0x200000 0x1000000 if X86_64 1711 range 0x200000 0x1000000 if X86_64
1744 ---help--- 1712 ---help---
@@ -1818,17 +1786,29 @@ config DEBUG_HOTPLUG_CPU0
1818 If unsure, say N. 1786 If unsure, say N.
1819 1787
1820config COMPAT_VDSO 1788config COMPAT_VDSO
1821 def_bool y 1789 def_bool n
1822 prompt "Compat VDSO support" 1790 prompt "Disable the 32-bit vDSO (needed for glibc 2.3.3)"
1823 depends on X86_32 || IA32_EMULATION 1791 depends on X86_32 || IA32_EMULATION
1824 ---help--- 1792 ---help---
1825 Map the 32-bit VDSO to the predictable old-style address too. 1793 Certain buggy versions of glibc will crash if they are
1794 presented with a 32-bit vDSO that is not mapped at the address
1795 indicated in its segment table.
1826 1796
1827 Say N here if you are running a sufficiently recent glibc 1797 The bug was introduced by f866314b89d56845f55e6f365e18b31ec978ec3a
1828 version (2.3.3 or later), to remove the high-mapped 1798 and fixed by 3b3ddb4f7db98ec9e912ccdf54d35df4aa30e04a and
1829 VDSO mapping and to exclusively use the randomized VDSO. 1799 49ad572a70b8aeb91e57483a11dd1b77e31c4468. Glibc 2.3.3 is
1800 the only released version with the bug, but OpenSUSE 9
1801 contains a buggy "glibc 2.3.2".
1830 1802
1831 If unsure, say Y. 1803 The symptom of the bug is that everything crashes on startup, saying:
1804 dl_main: Assertion `(void *) ph->p_vaddr == _rtld_local._dl_sysinfo_dso' failed!
1805
1806 Saying Y here changes the default value of the vdso32 boot
1807 option from 1 to 0, which turns off the 32-bit vDSO entirely.
1808 This works around the glibc bug but hurts performance.
1809
1810 If unsure, say N: if you are compiling your own kernel, you
1811 are unlikely to be using a buggy version of glibc.
1832 1812
1833config CMDLINE_BOOL 1813config CMDLINE_BOOL
1834 bool "Built-in kernel command line" 1814 bool "Built-in kernel command line"
@@ -2394,6 +2374,14 @@ config X86_DMA_REMAP
2394 bool 2374 bool
2395 depends on STA2X11 2375 depends on STA2X11
2396 2376
2377config IOSF_MBI
2378 bool
2379 depends on PCI
2380 ---help---
2381 To be selected by modules requiring access to the Intel OnChip System
2382 Fabric (IOSF) Sideband MailBox Interface (MBI). For MBI platforms
2383 enumerable by PCI.
2384
2397source "net/Kconfig" 2385source "net/Kconfig"
2398 2386
2399source "drivers/Kconfig" 2387source "drivers/Kconfig"
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index c026cca5602c..6983314c8b37 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -341,10 +341,6 @@ config X86_USE_3DNOW
341 def_bool y 341 def_bool y
342 depends on (MCYRIXIII || MK7 || MGEODE_LX) && !UML 342 depends on (MCYRIXIII || MK7 || MGEODE_LX) && !UML
343 343
344config X86_OOSTORE
345 def_bool y
346 depends on (MWINCHIP3D || MWINCHIPC6) && MTRR
347
348# 344#
349# P6_NOPs are a relatively minor optimization that require a family >= 345# P6_NOPs are a relatively minor optimization that require a family >=
350# 6 processor, except that it is broken on certain VIA chips. 346# 6 processor, except that it is broken on certain VIA chips.
@@ -363,7 +359,7 @@ config X86_P6_NOP
363 359
364config X86_TSC 360config X86_TSC
365 def_bool y 361 def_bool y
366 depends on ((MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) && !X86_NUMAQ) || X86_64 362 depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) || X86_64
367 363
368config X86_CMPXCHG64 364config X86_CMPXCHG64
369 def_bool y 365 def_bool y
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 0f3621ed1db6..61bd2ad94281 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -81,6 +81,15 @@ config X86_PTDUMP
81 kernel. 81 kernel.
82 If in doubt, say "N" 82 If in doubt, say "N"
83 83
84config EFI_PGT_DUMP
85 bool "Dump the EFI pagetable"
86 depends on EFI && X86_PTDUMP
87 ---help---
88 Enable this if you want to dump the EFI page table before
89 enabling virtual mode. This can be used to debug miscellaneous
90 issues with the mapping of the EFI runtime regions into that
91 table.
92
84config DEBUG_RODATA 93config DEBUG_RODATA
85 bool "Write protect kernel read-only data structures" 94 bool "Write protect kernel read-only data structures"
86 default y 95 default y
@@ -184,6 +193,7 @@ config HAVE_MMIOTRACE_SUPPORT
184config X86_DECODER_SELFTEST 193config X86_DECODER_SELFTEST
185 bool "x86 instruction decoder selftest" 194 bool "x86 instruction decoder selftest"
186 depends on DEBUG_KERNEL && KPROBES 195 depends on DEBUG_KERNEL && KPROBES
196 depends on !COMPILE_TEST
187 ---help--- 197 ---help---
188 Perform x86 instruction decoder selftests at build time. 198 Perform x86 instruction decoder selftests at build time.
189 This option is useful for checking the sanity of x86 instruction 199 This option is useful for checking the sanity of x86 instruction
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 57d021507120..3b9348a0c1a4 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -11,6 +11,28 @@ else
11 KBUILD_DEFCONFIG := $(ARCH)_defconfig 11 KBUILD_DEFCONFIG := $(ARCH)_defconfig
12endif 12endif
13 13
14# How to compile the 16-bit code. Note we always compile for -march=i386;
15# that way we can complain to the user if the CPU is insufficient.
16#
17# The -m16 option is supported by GCC >= 4.9 and clang >= 3.5. For
18# older versions of GCC, we need to play evil and unreliable tricks to
19# attempt to ensure that our asm(".code16gcc") is first in the asm
20# output.
21CODE16GCC_CFLAGS := -m32 -include $(srctree)/arch/x86/boot/code16gcc.h \
22 $(call cc-option, -fno-toplevel-reorder,\
23 $(call cc-option, -fno-unit-at-a-time))
24M16_CFLAGS := $(call cc-option, -m16, $(CODE16GCC_CFLAGS))
25
26REALMODE_CFLAGS := $(M16_CFLAGS) -g -Os -D__KERNEL__ \
27 -DDISABLE_BRANCH_PROFILING \
28 -Wall -Wstrict-prototypes -march=i386 -mregparm=3 \
29 -fno-strict-aliasing -fomit-frame-pointer -fno-pic \
30 -mno-mmx -mno-sse \
31 $(call cc-option, -ffreestanding) \
32 $(call cc-option, -fno-stack-protector) \
33 $(call cc-option, -mpreferred-stack-boundary=2)
34export REALMODE_CFLAGS
35
14# BITS is used as extension for files which are available in a 32 bit 36# BITS is used as extension for files which are available in a 32 bit
15# and a 64 bit version to simplify shared Makefiles. 37# and a 64 bit version to simplify shared Makefiles.
16# e.g.: obj-y += foo_$(BITS).o 38# e.g.: obj-y += foo_$(BITS).o
@@ -60,8 +82,8 @@ else
60 KBUILD_AFLAGS += -m64 82 KBUILD_AFLAGS += -m64
61 KBUILD_CFLAGS += -m64 83 KBUILD_CFLAGS += -m64
62 84
63 # Don't autogenerate MMX or SSE instructions 85 # Don't autogenerate traditional x87, MMX or SSE instructions
64 KBUILD_CFLAGS += -mno-mmx -mno-sse 86 KBUILD_CFLAGS += -mno-mmx -mno-sse -mno-80387 -mno-fp-ret-in-387
65 87
66 # Use -mpreferred-stack-boundary=3 if supported. 88 # Use -mpreferred-stack-boundary=3 if supported.
67 KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=3) 89 KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=3)
@@ -89,13 +111,11 @@ else
89 KBUILD_CFLAGS += -maccumulate-outgoing-args 111 KBUILD_CFLAGS += -maccumulate-outgoing-args
90endif 112endif
91 113
114# Make sure compiler does not have buggy stack-protector support.
92ifdef CONFIG_CC_STACKPROTECTOR 115ifdef CONFIG_CC_STACKPROTECTOR
93 cc_has_sp := $(srctree)/scripts/gcc-x86_$(BITS)-has-stack-protector.sh 116 cc_has_sp := $(srctree)/scripts/gcc-x86_$(BITS)-has-stack-protector.sh
94 ifeq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC) $(KBUILD_CPPFLAGS) $(biarch)),y) 117 ifneq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC) $(KBUILD_CPPFLAGS) $(biarch)),y)
95 stackp-y := -fstack-protector 118 $(warning stack-protector enabled but compiler support broken)
96 KBUILD_CFLAGS += $(stackp-y)
97 else
98 $(warning stack protector enabled but no compiler support)
99 endif 119 endif
100endif 120endif
101 121
@@ -132,6 +152,7 @@ cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTI
132 152
133# does binutils support specific instructions? 153# does binutils support specific instructions?
134asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1) 154asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)
155asinstr += $(call as-instr,crc32l %eax$(comma)%eax,-DCONFIG_AS_CRC32=1)
135avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1) 156avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
136avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1) 157avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1)
137 158
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index d9c11956fce0..abb9eba61b50 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -20,7 +20,7 @@ targets := vmlinux.bin setup.bin setup.elf bzImage
20targets += fdimage fdimage144 fdimage288 image.iso mtools.conf 20targets += fdimage fdimage144 fdimage288 image.iso mtools.conf
21subdir- := compressed 21subdir- := compressed
22 22
23setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpucheck.o 23setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpuflags.o cpucheck.o
24setup-y += early_serial_console.o edd.o header.o main.o mca.o memory.o 24setup-y += early_serial_console.o edd.o header.o main.o mca.o memory.o
25setup-y += pm.o pmjump.o printf.o regs.o string.o tty.o video.o 25setup-y += pm.o pmjump.o printf.o regs.o string.o tty.o video.o
26setup-y += video-mode.o version.o 26setup-y += video-mode.o version.o
@@ -51,20 +51,7 @@ $(obj)/cpustr.h: $(obj)/mkcpustr FORCE
51 51
52# --------------------------------------------------------------------------- 52# ---------------------------------------------------------------------------
53 53
54# How to compile the 16-bit code. Note we always compile for -march=i386, 54KBUILD_CFLAGS := $(USERINCLUDE) $(REALMODE_CFLAGS) -D_SETUP
55# that way we can complain to the user if the CPU is insufficient.
56KBUILD_CFLAGS := $(USERINCLUDE) -m32 -g -Os -D_SETUP -D__KERNEL__ \
57 -DDISABLE_BRANCH_PROFILING \
58 -Wall -Wstrict-prototypes \
59 -march=i386 -mregparm=3 \
60 -include $(srctree)/$(src)/code16gcc.h \
61 -fno-strict-aliasing -fomit-frame-pointer -fno-pic \
62 -mno-mmx -mno-sse \
63 $(call cc-option, -ffreestanding) \
64 $(call cc-option, -fno-toplevel-reorder,\
65 $(call cc-option, -fno-unit-at-a-time)) \
66 $(call cc-option, -fno-stack-protector) \
67 $(call cc-option, -mpreferred-stack-boundary=2)
68KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ 55KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
69GCOV_PROFILE := n 56GCOV_PROFILE := n
70 57
@@ -93,7 +80,7 @@ targets += voffset.h
93$(obj)/voffset.h: vmlinux FORCE 80$(obj)/voffset.h: vmlinux FORCE
94 $(call if_changed,voffset) 81 $(call if_changed,voffset)
95 82
96sed-zoffset := -e 's/^\([0-9a-fA-F]*\) . \(startup_32\|startup_64\|efi_pe_entry\|efi_stub_entry\|input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p' 83sed-zoffset := -e 's/^\([0-9a-fA-F]*\) . \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p'
97 84
98quiet_cmd_zoffset = ZOFFSET $@ 85quiet_cmd_zoffset = ZOFFSET $@
99 cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@ 86 cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@
diff --git a/arch/x86/boot/bioscall.S b/arch/x86/boot/bioscall.S
index 1dfbf64e52a2..d401b4a262b0 100644
--- a/arch/x86/boot/bioscall.S
+++ b/arch/x86/boot/bioscall.S
@@ -1,6 +1,6 @@
1/* ----------------------------------------------------------------------- 1/* -----------------------------------------------------------------------
2 * 2 *
3 * Copyright 2009 Intel Corporation; author H. Peter Anvin 3 * Copyright 2009-2014 Intel Corporation; author H. Peter Anvin
4 * 4 *
5 * This file is part of the Linux kernel, and is made available under 5 * This file is part of the Linux kernel, and is made available under
6 * the terms of the GNU General Public License version 2 or (at your 6 * the terms of the GNU General Public License version 2 or (at your
@@ -13,8 +13,8 @@
13 * touching registers they shouldn't be. 13 * touching registers they shouldn't be.
14 */ 14 */
15 15
16 .code16gcc 16 .code16
17 .text 17 .section ".inittext","ax"
18 .globl intcall 18 .globl intcall
19 .type intcall, @function 19 .type intcall, @function
20intcall: 20intcall:
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index ef72baeff484..bd49ec61255c 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -26,9 +26,8 @@
26#include <asm/boot.h> 26#include <asm/boot.h>
27#include <asm/setup.h> 27#include <asm/setup.h>
28#include "bitops.h" 28#include "bitops.h"
29#include <asm/cpufeature.h>
30#include <asm/processor-flags.h>
31#include "ctype.h" 29#include "ctype.h"
30#include "cpuflags.h"
32 31
33/* Useful macros */ 32/* Useful macros */
34#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) 33#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
@@ -178,14 +177,6 @@ static inline void wrgs32(u32 v, addr_t addr)
178} 177}
179 178
180/* Note: these only return true/false, not a signed return value! */ 179/* Note: these only return true/false, not a signed return value! */
181static inline int memcmp(const void *s1, const void *s2, size_t len)
182{
183 u8 diff;
184 asm("repe; cmpsb; setnz %0"
185 : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len));
186 return diff;
187}
188
189static inline int memcmp_fs(const void *s1, addr_t s2, size_t len) 180static inline int memcmp_fs(const void *s1, addr_t s2, size_t len)
190{ 181{
191 u8 diff; 182 u8 diff;
@@ -229,11 +220,6 @@ void copy_to_fs(addr_t dst, void *src, size_t len);
229void *copy_from_fs(void *dst, addr_t src, size_t len); 220void *copy_from_fs(void *dst, addr_t src, size_t len);
230void copy_to_gs(addr_t dst, void *src, size_t len); 221void copy_to_gs(addr_t dst, void *src, size_t len);
231void *copy_from_gs(void *dst, addr_t src, size_t len); 222void *copy_from_gs(void *dst, addr_t src, size_t len);
232void *memcpy(void *dst, void *src, size_t len);
233void *memset(void *dst, int c, size_t len);
234
235#define memcpy(d,s,l) __builtin_memcpy(d,s,l)
236#define memset(d,c,l) __builtin_memset(d,c,l)
237 223
238/* a20.c */ 224/* a20.c */
239int enable_a20(void); 225int enable_a20(void);
@@ -307,14 +293,7 @@ static inline int cmdline_find_option_bool(const char *option)
307 return __cmdline_find_option_bool(cmd_line_ptr, option); 293 return __cmdline_find_option_bool(cmd_line_ptr, option);
308} 294}
309 295
310
311/* cpu.c, cpucheck.c */ 296/* cpu.c, cpucheck.c */
312struct cpu_features {
313 int level; /* Family, or 64 for x86-64 */
314 int model;
315 u32 flags[NCAPINTS];
316};
317extern struct cpu_features cpu;
318int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr); 297int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr);
319int validate_cpu(void); 298int validate_cpu(void);
320 299
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index c8a6792e7842..0fcd9133790c 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -28,7 +28,7 @@ HOST_EXTRACFLAGS += -I$(srctree)/tools/include
28 28
29VMLINUX_OBJS = $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \ 29VMLINUX_OBJS = $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \
30 $(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o \ 30 $(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o \
31 $(obj)/piggy.o 31 $(obj)/piggy.o $(obj)/cpuflags.o $(obj)/aslr.o
32 32
33$(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone 33$(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone
34 34
diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
new file mode 100644
index 000000000000..4dbf967da50d
--- /dev/null
+++ b/arch/x86/boot/compressed/aslr.c
@@ -0,0 +1,317 @@
1#include "misc.h"
2
3#ifdef CONFIG_RANDOMIZE_BASE
4#include <asm/msr.h>
5#include <asm/archrandom.h>
6#include <asm/e820.h>
7
8#include <generated/compile.h>
9#include <linux/module.h>
10#include <linux/uts.h>
11#include <linux/utsname.h>
12#include <generated/utsrelease.h>
13
14/* Simplified build-specific string for starting entropy. */
15static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@"
16 LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION;
17
18#define I8254_PORT_CONTROL 0x43
19#define I8254_PORT_COUNTER0 0x40
20#define I8254_CMD_READBACK 0xC0
21#define I8254_SELECT_COUNTER0 0x02
22#define I8254_STATUS_NOTREADY 0x40
23static inline u16 i8254(void)
24{
25 u16 status, timer;
26
27 do {
28 outb(I8254_PORT_CONTROL,
29 I8254_CMD_READBACK | I8254_SELECT_COUNTER0);
30 status = inb(I8254_PORT_COUNTER0);
31 timer = inb(I8254_PORT_COUNTER0);
32 timer |= inb(I8254_PORT_COUNTER0) << 8;
33 } while (status & I8254_STATUS_NOTREADY);
34
35 return timer;
36}
37
38static unsigned long rotate_xor(unsigned long hash, const void *area,
39 size_t size)
40{
41 size_t i;
42 unsigned long *ptr = (unsigned long *)area;
43
44 for (i = 0; i < size / sizeof(hash); i++) {
45 /* Rotate by odd number of bits and XOR. */
46 hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7);
47 hash ^= ptr[i];
48 }
49
50 return hash;
51}
52
53/* Attempt to create a simple but unpredictable starting entropy. */
54static unsigned long get_random_boot(void)
55{
56 unsigned long hash = 0;
57
58 hash = rotate_xor(hash, build_str, sizeof(build_str));
59 hash = rotate_xor(hash, real_mode, sizeof(*real_mode));
60
61 return hash;
62}
63
64static unsigned long get_random_long(void)
65{
66#ifdef CONFIG_X86_64
67 const unsigned long mix_const = 0x5d6008cbf3848dd3UL;
68#else
69 const unsigned long mix_const = 0x3f39e593UL;
70#endif
71 unsigned long raw, random = get_random_boot();
72 bool use_i8254 = true;
73
74 debug_putstr("KASLR using");
75
76 if (has_cpuflag(X86_FEATURE_RDRAND)) {
77 debug_putstr(" RDRAND");
78 if (rdrand_long(&raw)) {
79 random ^= raw;
80 use_i8254 = false;
81 }
82 }
83
84 if (has_cpuflag(X86_FEATURE_TSC)) {
85 debug_putstr(" RDTSC");
86 rdtscll(raw);
87
88 random ^= raw;
89 use_i8254 = false;
90 }
91
92 if (use_i8254) {
93 debug_putstr(" i8254");
94 random ^= i8254();
95 }
96
97 /* Circular multiply for better bit diffusion */
98 asm("mul %3"
99 : "=a" (random), "=d" (raw)
100 : "a" (random), "rm" (mix_const));
101 random += raw;
102
103 debug_putstr("...\n");
104
105 return random;
106}
107
108struct mem_vector {
109 unsigned long start;
110 unsigned long size;
111};
112
113#define MEM_AVOID_MAX 5
114static struct mem_vector mem_avoid[MEM_AVOID_MAX];
115
116static bool mem_contains(struct mem_vector *region, struct mem_vector *item)
117{
118 /* Item at least partially before region. */
119 if (item->start < region->start)
120 return false;
121 /* Item at least partially after region. */
122 if (item->start + item->size > region->start + region->size)
123 return false;
124 return true;
125}
126
127static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two)
128{
129 /* Item one is entirely before item two. */
130 if (one->start + one->size <= two->start)
131 return false;
132 /* Item one is entirely after item two. */
133 if (one->start >= two->start + two->size)
134 return false;
135 return true;
136}
137
138static void mem_avoid_init(unsigned long input, unsigned long input_size,
139 unsigned long output, unsigned long output_size)
140{
141 u64 initrd_start, initrd_size;
142 u64 cmd_line, cmd_line_size;
143 unsigned long unsafe, unsafe_len;
144 char *ptr;
145
146 /*
147 * Avoid the region that is unsafe to overlap during
148 * decompression (see calculations at top of misc.c).
149 */
150 unsafe_len = (output_size >> 12) + 32768 + 18;
151 unsafe = (unsigned long)input + input_size - unsafe_len;
152 mem_avoid[0].start = unsafe;
153 mem_avoid[0].size = unsafe_len;
154
155 /* Avoid initrd. */
156 initrd_start = (u64)real_mode->ext_ramdisk_image << 32;
157 initrd_start |= real_mode->hdr.ramdisk_image;
158 initrd_size = (u64)real_mode->ext_ramdisk_size << 32;
159 initrd_size |= real_mode->hdr.ramdisk_size;
160 mem_avoid[1].start = initrd_start;
161 mem_avoid[1].size = initrd_size;
162
163 /* Avoid kernel command line. */
164 cmd_line = (u64)real_mode->ext_cmd_line_ptr << 32;
165 cmd_line |= real_mode->hdr.cmd_line_ptr;
166 /* Calculate size of cmd_line. */
167 ptr = (char *)(unsigned long)cmd_line;
168 for (cmd_line_size = 0; ptr[cmd_line_size++]; )
169 ;
170 mem_avoid[2].start = cmd_line;
171 mem_avoid[2].size = cmd_line_size;
172
173 /* Avoid heap memory. */
174 mem_avoid[3].start = (unsigned long)free_mem_ptr;
175 mem_avoid[3].size = BOOT_HEAP_SIZE;
176
177 /* Avoid stack memory. */
178 mem_avoid[4].start = (unsigned long)free_mem_end_ptr;
179 mem_avoid[4].size = BOOT_STACK_SIZE;
180}
181
182/* Does this memory vector overlap a known avoided area? */
183static bool mem_avoid_overlap(struct mem_vector *img)
184{
185 int i;
186
187 for (i = 0; i < MEM_AVOID_MAX; i++) {
188 if (mem_overlaps(img, &mem_avoid[i]))
189 return true;
190 }
191
192 return false;
193}
194
195static unsigned long slots[CONFIG_RANDOMIZE_BASE_MAX_OFFSET /
196 CONFIG_PHYSICAL_ALIGN];
197static unsigned long slot_max;
198
199static void slots_append(unsigned long addr)
200{
201 /* Overflowing the slots list should be impossible. */
202 if (slot_max >= CONFIG_RANDOMIZE_BASE_MAX_OFFSET /
203 CONFIG_PHYSICAL_ALIGN)
204 return;
205
206 slots[slot_max++] = addr;
207}
208
209static unsigned long slots_fetch_random(void)
210{
211 /* Handle case of no slots stored. */
212 if (slot_max == 0)
213 return 0;
214
215 return slots[get_random_long() % slot_max];
216}
217
218static void process_e820_entry(struct e820entry *entry,
219 unsigned long minimum,
220 unsigned long image_size)
221{
222 struct mem_vector region, img;
223
224 /* Skip non-RAM entries. */
225 if (entry->type != E820_RAM)
226 return;
227
228 /* Ignore entries entirely above our maximum. */
229 if (entry->addr >= CONFIG_RANDOMIZE_BASE_MAX_OFFSET)
230 return;
231
232 /* Ignore entries entirely below our minimum. */
233 if (entry->addr + entry->size < minimum)
234 return;
235
236 region.start = entry->addr;
237 region.size = entry->size;
238
239 /* Potentially raise address to minimum location. */
240 if (region.start < minimum)
241 region.start = minimum;
242
243 /* Potentially raise address to meet alignment requirements. */
244 region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN);
245
246 /* Did we raise the address above the bounds of this e820 region? */
247 if (region.start > entry->addr + entry->size)
248 return;
249
250 /* Reduce size by any delta from the original address. */
251 region.size -= region.start - entry->addr;
252
253 /* Reduce maximum size to fit end of image within maximum limit. */
254 if (region.start + region.size > CONFIG_RANDOMIZE_BASE_MAX_OFFSET)
255 region.size = CONFIG_RANDOMIZE_BASE_MAX_OFFSET - region.start;
256
257 /* Walk each aligned slot and check for avoided areas. */
258 for (img.start = region.start, img.size = image_size ;
259 mem_contains(&region, &img) ;
260 img.start += CONFIG_PHYSICAL_ALIGN) {
261 if (mem_avoid_overlap(&img))
262 continue;
263 slots_append(img.start);
264 }
265}
266
267static unsigned long find_random_addr(unsigned long minimum,
268 unsigned long size)
269{
270 int i;
271 unsigned long addr;
272
273 /* Make sure minimum is aligned. */
274 minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN);
275
276 /* Verify potential e820 positions, appending to slots list. */
277 for (i = 0; i < real_mode->e820_entries; i++) {
278 process_e820_entry(&real_mode->e820_map[i], minimum, size);
279 }
280
281 return slots_fetch_random();
282}
283
284unsigned char *choose_kernel_location(unsigned char *input,
285 unsigned long input_size,
286 unsigned char *output,
287 unsigned long output_size)
288{
289 unsigned long choice = (unsigned long)output;
290 unsigned long random;
291
292 if (cmdline_find_option_bool("nokaslr")) {
293 debug_putstr("KASLR disabled...\n");
294 goto out;
295 }
296
297 /* Record the various known unsafe memory ranges. */
298 mem_avoid_init((unsigned long)input, input_size,
299 (unsigned long)output, output_size);
300
301 /* Walk e820 and find a random address. */
302 random = find_random_addr(choice, output_size);
303 if (!random) {
304 debug_putstr("KASLR could not find suitable E820 region...\n");
305 goto out;
306 }
307
308 /* Always enforce the minimum. */
309 if (random < choice)
310 goto out;
311
312 choice = random;
313out:
314 return (unsigned char *)choice;
315}
316
317#endif /* CONFIG_RANDOMIZE_BASE */
diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c
index bffd73b45b1f..b68e3033e6b9 100644
--- a/arch/x86/boot/compressed/cmdline.c
+++ b/arch/x86/boot/compressed/cmdline.c
@@ -1,6 +1,6 @@
1#include "misc.h" 1#include "misc.h"
2 2
3#ifdef CONFIG_EARLY_PRINTK 3#if CONFIG_EARLY_PRINTK || CONFIG_RANDOMIZE_BASE
4 4
5static unsigned long fs; 5static unsigned long fs;
6static inline void set_fs(unsigned long seg) 6static inline void set_fs(unsigned long seg)
diff --git a/arch/x86/boot/compressed/cpuflags.c b/arch/x86/boot/compressed/cpuflags.c
new file mode 100644
index 000000000000..aa313466118b
--- /dev/null
+++ b/arch/x86/boot/compressed/cpuflags.c
@@ -0,0 +1,12 @@
1#ifdef CONFIG_RANDOMIZE_BASE
2
3#include "../cpuflags.c"
4
5bool has_cpuflag(int flag)
6{
7 get_cpuflags();
8
9 return test_bit(flag, cpu.flags);
10}
11
12#endif
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index a7677babf946..4703a6c4b8e3 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -19,10 +19,272 @@
19 19
20static efi_system_table_t *sys_table; 20static efi_system_table_t *sys_table;
21 21
22static struct efi_config *efi_early;
23
24#define efi_call_early(f, ...) \
25 efi_early->call(efi_early->f, __VA_ARGS__);
26
27#define BOOT_SERVICES(bits) \
28static void setup_boot_services##bits(struct efi_config *c) \
29{ \
30 efi_system_table_##bits##_t *table; \
31 efi_boot_services_##bits##_t *bt; \
32 \
33 table = (typeof(table))sys_table; \
34 \
35 c->text_output = table->con_out; \
36 \
37 bt = (typeof(bt))(unsigned long)(table->boottime); \
38 \
39 c->allocate_pool = bt->allocate_pool; \
40 c->allocate_pages = bt->allocate_pages; \
41 c->get_memory_map = bt->get_memory_map; \
42 c->free_pool = bt->free_pool; \
43 c->free_pages = bt->free_pages; \
44 c->locate_handle = bt->locate_handle; \
45 c->handle_protocol = bt->handle_protocol; \
46 c->exit_boot_services = bt->exit_boot_services; \
47}
48BOOT_SERVICES(32);
49BOOT_SERVICES(64);
22 50
23#include "../../../../drivers/firmware/efi/efi-stub-helper.c" 51static void efi_printk(efi_system_table_t *, char *);
52static void efi_char16_printk(efi_system_table_t *, efi_char16_t *);
53
54static efi_status_t
55__file_size32(void *__fh, efi_char16_t *filename_16,
56 void **handle, u64 *file_sz)
57{
58 efi_file_handle_32_t *h, *fh = __fh;
59 efi_file_info_t *info;
60 efi_status_t status;
61 efi_guid_t info_guid = EFI_FILE_INFO_ID;
62 u32 info_sz;
63
64 status = efi_early->call((unsigned long)fh->open, fh, &h, filename_16,
65 EFI_FILE_MODE_READ, (u64)0);
66 if (status != EFI_SUCCESS) {
67 efi_printk(sys_table, "Failed to open file: ");
68 efi_char16_printk(sys_table, filename_16);
69 efi_printk(sys_table, "\n");
70 return status;
71 }
72
73 *handle = h;
74
75 info_sz = 0;
76 status = efi_early->call((unsigned long)h->get_info, h, &info_guid,
77 &info_sz, NULL);
78 if (status != EFI_BUFFER_TOO_SMALL) {
79 efi_printk(sys_table, "Failed to get file info size\n");
80 return status;
81 }
82
83grow:
84 status = efi_call_early(allocate_pool, EFI_LOADER_DATA,
85 info_sz, (void **)&info);
86 if (status != EFI_SUCCESS) {
87 efi_printk(sys_table, "Failed to alloc mem for file info\n");
88 return status;
89 }
90
91 status = efi_early->call((unsigned long)h->get_info, h, &info_guid,
92 &info_sz, info);
93 if (status == EFI_BUFFER_TOO_SMALL) {
94 efi_call_early(free_pool, info);
95 goto grow;
96 }
97
98 *file_sz = info->file_size;
99 efi_call_early(free_pool, info);
100
101 if (status != EFI_SUCCESS)
102 efi_printk(sys_table, "Failed to get initrd info\n");
103
104 return status;
105}
106
107static efi_status_t
108__file_size64(void *__fh, efi_char16_t *filename_16,
109 void **handle, u64 *file_sz)
110{
111 efi_file_handle_64_t *h, *fh = __fh;
112 efi_file_info_t *info;
113 efi_status_t status;
114 efi_guid_t info_guid = EFI_FILE_INFO_ID;
115 u64 info_sz;
24 116
117 status = efi_early->call((unsigned long)fh->open, fh, &h, filename_16,
118 EFI_FILE_MODE_READ, (u64)0);
119 if (status != EFI_SUCCESS) {
120 efi_printk(sys_table, "Failed to open file: ");
121 efi_char16_printk(sys_table, filename_16);
122 efi_printk(sys_table, "\n");
123 return status;
124 }
25 125
126 *handle = h;
127
128 info_sz = 0;
129 status = efi_early->call((unsigned long)h->get_info, h, &info_guid,
130 &info_sz, NULL);
131 if (status != EFI_BUFFER_TOO_SMALL) {
132 efi_printk(sys_table, "Failed to get file info size\n");
133 return status;
134 }
135
136grow:
137 status = efi_call_early(allocate_pool, EFI_LOADER_DATA,
138 info_sz, (void **)&info);
139 if (status != EFI_SUCCESS) {
140 efi_printk(sys_table, "Failed to alloc mem for file info\n");
141 return status;
142 }
143
144 status = efi_early->call((unsigned long)h->get_info, h, &info_guid,
145 &info_sz, info);
146 if (status == EFI_BUFFER_TOO_SMALL) {
147 efi_call_early(free_pool, info);
148 goto grow;
149 }
150
151 *file_sz = info->file_size;
152 efi_call_early(free_pool, info);
153
154 if (status != EFI_SUCCESS)
155 efi_printk(sys_table, "Failed to get initrd info\n");
156
157 return status;
158}
159static efi_status_t
160efi_file_size(efi_system_table_t *sys_table, void *__fh,
161 efi_char16_t *filename_16, void **handle, u64 *file_sz)
162{
163 if (efi_early->is64)
164 return __file_size64(__fh, filename_16, handle, file_sz);
165
166 return __file_size32(__fh, filename_16, handle, file_sz);
167}
168
169static inline efi_status_t
170efi_file_read(void *handle, unsigned long *size, void *addr)
171{
172 unsigned long func;
173
174 if (efi_early->is64) {
175 efi_file_handle_64_t *fh = handle;
176
177 func = (unsigned long)fh->read;
178 return efi_early->call(func, handle, size, addr);
179 } else {
180 efi_file_handle_32_t *fh = handle;
181
182 func = (unsigned long)fh->read;
183 return efi_early->call(func, handle, size, addr);
184 }
185}
186
187static inline efi_status_t efi_file_close(void *handle)
188{
189 if (efi_early->is64) {
190 efi_file_handle_64_t *fh = handle;
191
192 return efi_early->call((unsigned long)fh->close, handle);
193 } else {
194 efi_file_handle_32_t *fh = handle;
195
196 return efi_early->call((unsigned long)fh->close, handle);
197 }
198}
199
200static inline efi_status_t __open_volume32(void *__image, void **__fh)
201{
202 efi_file_io_interface_t *io;
203 efi_loaded_image_32_t *image = __image;
204 efi_file_handle_32_t *fh;
205 efi_guid_t fs_proto = EFI_FILE_SYSTEM_GUID;
206 efi_status_t status;
207 void *handle = (void *)(unsigned long)image->device_handle;
208 unsigned long func;
209
210 status = efi_call_early(handle_protocol, handle,
211 &fs_proto, (void **)&io);
212 if (status != EFI_SUCCESS) {
213 efi_printk(sys_table, "Failed to handle fs_proto\n");
214 return status;
215 }
216
217 func = (unsigned long)io->open_volume;
218 status = efi_early->call(func, io, &fh);
219 if (status != EFI_SUCCESS)
220 efi_printk(sys_table, "Failed to open volume\n");
221
222 *__fh = fh;
223 return status;
224}
225
226static inline efi_status_t __open_volume64(void *__image, void **__fh)
227{
228 efi_file_io_interface_t *io;
229 efi_loaded_image_64_t *image = __image;
230 efi_file_handle_64_t *fh;
231 efi_guid_t fs_proto = EFI_FILE_SYSTEM_GUID;
232 efi_status_t status;
233 void *handle = (void *)(unsigned long)image->device_handle;
234 unsigned long func;
235
236 status = efi_call_early(handle_protocol, handle,
237 &fs_proto, (void **)&io);
238 if (status != EFI_SUCCESS) {
239 efi_printk(sys_table, "Failed to handle fs_proto\n");
240 return status;
241 }
242
243 func = (unsigned long)io->open_volume;
244 status = efi_early->call(func, io, &fh);
245 if (status != EFI_SUCCESS)
246 efi_printk(sys_table, "Failed to open volume\n");
247
248 *__fh = fh;
249 return status;
250}
251
252static inline efi_status_t
253efi_open_volume(efi_system_table_t *sys_table, void *__image, void **__fh)
254{
255 if (efi_early->is64)
256 return __open_volume64(__image, __fh);
257
258 return __open_volume32(__image, __fh);
259}
260
261static void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str)
262{
263 unsigned long output_string;
264 size_t offset;
265
266 if (efi_early->is64) {
267 struct efi_simple_text_output_protocol_64 *out;
268 u64 *func;
269
270 offset = offsetof(typeof(*out), output_string);
271 output_string = efi_early->text_output + offset;
272 func = (u64 *)output_string;
273
274 efi_early->call(*func, efi_early->text_output, str);
275 } else {
276 struct efi_simple_text_output_protocol_32 *out;
277 u32 *func;
278
279 offset = offsetof(typeof(*out), output_string);
280 output_string = efi_early->text_output + offset;
281 func = (u32 *)output_string;
282
283 efi_early->call(*func, efi_early->text_output, str);
284 }
285}
286
287#include "../../../../drivers/firmware/efi/efi-stub-helper.c"
26 288
27static void find_bits(unsigned long mask, u8 *pos, u8 *size) 289static void find_bits(unsigned long mask, u8 *pos, u8 *size)
28{ 290{
@@ -47,105 +309,97 @@ static void find_bits(unsigned long mask, u8 *pos, u8 *size)
47 *size = len; 309 *size = len;
48} 310}
49 311
50static efi_status_t setup_efi_pci(struct boot_params *params) 312static efi_status_t
313__setup_efi_pci32(efi_pci_io_protocol_32 *pci, struct pci_setup_rom **__rom)
51{ 314{
52 efi_pci_io_protocol *pci; 315 struct pci_setup_rom *rom = NULL;
53 efi_status_t status; 316 efi_status_t status;
54 void **pci_handle; 317 unsigned long size;
55 efi_guid_t pci_proto = EFI_PCI_IO_PROTOCOL_GUID; 318 uint64_t attributes;
56 unsigned long nr_pci, size = 0;
57 int i;
58 struct setup_data *data;
59
60 data = (struct setup_data *)(unsigned long)params->hdr.setup_data;
61 319
62 while (data && data->next) 320 status = efi_early->call(pci->attributes, pci,
63 data = (struct setup_data *)(unsigned long)data->next; 321 EfiPciIoAttributeOperationGet, 0, 0,
322 &attributes);
323 if (status != EFI_SUCCESS)
324 return status;
64 325
65 status = efi_call_phys5(sys_table->boottime->locate_handle, 326 if (!pci->romimage || !pci->romsize)
66 EFI_LOCATE_BY_PROTOCOL, &pci_proto, 327 return EFI_INVALID_PARAMETER;
67 NULL, &size, pci_handle);
68 328
69 if (status == EFI_BUFFER_TOO_SMALL) { 329 size = pci->romsize + sizeof(*rom);
70 status = efi_call_phys3(sys_table->boottime->allocate_pool,
71 EFI_LOADER_DATA, size, &pci_handle);
72 330
73 if (status != EFI_SUCCESS) 331 status = efi_call_early(allocate_pool, EFI_LOADER_DATA, size, &rom);
74 return status; 332 if (status != EFI_SUCCESS)
333 return status;
75 334
76 status = efi_call_phys5(sys_table->boottime->locate_handle, 335 memset(rom, 0, sizeof(*rom));
77 EFI_LOCATE_BY_PROTOCOL, &pci_proto,
78 NULL, &size, pci_handle);
79 }
80 336
81 if (status != EFI_SUCCESS) 337 rom->data.type = SETUP_PCI;
82 goto free_handle; 338 rom->data.len = size - sizeof(struct setup_data);
339 rom->data.next = 0;
340 rom->pcilen = pci->romsize;
341 *__rom = rom;
83 342
84 nr_pci = size / sizeof(void *); 343 status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16,
85 for (i = 0; i < nr_pci; i++) { 344 PCI_VENDOR_ID, 1, &(rom->vendor));
86 void *h = pci_handle[i];
87 uint64_t attributes;
88 struct pci_setup_rom *rom;
89 345
90 status = efi_call_phys3(sys_table->boottime->handle_protocol, 346 if (status != EFI_SUCCESS)
91 h, &pci_proto, &pci); 347 goto free_struct;
92 348
93 if (status != EFI_SUCCESS) 349 status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16,
94 continue; 350 PCI_DEVICE_ID, 1, &(rom->devid));
95 351
96 if (!pci) 352 if (status != EFI_SUCCESS)
97 continue; 353 goto free_struct;
98 354
99#ifdef CONFIG_X86_64 355 status = efi_early->call(pci->get_location, pci, &(rom->segment),
100 status = efi_call_phys4(pci->attributes, pci, 356 &(rom->bus), &(rom->device), &(rom->function));
101 EfiPciIoAttributeOperationGet, 0,
102 &attributes);
103#else
104 status = efi_call_phys5(pci->attributes, pci,
105 EfiPciIoAttributeOperationGet, 0, 0,
106 &attributes);
107#endif
108 if (status != EFI_SUCCESS)
109 continue;
110 357
111 if (!pci->romimage || !pci->romsize) 358 if (status != EFI_SUCCESS)
112 continue; 359 goto free_struct;
113 360
114 size = pci->romsize + sizeof(*rom); 361 memcpy(rom->romdata, pci->romimage, pci->romsize);
362 return status;
115 363
116 status = efi_call_phys3(sys_table->boottime->allocate_pool, 364free_struct:
117 EFI_LOADER_DATA, size, &rom); 365 efi_call_early(free_pool, rom);
366 return status;
367}
118 368
119 if (status != EFI_SUCCESS) 369static efi_status_t
120 continue; 370setup_efi_pci32(struct boot_params *params, void **pci_handle,
371 unsigned long size)
372{
373 efi_pci_io_protocol_32 *pci = NULL;
374 efi_guid_t pci_proto = EFI_PCI_IO_PROTOCOL_GUID;
375 u32 *handles = (u32 *)(unsigned long)pci_handle;
376 efi_status_t status;
377 unsigned long nr_pci;
378 struct setup_data *data;
379 int i;
121 380
122 rom->data.type = SETUP_PCI; 381 data = (struct setup_data *)(unsigned long)params->hdr.setup_data;
123 rom->data.len = size - sizeof(struct setup_data);
124 rom->data.next = 0;
125 rom->pcilen = pci->romsize;
126 382
127 status = efi_call_phys5(pci->pci.read, pci, 383 while (data && data->next)
128 EfiPciIoWidthUint16, PCI_VENDOR_ID, 384 data = (struct setup_data *)(unsigned long)data->next;
129 1, &(rom->vendor));
130 385
131 if (status != EFI_SUCCESS) 386 nr_pci = size / sizeof(u32);
132 goto free_struct; 387 for (i = 0; i < nr_pci; i++) {
388 struct pci_setup_rom *rom = NULL;
389 u32 h = handles[i];
133 390
134 status = efi_call_phys5(pci->pci.read, pci, 391 status = efi_call_early(handle_protocol, h,
135 EfiPciIoWidthUint16, PCI_DEVICE_ID, 392 &pci_proto, (void **)&pci);
136 1, &(rom->devid));
137 393
138 if (status != EFI_SUCCESS) 394 if (status != EFI_SUCCESS)
139 goto free_struct; 395 continue;
140 396
141 status = efi_call_phys5(pci->get_location, pci, 397 if (!pci)
142 &(rom->segment), &(rom->bus), 398 continue;
143 &(rom->device), &(rom->function));
144 399
400 status = __setup_efi_pci32(pci, &rom);
145 if (status != EFI_SUCCESS) 401 if (status != EFI_SUCCESS)
146 goto free_struct; 402 continue;
147
148 memcpy(rom->romdata, pci->romimage, pci->romsize);
149 403
150 if (data) 404 if (data)
151 data->next = (unsigned long)rom; 405 data->next = (unsigned long)rom;
@@ -154,105 +408,155 @@ static efi_status_t setup_efi_pci(struct boot_params *params)
154 408
155 data = (struct setup_data *)rom; 409 data = (struct setup_data *)rom;
156 410
157 continue;
158 free_struct:
159 efi_call_phys1(sys_table->boottime->free_pool, rom);
160 } 411 }
161 412
162free_handle:
163 efi_call_phys1(sys_table->boottime->free_pool, pci_handle);
164 return status; 413 return status;
165} 414}
166 415
167/* 416static efi_status_t
168 * See if we have Graphics Output Protocol 417__setup_efi_pci64(efi_pci_io_protocol_64 *pci, struct pci_setup_rom **__rom)
169 */
170static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto,
171 unsigned long size)
172{ 418{
173 struct efi_graphics_output_protocol *gop, *first_gop; 419 struct pci_setup_rom *rom;
174 struct efi_pixel_bitmask pixel_info;
175 unsigned long nr_gops;
176 efi_status_t status; 420 efi_status_t status;
177 void **gop_handle; 421 unsigned long size;
178 u16 width, height; 422 uint64_t attributes;
179 u32 fb_base, fb_size;
180 u32 pixels_per_scan_line;
181 int pixel_format;
182 int i;
183 423
184 status = efi_call_phys3(sys_table->boottime->allocate_pool, 424 status = efi_early->call(pci->attributes, pci,
185 EFI_LOADER_DATA, size, &gop_handle); 425 EfiPciIoAttributeOperationGet, 0,
426 &attributes);
186 if (status != EFI_SUCCESS) 427 if (status != EFI_SUCCESS)
187 return status; 428 return status;
188 429
189 status = efi_call_phys5(sys_table->boottime->locate_handle, 430 if (!pci->romimage || !pci->romsize)
190 EFI_LOCATE_BY_PROTOCOL, proto, 431 return EFI_INVALID_PARAMETER;
191 NULL, &size, gop_handle); 432
433 size = pci->romsize + sizeof(*rom);
434
435 status = efi_call_early(allocate_pool, EFI_LOADER_DATA, size, &rom);
192 if (status != EFI_SUCCESS) 436 if (status != EFI_SUCCESS)
193 goto free_handle; 437 return status;
194 438
195 first_gop = NULL; 439 rom->data.type = SETUP_PCI;
440 rom->data.len = size - sizeof(struct setup_data);
441 rom->data.next = 0;
442 rom->pcilen = pci->romsize;
443 *__rom = rom;
196 444
197 nr_gops = size / sizeof(void *); 445 status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16,
198 for (i = 0; i < nr_gops; i++) { 446 PCI_VENDOR_ID, 1, &(rom->vendor));
199 struct efi_graphics_output_mode_info *info; 447
200 efi_guid_t conout_proto = EFI_CONSOLE_OUT_DEVICE_GUID; 448 if (status != EFI_SUCCESS)
201 bool conout_found = false; 449 goto free_struct;
202 void *dummy; 450
203 void *h = gop_handle[i]; 451 status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16,
452 PCI_DEVICE_ID, 1, &(rom->devid));
453
454 if (status != EFI_SUCCESS)
455 goto free_struct;
456
457 status = efi_early->call(pci->get_location, pci, &(rom->segment),
458 &(rom->bus), &(rom->device), &(rom->function));
459
460 if (status != EFI_SUCCESS)
461 goto free_struct;
462
463 memcpy(rom->romdata, pci->romimage, pci->romsize);
464 return status;
465
466free_struct:
467 efi_call_early(free_pool, rom);
468 return status;
469
470}
471
472static efi_status_t
473setup_efi_pci64(struct boot_params *params, void **pci_handle,
474 unsigned long size)
475{
476 efi_pci_io_protocol_64 *pci = NULL;
477 efi_guid_t pci_proto = EFI_PCI_IO_PROTOCOL_GUID;
478 u64 *handles = (u64 *)(unsigned long)pci_handle;
479 efi_status_t status;
480 unsigned long nr_pci;
481 struct setup_data *data;
482 int i;
483
484 data = (struct setup_data *)(unsigned long)params->hdr.setup_data;
485
486 while (data && data->next)
487 data = (struct setup_data *)(unsigned long)data->next;
488
489 nr_pci = size / sizeof(u64);
490 for (i = 0; i < nr_pci; i++) {
491 struct pci_setup_rom *rom = NULL;
492 u64 h = handles[i];
493
494 status = efi_call_early(handle_protocol, h,
495 &pci_proto, (void **)&pci);
204 496
205 status = efi_call_phys3(sys_table->boottime->handle_protocol,
206 h, proto, &gop);
207 if (status != EFI_SUCCESS) 497 if (status != EFI_SUCCESS)
208 continue; 498 continue;
209 499
210 status = efi_call_phys3(sys_table->boottime->handle_protocol, 500 if (!pci)
211 h, &conout_proto, &dummy); 501 continue;
212 502
213 if (status == EFI_SUCCESS) 503 status = __setup_efi_pci64(pci, &rom);
214 conout_found = true; 504 if (status != EFI_SUCCESS)
505 continue;
215 506
216 status = efi_call_phys4(gop->query_mode, gop, 507 if (data)
217 gop->mode->mode, &size, &info); 508 data->next = (unsigned long)rom;
218 if (status == EFI_SUCCESS && (!first_gop || conout_found)) { 509 else
219 /* 510 params->hdr.setup_data = (unsigned long)rom;
220 * Systems that use the UEFI Console Splitter may 511
221 * provide multiple GOP devices, not all of which are 512 data = (struct setup_data *)rom;
222 * backed by real hardware. The workaround is to search
223 * for a GOP implementing the ConOut protocol, and if
224 * one isn't found, to just fall back to the first GOP.
225 */
226 width = info->horizontal_resolution;
227 height = info->vertical_resolution;
228 fb_base = gop->mode->frame_buffer_base;
229 fb_size = gop->mode->frame_buffer_size;
230 pixel_format = info->pixel_format;
231 pixel_info = info->pixel_information;
232 pixels_per_scan_line = info->pixels_per_scan_line;
233 513
234 /*
235 * Once we've found a GOP supporting ConOut,
236 * don't bother looking any further.
237 */
238 first_gop = gop;
239 if (conout_found)
240 break;
241 }
242 } 514 }
243 515
244 /* Did we find any GOPs? */ 516 return status;
245 if (!first_gop) 517}
518
519static efi_status_t setup_efi_pci(struct boot_params *params)
520{
521 efi_status_t status;
522 void **pci_handle = NULL;
523 efi_guid_t pci_proto = EFI_PCI_IO_PROTOCOL_GUID;
524 unsigned long size = 0;
525
526 status = efi_call_early(locate_handle,
527 EFI_LOCATE_BY_PROTOCOL,
528 &pci_proto, NULL, &size, pci_handle);
529
530 if (status == EFI_BUFFER_TOO_SMALL) {
531 status = efi_call_early(allocate_pool,
532 EFI_LOADER_DATA,
533 size, (void **)&pci_handle);
534
535 if (status != EFI_SUCCESS)
536 return status;
537
538 status = efi_call_early(locate_handle,
539 EFI_LOCATE_BY_PROTOCOL, &pci_proto,
540 NULL, &size, pci_handle);
541 }
542
543 if (status != EFI_SUCCESS)
246 goto free_handle; 544 goto free_handle;
247 545
248 /* EFI framebuffer */ 546 if (efi_early->is64)
249 si->orig_video_isVGA = VIDEO_TYPE_EFI; 547 status = setup_efi_pci64(params, pci_handle, size);
548 else
549 status = setup_efi_pci32(params, pci_handle, size);
250 550
251 si->lfb_width = width; 551free_handle:
252 si->lfb_height = height; 552 efi_call_early(free_pool, pci_handle);
253 si->lfb_base = fb_base; 553 return status;
254 si->pages = 1; 554}
255 555
556static void
557setup_pixel_info(struct screen_info *si, u32 pixels_per_scan_line,
558 struct efi_pixel_bitmask pixel_info, int pixel_format)
559{
256 if (pixel_format == PIXEL_RGB_RESERVED_8BIT_PER_COLOR) { 560 if (pixel_format == PIXEL_RGB_RESERVED_8BIT_PER_COLOR) {
257 si->lfb_depth = 32; 561 si->lfb_depth = 32;
258 si->lfb_linelength = pixels_per_scan_line * 4; 562 si->lfb_linelength = pixels_per_scan_line * 4;
@@ -297,62 +601,319 @@ static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto,
297 si->rsvd_size = 0; 601 si->rsvd_size = 0;
298 si->rsvd_pos = 0; 602 si->rsvd_pos = 0;
299 } 603 }
604}
605
606static efi_status_t
607__gop_query32(struct efi_graphics_output_protocol_32 *gop32,
608 struct efi_graphics_output_mode_info **info,
609 unsigned long *size, u32 *fb_base)
610{
611 struct efi_graphics_output_protocol_mode_32 *mode;
612 efi_status_t status;
613 unsigned long m;
614
615 m = gop32->mode;
616 mode = (struct efi_graphics_output_protocol_mode_32 *)m;
617
618 status = efi_early->call(gop32->query_mode, gop32,
619 mode->mode, size, info);
620 if (status != EFI_SUCCESS)
621 return status;
622
623 *fb_base = mode->frame_buffer_base;
624 return status;
625}
626
627static efi_status_t
628setup_gop32(struct screen_info *si, efi_guid_t *proto,
629 unsigned long size, void **gop_handle)
630{
631 struct efi_graphics_output_protocol_32 *gop32, *first_gop;
632 unsigned long nr_gops;
633 u16 width, height;
634 u32 pixels_per_scan_line;
635 u32 fb_base;
636 struct efi_pixel_bitmask pixel_info;
637 int pixel_format;
638 efi_status_t status;
639 u32 *handles = (u32 *)(unsigned long)gop_handle;
640 int i;
641
642 first_gop = NULL;
643 gop32 = NULL;
644
645 nr_gops = size / sizeof(u32);
646 for (i = 0; i < nr_gops; i++) {
647 struct efi_graphics_output_mode_info *info = NULL;
648 efi_guid_t conout_proto = EFI_CONSOLE_OUT_DEVICE_GUID;
649 bool conout_found = false;
650 void *dummy = NULL;
651 u32 h = handles[i];
652
653 status = efi_call_early(handle_protocol, h,
654 proto, (void **)&gop32);
655 if (status != EFI_SUCCESS)
656 continue;
657
658 status = efi_call_early(handle_protocol, h,
659 &conout_proto, &dummy);
660 if (status == EFI_SUCCESS)
661 conout_found = true;
662
663 status = __gop_query32(gop32, &info, &size, &fb_base);
664 if (status == EFI_SUCCESS && (!first_gop || conout_found)) {
665 /*
666 * Systems that use the UEFI Console Splitter may
667 * provide multiple GOP devices, not all of which are
668 * backed by real hardware. The workaround is to search
669 * for a GOP implementing the ConOut protocol, and if
670 * one isn't found, to just fall back to the first GOP.
671 */
672 width = info->horizontal_resolution;
673 height = info->vertical_resolution;
674 pixel_format = info->pixel_format;
675 pixel_info = info->pixel_information;
676 pixels_per_scan_line = info->pixels_per_scan_line;
677
678 /*
679 * Once we've found a GOP supporting ConOut,
680 * don't bother looking any further.
681 */
682 first_gop = gop32;
683 if (conout_found)
684 break;
685 }
686 }
687
688 /* Did we find any GOPs? */
689 if (!first_gop)
690 goto out;
691
692 /* EFI framebuffer */
693 si->orig_video_isVGA = VIDEO_TYPE_EFI;
694
695 si->lfb_width = width;
696 si->lfb_height = height;
697 si->lfb_base = fb_base;
698 si->pages = 1;
699
700 setup_pixel_info(si, pixels_per_scan_line, pixel_info, pixel_format);
300 701
301 si->lfb_size = si->lfb_linelength * si->lfb_height; 702 si->lfb_size = si->lfb_linelength * si->lfb_height;
302 703
303 si->capabilities |= VIDEO_CAPABILITY_SKIP_QUIRKS; 704 si->capabilities |= VIDEO_CAPABILITY_SKIP_QUIRKS;
705out:
706 return status;
707}
304 708
305free_handle: 709static efi_status_t
306 efi_call_phys1(sys_table->boottime->free_pool, gop_handle); 710__gop_query64(struct efi_graphics_output_protocol_64 *gop64,
711 struct efi_graphics_output_mode_info **info,
712 unsigned long *size, u32 *fb_base)
713{
714 struct efi_graphics_output_protocol_mode_64 *mode;
715 efi_status_t status;
716 unsigned long m;
717
718 m = gop64->mode;
719 mode = (struct efi_graphics_output_protocol_mode_64 *)m;
720
721 status = efi_early->call(gop64->query_mode, gop64,
722 mode->mode, size, info);
723 if (status != EFI_SUCCESS)
724 return status;
725
726 *fb_base = mode->frame_buffer_base;
727 return status;
728}
729
730static efi_status_t
731setup_gop64(struct screen_info *si, efi_guid_t *proto,
732 unsigned long size, void **gop_handle)
733{
734 struct efi_graphics_output_protocol_64 *gop64, *first_gop;
735 unsigned long nr_gops;
736 u16 width, height;
737 u32 pixels_per_scan_line;
738 u32 fb_base;
739 struct efi_pixel_bitmask pixel_info;
740 int pixel_format;
741 efi_status_t status;
742 u64 *handles = (u64 *)(unsigned long)gop_handle;
743 int i;
744
745 first_gop = NULL;
746 gop64 = NULL;
747
748 nr_gops = size / sizeof(u64);
749 for (i = 0; i < nr_gops; i++) {
750 struct efi_graphics_output_mode_info *info = NULL;
751 efi_guid_t conout_proto = EFI_CONSOLE_OUT_DEVICE_GUID;
752 bool conout_found = false;
753 void *dummy = NULL;
754 u64 h = handles[i];
755
756 status = efi_call_early(handle_protocol, h,
757 proto, (void **)&gop64);
758 if (status != EFI_SUCCESS)
759 continue;
760
761 status = efi_call_early(handle_protocol, h,
762 &conout_proto, &dummy);
763 if (status == EFI_SUCCESS)
764 conout_found = true;
765
766 status = __gop_query64(gop64, &info, &size, &fb_base);
767 if (status == EFI_SUCCESS && (!first_gop || conout_found)) {
768 /*
769 * Systems that use the UEFI Console Splitter may
770 * provide multiple GOP devices, not all of which are
771 * backed by real hardware. The workaround is to search
772 * for a GOP implementing the ConOut protocol, and if
773 * one isn't found, to just fall back to the first GOP.
774 */
775 width = info->horizontal_resolution;
776 height = info->vertical_resolution;
777 pixel_format = info->pixel_format;
778 pixel_info = info->pixel_information;
779 pixels_per_scan_line = info->pixels_per_scan_line;
780
781 /*
782 * Once we've found a GOP supporting ConOut,
783 * don't bother looking any further.
784 */
785 first_gop = gop64;
786 if (conout_found)
787 break;
788 }
789 }
790
791 /* Did we find any GOPs? */
792 if (!first_gop)
793 goto out;
794
795 /* EFI framebuffer */
796 si->orig_video_isVGA = VIDEO_TYPE_EFI;
797
798 si->lfb_width = width;
799 si->lfb_height = height;
800 si->lfb_base = fb_base;
801 si->pages = 1;
802
803 setup_pixel_info(si, pixels_per_scan_line, pixel_info, pixel_format);
804
805 si->lfb_size = si->lfb_linelength * si->lfb_height;
806
807 si->capabilities |= VIDEO_CAPABILITY_SKIP_QUIRKS;
808out:
307 return status; 809 return status;
308} 810}
309 811
310/* 812/*
311 * See if we have Universal Graphics Adapter (UGA) protocol 813 * See if we have Graphics Output Protocol
312 */ 814 */
313static efi_status_t setup_uga(struct screen_info *si, efi_guid_t *uga_proto, 815static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto,
314 unsigned long size) 816 unsigned long size)
315{ 817{
316 struct efi_uga_draw_protocol *uga, *first_uga;
317 unsigned long nr_ugas;
318 efi_status_t status; 818 efi_status_t status;
319 u32 width, height; 819 void **gop_handle = NULL;
320 void **uga_handle = NULL;
321 int i;
322 820
323 status = efi_call_phys3(sys_table->boottime->allocate_pool, 821 status = efi_call_early(allocate_pool, EFI_LOADER_DATA,
324 EFI_LOADER_DATA, size, &uga_handle); 822 size, (void **)&gop_handle);
325 if (status != EFI_SUCCESS) 823 if (status != EFI_SUCCESS)
326 return status; 824 return status;
327 825
328 status = efi_call_phys5(sys_table->boottime->locate_handle, 826 status = efi_call_early(locate_handle,
329 EFI_LOCATE_BY_PROTOCOL, uga_proto, 827 EFI_LOCATE_BY_PROTOCOL,
330 NULL, &size, uga_handle); 828 proto, NULL, &size, gop_handle);
331 if (status != EFI_SUCCESS) 829 if (status != EFI_SUCCESS)
332 goto free_handle; 830 goto free_handle;
333 831
832 if (efi_early->is64)
833 status = setup_gop64(si, proto, size, gop_handle);
834 else
835 status = setup_gop32(si, proto, size, gop_handle);
836
837free_handle:
838 efi_call_early(free_pool, gop_handle);
839 return status;
840}
841
842static efi_status_t
843setup_uga32(void **uga_handle, unsigned long size, u32 *width, u32 *height)
844{
845 struct efi_uga_draw_protocol *uga = NULL, *first_uga;
846 efi_guid_t uga_proto = EFI_UGA_PROTOCOL_GUID;
847 unsigned long nr_ugas;
848 u32 *handles = (u32 *)uga_handle;;
849 efi_status_t status;
850 int i;
851
334 first_uga = NULL; 852 first_uga = NULL;
853 nr_ugas = size / sizeof(u32);
854 for (i = 0; i < nr_ugas; i++) {
855 efi_guid_t pciio_proto = EFI_PCI_IO_PROTOCOL_GUID;
856 u32 w, h, depth, refresh;
857 void *pciio;
858 u32 handle = handles[i];
335 859
336 nr_ugas = size / sizeof(void *); 860 status = efi_call_early(handle_protocol, handle,
861 &uga_proto, (void **)&uga);
862 if (status != EFI_SUCCESS)
863 continue;
864
865 efi_call_early(handle_protocol, handle, &pciio_proto, &pciio);
866
867 status = efi_early->call((unsigned long)uga->get_mode, uga,
868 &w, &h, &depth, &refresh);
869 if (status == EFI_SUCCESS && (!first_uga || pciio)) {
870 *width = w;
871 *height = h;
872
873 /*
874 * Once we've found a UGA supporting PCIIO,
875 * don't bother looking any further.
876 */
877 if (pciio)
878 break;
879
880 first_uga = uga;
881 }
882 }
883
884 return status;
885}
886
887static efi_status_t
888setup_uga64(void **uga_handle, unsigned long size, u32 *width, u32 *height)
889{
890 struct efi_uga_draw_protocol *uga = NULL, *first_uga;
891 efi_guid_t uga_proto = EFI_UGA_PROTOCOL_GUID;
892 unsigned long nr_ugas;
893 u64 *handles = (u64 *)uga_handle;;
894 efi_status_t status;
895 int i;
896
897 first_uga = NULL;
898 nr_ugas = size / sizeof(u64);
337 for (i = 0; i < nr_ugas; i++) { 899 for (i = 0; i < nr_ugas; i++) {
338 efi_guid_t pciio_proto = EFI_PCI_IO_PROTOCOL_GUID; 900 efi_guid_t pciio_proto = EFI_PCI_IO_PROTOCOL_GUID;
339 void *handle = uga_handle[i];
340 u32 w, h, depth, refresh; 901 u32 w, h, depth, refresh;
341 void *pciio; 902 void *pciio;
903 u64 handle = handles[i];
342 904
343 status = efi_call_phys3(sys_table->boottime->handle_protocol, 905 status = efi_call_early(handle_protocol, handle,
344 handle, uga_proto, &uga); 906 &uga_proto, (void **)&uga);
345 if (status != EFI_SUCCESS) 907 if (status != EFI_SUCCESS)
346 continue; 908 continue;
347 909
348 efi_call_phys3(sys_table->boottime->handle_protocol, 910 efi_call_early(handle_protocol, handle, &pciio_proto, &pciio);
349 handle, &pciio_proto, &pciio);
350 911
351 status = efi_call_phys5(uga->get_mode, uga, &w, &h, 912 status = efi_early->call((unsigned long)uga->get_mode, uga,
352 &depth, &refresh); 913 &w, &h, &depth, &refresh);
353 if (status == EFI_SUCCESS && (!first_uga || pciio)) { 914 if (status == EFI_SUCCESS && (!first_uga || pciio)) {
354 width = w; 915 *width = w;
355 height = h; 916 *height = h;
356 917
357 /* 918 /*
358 * Once we've found a UGA supporting PCIIO, 919 * Once we've found a UGA supporting PCIIO,
@@ -365,7 +926,39 @@ static efi_status_t setup_uga(struct screen_info *si, efi_guid_t *uga_proto,
365 } 926 }
366 } 927 }
367 928
368 if (!first_uga) 929 return status;
930}
931
932/*
933 * See if we have Universal Graphics Adapter (UGA) protocol
934 */
935static efi_status_t setup_uga(struct screen_info *si, efi_guid_t *uga_proto,
936 unsigned long size)
937{
938 efi_status_t status;
939 u32 width, height;
940 void **uga_handle = NULL;
941
942 status = efi_call_early(allocate_pool, EFI_LOADER_DATA,
943 size, (void **)&uga_handle);
944 if (status != EFI_SUCCESS)
945 return status;
946
947 status = efi_call_early(locate_handle,
948 EFI_LOCATE_BY_PROTOCOL,
949 uga_proto, NULL, &size, uga_handle);
950 if (status != EFI_SUCCESS)
951 goto free_handle;
952
953 height = 0;
954 width = 0;
955
956 if (efi_early->is64)
957 status = setup_uga64(uga_handle, size, &width, &height);
958 else
959 status = setup_uga32(uga_handle, size, &width, &height);
960
961 if (!width && !height)
369 goto free_handle; 962 goto free_handle;
370 963
371 /* EFI framebuffer */ 964 /* EFI framebuffer */
@@ -384,9 +977,8 @@ static efi_status_t setup_uga(struct screen_info *si, efi_guid_t *uga_proto,
384 si->rsvd_size = 8; 977 si->rsvd_size = 8;
385 si->rsvd_pos = 24; 978 si->rsvd_pos = 24;
386 979
387
388free_handle: 980free_handle:
389 efi_call_phys1(sys_table->boottime->free_pool, uga_handle); 981 efi_call_early(free_pool, uga_handle);
390 return status; 982 return status;
391} 983}
392 984
@@ -404,29 +996,31 @@ void setup_graphics(struct boot_params *boot_params)
404 memset(si, 0, sizeof(*si)); 996 memset(si, 0, sizeof(*si));
405 997
406 size = 0; 998 size = 0;
407 status = efi_call_phys5(sys_table->boottime->locate_handle, 999 status = efi_call_early(locate_handle,
408 EFI_LOCATE_BY_PROTOCOL, &graphics_proto, 1000 EFI_LOCATE_BY_PROTOCOL,
409 NULL, &size, gop_handle); 1001 &graphics_proto, NULL, &size, gop_handle);
410 if (status == EFI_BUFFER_TOO_SMALL) 1002 if (status == EFI_BUFFER_TOO_SMALL)
411 status = setup_gop(si, &graphics_proto, size); 1003 status = setup_gop(si, &graphics_proto, size);
412 1004
413 if (status != EFI_SUCCESS) { 1005 if (status != EFI_SUCCESS) {
414 size = 0; 1006 size = 0;
415 status = efi_call_phys5(sys_table->boottime->locate_handle, 1007 status = efi_call_early(locate_handle,
416 EFI_LOCATE_BY_PROTOCOL, &uga_proto, 1008 EFI_LOCATE_BY_PROTOCOL,
417 NULL, &size, uga_handle); 1009 &uga_proto, NULL, &size, uga_handle);
418 if (status == EFI_BUFFER_TOO_SMALL) 1010 if (status == EFI_BUFFER_TOO_SMALL)
419 setup_uga(si, &uga_proto, size); 1011 setup_uga(si, &uga_proto, size);
420 } 1012 }
421} 1013}
422 1014
423
424/* 1015/*
425 * Because the x86 boot code expects to be passed a boot_params we 1016 * Because the x86 boot code expects to be passed a boot_params we
426 * need to create one ourselves (usually the bootloader would create 1017 * need to create one ourselves (usually the bootloader would create
427 * one for us). 1018 * one for us).
1019 *
1020 * The caller is responsible for filling out ->code32_start in the
1021 * returned boot_params.
428 */ 1022 */
429struct boot_params *make_boot_params(void *handle, efi_system_table_t *_table) 1023struct boot_params *make_boot_params(struct efi_config *c)
430{ 1024{
431 struct boot_params *boot_params; 1025 struct boot_params *boot_params;
432 struct sys_desc_table *sdt; 1026 struct sys_desc_table *sdt;
@@ -434,7 +1028,7 @@ struct boot_params *make_boot_params(void *handle, efi_system_table_t *_table)
434 struct setup_header *hdr; 1028 struct setup_header *hdr;
435 struct efi_info *efi; 1029 struct efi_info *efi;
436 efi_loaded_image_t *image; 1030 efi_loaded_image_t *image;
437 void *options; 1031 void *options, *handle;
438 efi_guid_t proto = LOADED_IMAGE_PROTOCOL_GUID; 1032 efi_guid_t proto = LOADED_IMAGE_PROTOCOL_GUID;
439 int options_size = 0; 1033 int options_size = 0;
440 efi_status_t status; 1034 efi_status_t status;
@@ -445,14 +1039,21 @@ struct boot_params *make_boot_params(void *handle, efi_system_table_t *_table)
445 unsigned long ramdisk_addr; 1039 unsigned long ramdisk_addr;
446 unsigned long ramdisk_size; 1040 unsigned long ramdisk_size;
447 1041
448 sys_table = _table; 1042 efi_early = c;
1043 sys_table = (efi_system_table_t *)(unsigned long)efi_early->table;
1044 handle = (void *)(unsigned long)efi_early->image_handle;
449 1045
450 /* Check if we were booted by the EFI firmware */ 1046 /* Check if we were booted by the EFI firmware */
451 if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) 1047 if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
452 return NULL; 1048 return NULL;
453 1049
454 status = efi_call_phys3(sys_table->boottime->handle_protocol, 1050 if (efi_early->is64)
455 handle, &proto, (void *)&image); 1051 setup_boot_services64(efi_early);
1052 else
1053 setup_boot_services32(efi_early);
1054
1055 status = efi_call_early(handle_protocol, handle,
1056 &proto, (void *)&image);
456 if (status != EFI_SUCCESS) { 1057 if (status != EFI_SUCCESS) {
457 efi_printk(sys_table, "Failed to get handle for LOADED_IMAGE_PROTOCOL\n"); 1058 efi_printk(sys_table, "Failed to get handle for LOADED_IMAGE_PROTOCOL\n");
458 return NULL; 1059 return NULL;
@@ -483,8 +1084,6 @@ struct boot_params *make_boot_params(void *handle, efi_system_table_t *_table)
483 hdr->vid_mode = 0xffff; 1084 hdr->vid_mode = 0xffff;
484 hdr->boot_flag = 0xAA55; 1085 hdr->boot_flag = 0xAA55;
485 1086
486 hdr->code32_start = (__u64)(unsigned long)image->image_base;
487
488 hdr->type_of_loader = 0x21; 1087 hdr->type_of_loader = 0x21;
489 1088
490 /* Convert unicode cmdline to ascii */ 1089 /* Convert unicode cmdline to ascii */
@@ -641,14 +1240,13 @@ static efi_status_t alloc_e820ext(u32 nr_desc, struct setup_data **e820ext,
641 sizeof(struct e820entry) * nr_desc; 1240 sizeof(struct e820entry) * nr_desc;
642 1241
643 if (*e820ext) { 1242 if (*e820ext) {
644 efi_call_phys1(sys_table->boottime->free_pool, *e820ext); 1243 efi_call_early(free_pool, *e820ext);
645 *e820ext = NULL; 1244 *e820ext = NULL;
646 *e820ext_size = 0; 1245 *e820ext_size = 0;
647 } 1246 }
648 1247
649 status = efi_call_phys3(sys_table->boottime->allocate_pool, 1248 status = efi_call_early(allocate_pool, EFI_LOADER_DATA,
650 EFI_LOADER_DATA, size, e820ext); 1249 size, (void **)e820ext);
651
652 if (status == EFI_SUCCESS) 1250 if (status == EFI_SUCCESS)
653 *e820ext_size = size; 1251 *e820ext_size = size;
654 1252
@@ -656,12 +1254,13 @@ static efi_status_t alloc_e820ext(u32 nr_desc, struct setup_data **e820ext,
656} 1254}
657 1255
658static efi_status_t exit_boot(struct boot_params *boot_params, 1256static efi_status_t exit_boot(struct boot_params *boot_params,
659 void *handle) 1257 void *handle, bool is64)
660{ 1258{
661 struct efi_info *efi = &boot_params->efi_info; 1259 struct efi_info *efi = &boot_params->efi_info;
662 unsigned long map_sz, key, desc_size; 1260 unsigned long map_sz, key, desc_size;
663 efi_memory_desc_t *mem_map; 1261 efi_memory_desc_t *mem_map;
664 struct setup_data *e820ext; 1262 struct setup_data *e820ext;
1263 const char *signature;
665 __u32 e820ext_size; 1264 __u32 e820ext_size;
666 __u32 nr_desc, prev_nr_desc; 1265 __u32 nr_desc, prev_nr_desc;
667 efi_status_t status; 1266 efi_status_t status;
@@ -691,11 +1290,13 @@ get_map:
691 if (status != EFI_SUCCESS) 1290 if (status != EFI_SUCCESS)
692 goto free_mem_map; 1291 goto free_mem_map;
693 1292
694 efi_call_phys1(sys_table->boottime->free_pool, mem_map); 1293 efi_call_early(free_pool, mem_map);
695 goto get_map; /* Allocated memory, get map again */ 1294 goto get_map; /* Allocated memory, get map again */
696 } 1295 }
697 1296
698 memcpy(&efi->efi_loader_signature, EFI_LOADER_SIGNATURE, sizeof(__u32)); 1297 signature = is64 ? EFI64_LOADER_SIGNATURE : EFI32_LOADER_SIGNATURE;
1298 memcpy(&efi->efi_loader_signature, signature, sizeof(__u32));
1299
699 efi->efi_systab = (unsigned long)sys_table; 1300 efi->efi_systab = (unsigned long)sys_table;
700 efi->efi_memdesc_size = desc_size; 1301 efi->efi_memdesc_size = desc_size;
701 efi->efi_memdesc_version = desc_version; 1302 efi->efi_memdesc_version = desc_version;
@@ -708,8 +1309,7 @@ get_map:
708#endif 1309#endif
709 1310
710 /* Might as well exit boot services now */ 1311 /* Might as well exit boot services now */
711 status = efi_call_phys2(sys_table->boottime->exit_boot_services, 1312 status = efi_call_early(exit_boot_services, handle, key);
712 handle, key);
713 if (status != EFI_SUCCESS) { 1313 if (status != EFI_SUCCESS) {
714 /* 1314 /*
715 * ExitBootServices() will fail if any of the event 1315 * ExitBootServices() will fail if any of the event
@@ -722,7 +1322,7 @@ get_map:
722 goto free_mem_map; 1322 goto free_mem_map;
723 1323
724 called_exit = true; 1324 called_exit = true;
725 efi_call_phys1(sys_table->boottime->free_pool, mem_map); 1325 efi_call_early(free_pool, mem_map);
726 goto get_map; 1326 goto get_map;
727 } 1327 }
728 1328
@@ -736,23 +1336,31 @@ get_map:
736 return EFI_SUCCESS; 1336 return EFI_SUCCESS;
737 1337
738free_mem_map: 1338free_mem_map:
739 efi_call_phys1(sys_table->boottime->free_pool, mem_map); 1339 efi_call_early(free_pool, mem_map);
740 return status; 1340 return status;
741} 1341}
742 1342
743
744/* 1343/*
745 * On success we return a pointer to a boot_params structure, and NULL 1344 * On success we return a pointer to a boot_params structure, and NULL
746 * on failure. 1345 * on failure.
747 */ 1346 */
748struct boot_params *efi_main(void *handle, efi_system_table_t *_table, 1347struct boot_params *efi_main(struct efi_config *c,
749 struct boot_params *boot_params) 1348 struct boot_params *boot_params)
750{ 1349{
751 struct desc_ptr *gdt; 1350 struct desc_ptr *gdt = NULL;
752 efi_loaded_image_t *image; 1351 efi_loaded_image_t *image;
753 struct setup_header *hdr = &boot_params->hdr; 1352 struct setup_header *hdr = &boot_params->hdr;
754 efi_status_t status; 1353 efi_status_t status;
755 struct desc_struct *desc; 1354 struct desc_struct *desc;
1355 void *handle;
1356 efi_system_table_t *_table;
1357 bool is64;
1358
1359 efi_early = c;
1360
1361 _table = (efi_system_table_t *)(unsigned long)efi_early->table;
1362 handle = (void *)(unsigned long)efi_early->image_handle;
1363 is64 = efi_early->is64;
756 1364
757 sys_table = _table; 1365 sys_table = _table;
758 1366
@@ -760,13 +1368,17 @@ struct boot_params *efi_main(void *handle, efi_system_table_t *_table,
760 if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) 1368 if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
761 goto fail; 1369 goto fail;
762 1370
1371 if (is64)
1372 setup_boot_services64(efi_early);
1373 else
1374 setup_boot_services32(efi_early);
1375
763 setup_graphics(boot_params); 1376 setup_graphics(boot_params);
764 1377
765 setup_efi_pci(boot_params); 1378 setup_efi_pci(boot_params);
766 1379
767 status = efi_call_phys3(sys_table->boottime->allocate_pool, 1380 status = efi_call_early(allocate_pool, EFI_LOADER_DATA,
768 EFI_LOADER_DATA, sizeof(*gdt), 1381 sizeof(*gdt), (void **)&gdt);
769 (void **)&gdt);
770 if (status != EFI_SUCCESS) { 1382 if (status != EFI_SUCCESS) {
771 efi_printk(sys_table, "Failed to alloc mem for gdt structure\n"); 1383 efi_printk(sys_table, "Failed to alloc mem for gdt structure\n");
772 goto fail; 1384 goto fail;
@@ -797,7 +1409,7 @@ struct boot_params *efi_main(void *handle, efi_system_table_t *_table,
797 hdr->code32_start = bzimage_addr; 1409 hdr->code32_start = bzimage_addr;
798 } 1410 }
799 1411
800 status = exit_boot(boot_params, handle); 1412 status = exit_boot(boot_params, handle, is64);
801 if (status != EFI_SUCCESS) 1413 if (status != EFI_SUCCESS)
802 goto fail; 1414 goto fail;
803 1415
diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h
index 81b6b652b46a..c88c31ecad12 100644
--- a/arch/x86/boot/compressed/eboot.h
+++ b/arch/x86/boot/compressed/eboot.h
@@ -37,6 +37,24 @@ struct efi_graphics_output_mode_info {
37 u32 pixels_per_scan_line; 37 u32 pixels_per_scan_line;
38} __packed; 38} __packed;
39 39
40struct efi_graphics_output_protocol_mode_32 {
41 u32 max_mode;
42 u32 mode;
43 u32 info;
44 u32 size_of_info;
45 u64 frame_buffer_base;
46 u32 frame_buffer_size;
47} __packed;
48
49struct efi_graphics_output_protocol_mode_64 {
50 u32 max_mode;
51 u32 mode;
52 u64 info;
53 u64 size_of_info;
54 u64 frame_buffer_base;
55 u64 frame_buffer_size;
56} __packed;
57
40struct efi_graphics_output_protocol_mode { 58struct efi_graphics_output_protocol_mode {
41 u32 max_mode; 59 u32 max_mode;
42 u32 mode; 60 u32 mode;
@@ -46,6 +64,20 @@ struct efi_graphics_output_protocol_mode {
46 unsigned long frame_buffer_size; 64 unsigned long frame_buffer_size;
47} __packed; 65} __packed;
48 66
67struct efi_graphics_output_protocol_32 {
68 u32 query_mode;
69 u32 set_mode;
70 u32 blt;
71 u32 mode;
72};
73
74struct efi_graphics_output_protocol_64 {
75 u64 query_mode;
76 u64 set_mode;
77 u64 blt;
78 u64 mode;
79};
80
49struct efi_graphics_output_protocol { 81struct efi_graphics_output_protocol {
50 void *query_mode; 82 void *query_mode;
51 unsigned long set_mode; 83 unsigned long set_mode;
@@ -53,10 +85,38 @@ struct efi_graphics_output_protocol {
53 struct efi_graphics_output_protocol_mode *mode; 85 struct efi_graphics_output_protocol_mode *mode;
54}; 86};
55 87
88struct efi_uga_draw_protocol_32 {
89 u32 get_mode;
90 u32 set_mode;
91 u32 blt;
92};
93
94struct efi_uga_draw_protocol_64 {
95 u64 get_mode;
96 u64 set_mode;
97 u64 blt;
98};
99
56struct efi_uga_draw_protocol { 100struct efi_uga_draw_protocol {
57 void *get_mode; 101 void *get_mode;
58 void *set_mode; 102 void *set_mode;
59 void *blt; 103 void *blt;
60}; 104};
61 105
106struct efi_config {
107 u64 image_handle;
108 u64 table;
109 u64 allocate_pool;
110 u64 allocate_pages;
111 u64 get_memory_map;
112 u64 free_pool;
113 u64 free_pages;
114 u64 locate_handle;
115 u64 handle_protocol;
116 u64 exit_boot_services;
117 u64 text_output;
118 efi_status_t (*call)(unsigned long, ...);
119 bool is64;
120} __packed;
121
62#endif /* BOOT_COMPRESSED_EBOOT_H */ 122#endif /* BOOT_COMPRESSED_EBOOT_H */
diff --git a/arch/x86/boot/compressed/efi_stub_64.S b/arch/x86/boot/compressed/efi_stub_64.S
index cedc60de86eb..7ff3632806b1 100644
--- a/arch/x86/boot/compressed/efi_stub_64.S
+++ b/arch/x86/boot/compressed/efi_stub_64.S
@@ -1 +1,30 @@
1#include <asm/segment.h>
2#include <asm/msr.h>
3#include <asm/processor-flags.h>
4
1#include "../../platform/efi/efi_stub_64.S" 5#include "../../platform/efi/efi_stub_64.S"
6
7#ifdef CONFIG_EFI_MIXED
8 .code64
9 .text
10ENTRY(efi64_thunk)
11 push %rbp
12 push %rbx
13
14 subq $16, %rsp
15 leaq efi_exit32(%rip), %rax
16 movl %eax, 8(%rsp)
17 leaq efi_gdt64(%rip), %rax
18 movl %eax, 4(%rsp)
19 movl %eax, 2(%rax) /* Fixup the gdt base address */
20 leaq efi32_boot_gdt(%rip), %rax
21 movl %eax, (%rsp)
22
23 call __efi64_thunk
24
25 addq $16, %rsp
26 pop %rbx
27 pop %rbp
28 ret
29ENDPROC(efi64_thunk)
30#endif /* CONFIG_EFI_MIXED */
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 5d6f6891b188..cbed1407a5cd 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -42,33 +42,56 @@ ENTRY(startup_32)
42ENTRY(efi_pe_entry) 42ENTRY(efi_pe_entry)
43 add $0x4, %esp 43 add $0x4, %esp
44 44
45 call 1f
461: popl %esi
47 subl $1b, %esi
48
49 popl %ecx
50 movl %ecx, efi32_config(%esi) /* Handle */
51 popl %ecx
52 movl %ecx, efi32_config+8(%esi) /* EFI System table pointer */
53
54 /* Relocate efi_config->call() */
55 leal efi32_config(%esi), %eax
56 add %esi, 88(%eax)
57 pushl %eax
58
45 call make_boot_params 59 call make_boot_params
46 cmpl $0, %eax 60 cmpl $0, %eax
47 je 1f 61 je fail
48 movl 0x4(%esp), %esi 62 movl %esi, BP_code32_start(%eax)
49 movl (%esp), %ecx 63 popl %ecx
50 pushl %eax 64 pushl %eax
51 pushl %esi
52 pushl %ecx 65 pushl %ecx
53 sub $0x4, %esp 66 jmp 2f /* Skip efi_config initialization */
54 67
55ENTRY(efi_stub_entry) 68ENTRY(efi32_stub_entry)
56 add $0x4, %esp 69 add $0x4, %esp
70 popl %ecx
71 popl %edx
72
73 call 1f
741: popl %esi
75 subl $1b, %esi
76
77 movl %ecx, efi32_config(%esi) /* Handle */
78 movl %edx, efi32_config+8(%esi) /* EFI System table pointer */
79
80 /* Relocate efi_config->call() */
81 leal efi32_config(%esi), %eax
82 add %esi, 88(%eax)
83 pushl %eax
842:
57 call efi_main 85 call efi_main
58 cmpl $0, %eax 86 cmpl $0, %eax
59 movl %eax, %esi 87 movl %eax, %esi
60 jne 2f 88 jne 2f
611: 89fail:
62 /* EFI init failed, so hang. */ 90 /* EFI init failed, so hang. */
63 hlt 91 hlt
64 jmp 1b 92 jmp fail
652: 932:
66 call 3f 94 movl BP_code32_start(%esi), %eax
673:
68 popl %eax
69 subl $3b, %eax
70 subl BP_pref_address(%esi), %eax
71 add BP_code32_start(%esi), %eax
72 leal preferred_addr(%eax), %eax 95 leal preferred_addr(%eax), %eax
73 jmp *%eax 96 jmp *%eax
74 97
@@ -117,9 +140,11 @@ preferred_addr:
117 addl %eax, %ebx 140 addl %eax, %ebx
118 notl %eax 141 notl %eax
119 andl %eax, %ebx 142 andl %eax, %ebx
120#else 143 cmpl $LOAD_PHYSICAL_ADDR, %ebx
121 movl $LOAD_PHYSICAL_ADDR, %ebx 144 jge 1f
122#endif 145#endif
146 movl $LOAD_PHYSICAL_ADDR, %ebx
1471:
123 148
124 /* Target address to relocate to for decompression */ 149 /* Target address to relocate to for decompression */
125 addl $z_extract_offset, %ebx 150 addl $z_extract_offset, %ebx
@@ -191,14 +216,23 @@ relocated:
191 leal boot_heap(%ebx), %eax 216 leal boot_heap(%ebx), %eax
192 pushl %eax /* heap area */ 217 pushl %eax /* heap area */
193 pushl %esi /* real mode pointer */ 218 pushl %esi /* real mode pointer */
194 call decompress_kernel 219 call decompress_kernel /* returns kernel location in %eax */
195 addl $24, %esp 220 addl $24, %esp
196 221
197/* 222/*
198 * Jump to the decompressed kernel. 223 * Jump to the decompressed kernel.
199 */ 224 */
200 xorl %ebx, %ebx 225 xorl %ebx, %ebx
201 jmp *%ebp 226 jmp *%eax
227
228#ifdef CONFIG_EFI_STUB
229 .data
230efi32_config:
231 .fill 11,8,0
232 .long efi_call_phys
233 .long 0
234 .byte 0
235#endif
202 236
203/* 237/*
204 * Stack and heap for uncompression 238 * Stack and heap for uncompression
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index c337422b575d..0d558ee899ae 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -94,9 +94,11 @@ ENTRY(startup_32)
94 addl %eax, %ebx 94 addl %eax, %ebx
95 notl %eax 95 notl %eax
96 andl %eax, %ebx 96 andl %eax, %ebx
97#else 97 cmpl $LOAD_PHYSICAL_ADDR, %ebx
98 movl $LOAD_PHYSICAL_ADDR, %ebx 98 jge 1f
99#endif 99#endif
100 movl $LOAD_PHYSICAL_ADDR, %ebx
1011:
100 102
101 /* Target address to relocate to for decompression */ 103 /* Target address to relocate to for decompression */
102 addl $z_extract_offset, %ebx 104 addl $z_extract_offset, %ebx
@@ -111,7 +113,8 @@ ENTRY(startup_32)
111 lgdt gdt(%ebp) 113 lgdt gdt(%ebp)
112 114
113 /* Enable PAE mode */ 115 /* Enable PAE mode */
114 movl $(X86_CR4_PAE), %eax 116 movl %cr4, %eax
117 orl $X86_CR4_PAE, %eax
115 movl %eax, %cr4 118 movl %eax, %cr4
116 119
117 /* 120 /*
@@ -176,6 +179,13 @@ ENTRY(startup_32)
176 */ 179 */
177 pushl $__KERNEL_CS 180 pushl $__KERNEL_CS
178 leal startup_64(%ebp), %eax 181 leal startup_64(%ebp), %eax
182#ifdef CONFIG_EFI_MIXED
183 movl efi32_config(%ebp), %ebx
184 cmp $0, %ebx
185 jz 1f
186 leal handover_entry(%ebp), %eax
1871:
188#endif
179 pushl %eax 189 pushl %eax
180 190
181 /* Enter paged protected Mode, activating Long Mode */ 191 /* Enter paged protected Mode, activating Long Mode */
@@ -186,6 +196,30 @@ ENTRY(startup_32)
186 lret 196 lret
187ENDPROC(startup_32) 197ENDPROC(startup_32)
188 198
199#ifdef CONFIG_EFI_MIXED
200 .org 0x190
201ENTRY(efi32_stub_entry)
202 add $0x4, %esp /* Discard return address */
203 popl %ecx
204 popl %edx
205 popl %esi
206
207 leal (BP_scratch+4)(%esi), %esp
208 call 1f
2091: pop %ebp
210 subl $1b, %ebp
211
212 movl %ecx, efi32_config(%ebp)
213 movl %edx, efi32_config+8(%ebp)
214 sgdtl efi32_boot_gdt(%ebp)
215
216 leal efi32_config(%ebp), %eax
217 movl %eax, efi_config(%ebp)
218
219 jmp startup_32
220ENDPROC(efi32_stub_entry)
221#endif
222
189 .code64 223 .code64
190 .org 0x200 224 .org 0x200
191ENTRY(startup_64) 225ENTRY(startup_64)
@@ -207,33 +241,52 @@ ENTRY(startup_64)
207 jmp preferred_addr 241 jmp preferred_addr
208 242
209ENTRY(efi_pe_entry) 243ENTRY(efi_pe_entry)
210 mov %rcx, %rdi 244 movq %rcx, efi64_config(%rip) /* Handle */
211 mov %rdx, %rsi 245 movq %rdx, efi64_config+8(%rip) /* EFI System table pointer */
212 pushq %rdi 246
213 pushq %rsi 247 leaq efi64_config(%rip), %rax
248 movq %rax, efi_config(%rip)
249
250 call 1f
2511: popq %rbp
252 subq $1b, %rbp
253
254 /*
255 * Relocate efi_config->call().
256 */
257 addq %rbp, efi64_config+88(%rip)
258
259 movq %rax, %rdi
214 call make_boot_params 260 call make_boot_params
215 cmpq $0,%rax 261 cmpq $0,%rax
216 je 1f 262 je fail
217 mov %rax, %rdx 263 mov %rax, %rsi
218 popq %rsi 264 leaq startup_32(%rip), %rax
219 popq %rdi 265 movl %eax, BP_code32_start(%rsi)
266 jmp 2f /* Skip the relocation */
267
268handover_entry:
269 call 1f
2701: popq %rbp
271 subq $1b, %rbp
220 272
221ENTRY(efi_stub_entry) 273 /*
274 * Relocate efi_config->call().
275 */
276 movq efi_config(%rip), %rax
277 addq %rbp, 88(%rax)
2782:
279 movq efi_config(%rip), %rdi
222 call efi_main 280 call efi_main
223 movq %rax,%rsi 281 movq %rax,%rsi
224 cmpq $0,%rax 282 cmpq $0,%rax
225 jne 2f 283 jne 2f
2261: 284fail:
227 /* EFI init failed, so hang. */ 285 /* EFI init failed, so hang. */
228 hlt 286 hlt
229 jmp 1b 287 jmp fail
2302: 2882:
231 call 3f 289 movl BP_code32_start(%esi), %eax
2323:
233 popq %rax
234 subq $3b, %rax
235 subq BP_pref_address(%rsi), %rax
236 add BP_code32_start(%esi), %eax
237 leaq preferred_addr(%rax), %rax 290 leaq preferred_addr(%rax), %rax
238 jmp *%rax 291 jmp *%rax
239 292
@@ -269,9 +322,11 @@ preferred_addr:
269 addq %rax, %rbp 322 addq %rax, %rbp
270 notq %rax 323 notq %rax
271 andq %rax, %rbp 324 andq %rax, %rbp
272#else 325 cmpq $LOAD_PHYSICAL_ADDR, %rbp
273 movq $LOAD_PHYSICAL_ADDR, %rbp 326 jge 1f
274#endif 327#endif
328 movq $LOAD_PHYSICAL_ADDR, %rbp
3291:
275 330
276 /* Target address to relocate to for decompression */ 331 /* Target address to relocate to for decompression */
277 leaq z_extract_offset(%rbp), %rbx 332 leaq z_extract_offset(%rbp), %rbx
@@ -303,6 +358,20 @@ preferred_addr:
303 leaq relocated(%rbx), %rax 358 leaq relocated(%rbx), %rax
304 jmp *%rax 359 jmp *%rax
305 360
361#ifdef CONFIG_EFI_STUB
362 .org 0x390
363ENTRY(efi64_stub_entry)
364 movq %rdi, efi64_config(%rip) /* Handle */
365 movq %rsi, efi64_config+8(%rip) /* EFI System table pointer */
366
367 leaq efi64_config(%rip), %rax
368 movq %rax, efi_config(%rip)
369
370 movq %rdx, %rsi
371 jmp handover_entry
372ENDPROC(efi64_stub_entry)
373#endif
374
306 .text 375 .text
307relocated: 376relocated:
308 377
@@ -339,13 +408,13 @@ relocated:
339 movl $z_input_len, %ecx /* input_len */ 408 movl $z_input_len, %ecx /* input_len */
340 movq %rbp, %r8 /* output target address */ 409 movq %rbp, %r8 /* output target address */
341 movq $z_output_len, %r9 /* decompressed length */ 410 movq $z_output_len, %r9 /* decompressed length */
342 call decompress_kernel 411 call decompress_kernel /* returns kernel location in %rax */
343 popq %rsi 412 popq %rsi
344 413
345/* 414/*
346 * Jump to the decompressed kernel. 415 * Jump to the decompressed kernel.
347 */ 416 */
348 jmp *%rbp 417 jmp *%rax
349 418
350 .code32 419 .code32
351no_longmode: 420no_longmode:
@@ -368,6 +437,25 @@ gdt:
368 .quad 0x0000000000000000 /* TS continued */ 437 .quad 0x0000000000000000 /* TS continued */
369gdt_end: 438gdt_end:
370 439
440#ifdef CONFIG_EFI_STUB
441efi_config:
442 .quad 0
443
444#ifdef CONFIG_EFI_MIXED
445 .global efi32_config
446efi32_config:
447 .fill 11,8,0
448 .quad efi64_thunk
449 .byte 0
450#endif
451
452 .global efi64_config
453efi64_config:
454 .fill 11,8,0
455 .quad efi_call6
456 .byte 1
457#endif /* CONFIG_EFI_STUB */
458
371/* 459/*
372 * Stack and heap for uncompression 460 * Stack and heap for uncompression
373 */ 461 */
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 434f077d2c4d..17684615374b 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -10,6 +10,7 @@
10 */ 10 */
11 11
12#include "misc.h" 12#include "misc.h"
13#include "../string.h"
13 14
14/* WARNING!! 15/* WARNING!!
15 * This code is compiled with -fPIC and it is relocated dynamically 16 * This code is compiled with -fPIC and it is relocated dynamically
@@ -97,8 +98,14 @@
97 */ 98 */
98#define STATIC static 99#define STATIC static
99 100
100#undef memset
101#undef memcpy 101#undef memcpy
102
103/*
104 * Use a normal definition of memset() from string.c. There are already
105 * included header files which expect a definition of memset() and by
106 * the time we define memset macro, it is too late.
107 */
108#undef memset
102#define memzero(s, n) memset((s), 0, (n)) 109#define memzero(s, n) memset((s), 0, (n))
103 110
104 111
@@ -109,17 +116,8 @@ static void error(char *m);
109 */ 116 */
110struct boot_params *real_mode; /* Pointer to real-mode data */ 117struct boot_params *real_mode; /* Pointer to real-mode data */
111 118
112void *memset(void *s, int c, size_t n); 119memptr free_mem_ptr;
113void *memcpy(void *dest, const void *src, size_t n); 120memptr free_mem_end_ptr;
114
115#ifdef CONFIG_X86_64
116#define memptr long
117#else
118#define memptr unsigned
119#endif
120
121static memptr free_mem_ptr;
122static memptr free_mem_end_ptr;
123 121
124static char *vidmem; 122static char *vidmem;
125static int vidport; 123static int vidport;
@@ -222,45 +220,6 @@ void __putstr(const char *s)
222 outb(0xff & (pos >> 1), vidport+1); 220 outb(0xff & (pos >> 1), vidport+1);
223} 221}
224 222
225void *memset(void *s, int c, size_t n)
226{
227 int i;
228 char *ss = s;
229
230 for (i = 0; i < n; i++)
231 ss[i] = c;
232 return s;
233}
234#ifdef CONFIG_X86_32
235void *memcpy(void *dest, const void *src, size_t n)
236{
237 int d0, d1, d2;
238 asm volatile(
239 "rep ; movsl\n\t"
240 "movl %4,%%ecx\n\t"
241 "rep ; movsb\n\t"
242 : "=&c" (d0), "=&D" (d1), "=&S" (d2)
243 : "0" (n >> 2), "g" (n & 3), "1" (dest), "2" (src)
244 : "memory");
245
246 return dest;
247}
248#else
249void *memcpy(void *dest, const void *src, size_t n)
250{
251 long d0, d1, d2;
252 asm volatile(
253 "rep ; movsq\n\t"
254 "movq %4,%%rcx\n\t"
255 "rep ; movsb\n\t"
256 : "=&c" (d0), "=&D" (d1), "=&S" (d2)
257 : "0" (n >> 3), "g" (n & 7), "1" (dest), "2" (src)
258 : "memory");
259
260 return dest;
261}
262#endif
263
264static void error(char *x) 223static void error(char *x)
265{ 224{
266 error_putstr("\n\n"); 225 error_putstr("\n\n");
@@ -395,7 +354,7 @@ static void parse_elf(void *output)
395 free(phdrs); 354 free(phdrs);
396} 355}
397 356
398asmlinkage void decompress_kernel(void *rmode, memptr heap, 357asmlinkage void *decompress_kernel(void *rmode, memptr heap,
399 unsigned char *input_data, 358 unsigned char *input_data,
400 unsigned long input_len, 359 unsigned long input_len,
401 unsigned char *output, 360 unsigned char *output,
@@ -422,6 +381,10 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
422 free_mem_ptr = heap; /* Heap */ 381 free_mem_ptr = heap; /* Heap */
423 free_mem_end_ptr = heap + BOOT_HEAP_SIZE; 382 free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
424 383
384 output = choose_kernel_location(input_data, input_len,
385 output, output_len);
386
387 /* Validate memory location choices. */
425 if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1)) 388 if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1))
426 error("Destination address inappropriately aligned"); 389 error("Destination address inappropriately aligned");
427#ifdef CONFIG_X86_64 390#ifdef CONFIG_X86_64
@@ -441,5 +404,5 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
441 parse_elf(output); 404 parse_elf(output);
442 handle_relocations(output, output_len); 405 handle_relocations(output, output_len);
443 debug_putstr("done.\nBooting the kernel.\n"); 406 debug_putstr("done.\nBooting the kernel.\n");
444 return; 407 return output;
445} 408}
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 674019d8e235..24e3e569a13c 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -23,7 +23,15 @@
23#define BOOT_BOOT_H 23#define BOOT_BOOT_H
24#include "../ctype.h" 24#include "../ctype.h"
25 25
26#ifdef CONFIG_X86_64
27#define memptr long
28#else
29#define memptr unsigned
30#endif
31
26/* misc.c */ 32/* misc.c */
33extern memptr free_mem_ptr;
34extern memptr free_mem_end_ptr;
27extern struct boot_params *real_mode; /* Pointer to real-mode data */ 35extern struct boot_params *real_mode; /* Pointer to real-mode data */
28void __putstr(const char *s); 36void __putstr(const char *s);
29#define error_putstr(__x) __putstr(__x) 37#define error_putstr(__x) __putstr(__x)
@@ -39,23 +47,40 @@ static inline void debug_putstr(const char *s)
39 47
40#endif 48#endif
41 49
42#ifdef CONFIG_EARLY_PRINTK 50#if CONFIG_EARLY_PRINTK || CONFIG_RANDOMIZE_BASE
43
44/* cmdline.c */ 51/* cmdline.c */
45int cmdline_find_option(const char *option, char *buffer, int bufsize); 52int cmdline_find_option(const char *option, char *buffer, int bufsize);
46int cmdline_find_option_bool(const char *option); 53int cmdline_find_option_bool(const char *option);
54#endif
47 55
48/* early_serial_console.c */
49extern int early_serial_base;
50void console_init(void);
51 56
57#if CONFIG_RANDOMIZE_BASE
58/* aslr.c */
59unsigned char *choose_kernel_location(unsigned char *input,
60 unsigned long input_size,
61 unsigned char *output,
62 unsigned long output_size);
63/* cpuflags.c */
64bool has_cpuflag(int flag);
52#else 65#else
66static inline
67unsigned char *choose_kernel_location(unsigned char *input,
68 unsigned long input_size,
69 unsigned char *output,
70 unsigned long output_size)
71{
72 return output;
73}
74#endif
53 75
76#ifdef CONFIG_EARLY_PRINTK
54/* early_serial_console.c */ 77/* early_serial_console.c */
78extern int early_serial_base;
79void console_init(void);
80#else
55static const int early_serial_base; 81static const int early_serial_base;
56static inline void console_init(void) 82static inline void console_init(void)
57{ } 83{ }
58
59#endif 84#endif
60 85
61#endif 86#endif
diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c
index ffb9c5c9d748..f3c57e341402 100644
--- a/arch/x86/boot/compressed/string.c
+++ b/arch/x86/boot/compressed/string.c
@@ -1,11 +1,45 @@
1#include "misc.h" 1#include "misc.h"
2#include "../string.c"
3
4/* misc.h might pull in string_32.h which has a macro for memcpy. undef that */
5#undef memcpy
2 6
3int memcmp(const void *s1, const void *s2, size_t len) 7#ifdef CONFIG_X86_32
8void *memcpy(void *dest, const void *src, size_t n)
4{ 9{
5 u8 diff; 10 int d0, d1, d2;
6 asm("repe; cmpsb; setnz %0" 11 asm volatile(
7 : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len)); 12 "rep ; movsl\n\t"
8 return diff; 13 "movl %4,%%ecx\n\t"
14 "rep ; movsb\n\t"
15 : "=&c" (d0), "=&D" (d1), "=&S" (d2)
16 : "0" (n >> 2), "g" (n & 3), "1" (dest), "2" (src)
17 : "memory");
18
19 return dest;
9} 20}
21#else
22void *memcpy(void *dest, const void *src, size_t n)
23{
24 long d0, d1, d2;
25 asm volatile(
26 "rep ; movsq\n\t"
27 "movq %4,%%rcx\n\t"
28 "rep ; movsb\n\t"
29 : "=&c" (d0), "=&D" (d1), "=&S" (d2)
30 : "0" (n >> 3), "g" (n & 7), "1" (dest), "2" (src)
31 : "memory");
10 32
11#include "../string.c" 33 return dest;
34}
35#endif
36
37void *memset(void *s, int c, size_t n)
38{
39 int i;
40 char *ss = s;
41
42 for (i = 0; i < n; i++)
43 ss[i] = c;
44 return s;
45}
diff --git a/arch/x86/boot/copy.S b/arch/x86/boot/copy.S
index 11f272c6f5e9..1eb7d298b47d 100644
--- a/arch/x86/boot/copy.S
+++ b/arch/x86/boot/copy.S
@@ -14,7 +14,7 @@
14 * Memory copy routines 14 * Memory copy routines
15 */ 15 */
16 16
17 .code16gcc 17 .code16
18 .text 18 .text
19 19
20GLOBAL(memcpy) 20GLOBAL(memcpy)
@@ -30,7 +30,7 @@ GLOBAL(memcpy)
30 rep; movsb 30 rep; movsb
31 popw %di 31 popw %di
32 popw %si 32 popw %si
33 ret 33 retl
34ENDPROC(memcpy) 34ENDPROC(memcpy)
35 35
36GLOBAL(memset) 36GLOBAL(memset)
@@ -45,25 +45,25 @@ GLOBAL(memset)
45 andw $3, %cx 45 andw $3, %cx
46 rep; stosb 46 rep; stosb
47 popw %di 47 popw %di
48 ret 48 retl
49ENDPROC(memset) 49ENDPROC(memset)
50 50
51GLOBAL(copy_from_fs) 51GLOBAL(copy_from_fs)
52 pushw %ds 52 pushw %ds
53 pushw %fs 53 pushw %fs
54 popw %ds 54 popw %ds
55 call memcpy 55 calll memcpy
56 popw %ds 56 popw %ds
57 ret 57 retl
58ENDPROC(copy_from_fs) 58ENDPROC(copy_from_fs)
59 59
60GLOBAL(copy_to_fs) 60GLOBAL(copy_to_fs)
61 pushw %es 61 pushw %es
62 pushw %fs 62 pushw %fs
63 popw %es 63 popw %es
64 call memcpy 64 calll memcpy
65 popw %es 65 popw %es
66 ret 66 retl
67ENDPROC(copy_to_fs) 67ENDPROC(copy_to_fs)
68 68
69#if 0 /* Not currently used, but can be enabled as needed */ 69#if 0 /* Not currently used, but can be enabled as needed */
@@ -71,17 +71,17 @@ GLOBAL(copy_from_gs)
71 pushw %ds 71 pushw %ds
72 pushw %gs 72 pushw %gs
73 popw %ds 73 popw %ds
74 call memcpy 74 calll memcpy
75 popw %ds 75 popw %ds
76 ret 76 retl
77ENDPROC(copy_from_gs) 77ENDPROC(copy_from_gs)
78 78
79GLOBAL(copy_to_gs) 79GLOBAL(copy_to_gs)
80 pushw %es 80 pushw %es
81 pushw %gs 81 pushw %gs
82 popw %es 82 popw %es
83 call memcpy 83 calll memcpy
84 popw %es 84 popw %es
85 ret 85 retl
86ENDPROC(copy_to_gs) 86ENDPROC(copy_to_gs)
87#endif 87#endif
diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c
index 4d3ff037201f..1fd7d575092e 100644
--- a/arch/x86/boot/cpucheck.c
+++ b/arch/x86/boot/cpucheck.c
@@ -27,9 +27,8 @@
27#include <asm/processor-flags.h> 27#include <asm/processor-flags.h>
28#include <asm/required-features.h> 28#include <asm/required-features.h>
29#include <asm/msr-index.h> 29#include <asm/msr-index.h>
30#include "string.h"
30 31
31struct cpu_features cpu;
32static u32 cpu_vendor[3];
33static u32 err_flags[NCAPINTS]; 32static u32 err_flags[NCAPINTS];
34 33
35static const int req_level = CONFIG_X86_MINIMUM_CPU_FAMILY; 34static const int req_level = CONFIG_X86_MINIMUM_CPU_FAMILY;
@@ -69,92 +68,15 @@ static int is_transmeta(void)
69 cpu_vendor[2] == A32('M', 'x', '8', '6'); 68 cpu_vendor[2] == A32('M', 'x', '8', '6');
70} 69}
71 70
72static int has_fpu(void) 71static int is_intel(void)
73{ 72{
74 u16 fcw = -1, fsw = -1; 73 return cpu_vendor[0] == A32('G', 'e', 'n', 'u') &&
75 u32 cr0; 74 cpu_vendor[1] == A32('i', 'n', 'e', 'I') &&
76 75 cpu_vendor[2] == A32('n', 't', 'e', 'l');
77 asm("movl %%cr0,%0" : "=r" (cr0));
78 if (cr0 & (X86_CR0_EM|X86_CR0_TS)) {
79 cr0 &= ~(X86_CR0_EM|X86_CR0_TS);
80 asm volatile("movl %0,%%cr0" : : "r" (cr0));
81 }
82
83 asm volatile("fninit ; fnstsw %0 ; fnstcw %1"
84 : "+m" (fsw), "+m" (fcw));
85
86 return fsw == 0 && (fcw & 0x103f) == 0x003f;
87}
88
89static int has_eflag(u32 mask)
90{
91 u32 f0, f1;
92
93 asm("pushfl ; "
94 "pushfl ; "
95 "popl %0 ; "
96 "movl %0,%1 ; "
97 "xorl %2,%1 ; "
98 "pushl %1 ; "
99 "popfl ; "
100 "pushfl ; "
101 "popl %1 ; "
102 "popfl"
103 : "=&r" (f0), "=&r" (f1)
104 : "ri" (mask));
105
106 return !!((f0^f1) & mask);
107}
108
109static void get_flags(void)
110{
111 u32 max_intel_level, max_amd_level;
112 u32 tfms;
113
114 if (has_fpu())
115 set_bit(X86_FEATURE_FPU, cpu.flags);
116
117 if (has_eflag(X86_EFLAGS_ID)) {
118 asm("cpuid"
119 : "=a" (max_intel_level),
120 "=b" (cpu_vendor[0]),
121 "=d" (cpu_vendor[1]),
122 "=c" (cpu_vendor[2])
123 : "a" (0));
124
125 if (max_intel_level >= 0x00000001 &&
126 max_intel_level <= 0x0000ffff) {
127 asm("cpuid"
128 : "=a" (tfms),
129 "=c" (cpu.flags[4]),
130 "=d" (cpu.flags[0])
131 : "a" (0x00000001)
132 : "ebx");
133 cpu.level = (tfms >> 8) & 15;
134 cpu.model = (tfms >> 4) & 15;
135 if (cpu.level >= 6)
136 cpu.model += ((tfms >> 16) & 0xf) << 4;
137 }
138
139 asm("cpuid"
140 : "=a" (max_amd_level)
141 : "a" (0x80000000)
142 : "ebx", "ecx", "edx");
143
144 if (max_amd_level >= 0x80000001 &&
145 max_amd_level <= 0x8000ffff) {
146 u32 eax = 0x80000001;
147 asm("cpuid"
148 : "+a" (eax),
149 "=c" (cpu.flags[6]),
150 "=d" (cpu.flags[1])
151 : : "ebx");
152 }
153 }
154} 76}
155 77
156/* Returns a bitmask of which words we have error bits in */ 78/* Returns a bitmask of which words we have error bits in */
157static int check_flags(void) 79static int check_cpuflags(void)
158{ 80{
159 u32 err; 81 u32 err;
160 int i; 82 int i;
@@ -187,8 +109,8 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr)
187 if (has_eflag(X86_EFLAGS_AC)) 109 if (has_eflag(X86_EFLAGS_AC))
188 cpu.level = 4; 110 cpu.level = 4;
189 111
190 get_flags(); 112 get_cpuflags();
191 err = check_flags(); 113 err = check_cpuflags();
192 114
193 if (test_bit(X86_FEATURE_LM, cpu.flags)) 115 if (test_bit(X86_FEATURE_LM, cpu.flags))
194 cpu.level = 64; 116 cpu.level = 64;
@@ -207,8 +129,8 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr)
207 eax &= ~(1 << 15); 129 eax &= ~(1 << 15);
208 asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx)); 130 asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx));
209 131
210 get_flags(); /* Make sure it really did something */ 132 get_cpuflags(); /* Make sure it really did something */
211 err = check_flags(); 133 err = check_cpuflags();
212 } else if (err == 0x01 && 134 } else if (err == 0x01 &&
213 !(err_flags[0] & ~(1 << X86_FEATURE_CX8)) && 135 !(err_flags[0] & ~(1 << X86_FEATURE_CX8)) &&
214 is_centaur() && cpu.model >= 6) { 136 is_centaur() && cpu.model >= 6) {
@@ -223,7 +145,7 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr)
223 asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx)); 145 asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx));
224 146
225 set_bit(X86_FEATURE_CX8, cpu.flags); 147 set_bit(X86_FEATURE_CX8, cpu.flags);
226 err = check_flags(); 148 err = check_cpuflags();
227 } else if (err == 0x01 && is_transmeta()) { 149 } else if (err == 0x01 && is_transmeta()) {
228 /* Transmeta might have masked feature bits in word 0 */ 150 /* Transmeta might have masked feature bits in word 0 */
229 151
@@ -238,7 +160,20 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr)
238 : : "ecx", "ebx"); 160 : : "ecx", "ebx");
239 asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx)); 161 asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx));
240 162
241 err = check_flags(); 163 err = check_cpuflags();
164 } else if (err == 0x01 &&
165 !(err_flags[0] & ~(1 << X86_FEATURE_PAE)) &&
166 is_intel() && cpu.level == 6 &&
167 (cpu.model == 9 || cpu.model == 13)) {
168 /* PAE is disabled on this Pentium M but can be forced */
169 if (cmdline_find_option_bool("forcepae")) {
170 puts("WARNING: Forcing PAE in CPU flags\n");
171 set_bit(X86_FEATURE_PAE, cpu.flags);
172 err = check_cpuflags();
173 }
174 else {
175 puts("WARNING: PAE disabled. Use parameter 'forcepae' to enable at your own risk!\n");
176 }
242 } 177 }
243 178
244 if (err_flags_ptr) 179 if (err_flags_ptr)
diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c
new file mode 100644
index 000000000000..431fa5f84537
--- /dev/null
+++ b/arch/x86/boot/cpuflags.c
@@ -0,0 +1,119 @@
1#include <linux/types.h>
2#include "bitops.h"
3
4#include <asm/processor-flags.h>
5#include <asm/required-features.h>
6#include <asm/msr-index.h>
7#include "cpuflags.h"
8
9struct cpu_features cpu;
10u32 cpu_vendor[3];
11
12static bool loaded_flags;
13
14static int has_fpu(void)
15{
16 u16 fcw = -1, fsw = -1;
17 unsigned long cr0;
18
19 asm volatile("mov %%cr0,%0" : "=r" (cr0));
20 if (cr0 & (X86_CR0_EM|X86_CR0_TS)) {
21 cr0 &= ~(X86_CR0_EM|X86_CR0_TS);
22 asm volatile("mov %0,%%cr0" : : "r" (cr0));
23 }
24
25 asm volatile("fninit ; fnstsw %0 ; fnstcw %1"
26 : "+m" (fsw), "+m" (fcw));
27
28 return fsw == 0 && (fcw & 0x103f) == 0x003f;
29}
30
31/*
32 * For building the 16-bit code we want to explicitly specify 32-bit
33 * push/pop operations, rather than just saying 'pushf' or 'popf' and
34 * letting the compiler choose. But this is also included from the
35 * compressed/ directory where it may be 64-bit code, and thus needs
36 * to be 'pushfq' or 'popfq' in that case.
37 */
38#ifdef __x86_64__
39#define PUSHF "pushfq"
40#define POPF "popfq"
41#else
42#define PUSHF "pushfl"
43#define POPF "popfl"
44#endif
45
46int has_eflag(unsigned long mask)
47{
48 unsigned long f0, f1;
49
50 asm volatile(PUSHF " \n\t"
51 PUSHF " \n\t"
52 "pop %0 \n\t"
53 "mov %0,%1 \n\t"
54 "xor %2,%1 \n\t"
55 "push %1 \n\t"
56 POPF " \n\t"
57 PUSHF " \n\t"
58 "pop %1 \n\t"
59 POPF
60 : "=&r" (f0), "=&r" (f1)
61 : "ri" (mask));
62
63 return !!((f0^f1) & mask);
64}
65
66/* Handle x86_32 PIC using ebx. */
67#if defined(__i386__) && defined(__PIC__)
68# define EBX_REG "=r"
69#else
70# define EBX_REG "=b"
71#endif
72
73static inline void cpuid(u32 id, u32 *a, u32 *b, u32 *c, u32 *d)
74{
75 asm volatile(".ifnc %%ebx,%3 ; movl %%ebx,%3 ; .endif \n\t"
76 "cpuid \n\t"
77 ".ifnc %%ebx,%3 ; xchgl %%ebx,%3 ; .endif \n\t"
78 : "=a" (*a), "=c" (*c), "=d" (*d), EBX_REG (*b)
79 : "a" (id)
80 );
81}
82
83void get_cpuflags(void)
84{
85 u32 max_intel_level, max_amd_level;
86 u32 tfms;
87 u32 ignored;
88
89 if (loaded_flags)
90 return;
91 loaded_flags = true;
92
93 if (has_fpu())
94 set_bit(X86_FEATURE_FPU, cpu.flags);
95
96 if (has_eflag(X86_EFLAGS_ID)) {
97 cpuid(0x0, &max_intel_level, &cpu_vendor[0], &cpu_vendor[2],
98 &cpu_vendor[1]);
99
100 if (max_intel_level >= 0x00000001 &&
101 max_intel_level <= 0x0000ffff) {
102 cpuid(0x1, &tfms, &ignored, &cpu.flags[4],
103 &cpu.flags[0]);
104 cpu.level = (tfms >> 8) & 15;
105 cpu.model = (tfms >> 4) & 15;
106 if (cpu.level >= 6)
107 cpu.model += ((tfms >> 16) & 0xf) << 4;
108 }
109
110 cpuid(0x80000000, &max_amd_level, &ignored, &ignored,
111 &ignored);
112
113 if (max_amd_level >= 0x80000001 &&
114 max_amd_level <= 0x8000ffff) {
115 cpuid(0x80000001, &ignored, &ignored, &cpu.flags[6],
116 &cpu.flags[1]);
117 }
118 }
119}
diff --git a/arch/x86/boot/cpuflags.h b/arch/x86/boot/cpuflags.h
new file mode 100644
index 000000000000..ea97697e51e4
--- /dev/null
+++ b/arch/x86/boot/cpuflags.h
@@ -0,0 +1,19 @@
1#ifndef BOOT_CPUFLAGS_H
2#define BOOT_CPUFLAGS_H
3
4#include <asm/cpufeature.h>
5#include <asm/processor-flags.h>
6
7struct cpu_features {
8 int level; /* Family, or 64 for x86-64 */
9 int model;
10 u32 flags[NCAPINTS];
11};
12
13extern struct cpu_features cpu;
14extern u32 cpu_vendor[3];
15
16int has_eflag(unsigned long mask);
17void get_cpuflags(void);
18
19#endif
diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c
index c501a5b466f8..223e42527077 100644
--- a/arch/x86/boot/edd.c
+++ b/arch/x86/boot/edd.c
@@ -15,6 +15,7 @@
15 15
16#include "boot.h" 16#include "boot.h"
17#include <linux/edd.h> 17#include <linux/edd.h>
18#include "string.h"
18 19
19#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) 20#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
20 21
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 9ec06a1f6d61..0ca9a5c362bc 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -283,7 +283,7 @@ _start:
283 # Part 2 of the header, from the old setup.S 283 # Part 2 of the header, from the old setup.S
284 284
285 .ascii "HdrS" # header signature 285 .ascii "HdrS" # header signature
286 .word 0x020c # header version number (>= 0x0105) 286 .word 0x020d # header version number (>= 0x0105)
287 # or else old loadlin-1.5 will fail) 287 # or else old loadlin-1.5 will fail)
288 .globl realmode_swtch 288 .globl realmode_swtch
289realmode_swtch: .word 0, 0 # default_switch, SETUPSEG 289realmode_swtch: .word 0, 0 # default_switch, SETUPSEG
@@ -350,7 +350,7 @@ cmd_line_ptr: .long 0 # (Header version 0x0202 or later)
350 # can be located anywhere in 350 # can be located anywhere in
351 # low memory 0x10000 or higher. 351 # low memory 0x10000 or higher.
352 352
353ramdisk_max: .long 0x7fffffff 353initrd_addr_max: .long 0x7fffffff
354 # (Header version 0x0203 or later) 354 # (Header version 0x0203 or later)
355 # The highest safe address for 355 # The highest safe address for
356 # the contents of an initrd 356 # the contents of an initrd
@@ -375,7 +375,8 @@ xloadflags:
375# define XLF0 0 375# define XLF0 0
376#endif 376#endif
377 377
378#if defined(CONFIG_RELOCATABLE) && defined(CONFIG_X86_64) 378#if defined(CONFIG_RELOCATABLE) && defined(CONFIG_X86_64) && \
379 !defined(CONFIG_EFI_MIXED)
379 /* kernel/boot_param/ramdisk could be loaded above 4g */ 380 /* kernel/boot_param/ramdisk could be loaded above 4g */
380# define XLF1 XLF_CAN_BE_LOADED_ABOVE_4G 381# define XLF1 XLF_CAN_BE_LOADED_ABOVE_4G
381#else 382#else
@@ -383,15 +384,26 @@ xloadflags:
383#endif 384#endif
384 385
385#ifdef CONFIG_EFI_STUB 386#ifdef CONFIG_EFI_STUB
386# ifdef CONFIG_X86_64 387# ifdef CONFIG_EFI_MIXED
387# define XLF23 XLF_EFI_HANDOVER_64 /* 64-bit EFI handover ok */ 388# define XLF23 (XLF_EFI_HANDOVER_32|XLF_EFI_HANDOVER_64)
388# else 389# else
389# define XLF23 XLF_EFI_HANDOVER_32 /* 32-bit EFI handover ok */ 390# ifdef CONFIG_X86_64
391# define XLF23 XLF_EFI_HANDOVER_64 /* 64-bit EFI handover ok */
392# else
393# define XLF23 XLF_EFI_HANDOVER_32 /* 32-bit EFI handover ok */
394# endif
390# endif 395# endif
391#else 396#else
392# define XLF23 0 397# define XLF23 0
393#endif 398#endif
394 .word XLF0 | XLF1 | XLF23 399
400#if defined(CONFIG_X86_64) && defined(CONFIG_EFI) && defined(CONFIG_KEXEC)
401# define XLF4 XLF_EFI_KEXEC
402#else
403# define XLF4 0
404#endif
405
406 .word XLF0 | XLF1 | XLF23 | XLF4
395 407
396cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line, 408cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line,
397 #added with boot protocol 409 #added with boot protocol
@@ -419,13 +431,7 @@ pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr
419#define INIT_SIZE VO_INIT_SIZE 431#define INIT_SIZE VO_INIT_SIZE
420#endif 432#endif
421init_size: .long INIT_SIZE # kernel initialization size 433init_size: .long INIT_SIZE # kernel initialization size
422handover_offset: 434handover_offset: .long 0 # Filled in by build.c
423#ifdef CONFIG_EFI_STUB
424 .long 0x30 # offset to the handover
425 # protocol entry point
426#else
427 .long 0
428#endif
429 435
430# End of setup header ##################################################### 436# End of setup header #####################################################
431 437
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c
index cf6083d444f4..fd6c9f236996 100644
--- a/arch/x86/boot/main.c
+++ b/arch/x86/boot/main.c
@@ -14,6 +14,7 @@
14 */ 14 */
15 15
16#include "boot.h" 16#include "boot.h"
17#include "string.h"
17 18
18struct boot_params boot_params __attribute__((aligned(16))); 19struct boot_params boot_params __attribute__((aligned(16)));
19 20
diff --git a/arch/x86/boot/regs.c b/arch/x86/boot/regs.c
index 958019b1cfa5..c0fb356a3092 100644
--- a/arch/x86/boot/regs.c
+++ b/arch/x86/boot/regs.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include "boot.h" 19#include "boot.h"
20#include "string.h"
20 21
21void initregs(struct biosregs *reg) 22void initregs(struct biosregs *reg)
22{ 23{
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index 574dedfe2890..5339040ef86e 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -14,6 +14,20 @@
14 14
15#include "boot.h" 15#include "boot.h"
16 16
17/*
18 * This file gets included in compressed/string.c which might pull in
19 * string_32.h and which in turn maps memcmp to __builtin_memcmp(). Undo
20 * that first.
21 */
22#undef memcmp
23int memcmp(const void *s1, const void *s2, size_t len)
24{
25 u8 diff;
26 asm("repe; cmpsb; setnz %0"
27 : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len));
28 return diff;
29}
30
17int strcmp(const char *str1, const char *str2) 31int strcmp(const char *str1, const char *str2)
18{ 32{
19 const unsigned char *s1 = (const unsigned char *)str1; 33 const unsigned char *s1 = (const unsigned char *)str1;
diff --git a/arch/x86/boot/string.h b/arch/x86/boot/string.h
new file mode 100644
index 000000000000..725e820602b1
--- /dev/null
+++ b/arch/x86/boot/string.h
@@ -0,0 +1,21 @@
1#ifndef BOOT_STRING_H
2#define BOOT_STRING_H
3
4/* Undef any of these macros coming from string_32.h. */
5#undef memcpy
6#undef memset
7#undef memcmp
8
9void *memcpy(void *dst, const void *src, size_t len);
10void *memset(void *dst, int c, size_t len);
11int memcmp(const void *s1, const void *s2, size_t len);
12
13/*
14 * Access builtin version by default. If one needs to use optimized version,
15 * do "undef memcpy" in .c file and link against right string.c
16 */
17#define memcpy(d,s,l) __builtin_memcpy(d,s,l)
18#define memset(d,c,l) __builtin_memset(d,c,l)
19#define memcmp __builtin_memcmp
20
21#endif /* BOOT_STRING_H */
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
index 8e15b22391fc..1a2f2121cada 100644
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -53,7 +53,8 @@ int is_big_kernel;
53 53
54#define PECOFF_RELOC_RESERVE 0x20 54#define PECOFF_RELOC_RESERVE 0x20
55 55
56unsigned long efi_stub_entry; 56unsigned long efi32_stub_entry;
57unsigned long efi64_stub_entry;
57unsigned long efi_pe_entry; 58unsigned long efi_pe_entry;
58unsigned long startup_64; 59unsigned long startup_64;
59 60
@@ -219,6 +220,52 @@ static void update_pecoff_text(unsigned int text_start, unsigned int file_sz)
219 update_pecoff_section_header(".text", text_start, text_sz); 220 update_pecoff_section_header(".text", text_start, text_sz);
220} 221}
221 222
223static int reserve_pecoff_reloc_section(int c)
224{
225 /* Reserve 0x20 bytes for .reloc section */
226 memset(buf+c, 0, PECOFF_RELOC_RESERVE);
227 return PECOFF_RELOC_RESERVE;
228}
229
230static void efi_stub_defaults(void)
231{
232 /* Defaults for old kernel */
233#ifdef CONFIG_X86_32
234 efi_pe_entry = 0x10;
235#else
236 efi_pe_entry = 0x210;
237 startup_64 = 0x200;
238#endif
239}
240
241static void efi_stub_entry_update(void)
242{
243 unsigned long addr = efi32_stub_entry;
244
245#ifdef CONFIG_X86_64
246 /* Yes, this is really how we defined it :( */
247 addr = efi64_stub_entry - 0x200;
248#endif
249
250#ifdef CONFIG_EFI_MIXED
251 if (efi32_stub_entry != addr)
252 die("32-bit and 64-bit EFI entry points do not match\n");
253#endif
254 put_unaligned_le32(addr, &buf[0x264]);
255}
256
257#else
258
259static inline void update_pecoff_setup_and_reloc(unsigned int size) {}
260static inline void update_pecoff_text(unsigned int text_start,
261 unsigned int file_sz) {}
262static inline void efi_stub_defaults(void) {}
263static inline void efi_stub_entry_update(void) {}
264
265static inline int reserve_pecoff_reloc_section(int c)
266{
267 return 0;
268}
222#endif /* CONFIG_EFI_STUB */ 269#endif /* CONFIG_EFI_STUB */
223 270
224 271
@@ -250,7 +297,8 @@ static void parse_zoffset(char *fname)
250 p = (char *)buf; 297 p = (char *)buf;
251 298
252 while (p && *p) { 299 while (p && *p) {
253 PARSE_ZOFS(p, efi_stub_entry); 300 PARSE_ZOFS(p, efi32_stub_entry);
301 PARSE_ZOFS(p, efi64_stub_entry);
254 PARSE_ZOFS(p, efi_pe_entry); 302 PARSE_ZOFS(p, efi_pe_entry);
255 PARSE_ZOFS(p, startup_64); 303 PARSE_ZOFS(p, startup_64);
256 304
@@ -271,15 +319,7 @@ int main(int argc, char ** argv)
271 void *kernel; 319 void *kernel;
272 u32 crc = 0xffffffffUL; 320 u32 crc = 0xffffffffUL;
273 321
274 /* Defaults for old kernel */ 322 efi_stub_defaults();
275#ifdef CONFIG_X86_32
276 efi_pe_entry = 0x10;
277 efi_stub_entry = 0x30;
278#else
279 efi_pe_entry = 0x210;
280 efi_stub_entry = 0x230;
281 startup_64 = 0x200;
282#endif
283 323
284 if (argc != 5) 324 if (argc != 5)
285 usage(); 325 usage();
@@ -302,11 +342,7 @@ int main(int argc, char ** argv)
302 die("Boot block hasn't got boot flag (0xAA55)"); 342 die("Boot block hasn't got boot flag (0xAA55)");
303 fclose(file); 343 fclose(file);
304 344
305#ifdef CONFIG_EFI_STUB 345 c += reserve_pecoff_reloc_section(c);
306 /* Reserve 0x20 bytes for .reloc section */
307 memset(buf+c, 0, PECOFF_RELOC_RESERVE);
308 c += PECOFF_RELOC_RESERVE;
309#endif
310 346
311 /* Pad unused space with zeros */ 347 /* Pad unused space with zeros */
312 setup_sectors = (c + 511) / 512; 348 setup_sectors = (c + 511) / 512;
@@ -315,9 +351,7 @@ int main(int argc, char ** argv)
315 i = setup_sectors*512; 351 i = setup_sectors*512;
316 memset(buf+c, 0, i-c); 352 memset(buf+c, 0, i-c);
317 353
318#ifdef CONFIG_EFI_STUB
319 update_pecoff_setup_and_reloc(i); 354 update_pecoff_setup_and_reloc(i);
320#endif
321 355
322 /* Set the default root device */ 356 /* Set the default root device */
323 put_unaligned_le16(DEFAULT_ROOT_DEV, &buf[508]); 357 put_unaligned_le16(DEFAULT_ROOT_DEV, &buf[508]);
@@ -342,14 +376,9 @@ int main(int argc, char ** argv)
342 buf[0x1f1] = setup_sectors-1; 376 buf[0x1f1] = setup_sectors-1;
343 put_unaligned_le32(sys_size, &buf[0x1f4]); 377 put_unaligned_le32(sys_size, &buf[0x1f4]);
344 378
345#ifdef CONFIG_EFI_STUB
346 update_pecoff_text(setup_sectors * 512, sz + i + ((sys_size * 16) - sz)); 379 update_pecoff_text(setup_sectors * 512, sz + i + ((sys_size * 16) - sz));
347 380
348#ifdef CONFIG_X86_64 /* Yes, this is really how we defined it :( */ 381 efi_stub_entry_update();
349 efi_stub_entry -= 0x200;
350#endif
351 put_unaligned_le32(efi_stub_entry, &buf[0x264]);
352#endif
353 382
354 crc = partial_crc32(buf, i, crc); 383 crc = partial_crc32(buf, i, crc);
355 if (fwrite(buf, 1, i, dest) != i) 384 if (fwrite(buf, 1, i, dest) != i)
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c
index 11e8c6eb80a1..ba3e100654db 100644
--- a/arch/x86/boot/video-vesa.c
+++ b/arch/x86/boot/video-vesa.c
@@ -16,6 +16,7 @@
16#include "boot.h" 16#include "boot.h"
17#include "video.h" 17#include "video.h"
18#include "vesa.h" 18#include "vesa.h"
19#include "string.h"
19 20
20/* VESA information */ 21/* VESA information */
21static struct vesa_general_info vginfo; 22static struct vesa_general_info vginfo;
diff --git a/arch/x86/boot/video.h b/arch/x86/boot/video.h
index ff339c5db311..0bb25491262d 100644
--- a/arch/x86/boot/video.h
+++ b/arch/x86/boot/video.h
@@ -80,7 +80,7 @@ struct card_info {
80 u16 xmode_n; /* Size of unprobed mode range */ 80 u16 xmode_n; /* Size of unprobed mode range */
81}; 81};
82 82
83#define __videocard struct card_info __attribute__((section(".videocards"))) 83#define __videocard struct card_info __attribute__((used,section(".videocards")))
84extern struct card_info video_cards[], video_cards_end[]; 84extern struct card_info video_cards[], video_cards_end[];
85 85
86int mode_defined(u16 mode); /* video.c */ 86int mode_defined(u16 mode); /* video.c */
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index a7fef2621cc9..619e7f7426c6 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -60,7 +60,6 @@ CONFIG_CRASH_DUMP=y
60CONFIG_HIBERNATION=y 60CONFIG_HIBERNATION=y
61CONFIG_PM_DEBUG=y 61CONFIG_PM_DEBUG=y
62CONFIG_PM_TRACE_RTC=y 62CONFIG_PM_TRACE_RTC=y
63CONFIG_ACPI_PROCFS=y
64CONFIG_ACPI_DOCK=y 63CONFIG_ACPI_DOCK=y
65CONFIG_CPU_FREQ=y 64CONFIG_CPU_FREQ=y
66# CONFIG_CPU_FREQ_STAT is not set 65# CONFIG_CPU_FREQ_STAT is not set
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index c1119d4c1281..6181c69b786b 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -58,7 +58,6 @@ CONFIG_CRASH_DUMP=y
58CONFIG_HIBERNATION=y 58CONFIG_HIBERNATION=y
59CONFIG_PM_DEBUG=y 59CONFIG_PM_DEBUG=y
60CONFIG_PM_TRACE_RTC=y 60CONFIG_PM_TRACE_RTC=y
61CONFIG_ACPI_PROCFS=y
62CONFIG_ACPI_DOCK=y 61CONFIG_ACPI_DOCK=y
63CONFIG_CPU_FREQ=y 62CONFIG_CPU_FREQ=y
64# CONFIG_CPU_FREQ_STAT is not set 63# CONFIG_CPU_FREQ_STAT is not set
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index e0fc24db234a..61d6e281898b 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -76,8 +76,12 @@ ifeq ($(avx2_supported),yes)
76endif 76endif
77 77
78aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o 78aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
79aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o
79ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o 80ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
80sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o 81sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
82ifeq ($(avx2_supported),yes)
83sha1-ssse3-y += sha1_avx2_x86_64_asm.o
84endif
81crc32c-intel-y := crc32c-intel_glue.o 85crc32c-intel-y := crc32c-intel_glue.o
82crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o 86crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o
83crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o 87crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o
diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
new file mode 100644
index 000000000000..522ab68d1c88
--- /dev/null
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -0,0 +1,2811 @@
1########################################################################
2# Copyright (c) 2013, Intel Corporation
3#
4# This software is available to you under a choice of one of two
5# licenses. You may choose to be licensed under the terms of the GNU
6# General Public License (GPL) Version 2, available from the file
7# COPYING in the main directory of this source tree, or the
8# OpenIB.org BSD license below:
9#
10# Redistribution and use in source and binary forms, with or without
11# modification, are permitted provided that the following conditions are
12# met:
13#
14# * Redistributions of source code must retain the above copyright
15# notice, this list of conditions and the following disclaimer.
16#
17# * Redistributions in binary form must reproduce the above copyright
18# notice, this list of conditions and the following disclaimer in the
19# documentation and/or other materials provided with the
20# distribution.
21#
22# * Neither the name of the Intel Corporation nor the names of its
23# contributors may be used to endorse or promote products derived from
24# this software without specific prior written permission.
25#
26#
27# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
28# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
31# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
34# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38########################################################################
39##
40## Authors:
41## Erdinc Ozturk <erdinc.ozturk@intel.com>
42## Vinodh Gopal <vinodh.gopal@intel.com>
43## James Guilford <james.guilford@intel.com>
44## Tim Chen <tim.c.chen@linux.intel.com>
45##
46## References:
47## This code was derived and highly optimized from the code described in paper:
48## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
49## on Intel Architecture Processors. August, 2010
50## The details of the implementation is explained in:
51## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
52## on Intel Architecture Processors. October, 2012.
53##
54## Assumptions:
55##
56##
57##
58## iv:
59## 0 1 2 3
60## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
61## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
62## | Salt (From the SA) |
63## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
64## | Initialization Vector |
65## | (This is the sequence number from IPSec header) |
66## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
67## | 0x1 |
68## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
69##
70##
71##
72## AAD:
73## AAD padded to 128 bits with 0
74## for example, assume AAD is a u32 vector
75##
76## if AAD is 8 bytes:
77## AAD[3] = {A0, A1}#
78## padded AAD in xmm register = {A1 A0 0 0}
79##
80## 0 1 2 3
81## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
82## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
83## | SPI (A1) |
84## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
85## | 32-bit Sequence Number (A0) |
86## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
87## | 0x0 |
88## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
89##
90## AAD Format with 32-bit Sequence Number
91##
92## if AAD is 12 bytes:
93## AAD[3] = {A0, A1, A2}#
94## padded AAD in xmm register = {A2 A1 A0 0}
95##
96## 0 1 2 3
97## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
98## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
99## | SPI (A2) |
100## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
101## | 64-bit Extended Sequence Number {A1,A0} |
102## | |
103## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
104## | 0x0 |
105## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
106##
107## AAD Format with 64-bit Extended Sequence Number
108##
109##
110## aadLen:
111## from the definition of the spec, aadLen can only be 8 or 12 bytes.
112## The code additionally supports aadLen of length 16 bytes.
113##
114## TLen:
115## from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
116##
117## poly = x^128 + x^127 + x^126 + x^121 + 1
118## throughout the code, one tab and two tab indentations are used. one tab is
119## for GHASH part, two tabs is for AES part.
120##
121
122#include <linux/linkage.h>
123#include <asm/inst.h>
124
125.data
126.align 16
127
128POLY: .octa 0xC2000000000000000000000000000001
129POLY2: .octa 0xC20000000000000000000001C2000000
130TWOONE: .octa 0x00000001000000000000000000000001
131
132# order of these constants should not change.
133# more specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F
134
135SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
136SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
137ALL_F: .octa 0xffffffffffffffffffffffffffffffff
138ZERO: .octa 0x00000000000000000000000000000000
139ONE: .octa 0x00000000000000000000000000000001
140ONEf: .octa 0x01000000000000000000000000000000
141
142.text
143
144
145##define the fields of the gcm aes context
146#{
147# u8 expanded_keys[16*11] store expanded keys
148# u8 shifted_hkey_1[16] store HashKey <<1 mod poly here
149# u8 shifted_hkey_2[16] store HashKey^2 <<1 mod poly here
150# u8 shifted_hkey_3[16] store HashKey^3 <<1 mod poly here
151# u8 shifted_hkey_4[16] store HashKey^4 <<1 mod poly here
152# u8 shifted_hkey_5[16] store HashKey^5 <<1 mod poly here
153# u8 shifted_hkey_6[16] store HashKey^6 <<1 mod poly here
154# u8 shifted_hkey_7[16] store HashKey^7 <<1 mod poly here
155# u8 shifted_hkey_8[16] store HashKey^8 <<1 mod poly here
156# u8 shifted_hkey_1_k[16] store XOR HashKey <<1 mod poly here (for Karatsuba purposes)
157# u8 shifted_hkey_2_k[16] store XOR HashKey^2 <<1 mod poly here (for Karatsuba purposes)
158# u8 shifted_hkey_3_k[16] store XOR HashKey^3 <<1 mod poly here (for Karatsuba purposes)
159# u8 shifted_hkey_4_k[16] store XOR HashKey^4 <<1 mod poly here (for Karatsuba purposes)
160# u8 shifted_hkey_5_k[16] store XOR HashKey^5 <<1 mod poly here (for Karatsuba purposes)
161# u8 shifted_hkey_6_k[16] store XOR HashKey^6 <<1 mod poly here (for Karatsuba purposes)
162# u8 shifted_hkey_7_k[16] store XOR HashKey^7 <<1 mod poly here (for Karatsuba purposes)
163# u8 shifted_hkey_8_k[16] store XOR HashKey^8 <<1 mod poly here (for Karatsuba purposes)
164#} gcm_ctx#
165
166HashKey = 16*11 # store HashKey <<1 mod poly here
167HashKey_2 = 16*12 # store HashKey^2 <<1 mod poly here
168HashKey_3 = 16*13 # store HashKey^3 <<1 mod poly here
169HashKey_4 = 16*14 # store HashKey^4 <<1 mod poly here
170HashKey_5 = 16*15 # store HashKey^5 <<1 mod poly here
171HashKey_6 = 16*16 # store HashKey^6 <<1 mod poly here
172HashKey_7 = 16*17 # store HashKey^7 <<1 mod poly here
173HashKey_8 = 16*18 # store HashKey^8 <<1 mod poly here
174HashKey_k = 16*19 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
175HashKey_2_k = 16*20 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
176HashKey_3_k = 16*21 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
177HashKey_4_k = 16*22 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
178HashKey_5_k = 16*23 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
179HashKey_6_k = 16*24 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
180HashKey_7_k = 16*25 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
181HashKey_8_k = 16*26 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
182
183#define arg1 %rdi
184#define arg2 %rsi
185#define arg3 %rdx
186#define arg4 %rcx
187#define arg5 %r8
188#define arg6 %r9
189#define arg7 STACK_OFFSET+8*1(%r14)
190#define arg8 STACK_OFFSET+8*2(%r14)
191#define arg9 STACK_OFFSET+8*3(%r14)
192
193i = 0
194j = 0
195
196out_order = 0
197in_order = 1
198DEC = 0
199ENC = 1
200
201.macro define_reg r n
202reg_\r = %xmm\n
203.endm
204
205.macro setreg
206.altmacro
207define_reg i %i
208define_reg j %j
209.noaltmacro
210.endm
211
212# need to push 4 registers into stack to maintain
213STACK_OFFSET = 8*4
214
215TMP1 = 16*0 # Temporary storage for AAD
216TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
217TMP3 = 16*2 # Temporary storage for AES State 3
218TMP4 = 16*3 # Temporary storage for AES State 4
219TMP5 = 16*4 # Temporary storage for AES State 5
220TMP6 = 16*5 # Temporary storage for AES State 6
221TMP7 = 16*6 # Temporary storage for AES State 7
222TMP8 = 16*7 # Temporary storage for AES State 8
223
224VARIABLE_OFFSET = 16*8
225
226################################
227# Utility Macros
228################################
229
230# Encryption of a single block
231.macro ENCRYPT_SINGLE_BLOCK XMM0
232 vpxor (arg1), \XMM0, \XMM0
233 i = 1
234 setreg
235.rep 9
236 vaesenc 16*i(arg1), \XMM0, \XMM0
237 i = (i+1)
238 setreg
239.endr
240 vaesenclast 16*10(arg1), \XMM0, \XMM0
241.endm
242
243#ifdef CONFIG_AS_AVX
244###############################################################################
245# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
246# Input: A and B (128-bits each, bit-reflected)
247# Output: C = A*B*x mod poly, (i.e. >>1 )
248# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
249# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
250###############################################################################
251.macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
252
253 vpshufd $0b01001110, \GH, \T2
254 vpshufd $0b01001110, \HK, \T3
255 vpxor \GH , \T2, \T2 # T2 = (a1+a0)
256 vpxor \HK , \T3, \T3 # T3 = (b1+b0)
257
258 vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1
259 vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0
260 vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0)
261 vpxor \GH, \T2,\T2
262 vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0
263
264 vpslldq $8, \T2,\T3 # shift-L T3 2 DWs
265 vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs
266 vpxor \T3, \GH, \GH
267 vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK
268
269 #first phase of the reduction
270 vpslld $31, \GH, \T2 # packed right shifting << 31
271 vpslld $30, \GH, \T3 # packed right shifting shift << 30
272 vpslld $25, \GH, \T4 # packed right shifting shift << 25
273
274 vpxor \T3, \T2, \T2 # xor the shifted versions
275 vpxor \T4, \T2, \T2
276
277 vpsrldq $4, \T2, \T5 # shift-R T5 1 DW
278
279 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
280 vpxor \T2, \GH, \GH # first phase of the reduction complete
281
282 #second phase of the reduction
283
284 vpsrld $1,\GH, \T2 # packed left shifting >> 1
285 vpsrld $2,\GH, \T3 # packed left shifting >> 2
286 vpsrld $7,\GH, \T4 # packed left shifting >> 7
287 vpxor \T3, \T2, \T2 # xor the shifted versions
288 vpxor \T4, \T2, \T2
289
290 vpxor \T5, \T2, \T2
291 vpxor \T2, \GH, \GH
292 vpxor \T1, \GH, \GH # the result is in GH
293
294
295.endm
296
297.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
298
299 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
300 vmovdqa \HK, \T5
301
302 vpshufd $0b01001110, \T5, \T1
303 vpxor \T5, \T1, \T1
304 vmovdqa \T1, HashKey_k(arg1)
305
306 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
307 vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly
308 vpshufd $0b01001110, \T5, \T1
309 vpxor \T5, \T1, \T1
310 vmovdqa \T1, HashKey_2_k(arg1)
311
312 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
313 vmovdqa \T5, HashKey_3(arg1)
314 vpshufd $0b01001110, \T5, \T1
315 vpxor \T5, \T1, \T1
316 vmovdqa \T1, HashKey_3_k(arg1)
317
318 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
319 vmovdqa \T5, HashKey_4(arg1)
320 vpshufd $0b01001110, \T5, \T1
321 vpxor \T5, \T1, \T1
322 vmovdqa \T1, HashKey_4_k(arg1)
323
324 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
325 vmovdqa \T5, HashKey_5(arg1)
326 vpshufd $0b01001110, \T5, \T1
327 vpxor \T5, \T1, \T1
328 vmovdqa \T1, HashKey_5_k(arg1)
329
330 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
331 vmovdqa \T5, HashKey_6(arg1)
332 vpshufd $0b01001110, \T5, \T1
333 vpxor \T5, \T1, \T1
334 vmovdqa \T1, HashKey_6_k(arg1)
335
336 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
337 vmovdqa \T5, HashKey_7(arg1)
338 vpshufd $0b01001110, \T5, \T1
339 vpxor \T5, \T1, \T1
340 vmovdqa \T1, HashKey_7_k(arg1)
341
342 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
343 vmovdqa \T5, HashKey_8(arg1)
344 vpshufd $0b01001110, \T5, \T1
345 vpxor \T5, \T1, \T1
346 vmovdqa \T1, HashKey_8_k(arg1)
347
348.endm
349
350## if a = number of total plaintext bytes
351## b = floor(a/16)
352## num_initial_blocks = b mod 4#
353## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
354## r10, r11, r12, rax are clobbered
355## arg1, arg2, arg3, r14 are used as a pointer only, not modified
356
357.macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
358 i = (8-\num_initial_blocks)
359 setreg
360
361 mov arg6, %r10 # r10 = AAD
362 mov arg7, %r12 # r12 = aadLen
363
364
365 mov %r12, %r11
366
367 vpxor reg_i, reg_i, reg_i
368_get_AAD_loop\@:
369 vmovd (%r10), \T1
370 vpslldq $12, \T1, \T1
371 vpsrldq $4, reg_i, reg_i
372 vpxor \T1, reg_i, reg_i
373
374 add $4, %r10
375 sub $4, %r12
376 jg _get_AAD_loop\@
377
378
379 cmp $16, %r11
380 je _get_AAD_loop2_done\@
381 mov $16, %r12
382
383_get_AAD_loop2\@:
384 vpsrldq $4, reg_i, reg_i
385 sub $4, %r12
386 cmp %r11, %r12
387 jg _get_AAD_loop2\@
388
389_get_AAD_loop2_done\@:
390
391 #byte-reflect the AAD data
392 vpshufb SHUF_MASK(%rip), reg_i, reg_i
393
394 # initialize the data pointer offset as zero
395 xor %r11, %r11
396
397 # start AES for num_initial_blocks blocks
398 mov arg5, %rax # rax = *Y0
399 vmovdqu (%rax), \CTR # CTR = Y0
400 vpshufb SHUF_MASK(%rip), \CTR, \CTR
401
402
403 i = (9-\num_initial_blocks)
404 setreg
405.rep \num_initial_blocks
406 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
407 vmovdqa \CTR, reg_i
408 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
409 i = (i+1)
410 setreg
411.endr
412
413 vmovdqa (arg1), \T_key
414 i = (9-\num_initial_blocks)
415 setreg
416.rep \num_initial_blocks
417 vpxor \T_key, reg_i, reg_i
418 i = (i+1)
419 setreg
420.endr
421
422 j = 1
423 setreg
424.rep 9
425 vmovdqa 16*j(arg1), \T_key
426 i = (9-\num_initial_blocks)
427 setreg
428.rep \num_initial_blocks
429 vaesenc \T_key, reg_i, reg_i
430 i = (i+1)
431 setreg
432.endr
433
434 j = (j+1)
435 setreg
436.endr
437
438
439 vmovdqa 16*10(arg1), \T_key
440 i = (9-\num_initial_blocks)
441 setreg
442.rep \num_initial_blocks
443 vaesenclast \T_key, reg_i, reg_i
444 i = (i+1)
445 setreg
446.endr
447
448 i = (9-\num_initial_blocks)
449 setreg
450.rep \num_initial_blocks
451 vmovdqu (arg3, %r11), \T1
452 vpxor \T1, reg_i, reg_i
453 vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for num_initial_blocks blocks
454 add $16, %r11
455.if \ENC_DEC == DEC
456 vmovdqa \T1, reg_i
457.endif
458 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
459 i = (i+1)
460 setreg
461.endr
462
463
464 i = (8-\num_initial_blocks)
465 j = (9-\num_initial_blocks)
466 setreg
467 GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6
468
469.rep \num_initial_blocks
470 vpxor reg_i, reg_j, reg_j
471 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
472 i = (i+1)
473 j = (j+1)
474 setreg
475.endr
476 # XMM8 has the combined result here
477
478 vmovdqa \XMM8, TMP1(%rsp)
479 vmovdqa \XMM8, \T3
480
481 cmp $128, %r13
482 jl _initial_blocks_done\@ # no need for precomputed constants
483
484###############################################################################
485# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
486 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
487 vmovdqa \CTR, \XMM1
488 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
489
490 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
491 vmovdqa \CTR, \XMM2
492 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
493
494 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
495 vmovdqa \CTR, \XMM3
496 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
497
498 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
499 vmovdqa \CTR, \XMM4
500 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
501
502 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
503 vmovdqa \CTR, \XMM5
504 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
505
506 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
507 vmovdqa \CTR, \XMM6
508 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
509
510 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
511 vmovdqa \CTR, \XMM7
512 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
513
514 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
515 vmovdqa \CTR, \XMM8
516 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
517
518 vmovdqa (arg1), \T_key
519 vpxor \T_key, \XMM1, \XMM1
520 vpxor \T_key, \XMM2, \XMM2
521 vpxor \T_key, \XMM3, \XMM3
522 vpxor \T_key, \XMM4, \XMM4
523 vpxor \T_key, \XMM5, \XMM5
524 vpxor \T_key, \XMM6, \XMM6
525 vpxor \T_key, \XMM7, \XMM7
526 vpxor \T_key, \XMM8, \XMM8
527
528 i = 1
529 setreg
530.rep 9 # do 9 rounds
531 vmovdqa 16*i(arg1), \T_key
532 vaesenc \T_key, \XMM1, \XMM1
533 vaesenc \T_key, \XMM2, \XMM2
534 vaesenc \T_key, \XMM3, \XMM3
535 vaesenc \T_key, \XMM4, \XMM4
536 vaesenc \T_key, \XMM5, \XMM5
537 vaesenc \T_key, \XMM6, \XMM6
538 vaesenc \T_key, \XMM7, \XMM7
539 vaesenc \T_key, \XMM8, \XMM8
540 i = (i+1)
541 setreg
542.endr
543
544
545 vmovdqa 16*i(arg1), \T_key
546 vaesenclast \T_key, \XMM1, \XMM1
547 vaesenclast \T_key, \XMM2, \XMM2
548 vaesenclast \T_key, \XMM3, \XMM3
549 vaesenclast \T_key, \XMM4, \XMM4
550 vaesenclast \T_key, \XMM5, \XMM5
551 vaesenclast \T_key, \XMM6, \XMM6
552 vaesenclast \T_key, \XMM7, \XMM7
553 vaesenclast \T_key, \XMM8, \XMM8
554
555 vmovdqu (arg3, %r11), \T1
556 vpxor \T1, \XMM1, \XMM1
557 vmovdqu \XMM1, (arg2 , %r11)
558 .if \ENC_DEC == DEC
559 vmovdqa \T1, \XMM1
560 .endif
561
562 vmovdqu 16*1(arg3, %r11), \T1
563 vpxor \T1, \XMM2, \XMM2
564 vmovdqu \XMM2, 16*1(arg2 , %r11)
565 .if \ENC_DEC == DEC
566 vmovdqa \T1, \XMM2
567 .endif
568
569 vmovdqu 16*2(arg3, %r11), \T1
570 vpxor \T1, \XMM3, \XMM3
571 vmovdqu \XMM3, 16*2(arg2 , %r11)
572 .if \ENC_DEC == DEC
573 vmovdqa \T1, \XMM3
574 .endif
575
576 vmovdqu 16*3(arg3, %r11), \T1
577 vpxor \T1, \XMM4, \XMM4
578 vmovdqu \XMM4, 16*3(arg2 , %r11)
579 .if \ENC_DEC == DEC
580 vmovdqa \T1, \XMM4
581 .endif
582
583 vmovdqu 16*4(arg3, %r11), \T1
584 vpxor \T1, \XMM5, \XMM5
585 vmovdqu \XMM5, 16*4(arg2 , %r11)
586 .if \ENC_DEC == DEC
587 vmovdqa \T1, \XMM5
588 .endif
589
590 vmovdqu 16*5(arg3, %r11), \T1
591 vpxor \T1, \XMM6, \XMM6
592 vmovdqu \XMM6, 16*5(arg2 , %r11)
593 .if \ENC_DEC == DEC
594 vmovdqa \T1, \XMM6
595 .endif
596
597 vmovdqu 16*6(arg3, %r11), \T1
598 vpxor \T1, \XMM7, \XMM7
599 vmovdqu \XMM7, 16*6(arg2 , %r11)
600 .if \ENC_DEC == DEC
601 vmovdqa \T1, \XMM7
602 .endif
603
604 vmovdqu 16*7(arg3, %r11), \T1
605 vpxor \T1, \XMM8, \XMM8
606 vmovdqu \XMM8, 16*7(arg2 , %r11)
607 .if \ENC_DEC == DEC
608 vmovdqa \T1, \XMM8
609 .endif
610
611 add $128, %r11
612
613 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
614 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext
615 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
616 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
617 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
618 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
619 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
620 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
621 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
622
623###############################################################################
624
625_initial_blocks_done\@:
626
627.endm
628
629# encrypt 8 blocks at a time
630# ghash the 8 previously encrypted ciphertext blocks
631# arg1, arg2, arg3 are used as pointers only, not modified
632# r11 is the data offset value
633.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
634
635 vmovdqa \XMM1, \T2
636 vmovdqa \XMM2, TMP2(%rsp)
637 vmovdqa \XMM3, TMP3(%rsp)
638 vmovdqa \XMM4, TMP4(%rsp)
639 vmovdqa \XMM5, TMP5(%rsp)
640 vmovdqa \XMM6, TMP6(%rsp)
641 vmovdqa \XMM7, TMP7(%rsp)
642 vmovdqa \XMM8, TMP8(%rsp)
643
644.if \loop_idx == in_order
645 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
646 vpaddd ONE(%rip), \XMM1, \XMM2
647 vpaddd ONE(%rip), \XMM2, \XMM3
648 vpaddd ONE(%rip), \XMM3, \XMM4
649 vpaddd ONE(%rip), \XMM4, \XMM5
650 vpaddd ONE(%rip), \XMM5, \XMM6
651 vpaddd ONE(%rip), \XMM6, \XMM7
652 vpaddd ONE(%rip), \XMM7, \XMM8
653 vmovdqa \XMM8, \CTR
654
655 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
656 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
657 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
658 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
659 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
660 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
661 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
662 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
663.else
664 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
665 vpaddd ONEf(%rip), \XMM1, \XMM2
666 vpaddd ONEf(%rip), \XMM2, \XMM3
667 vpaddd ONEf(%rip), \XMM3, \XMM4
668 vpaddd ONEf(%rip), \XMM4, \XMM5
669 vpaddd ONEf(%rip), \XMM5, \XMM6
670 vpaddd ONEf(%rip), \XMM6, \XMM7
671 vpaddd ONEf(%rip), \XMM7, \XMM8
672 vmovdqa \XMM8, \CTR
673.endif
674
675
676 #######################################################################
677
678 vmovdqu (arg1), \T1
679 vpxor \T1, \XMM1, \XMM1
680 vpxor \T1, \XMM2, \XMM2
681 vpxor \T1, \XMM3, \XMM3
682 vpxor \T1, \XMM4, \XMM4
683 vpxor \T1, \XMM5, \XMM5
684 vpxor \T1, \XMM6, \XMM6
685 vpxor \T1, \XMM7, \XMM7
686 vpxor \T1, \XMM8, \XMM8
687
688 #######################################################################
689
690
691
692
693
694 vmovdqu 16*1(arg1), \T1
695 vaesenc \T1, \XMM1, \XMM1
696 vaesenc \T1, \XMM2, \XMM2
697 vaesenc \T1, \XMM3, \XMM3
698 vaesenc \T1, \XMM4, \XMM4
699 vaesenc \T1, \XMM5, \XMM5
700 vaesenc \T1, \XMM6, \XMM6
701 vaesenc \T1, \XMM7, \XMM7
702 vaesenc \T1, \XMM8, \XMM8
703
704 vmovdqu 16*2(arg1), \T1
705 vaesenc \T1, \XMM1, \XMM1
706 vaesenc \T1, \XMM2, \XMM2
707 vaesenc \T1, \XMM3, \XMM3
708 vaesenc \T1, \XMM4, \XMM4
709 vaesenc \T1, \XMM5, \XMM5
710 vaesenc \T1, \XMM6, \XMM6
711 vaesenc \T1, \XMM7, \XMM7
712 vaesenc \T1, \XMM8, \XMM8
713
714
715 #######################################################################
716
717 vmovdqa HashKey_8(arg1), \T5
718 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
719 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
720
721 vpshufd $0b01001110, \T2, \T6
722 vpxor \T2, \T6, \T6
723
724 vmovdqa HashKey_8_k(arg1), \T5
725 vpclmulqdq $0x00, \T5, \T6, \T6
726
727 vmovdqu 16*3(arg1), \T1
728 vaesenc \T1, \XMM1, \XMM1
729 vaesenc \T1, \XMM2, \XMM2
730 vaesenc \T1, \XMM3, \XMM3
731 vaesenc \T1, \XMM4, \XMM4
732 vaesenc \T1, \XMM5, \XMM5
733 vaesenc \T1, \XMM6, \XMM6
734 vaesenc \T1, \XMM7, \XMM7
735 vaesenc \T1, \XMM8, \XMM8
736
737 vmovdqa TMP2(%rsp), \T1
738 vmovdqa HashKey_7(arg1), \T5
739 vpclmulqdq $0x11, \T5, \T1, \T3
740 vpxor \T3, \T4, \T4
741 vpclmulqdq $0x00, \T5, \T1, \T3
742 vpxor \T3, \T7, \T7
743
744 vpshufd $0b01001110, \T1, \T3
745 vpxor \T1, \T3, \T3
746 vmovdqa HashKey_7_k(arg1), \T5
747 vpclmulqdq $0x10, \T5, \T3, \T3
748 vpxor \T3, \T6, \T6
749
750 vmovdqu 16*4(arg1), \T1
751 vaesenc \T1, \XMM1, \XMM1
752 vaesenc \T1, \XMM2, \XMM2
753 vaesenc \T1, \XMM3, \XMM3
754 vaesenc \T1, \XMM4, \XMM4
755 vaesenc \T1, \XMM5, \XMM5
756 vaesenc \T1, \XMM6, \XMM6
757 vaesenc \T1, \XMM7, \XMM7
758 vaesenc \T1, \XMM8, \XMM8
759
760 #######################################################################
761
762 vmovdqa TMP3(%rsp), \T1
763 vmovdqa HashKey_6(arg1), \T5
764 vpclmulqdq $0x11, \T5, \T1, \T3
765 vpxor \T3, \T4, \T4
766 vpclmulqdq $0x00, \T5, \T1, \T3
767 vpxor \T3, \T7, \T7
768
769 vpshufd $0b01001110, \T1, \T3
770 vpxor \T1, \T3, \T3
771 vmovdqa HashKey_6_k(arg1), \T5
772 vpclmulqdq $0x10, \T5, \T3, \T3
773 vpxor \T3, \T6, \T6
774
775 vmovdqu 16*5(arg1), \T1
776 vaesenc \T1, \XMM1, \XMM1
777 vaesenc \T1, \XMM2, \XMM2
778 vaesenc \T1, \XMM3, \XMM3
779 vaesenc \T1, \XMM4, \XMM4
780 vaesenc \T1, \XMM5, \XMM5
781 vaesenc \T1, \XMM6, \XMM6
782 vaesenc \T1, \XMM7, \XMM7
783 vaesenc \T1, \XMM8, \XMM8
784
785 vmovdqa TMP4(%rsp), \T1
786 vmovdqa HashKey_5(arg1), \T5
787 vpclmulqdq $0x11, \T5, \T1, \T3
788 vpxor \T3, \T4, \T4
789 vpclmulqdq $0x00, \T5, \T1, \T3
790 vpxor \T3, \T7, \T7
791
792 vpshufd $0b01001110, \T1, \T3
793 vpxor \T1, \T3, \T3
794 vmovdqa HashKey_5_k(arg1), \T5
795 vpclmulqdq $0x10, \T5, \T3, \T3
796 vpxor \T3, \T6, \T6
797
798 vmovdqu 16*6(arg1), \T1
799 vaesenc \T1, \XMM1, \XMM1
800 vaesenc \T1, \XMM2, \XMM2
801 vaesenc \T1, \XMM3, \XMM3
802 vaesenc \T1, \XMM4, \XMM4
803 vaesenc \T1, \XMM5, \XMM5
804 vaesenc \T1, \XMM6, \XMM6
805 vaesenc \T1, \XMM7, \XMM7
806 vaesenc \T1, \XMM8, \XMM8
807
808
809 vmovdqa TMP5(%rsp), \T1
810 vmovdqa HashKey_4(arg1), \T5
811 vpclmulqdq $0x11, \T5, \T1, \T3
812 vpxor \T3, \T4, \T4
813 vpclmulqdq $0x00, \T5, \T1, \T3
814 vpxor \T3, \T7, \T7
815
816 vpshufd $0b01001110, \T1, \T3
817 vpxor \T1, \T3, \T3
818 vmovdqa HashKey_4_k(arg1), \T5
819 vpclmulqdq $0x10, \T5, \T3, \T3
820 vpxor \T3, \T6, \T6
821
822 vmovdqu 16*7(arg1), \T1
823 vaesenc \T1, \XMM1, \XMM1
824 vaesenc \T1, \XMM2, \XMM2
825 vaesenc \T1, \XMM3, \XMM3
826 vaesenc \T1, \XMM4, \XMM4
827 vaesenc \T1, \XMM5, \XMM5
828 vaesenc \T1, \XMM6, \XMM6
829 vaesenc \T1, \XMM7, \XMM7
830 vaesenc \T1, \XMM8, \XMM8
831
832 vmovdqa TMP6(%rsp), \T1
833 vmovdqa HashKey_3(arg1), \T5
834 vpclmulqdq $0x11, \T5, \T1, \T3
835 vpxor \T3, \T4, \T4
836 vpclmulqdq $0x00, \T5, \T1, \T3
837 vpxor \T3, \T7, \T7
838
839 vpshufd $0b01001110, \T1, \T3
840 vpxor \T1, \T3, \T3
841 vmovdqa HashKey_3_k(arg1), \T5
842 vpclmulqdq $0x10, \T5, \T3, \T3
843 vpxor \T3, \T6, \T6
844
845
846 vmovdqu 16*8(arg1), \T1
847 vaesenc \T1, \XMM1, \XMM1
848 vaesenc \T1, \XMM2, \XMM2
849 vaesenc \T1, \XMM3, \XMM3
850 vaesenc \T1, \XMM4, \XMM4
851 vaesenc \T1, \XMM5, \XMM5
852 vaesenc \T1, \XMM6, \XMM6
853 vaesenc \T1, \XMM7, \XMM7
854 vaesenc \T1, \XMM8, \XMM8
855
856 vmovdqa TMP7(%rsp), \T1
857 vmovdqa HashKey_2(arg1), \T5
858 vpclmulqdq $0x11, \T5, \T1, \T3
859 vpxor \T3, \T4, \T4
860 vpclmulqdq $0x00, \T5, \T1, \T3
861 vpxor \T3, \T7, \T7
862
863 vpshufd $0b01001110, \T1, \T3
864 vpxor \T1, \T3, \T3
865 vmovdqa HashKey_2_k(arg1), \T5
866 vpclmulqdq $0x10, \T5, \T3, \T3
867 vpxor \T3, \T6, \T6
868
869 #######################################################################
870
871 vmovdqu 16*9(arg1), \T5
872 vaesenc \T5, \XMM1, \XMM1
873 vaesenc \T5, \XMM2, \XMM2
874 vaesenc \T5, \XMM3, \XMM3
875 vaesenc \T5, \XMM4, \XMM4
876 vaesenc \T5, \XMM5, \XMM5
877 vaesenc \T5, \XMM6, \XMM6
878 vaesenc \T5, \XMM7, \XMM7
879 vaesenc \T5, \XMM8, \XMM8
880
881 vmovdqa TMP8(%rsp), \T1
882 vmovdqa HashKey(arg1), \T5
883 vpclmulqdq $0x11, \T5, \T1, \T3
884 vpxor \T3, \T4, \T4
885 vpclmulqdq $0x00, \T5, \T1, \T3
886 vpxor \T3, \T7, \T7
887
888 vpshufd $0b01001110, \T1, \T3
889 vpxor \T1, \T3, \T3
890 vmovdqa HashKey_k(arg1), \T5
891 vpclmulqdq $0x10, \T5, \T3, \T3
892 vpxor \T3, \T6, \T6
893
894 vpxor \T4, \T6, \T6
895 vpxor \T7, \T6, \T6
896
897 vmovdqu 16*10(arg1), \T5
898
899 i = 0
900 j = 1
901 setreg
902.rep 8
903 vpxor 16*i(arg3, %r11), \T5, \T2
904 .if \ENC_DEC == ENC
905 vaesenclast \T2, reg_j, reg_j
906 .else
907 vaesenclast \T2, reg_j, \T3
908 vmovdqu 16*i(arg3, %r11), reg_j
909 vmovdqu \T3, 16*i(arg2, %r11)
910 .endif
911 i = (i+1)
912 j = (j+1)
913 setreg
914.endr
915 #######################################################################
916
917
918 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
919 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
920 vpxor \T3, \T7, \T7
921 vpxor \T4, \T6, \T6 # accumulate the results in T6:T7
922
923
924
925 #######################################################################
926 #first phase of the reduction
927 #######################################################################
928 vpslld $31, \T7, \T2 # packed right shifting << 31
929 vpslld $30, \T7, \T3 # packed right shifting shift << 30
930 vpslld $25, \T7, \T4 # packed right shifting shift << 25
931
932 vpxor \T3, \T2, \T2 # xor the shifted versions
933 vpxor \T4, \T2, \T2
934
935 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
936
937 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
938 vpxor \T2, \T7, \T7 # first phase of the reduction complete
939 #######################################################################
940 .if \ENC_DEC == ENC
941 vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer
942 vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer
943 vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer
944 vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer
945 vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer
946 vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer
947 vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer
948 vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer
949 .endif
950
951 #######################################################################
952 #second phase of the reduction
953 vpsrld $1, \T7, \T2 # packed left shifting >> 1
954 vpsrld $2, \T7, \T3 # packed left shifting >> 2
955 vpsrld $7, \T7, \T4 # packed left shifting >> 7
956 vpxor \T3, \T2, \T2 # xor the shifted versions
957 vpxor \T4, \T2, \T2
958
959 vpxor \T1, \T2, \T2
960 vpxor \T2, \T7, \T7
961 vpxor \T7, \T6, \T6 # the result is in T6
962 #######################################################################
963
964 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
965 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
966 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
967 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
968 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
969 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
970 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
971 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
972
973
974 vpxor \T6, \XMM1, \XMM1
975
976
977
978.endm
979
980
981# GHASH the last 4 ciphertext blocks.
982.macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
983
984 ## Karatsuba Method
985
986
987 vpshufd $0b01001110, \XMM1, \T2
988 vpxor \XMM1, \T2, \T2
989 vmovdqa HashKey_8(arg1), \T5
990 vpclmulqdq $0x11, \T5, \XMM1, \T6
991 vpclmulqdq $0x00, \T5, \XMM1, \T7
992
993 vmovdqa HashKey_8_k(arg1), \T3
994 vpclmulqdq $0x00, \T3, \T2, \XMM1
995
996 ######################
997
998 vpshufd $0b01001110, \XMM2, \T2
999 vpxor \XMM2, \T2, \T2
1000 vmovdqa HashKey_7(arg1), \T5
1001 vpclmulqdq $0x11, \T5, \XMM2, \T4
1002 vpxor \T4, \T6, \T6
1003
1004 vpclmulqdq $0x00, \T5, \XMM2, \T4
1005 vpxor \T4, \T7, \T7
1006
1007 vmovdqa HashKey_7_k(arg1), \T3
1008 vpclmulqdq $0x00, \T3, \T2, \T2
1009 vpxor \T2, \XMM1, \XMM1
1010
1011 ######################
1012
1013 vpshufd $0b01001110, \XMM3, \T2
1014 vpxor \XMM3, \T2, \T2
1015 vmovdqa HashKey_6(arg1), \T5
1016 vpclmulqdq $0x11, \T5, \XMM3, \T4
1017 vpxor \T4, \T6, \T6
1018
1019 vpclmulqdq $0x00, \T5, \XMM3, \T4
1020 vpxor \T4, \T7, \T7
1021
1022 vmovdqa HashKey_6_k(arg1), \T3
1023 vpclmulqdq $0x00, \T3, \T2, \T2
1024 vpxor \T2, \XMM1, \XMM1
1025
1026 ######################
1027
1028 vpshufd $0b01001110, \XMM4, \T2
1029 vpxor \XMM4, \T2, \T2
1030 vmovdqa HashKey_5(arg1), \T5
1031 vpclmulqdq $0x11, \T5, \XMM4, \T4
1032 vpxor \T4, \T6, \T6
1033
1034 vpclmulqdq $0x00, \T5, \XMM4, \T4
1035 vpxor \T4, \T7, \T7
1036
1037 vmovdqa HashKey_5_k(arg1), \T3
1038 vpclmulqdq $0x00, \T3, \T2, \T2
1039 vpxor \T2, \XMM1, \XMM1
1040
1041 ######################
1042
1043 vpshufd $0b01001110, \XMM5, \T2
1044 vpxor \XMM5, \T2, \T2
1045 vmovdqa HashKey_4(arg1), \T5
1046 vpclmulqdq $0x11, \T5, \XMM5, \T4
1047 vpxor \T4, \T6, \T6
1048
1049 vpclmulqdq $0x00, \T5, \XMM5, \T4
1050 vpxor \T4, \T7, \T7
1051
1052 vmovdqa HashKey_4_k(arg1), \T3
1053 vpclmulqdq $0x00, \T3, \T2, \T2
1054 vpxor \T2, \XMM1, \XMM1
1055
1056 ######################
1057
1058 vpshufd $0b01001110, \XMM6, \T2
1059 vpxor \XMM6, \T2, \T2
1060 vmovdqa HashKey_3(arg1), \T5
1061 vpclmulqdq $0x11, \T5, \XMM6, \T4
1062 vpxor \T4, \T6, \T6
1063
1064 vpclmulqdq $0x00, \T5, \XMM6, \T4
1065 vpxor \T4, \T7, \T7
1066
1067 vmovdqa HashKey_3_k(arg1), \T3
1068 vpclmulqdq $0x00, \T3, \T2, \T2
1069 vpxor \T2, \XMM1, \XMM1
1070
1071 ######################
1072
1073 vpshufd $0b01001110, \XMM7, \T2
1074 vpxor \XMM7, \T2, \T2
1075 vmovdqa HashKey_2(arg1), \T5
1076 vpclmulqdq $0x11, \T5, \XMM7, \T4
1077 vpxor \T4, \T6, \T6
1078
1079 vpclmulqdq $0x00, \T5, \XMM7, \T4
1080 vpxor \T4, \T7, \T7
1081
1082 vmovdqa HashKey_2_k(arg1), \T3
1083 vpclmulqdq $0x00, \T3, \T2, \T2
1084 vpxor \T2, \XMM1, \XMM1
1085
1086 ######################
1087
1088 vpshufd $0b01001110, \XMM8, \T2
1089 vpxor \XMM8, \T2, \T2
1090 vmovdqa HashKey(arg1), \T5
1091 vpclmulqdq $0x11, \T5, \XMM8, \T4
1092 vpxor \T4, \T6, \T6
1093
1094 vpclmulqdq $0x00, \T5, \XMM8, \T4
1095 vpxor \T4, \T7, \T7
1096
1097 vmovdqa HashKey_k(arg1), \T3
1098 vpclmulqdq $0x00, \T3, \T2, \T2
1099
1100 vpxor \T2, \XMM1, \XMM1
1101 vpxor \T6, \XMM1, \XMM1
1102 vpxor \T7, \XMM1, \T2
1103
1104
1105
1106
1107 vpslldq $8, \T2, \T4
1108 vpsrldq $8, \T2, \T2
1109
1110 vpxor \T4, \T7, \T7
1111 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of
1112 # the accumulated carry-less multiplications
1113
1114 #######################################################################
1115 #first phase of the reduction
1116 vpslld $31, \T7, \T2 # packed right shifting << 31
1117 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1118 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1119
1120 vpxor \T3, \T2, \T2 # xor the shifted versions
1121 vpxor \T4, \T2, \T2
1122
1123 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1124
1125 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1126 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1127 #######################################################################
1128
1129
1130 #second phase of the reduction
1131 vpsrld $1, \T7, \T2 # packed left shifting >> 1
1132 vpsrld $2, \T7, \T3 # packed left shifting >> 2
1133 vpsrld $7, \T7, \T4 # packed left shifting >> 7
1134 vpxor \T3, \T2, \T2 # xor the shifted versions
1135 vpxor \T4, \T2, \T2
1136
1137 vpxor \T1, \T2, \T2
1138 vpxor \T2, \T7, \T7
1139 vpxor \T7, \T6, \T6 # the result is in T6
1140
1141.endm
1142
1143
1144# combined for GCM encrypt and decrypt functions
1145# clobbering all xmm registers
1146# clobbering r10, r11, r12, r13, r14, r15
1147.macro GCM_ENC_DEC_AVX ENC_DEC
1148
1149 #the number of pushes must equal STACK_OFFSET
1150 push %r12
1151 push %r13
1152 push %r14
1153 push %r15
1154
1155 mov %rsp, %r14
1156
1157
1158
1159
1160 sub $VARIABLE_OFFSET, %rsp
1161 and $~63, %rsp # align rsp to 64 bytes
1162
1163
1164 vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey
1165
1166 mov arg4, %r13 # save the number of bytes of plaintext/ciphertext
1167 and $-16, %r13 # r13 = r13 - (r13 mod 16)
1168
1169 mov %r13, %r12
1170 shr $4, %r12
1171 and $7, %r12
1172 jz _initial_num_blocks_is_0\@
1173
1174 cmp $7, %r12
1175 je _initial_num_blocks_is_7\@
1176 cmp $6, %r12
1177 je _initial_num_blocks_is_6\@
1178 cmp $5, %r12
1179 je _initial_num_blocks_is_5\@
1180 cmp $4, %r12
1181 je _initial_num_blocks_is_4\@
1182 cmp $3, %r12
1183 je _initial_num_blocks_is_3\@
1184 cmp $2, %r12
1185 je _initial_num_blocks_is_2\@
1186
1187 jmp _initial_num_blocks_is_1\@
1188
1189_initial_num_blocks_is_7\@:
1190 INITIAL_BLOCKS_AVX 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1191 sub $16*7, %r13
1192 jmp _initial_blocks_encrypted\@
1193
1194_initial_num_blocks_is_6\@:
1195 INITIAL_BLOCKS_AVX 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1196 sub $16*6, %r13
1197 jmp _initial_blocks_encrypted\@
1198
1199_initial_num_blocks_is_5\@:
1200 INITIAL_BLOCKS_AVX 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1201 sub $16*5, %r13
1202 jmp _initial_blocks_encrypted\@
1203
1204_initial_num_blocks_is_4\@:
1205 INITIAL_BLOCKS_AVX 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1206 sub $16*4, %r13
1207 jmp _initial_blocks_encrypted\@
1208
1209_initial_num_blocks_is_3\@:
1210 INITIAL_BLOCKS_AVX 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1211 sub $16*3, %r13
1212 jmp _initial_blocks_encrypted\@
1213
1214_initial_num_blocks_is_2\@:
1215 INITIAL_BLOCKS_AVX 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1216 sub $16*2, %r13
1217 jmp _initial_blocks_encrypted\@
1218
1219_initial_num_blocks_is_1\@:
1220 INITIAL_BLOCKS_AVX 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1221 sub $16*1, %r13
1222 jmp _initial_blocks_encrypted\@
1223
1224_initial_num_blocks_is_0\@:
1225 INITIAL_BLOCKS_AVX 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1226
1227
1228_initial_blocks_encrypted\@:
1229 cmp $0, %r13
1230 je _zero_cipher_left\@
1231
1232 sub $128, %r13
1233 je _eight_cipher_left\@
1234
1235
1236
1237
1238 vmovd %xmm9, %r15d
1239 and $255, %r15d
1240 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1241
1242
1243_encrypt_by_8_new\@:
1244 cmp $(255-8), %r15d
1245 jg _encrypt_by_8\@
1246
1247
1248
1249 add $8, %r15b
1250 GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
1251 add $128, %r11
1252 sub $128, %r13
1253 jne _encrypt_by_8_new\@
1254
1255 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1256 jmp _eight_cipher_left\@
1257
1258_encrypt_by_8\@:
1259 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1260 add $8, %r15b
1261 GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
1262 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1263 add $128, %r11
1264 sub $128, %r13
1265 jne _encrypt_by_8_new\@
1266
1267 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1268
1269
1270
1271
1272_eight_cipher_left\@:
1273 GHASH_LAST_8_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
1274
1275
1276_zero_cipher_left\@:
1277 cmp $16, arg4
1278 jl _only_less_than_16\@
1279
1280 mov arg4, %r13
1281 and $15, %r13 # r13 = (arg4 mod 16)
1282
1283 je _multiple_of_16_bytes\@
1284
1285 # handle the last <16 Byte block seperately
1286
1287
1288 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
1289 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1290 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
1291
1292 sub $16, %r11
1293 add %r13, %r11
1294 vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block
1295
1296 lea SHIFT_MASK+16(%rip), %r12
1297 sub %r13, %r12 # adjust the shuffle mask pointer to be
1298 # able to shift 16-r13 bytes (r13 is the
1299 # number of bytes in plaintext mod 16)
1300 vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
1301 vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes
1302 jmp _final_ghash_mul\@
1303
1304_only_less_than_16\@:
1305 # check for 0 length
1306 mov arg4, %r13
1307 and $15, %r13 # r13 = (arg4 mod 16)
1308
1309 je _multiple_of_16_bytes\@
1310
1311 # handle the last <16 Byte block seperately
1312
1313
1314 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
1315 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1316 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
1317
1318
1319 lea SHIFT_MASK+16(%rip), %r12
1320 sub %r13, %r12 # adjust the shuffle mask pointer to be
1321 # able to shift 16-r13 bytes (r13 is the
1322 # number of bytes in plaintext mod 16)
1323
1324_get_last_16_byte_loop\@:
1325 movb (arg3, %r11), %al
1326 movb %al, TMP1 (%rsp , %r11)
1327 add $1, %r11
1328 cmp %r13, %r11
1329 jne _get_last_16_byte_loop\@
1330
1331 vmovdqu TMP1(%rsp), %xmm1
1332
1333 sub $16, %r11
1334
1335_final_ghash_mul\@:
1336 .if \ENC_DEC == DEC
1337 vmovdqa %xmm1, %xmm2
1338 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
1339 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
1340 # mask out top 16-r13 bytes of xmm9
1341 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
1342 vpand %xmm1, %xmm2, %xmm2
1343 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
1344 vpxor %xmm2, %xmm14, %xmm14
1345 #GHASH computation for the last <16 Byte block
1346 GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
1347 sub %r13, %r11
1348 add $16, %r11
1349 .else
1350 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
1351 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
1352 # mask out top 16-r13 bytes of xmm9
1353 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
1354 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1355 vpxor %xmm9, %xmm14, %xmm14
1356 #GHASH computation for the last <16 Byte block
1357 GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
1358 sub %r13, %r11
1359 add $16, %r11
1360 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
1361 .endif
1362
1363
1364 #############################
1365 # output r13 Bytes
1366 vmovq %xmm9, %rax
1367 cmp $8, %r13
1368 jle _less_than_8_bytes_left\@
1369
1370 mov %rax, (arg2 , %r11)
1371 add $8, %r11
1372 vpsrldq $8, %xmm9, %xmm9
1373 vmovq %xmm9, %rax
1374 sub $8, %r13
1375
1376_less_than_8_bytes_left\@:
1377 movb %al, (arg2 , %r11)
1378 add $1, %r11
1379 shr $8, %rax
1380 sub $1, %r13
1381 jne _less_than_8_bytes_left\@
1382 #############################
1383
1384_multiple_of_16_bytes\@:
1385 mov arg7, %r12 # r12 = aadLen (number of bytes)
1386 shl $3, %r12 # convert into number of bits
1387 vmovd %r12d, %xmm15 # len(A) in xmm15
1388
1389 shl $3, arg4 # len(C) in bits (*128)
1390 vmovq arg4, %xmm1
1391 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
1392 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
1393
1394 vpxor %xmm15, %xmm14, %xmm14
1395 GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
1396 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
1397
1398 mov arg5, %rax # rax = *Y0
1399 vmovdqu (%rax), %xmm9 # xmm9 = Y0
1400
1401 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0)
1402
1403 vpxor %xmm14, %xmm9, %xmm9
1404
1405
1406
1407_return_T\@:
1408 mov arg8, %r10 # r10 = authTag
1409 mov arg9, %r11 # r11 = auth_tag_len
1410
1411 cmp $16, %r11
1412 je _T_16\@
1413
1414 cmp $12, %r11
1415 je _T_12\@
1416
1417_T_8\@:
1418 vmovq %xmm9, %rax
1419 mov %rax, (%r10)
1420 jmp _return_T_done\@
1421_T_12\@:
1422 vmovq %xmm9, %rax
1423 mov %rax, (%r10)
1424 vpsrldq $8, %xmm9, %xmm9
1425 vmovd %xmm9, %eax
1426 mov %eax, 8(%r10)
1427 jmp _return_T_done\@
1428
1429_T_16\@:
1430 vmovdqu %xmm9, (%r10)
1431
1432_return_T_done\@:
1433 mov %r14, %rsp
1434
1435 pop %r15
1436 pop %r14
1437 pop %r13
1438 pop %r12
1439.endm
1440
1441
1442#############################################################
1443#void aesni_gcm_precomp_avx_gen2
1444# (gcm_data *my_ctx_data,
1445# u8 *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
1446#############################################################
1447ENTRY(aesni_gcm_precomp_avx_gen2)
1448 #the number of pushes must equal STACK_OFFSET
1449 push %r12
1450 push %r13
1451 push %r14
1452 push %r15
1453
1454 mov %rsp, %r14
1455
1456
1457
1458 sub $VARIABLE_OFFSET, %rsp
1459 and $~63, %rsp # align rsp to 64 bytes
1460
1461 vmovdqu (arg2), %xmm6 # xmm6 = HashKey
1462
1463 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
1464 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
1465 vmovdqa %xmm6, %xmm2
1466 vpsllq $1, %xmm6, %xmm6
1467 vpsrlq $63, %xmm2, %xmm2
1468 vmovdqa %xmm2, %xmm1
1469 vpslldq $8, %xmm2, %xmm2
1470 vpsrldq $8, %xmm1, %xmm1
1471 vpor %xmm2, %xmm6, %xmm6
1472 #reduction
1473 vpshufd $0b00100100, %xmm1, %xmm2
1474 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
1475 vpand POLY(%rip), %xmm2, %xmm2
1476 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
1477 #######################################################################
1478 vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly
1479
1480
1481 PRECOMPUTE_AVX %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
1482
1483 mov %r14, %rsp
1484
1485 pop %r15
1486 pop %r14
1487 pop %r13
1488 pop %r12
1489 ret
1490ENDPROC(aesni_gcm_precomp_avx_gen2)
1491
1492###############################################################################
1493#void aesni_gcm_enc_avx_gen2(
1494# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1495# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
1496# const u8 *in, /* Plaintext input */
1497# u64 plaintext_len, /* Length of data in Bytes for encryption. */
1498# u8 *iv, /* Pre-counter block j0: 4 byte salt
1499# (from Security Association) concatenated with 8 byte
1500# Initialisation Vector (from IPSec ESP Payload)
1501# concatenated with 0x00000001. 16-byte aligned pointer. */
1502# const u8 *aad, /* Additional Authentication Data (AAD)*/
1503# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1504# u8 *auth_tag, /* Authenticated Tag output. */
1505# u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
1506# Valid values are 16 (most likely), 12 or 8. */
1507###############################################################################
1508ENTRY(aesni_gcm_enc_avx_gen2)
1509 GCM_ENC_DEC_AVX ENC
1510 ret
1511ENDPROC(aesni_gcm_enc_avx_gen2)
1512
1513###############################################################################
1514#void aesni_gcm_dec_avx_gen2(
1515# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1516# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
1517# const u8 *in, /* Ciphertext input */
1518# u64 plaintext_len, /* Length of data in Bytes for encryption. */
1519# u8 *iv, /* Pre-counter block j0: 4 byte salt
1520# (from Security Association) concatenated with 8 byte
1521# Initialisation Vector (from IPSec ESP Payload)
1522# concatenated with 0x00000001. 16-byte aligned pointer. */
1523# const u8 *aad, /* Additional Authentication Data (AAD)*/
1524# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1525# u8 *auth_tag, /* Authenticated Tag output. */
1526# u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
1527# Valid values are 16 (most likely), 12 or 8. */
1528###############################################################################
1529ENTRY(aesni_gcm_dec_avx_gen2)
1530 GCM_ENC_DEC_AVX DEC
1531 ret
1532ENDPROC(aesni_gcm_dec_avx_gen2)
1533#endif /* CONFIG_AS_AVX */
1534
1535#ifdef CONFIG_AS_AVX2
1536###############################################################################
1537# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
1538# Input: A and B (128-bits each, bit-reflected)
1539# Output: C = A*B*x mod poly, (i.e. >>1 )
1540# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
1541# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
1542###############################################################################
1543.macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1544
1545 vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1
1546 vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0
1547 vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0
1548 vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1
1549 vpxor \T3, \GH, \GH
1550
1551
1552 vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs
1553 vpslldq $8 , \GH, \GH # shift-L GH 2 DWs
1554
1555 vpxor \T3, \T1, \T1
1556 vpxor \T2, \GH, \GH
1557
1558 #######################################################################
1559 #first phase of the reduction
1560 vmovdqa POLY2(%rip), \T3
1561
1562 vpclmulqdq $0x01, \GH, \T3, \T2
1563 vpslldq $8, \T2, \T2 # shift-L T2 2 DWs
1564
1565 vpxor \T2, \GH, \GH # first phase of the reduction complete
1566 #######################################################################
1567 #second phase of the reduction
1568 vpclmulqdq $0x00, \GH, \T3, \T2
1569 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1570
1571 vpclmulqdq $0x10, \GH, \T3, \GH
1572 vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
1573
1574 vpxor \T2, \GH, \GH # second phase of the reduction complete
1575 #######################################################################
1576 vpxor \T1, \GH, \GH # the result is in GH
1577
1578
1579.endm
1580
1581.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1582
1583 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1584 vmovdqa \HK, \T5
1585 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
1586 vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly
1587
1588 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
1589 vmovdqa \T5, HashKey_3(arg1)
1590
1591 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
1592 vmovdqa \T5, HashKey_4(arg1)
1593
1594 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
1595 vmovdqa \T5, HashKey_5(arg1)
1596
1597 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
1598 vmovdqa \T5, HashKey_6(arg1)
1599
1600 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
1601 vmovdqa \T5, HashKey_7(arg1)
1602
1603 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
1604 vmovdqa \T5, HashKey_8(arg1)
1605
1606.endm
1607
1608
1609## if a = number of total plaintext bytes
1610## b = floor(a/16)
1611## num_initial_blocks = b mod 4#
1612## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1613## r10, r11, r12, rax are clobbered
1614## arg1, arg2, arg3, r14 are used as a pointer only, not modified
1615
1616.macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1617 i = (8-\num_initial_blocks)
1618 setreg
1619
1620 mov arg6, %r10 # r10 = AAD
1621 mov arg7, %r12 # r12 = aadLen
1622
1623
1624 mov %r12, %r11
1625
1626 vpxor reg_i, reg_i, reg_i
1627_get_AAD_loop\@:
1628 vmovd (%r10), \T1
1629 vpslldq $12, \T1, \T1
1630 vpsrldq $4, reg_i, reg_i
1631 vpxor \T1, reg_i, reg_i
1632
1633 add $4, %r10
1634 sub $4, %r12
1635 jg _get_AAD_loop\@
1636
1637
1638 cmp $16, %r11
1639 je _get_AAD_loop2_done\@
1640 mov $16, %r12
1641
1642_get_AAD_loop2\@:
1643 vpsrldq $4, reg_i, reg_i
1644 sub $4, %r12
1645 cmp %r11, %r12
1646 jg _get_AAD_loop2\@
1647
1648_get_AAD_loop2_done\@:
1649
1650 #byte-reflect the AAD data
1651 vpshufb SHUF_MASK(%rip), reg_i, reg_i
1652
1653 # initialize the data pointer offset as zero
1654 xor %r11, %r11
1655
1656 # start AES for num_initial_blocks blocks
1657 mov arg5, %rax # rax = *Y0
1658 vmovdqu (%rax), \CTR # CTR = Y0
1659 vpshufb SHUF_MASK(%rip), \CTR, \CTR
1660
1661
1662 i = (9-\num_initial_blocks)
1663 setreg
1664.rep \num_initial_blocks
1665 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1666 vmovdqa \CTR, reg_i
1667 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
1668 i = (i+1)
1669 setreg
1670.endr
1671
1672 vmovdqa (arg1), \T_key
1673 i = (9-\num_initial_blocks)
1674 setreg
1675.rep \num_initial_blocks
1676 vpxor \T_key, reg_i, reg_i
1677 i = (i+1)
1678 setreg
1679.endr
1680
1681 j = 1
1682 setreg
1683.rep 9
1684 vmovdqa 16*j(arg1), \T_key
1685 i = (9-\num_initial_blocks)
1686 setreg
1687.rep \num_initial_blocks
1688 vaesenc \T_key, reg_i, reg_i
1689 i = (i+1)
1690 setreg
1691.endr
1692
1693 j = (j+1)
1694 setreg
1695.endr
1696
1697
1698 vmovdqa 16*10(arg1), \T_key
1699 i = (9-\num_initial_blocks)
1700 setreg
1701.rep \num_initial_blocks
1702 vaesenclast \T_key, reg_i, reg_i
1703 i = (i+1)
1704 setreg
1705.endr
1706
1707 i = (9-\num_initial_blocks)
1708 setreg
1709.rep \num_initial_blocks
1710 vmovdqu (arg3, %r11), \T1
1711 vpxor \T1, reg_i, reg_i
1712 vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for
1713 # num_initial_blocks blocks
1714 add $16, %r11
1715.if \ENC_DEC == DEC
1716 vmovdqa \T1, reg_i
1717.endif
1718 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
1719 i = (i+1)
1720 setreg
1721.endr
1722
1723
1724 i = (8-\num_initial_blocks)
1725 j = (9-\num_initial_blocks)
1726 setreg
1727 GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6
1728
1729.rep \num_initial_blocks
1730 vpxor reg_i, reg_j, reg_j
1731 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
1732 i = (i+1)
1733 j = (j+1)
1734 setreg
1735.endr
1736 # XMM8 has the combined result here
1737
1738 vmovdqa \XMM8, TMP1(%rsp)
1739 vmovdqa \XMM8, \T3
1740
1741 cmp $128, %r13
1742 jl _initial_blocks_done\@ # no need for precomputed constants
1743
1744###############################################################################
1745# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1746 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1747 vmovdqa \CTR, \XMM1
1748 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1749
1750 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1751 vmovdqa \CTR, \XMM2
1752 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1753
1754 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1755 vmovdqa \CTR, \XMM3
1756 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1757
1758 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1759 vmovdqa \CTR, \XMM4
1760 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1761
1762 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1763 vmovdqa \CTR, \XMM5
1764 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1765
1766 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1767 vmovdqa \CTR, \XMM6
1768 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1769
1770 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1771 vmovdqa \CTR, \XMM7
1772 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1773
1774 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1775 vmovdqa \CTR, \XMM8
1776 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1777
1778 vmovdqa (arg1), \T_key
1779 vpxor \T_key, \XMM1, \XMM1
1780 vpxor \T_key, \XMM2, \XMM2
1781 vpxor \T_key, \XMM3, \XMM3
1782 vpxor \T_key, \XMM4, \XMM4
1783 vpxor \T_key, \XMM5, \XMM5
1784 vpxor \T_key, \XMM6, \XMM6
1785 vpxor \T_key, \XMM7, \XMM7
1786 vpxor \T_key, \XMM8, \XMM8
1787
1788 i = 1
1789 setreg
1790.rep 9 # do 9 rounds
1791 vmovdqa 16*i(arg1), \T_key
1792 vaesenc \T_key, \XMM1, \XMM1
1793 vaesenc \T_key, \XMM2, \XMM2
1794 vaesenc \T_key, \XMM3, \XMM3
1795 vaesenc \T_key, \XMM4, \XMM4
1796 vaesenc \T_key, \XMM5, \XMM5
1797 vaesenc \T_key, \XMM6, \XMM6
1798 vaesenc \T_key, \XMM7, \XMM7
1799 vaesenc \T_key, \XMM8, \XMM8
1800 i = (i+1)
1801 setreg
1802.endr
1803
1804
1805 vmovdqa 16*i(arg1), \T_key
1806 vaesenclast \T_key, \XMM1, \XMM1
1807 vaesenclast \T_key, \XMM2, \XMM2
1808 vaesenclast \T_key, \XMM3, \XMM3
1809 vaesenclast \T_key, \XMM4, \XMM4
1810 vaesenclast \T_key, \XMM5, \XMM5
1811 vaesenclast \T_key, \XMM6, \XMM6
1812 vaesenclast \T_key, \XMM7, \XMM7
1813 vaesenclast \T_key, \XMM8, \XMM8
1814
1815 vmovdqu (arg3, %r11), \T1
1816 vpxor \T1, \XMM1, \XMM1
1817 vmovdqu \XMM1, (arg2 , %r11)
1818 .if \ENC_DEC == DEC
1819 vmovdqa \T1, \XMM1
1820 .endif
1821
1822 vmovdqu 16*1(arg3, %r11), \T1
1823 vpxor \T1, \XMM2, \XMM2
1824 vmovdqu \XMM2, 16*1(arg2 , %r11)
1825 .if \ENC_DEC == DEC
1826 vmovdqa \T1, \XMM2
1827 .endif
1828
1829 vmovdqu 16*2(arg3, %r11), \T1
1830 vpxor \T1, \XMM3, \XMM3
1831 vmovdqu \XMM3, 16*2(arg2 , %r11)
1832 .if \ENC_DEC == DEC
1833 vmovdqa \T1, \XMM3
1834 .endif
1835
1836 vmovdqu 16*3(arg3, %r11), \T1
1837 vpxor \T1, \XMM4, \XMM4
1838 vmovdqu \XMM4, 16*3(arg2 , %r11)
1839 .if \ENC_DEC == DEC
1840 vmovdqa \T1, \XMM4
1841 .endif
1842
1843 vmovdqu 16*4(arg3, %r11), \T1
1844 vpxor \T1, \XMM5, \XMM5
1845 vmovdqu \XMM5, 16*4(arg2 , %r11)
1846 .if \ENC_DEC == DEC
1847 vmovdqa \T1, \XMM5
1848 .endif
1849
1850 vmovdqu 16*5(arg3, %r11), \T1
1851 vpxor \T1, \XMM6, \XMM6
1852 vmovdqu \XMM6, 16*5(arg2 , %r11)
1853 .if \ENC_DEC == DEC
1854 vmovdqa \T1, \XMM6
1855 .endif
1856
1857 vmovdqu 16*6(arg3, %r11), \T1
1858 vpxor \T1, \XMM7, \XMM7
1859 vmovdqu \XMM7, 16*6(arg2 , %r11)
1860 .if \ENC_DEC == DEC
1861 vmovdqa \T1, \XMM7
1862 .endif
1863
1864 vmovdqu 16*7(arg3, %r11), \T1
1865 vpxor \T1, \XMM8, \XMM8
1866 vmovdqu \XMM8, 16*7(arg2 , %r11)
1867 .if \ENC_DEC == DEC
1868 vmovdqa \T1, \XMM8
1869 .endif
1870
1871 add $128, %r11
1872
1873 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1874 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with
1875 # the corresponding ciphertext
1876 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1877 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1878 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1879 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1880 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1881 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1882 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1883
1884###############################################################################
1885
1886_initial_blocks_done\@:
1887
1888
1889.endm
1890
1891
1892
1893# encrypt 8 blocks at a time
1894# ghash the 8 previously encrypted ciphertext blocks
1895# arg1, arg2, arg3 are used as pointers only, not modified
1896# r11 is the data offset value
1897.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
1898
1899 vmovdqa \XMM1, \T2
1900 vmovdqa \XMM2, TMP2(%rsp)
1901 vmovdqa \XMM3, TMP3(%rsp)
1902 vmovdqa \XMM4, TMP4(%rsp)
1903 vmovdqa \XMM5, TMP5(%rsp)
1904 vmovdqa \XMM6, TMP6(%rsp)
1905 vmovdqa \XMM7, TMP7(%rsp)
1906 vmovdqa \XMM8, TMP8(%rsp)
1907
1908.if \loop_idx == in_order
1909 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
1910 vpaddd ONE(%rip), \XMM1, \XMM2
1911 vpaddd ONE(%rip), \XMM2, \XMM3
1912 vpaddd ONE(%rip), \XMM3, \XMM4
1913 vpaddd ONE(%rip), \XMM4, \XMM5
1914 vpaddd ONE(%rip), \XMM5, \XMM6
1915 vpaddd ONE(%rip), \XMM6, \XMM7
1916 vpaddd ONE(%rip), \XMM7, \XMM8
1917 vmovdqa \XMM8, \CTR
1918
1919 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1920 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1921 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1922 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1923 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1924 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1925 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1926 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1927.else
1928 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
1929 vpaddd ONEf(%rip), \XMM1, \XMM2
1930 vpaddd ONEf(%rip), \XMM2, \XMM3
1931 vpaddd ONEf(%rip), \XMM3, \XMM4
1932 vpaddd ONEf(%rip), \XMM4, \XMM5
1933 vpaddd ONEf(%rip), \XMM5, \XMM6
1934 vpaddd ONEf(%rip), \XMM6, \XMM7
1935 vpaddd ONEf(%rip), \XMM7, \XMM8
1936 vmovdqa \XMM8, \CTR
1937.endif
1938
1939
1940 #######################################################################
1941
1942 vmovdqu (arg1), \T1
1943 vpxor \T1, \XMM1, \XMM1
1944 vpxor \T1, \XMM2, \XMM2
1945 vpxor \T1, \XMM3, \XMM3
1946 vpxor \T1, \XMM4, \XMM4
1947 vpxor \T1, \XMM5, \XMM5
1948 vpxor \T1, \XMM6, \XMM6
1949 vpxor \T1, \XMM7, \XMM7
1950 vpxor \T1, \XMM8, \XMM8
1951
1952 #######################################################################
1953
1954
1955
1956
1957
1958 vmovdqu 16*1(arg1), \T1
1959 vaesenc \T1, \XMM1, \XMM1
1960 vaesenc \T1, \XMM2, \XMM2
1961 vaesenc \T1, \XMM3, \XMM3
1962 vaesenc \T1, \XMM4, \XMM4
1963 vaesenc \T1, \XMM5, \XMM5
1964 vaesenc \T1, \XMM6, \XMM6
1965 vaesenc \T1, \XMM7, \XMM7
1966 vaesenc \T1, \XMM8, \XMM8
1967
1968 vmovdqu 16*2(arg1), \T1
1969 vaesenc \T1, \XMM1, \XMM1
1970 vaesenc \T1, \XMM2, \XMM2
1971 vaesenc \T1, \XMM3, \XMM3
1972 vaesenc \T1, \XMM4, \XMM4
1973 vaesenc \T1, \XMM5, \XMM5
1974 vaesenc \T1, \XMM6, \XMM6
1975 vaesenc \T1, \XMM7, \XMM7
1976 vaesenc \T1, \XMM8, \XMM8
1977
1978
1979 #######################################################################
1980
1981 vmovdqa HashKey_8(arg1), \T5
1982 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
1983 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
1984 vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0
1985 vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1
1986 vpxor \T5, \T6, \T6
1987
1988 vmovdqu 16*3(arg1), \T1
1989 vaesenc \T1, \XMM1, \XMM1
1990 vaesenc \T1, \XMM2, \XMM2
1991 vaesenc \T1, \XMM3, \XMM3
1992 vaesenc \T1, \XMM4, \XMM4
1993 vaesenc \T1, \XMM5, \XMM5
1994 vaesenc \T1, \XMM6, \XMM6
1995 vaesenc \T1, \XMM7, \XMM7
1996 vaesenc \T1, \XMM8, \XMM8
1997
1998 vmovdqa TMP2(%rsp), \T1
1999 vmovdqa HashKey_7(arg1), \T5
2000 vpclmulqdq $0x11, \T5, \T1, \T3
2001 vpxor \T3, \T4, \T4
2002
2003 vpclmulqdq $0x00, \T5, \T1, \T3
2004 vpxor \T3, \T7, \T7
2005
2006 vpclmulqdq $0x01, \T5, \T1, \T3
2007 vpxor \T3, \T6, \T6
2008
2009 vpclmulqdq $0x10, \T5, \T1, \T3
2010 vpxor \T3, \T6, \T6
2011
2012 vmovdqu 16*4(arg1), \T1
2013 vaesenc \T1, \XMM1, \XMM1
2014 vaesenc \T1, \XMM2, \XMM2
2015 vaesenc \T1, \XMM3, \XMM3
2016 vaesenc \T1, \XMM4, \XMM4
2017 vaesenc \T1, \XMM5, \XMM5
2018 vaesenc \T1, \XMM6, \XMM6
2019 vaesenc \T1, \XMM7, \XMM7
2020 vaesenc \T1, \XMM8, \XMM8
2021
2022 #######################################################################
2023
2024 vmovdqa TMP3(%rsp), \T1
2025 vmovdqa HashKey_6(arg1), \T5
2026 vpclmulqdq $0x11, \T5, \T1, \T3
2027 vpxor \T3, \T4, \T4
2028
2029 vpclmulqdq $0x00, \T5, \T1, \T3
2030 vpxor \T3, \T7, \T7
2031
2032 vpclmulqdq $0x01, \T5, \T1, \T3
2033 vpxor \T3, \T6, \T6
2034
2035 vpclmulqdq $0x10, \T5, \T1, \T3
2036 vpxor \T3, \T6, \T6
2037
2038 vmovdqu 16*5(arg1), \T1
2039 vaesenc \T1, \XMM1, \XMM1
2040 vaesenc \T1, \XMM2, \XMM2
2041 vaesenc \T1, \XMM3, \XMM3
2042 vaesenc \T1, \XMM4, \XMM4
2043 vaesenc \T1, \XMM5, \XMM5
2044 vaesenc \T1, \XMM6, \XMM6
2045 vaesenc \T1, \XMM7, \XMM7
2046 vaesenc \T1, \XMM8, \XMM8
2047
2048 vmovdqa TMP4(%rsp), \T1
2049 vmovdqa HashKey_5(arg1), \T5
2050 vpclmulqdq $0x11, \T5, \T1, \T3
2051 vpxor \T3, \T4, \T4
2052
2053 vpclmulqdq $0x00, \T5, \T1, \T3
2054 vpxor \T3, \T7, \T7
2055
2056 vpclmulqdq $0x01, \T5, \T1, \T3
2057 vpxor \T3, \T6, \T6
2058
2059 vpclmulqdq $0x10, \T5, \T1, \T3
2060 vpxor \T3, \T6, \T6
2061
2062 vmovdqu 16*6(arg1), \T1
2063 vaesenc \T1, \XMM1, \XMM1
2064 vaesenc \T1, \XMM2, \XMM2
2065 vaesenc \T1, \XMM3, \XMM3
2066 vaesenc \T1, \XMM4, \XMM4
2067 vaesenc \T1, \XMM5, \XMM5
2068 vaesenc \T1, \XMM6, \XMM6
2069 vaesenc \T1, \XMM7, \XMM7
2070 vaesenc \T1, \XMM8, \XMM8
2071
2072
2073 vmovdqa TMP5(%rsp), \T1
2074 vmovdqa HashKey_4(arg1), \T5
2075 vpclmulqdq $0x11, \T5, \T1, \T3
2076 vpxor \T3, \T4, \T4
2077
2078 vpclmulqdq $0x00, \T5, \T1, \T3
2079 vpxor \T3, \T7, \T7
2080
2081 vpclmulqdq $0x01, \T5, \T1, \T3
2082 vpxor \T3, \T6, \T6
2083
2084 vpclmulqdq $0x10, \T5, \T1, \T3
2085 vpxor \T3, \T6, \T6
2086
2087 vmovdqu 16*7(arg1), \T1
2088 vaesenc \T1, \XMM1, \XMM1
2089 vaesenc \T1, \XMM2, \XMM2
2090 vaesenc \T1, \XMM3, \XMM3
2091 vaesenc \T1, \XMM4, \XMM4
2092 vaesenc \T1, \XMM5, \XMM5
2093 vaesenc \T1, \XMM6, \XMM6
2094 vaesenc \T1, \XMM7, \XMM7
2095 vaesenc \T1, \XMM8, \XMM8
2096
2097 vmovdqa TMP6(%rsp), \T1
2098 vmovdqa HashKey_3(arg1), \T5
2099 vpclmulqdq $0x11, \T5, \T1, \T3
2100 vpxor \T3, \T4, \T4
2101
2102 vpclmulqdq $0x00, \T5, \T1, \T3
2103 vpxor \T3, \T7, \T7
2104
2105 vpclmulqdq $0x01, \T5, \T1, \T3
2106 vpxor \T3, \T6, \T6
2107
2108 vpclmulqdq $0x10, \T5, \T1, \T3
2109 vpxor \T3, \T6, \T6
2110
2111 vmovdqu 16*8(arg1), \T1
2112 vaesenc \T1, \XMM1, \XMM1
2113 vaesenc \T1, \XMM2, \XMM2
2114 vaesenc \T1, \XMM3, \XMM3
2115 vaesenc \T1, \XMM4, \XMM4
2116 vaesenc \T1, \XMM5, \XMM5
2117 vaesenc \T1, \XMM6, \XMM6
2118 vaesenc \T1, \XMM7, \XMM7
2119 vaesenc \T1, \XMM8, \XMM8
2120
2121 vmovdqa TMP7(%rsp), \T1
2122 vmovdqa HashKey_2(arg1), \T5
2123 vpclmulqdq $0x11, \T5, \T1, \T3
2124 vpxor \T3, \T4, \T4
2125
2126 vpclmulqdq $0x00, \T5, \T1, \T3
2127 vpxor \T3, \T7, \T7
2128
2129 vpclmulqdq $0x01, \T5, \T1, \T3
2130 vpxor \T3, \T6, \T6
2131
2132 vpclmulqdq $0x10, \T5, \T1, \T3
2133 vpxor \T3, \T6, \T6
2134
2135
2136 #######################################################################
2137
2138 vmovdqu 16*9(arg1), \T5
2139 vaesenc \T5, \XMM1, \XMM1
2140 vaesenc \T5, \XMM2, \XMM2
2141 vaesenc \T5, \XMM3, \XMM3
2142 vaesenc \T5, \XMM4, \XMM4
2143 vaesenc \T5, \XMM5, \XMM5
2144 vaesenc \T5, \XMM6, \XMM6
2145 vaesenc \T5, \XMM7, \XMM7
2146 vaesenc \T5, \XMM8, \XMM8
2147
2148 vmovdqa TMP8(%rsp), \T1
2149 vmovdqa HashKey(arg1), \T5
2150
2151 vpclmulqdq $0x00, \T5, \T1, \T3
2152 vpxor \T3, \T7, \T7
2153
2154 vpclmulqdq $0x01, \T5, \T1, \T3
2155 vpxor \T3, \T6, \T6
2156
2157 vpclmulqdq $0x10, \T5, \T1, \T3
2158 vpxor \T3, \T6, \T6
2159
2160 vpclmulqdq $0x11, \T5, \T1, \T3
2161 vpxor \T3, \T4, \T1
2162
2163
2164 vmovdqu 16*10(arg1), \T5
2165
2166 i = 0
2167 j = 1
2168 setreg
2169.rep 8
2170 vpxor 16*i(arg3, %r11), \T5, \T2
2171 .if \ENC_DEC == ENC
2172 vaesenclast \T2, reg_j, reg_j
2173 .else
2174 vaesenclast \T2, reg_j, \T3
2175 vmovdqu 16*i(arg3, %r11), reg_j
2176 vmovdqu \T3, 16*i(arg2, %r11)
2177 .endif
2178 i = (i+1)
2179 j = (j+1)
2180 setreg
2181.endr
2182 #######################################################################
2183
2184
2185 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
2186 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
2187 vpxor \T3, \T7, \T7
2188 vpxor \T6, \T1, \T1 # accumulate the results in T1:T7
2189
2190
2191
2192 #######################################################################
2193 #first phase of the reduction
2194 vmovdqa POLY2(%rip), \T3
2195
2196 vpclmulqdq $0x01, \T7, \T3, \T2
2197 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2198
2199 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2200 #######################################################################
2201 .if \ENC_DEC == ENC
2202 vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer
2203 vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer
2204 vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer
2205 vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer
2206 vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer
2207 vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer
2208 vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer
2209 vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer
2210 .endif
2211
2212 #######################################################################
2213 #second phase of the reduction
2214 vpclmulqdq $0x00, \T7, \T3, \T2
2215 vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2216
2217 vpclmulqdq $0x10, \T7, \T3, \T4
2218 vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
2219
2220 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2221 #######################################################################
2222 vpxor \T4, \T1, \T1 # the result is in T1
2223
2224 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2225 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2226 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2227 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2228 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2229 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2230 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2231 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2232
2233
2234 vpxor \T1, \XMM1, \XMM1
2235
2236
2237
2238.endm
2239
2240
2241# GHASH the last 4 ciphertext blocks.
2242.macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2243
2244 ## Karatsuba Method
2245
2246 vmovdqa HashKey_8(arg1), \T5
2247
2248 vpshufd $0b01001110, \XMM1, \T2
2249 vpshufd $0b01001110, \T5, \T3
2250 vpxor \XMM1, \T2, \T2
2251 vpxor \T5, \T3, \T3
2252
2253 vpclmulqdq $0x11, \T5, \XMM1, \T6
2254 vpclmulqdq $0x00, \T5, \XMM1, \T7
2255
2256 vpclmulqdq $0x00, \T3, \T2, \XMM1
2257
2258 ######################
2259
2260 vmovdqa HashKey_7(arg1), \T5
2261 vpshufd $0b01001110, \XMM2, \T2
2262 vpshufd $0b01001110, \T5, \T3
2263 vpxor \XMM2, \T2, \T2
2264 vpxor \T5, \T3, \T3
2265
2266 vpclmulqdq $0x11, \T5, \XMM2, \T4
2267 vpxor \T4, \T6, \T6
2268
2269 vpclmulqdq $0x00, \T5, \XMM2, \T4
2270 vpxor \T4, \T7, \T7
2271
2272 vpclmulqdq $0x00, \T3, \T2, \T2
2273
2274 vpxor \T2, \XMM1, \XMM1
2275
2276 ######################
2277
2278 vmovdqa HashKey_6(arg1), \T5
2279 vpshufd $0b01001110, \XMM3, \T2
2280 vpshufd $0b01001110, \T5, \T3
2281 vpxor \XMM3, \T2, \T2
2282 vpxor \T5, \T3, \T3
2283
2284 vpclmulqdq $0x11, \T5, \XMM3, \T4
2285 vpxor \T4, \T6, \T6
2286
2287 vpclmulqdq $0x00, \T5, \XMM3, \T4
2288 vpxor \T4, \T7, \T7
2289
2290 vpclmulqdq $0x00, \T3, \T2, \T2
2291
2292 vpxor \T2, \XMM1, \XMM1
2293
2294 ######################
2295
2296 vmovdqa HashKey_5(arg1), \T5
2297 vpshufd $0b01001110, \XMM4, \T2
2298 vpshufd $0b01001110, \T5, \T3
2299 vpxor \XMM4, \T2, \T2
2300 vpxor \T5, \T3, \T3
2301
2302 vpclmulqdq $0x11, \T5, \XMM4, \T4
2303 vpxor \T4, \T6, \T6
2304
2305 vpclmulqdq $0x00, \T5, \XMM4, \T4
2306 vpxor \T4, \T7, \T7
2307
2308 vpclmulqdq $0x00, \T3, \T2, \T2
2309
2310 vpxor \T2, \XMM1, \XMM1
2311
2312 ######################
2313
2314 vmovdqa HashKey_4(arg1), \T5
2315 vpshufd $0b01001110, \XMM5, \T2
2316 vpshufd $0b01001110, \T5, \T3
2317 vpxor \XMM5, \T2, \T2
2318 vpxor \T5, \T3, \T3
2319
2320 vpclmulqdq $0x11, \T5, \XMM5, \T4
2321 vpxor \T4, \T6, \T6
2322
2323 vpclmulqdq $0x00, \T5, \XMM5, \T4
2324 vpxor \T4, \T7, \T7
2325
2326 vpclmulqdq $0x00, \T3, \T2, \T2
2327
2328 vpxor \T2, \XMM1, \XMM1
2329
2330 ######################
2331
2332 vmovdqa HashKey_3(arg1), \T5
2333 vpshufd $0b01001110, \XMM6, \T2
2334 vpshufd $0b01001110, \T5, \T3
2335 vpxor \XMM6, \T2, \T2
2336 vpxor \T5, \T3, \T3
2337
2338 vpclmulqdq $0x11, \T5, \XMM6, \T4
2339 vpxor \T4, \T6, \T6
2340
2341 vpclmulqdq $0x00, \T5, \XMM6, \T4
2342 vpxor \T4, \T7, \T7
2343
2344 vpclmulqdq $0x00, \T3, \T2, \T2
2345
2346 vpxor \T2, \XMM1, \XMM1
2347
2348 ######################
2349
2350 vmovdqa HashKey_2(arg1), \T5
2351 vpshufd $0b01001110, \XMM7, \T2
2352 vpshufd $0b01001110, \T5, \T3
2353 vpxor \XMM7, \T2, \T2
2354 vpxor \T5, \T3, \T3
2355
2356 vpclmulqdq $0x11, \T5, \XMM7, \T4
2357 vpxor \T4, \T6, \T6
2358
2359 vpclmulqdq $0x00, \T5, \XMM7, \T4
2360 vpxor \T4, \T7, \T7
2361
2362 vpclmulqdq $0x00, \T3, \T2, \T2
2363
2364 vpxor \T2, \XMM1, \XMM1
2365
2366 ######################
2367
2368 vmovdqa HashKey(arg1), \T5
2369 vpshufd $0b01001110, \XMM8, \T2
2370 vpshufd $0b01001110, \T5, \T3
2371 vpxor \XMM8, \T2, \T2
2372 vpxor \T5, \T3, \T3
2373
2374 vpclmulqdq $0x11, \T5, \XMM8, \T4
2375 vpxor \T4, \T6, \T6
2376
2377 vpclmulqdq $0x00, \T5, \XMM8, \T4
2378 vpxor \T4, \T7, \T7
2379
2380 vpclmulqdq $0x00, \T3, \T2, \T2
2381
2382 vpxor \T2, \XMM1, \XMM1
2383 vpxor \T6, \XMM1, \XMM1
2384 vpxor \T7, \XMM1, \T2
2385
2386
2387
2388
2389 vpslldq $8, \T2, \T4
2390 vpsrldq $8, \T2, \T2
2391
2392 vpxor \T4, \T7, \T7
2393 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the
2394 # accumulated carry-less multiplications
2395
2396 #######################################################################
2397 #first phase of the reduction
2398 vmovdqa POLY2(%rip), \T3
2399
2400 vpclmulqdq $0x01, \T7, \T3, \T2
2401 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2402
2403 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2404 #######################################################################
2405
2406
2407 #second phase of the reduction
2408 vpclmulqdq $0x00, \T7, \T3, \T2
2409 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2410
2411 vpclmulqdq $0x10, \T7, \T3, \T4
2412 vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2413
2414 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2415 #######################################################################
2416 vpxor \T4, \T6, \T6 # the result is in T6
2417.endm
2418
2419
2420
2421# combined for GCM encrypt and decrypt functions
2422# clobbering all xmm registers
2423# clobbering r10, r11, r12, r13, r14, r15
2424.macro GCM_ENC_DEC_AVX2 ENC_DEC
2425
2426 #the number of pushes must equal STACK_OFFSET
2427 push %r12
2428 push %r13
2429 push %r14
2430 push %r15
2431
2432 mov %rsp, %r14
2433
2434
2435
2436
2437 sub $VARIABLE_OFFSET, %rsp
2438 and $~63, %rsp # align rsp to 64 bytes
2439
2440
2441 vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey
2442
2443 mov arg4, %r13 # save the number of bytes of plaintext/ciphertext
2444 and $-16, %r13 # r13 = r13 - (r13 mod 16)
2445
2446 mov %r13, %r12
2447 shr $4, %r12
2448 and $7, %r12
2449 jz _initial_num_blocks_is_0\@
2450
2451 cmp $7, %r12
2452 je _initial_num_blocks_is_7\@
2453 cmp $6, %r12
2454 je _initial_num_blocks_is_6\@
2455 cmp $5, %r12
2456 je _initial_num_blocks_is_5\@
2457 cmp $4, %r12
2458 je _initial_num_blocks_is_4\@
2459 cmp $3, %r12
2460 je _initial_num_blocks_is_3\@
2461 cmp $2, %r12
2462 je _initial_num_blocks_is_2\@
2463
2464 jmp _initial_num_blocks_is_1\@
2465
2466_initial_num_blocks_is_7\@:
2467 INITIAL_BLOCKS_AVX2 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2468 sub $16*7, %r13
2469 jmp _initial_blocks_encrypted\@
2470
2471_initial_num_blocks_is_6\@:
2472 INITIAL_BLOCKS_AVX2 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2473 sub $16*6, %r13
2474 jmp _initial_blocks_encrypted\@
2475
2476_initial_num_blocks_is_5\@:
2477 INITIAL_BLOCKS_AVX2 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2478 sub $16*5, %r13
2479 jmp _initial_blocks_encrypted\@
2480
2481_initial_num_blocks_is_4\@:
2482 INITIAL_BLOCKS_AVX2 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2483 sub $16*4, %r13
2484 jmp _initial_blocks_encrypted\@
2485
2486_initial_num_blocks_is_3\@:
2487 INITIAL_BLOCKS_AVX2 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2488 sub $16*3, %r13
2489 jmp _initial_blocks_encrypted\@
2490
2491_initial_num_blocks_is_2\@:
2492 INITIAL_BLOCKS_AVX2 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2493 sub $16*2, %r13
2494 jmp _initial_blocks_encrypted\@
2495
2496_initial_num_blocks_is_1\@:
2497 INITIAL_BLOCKS_AVX2 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2498 sub $16*1, %r13
2499 jmp _initial_blocks_encrypted\@
2500
2501_initial_num_blocks_is_0\@:
2502 INITIAL_BLOCKS_AVX2 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2503
2504
2505_initial_blocks_encrypted\@:
2506 cmp $0, %r13
2507 je _zero_cipher_left\@
2508
2509 sub $128, %r13
2510 je _eight_cipher_left\@
2511
2512
2513
2514
2515 vmovd %xmm9, %r15d
2516 and $255, %r15d
2517 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2518
2519
2520_encrypt_by_8_new\@:
2521 cmp $(255-8), %r15d
2522 jg _encrypt_by_8\@
2523
2524
2525
2526 add $8, %r15b
2527 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
2528 add $128, %r11
2529 sub $128, %r13
2530 jne _encrypt_by_8_new\@
2531
2532 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2533 jmp _eight_cipher_left\@
2534
2535_encrypt_by_8\@:
2536 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2537 add $8, %r15b
2538 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
2539 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2540 add $128, %r11
2541 sub $128, %r13
2542 jne _encrypt_by_8_new\@
2543
2544 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2545
2546
2547
2548
2549_eight_cipher_left\@:
2550 GHASH_LAST_8_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
2551
2552
2553_zero_cipher_left\@:
2554 cmp $16, arg4
2555 jl _only_less_than_16\@
2556
2557 mov arg4, %r13
2558 and $15, %r13 # r13 = (arg4 mod 16)
2559
2560 je _multiple_of_16_bytes\@
2561
2562 # handle the last <16 Byte block seperately
2563
2564
2565 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
2566 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2567 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
2568
2569 sub $16, %r11
2570 add %r13, %r11
2571 vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block
2572
2573 lea SHIFT_MASK+16(%rip), %r12
2574 sub %r13, %r12 # adjust the shuffle mask pointer
2575 # to be able to shift 16-r13 bytes
2576 # (r13 is the number of bytes in plaintext mod 16)
2577 vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
2578 vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes
2579 jmp _final_ghash_mul\@
2580
2581_only_less_than_16\@:
2582 # check for 0 length
2583 mov arg4, %r13
2584 and $15, %r13 # r13 = (arg4 mod 16)
2585
2586 je _multiple_of_16_bytes\@
2587
2588 # handle the last <16 Byte block seperately
2589
2590
2591 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
2592 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2593 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
2594
2595
2596 lea SHIFT_MASK+16(%rip), %r12
2597 sub %r13, %r12 # adjust the shuffle mask pointer to be
2598 # able to shift 16-r13 bytes (r13 is the
2599 # number of bytes in plaintext mod 16)
2600
2601_get_last_16_byte_loop\@:
2602 movb (arg3, %r11), %al
2603 movb %al, TMP1 (%rsp , %r11)
2604 add $1, %r11
2605 cmp %r13, %r11
2606 jne _get_last_16_byte_loop\@
2607
2608 vmovdqu TMP1(%rsp), %xmm1
2609
2610 sub $16, %r11
2611
2612_final_ghash_mul\@:
2613 .if \ENC_DEC == DEC
2614 vmovdqa %xmm1, %xmm2
2615 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
2616 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9
2617 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
2618 vpand %xmm1, %xmm2, %xmm2
2619 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
2620 vpxor %xmm2, %xmm14, %xmm14
2621 #GHASH computation for the last <16 Byte block
2622 GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
2623 sub %r13, %r11
2624 add $16, %r11
2625 .else
2626 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
2627 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9
2628 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
2629 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2630 vpxor %xmm9, %xmm14, %xmm14
2631 #GHASH computation for the last <16 Byte block
2632 GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
2633 sub %r13, %r11
2634 add $16, %r11
2635 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
2636 .endif
2637
2638
2639 #############################
2640 # output r13 Bytes
2641 vmovq %xmm9, %rax
2642 cmp $8, %r13
2643 jle _less_than_8_bytes_left\@
2644
2645 mov %rax, (arg2 , %r11)
2646 add $8, %r11
2647 vpsrldq $8, %xmm9, %xmm9
2648 vmovq %xmm9, %rax
2649 sub $8, %r13
2650
2651_less_than_8_bytes_left\@:
2652 movb %al, (arg2 , %r11)
2653 add $1, %r11
2654 shr $8, %rax
2655 sub $1, %r13
2656 jne _less_than_8_bytes_left\@
2657 #############################
2658
2659_multiple_of_16_bytes\@:
2660 mov arg7, %r12 # r12 = aadLen (number of bytes)
2661 shl $3, %r12 # convert into number of bits
2662 vmovd %r12d, %xmm15 # len(A) in xmm15
2663
2664 shl $3, arg4 # len(C) in bits (*128)
2665 vmovq arg4, %xmm1
2666 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
2667 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
2668
2669 vpxor %xmm15, %xmm14, %xmm14
2670 GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
2671 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
2672
2673 mov arg5, %rax # rax = *Y0
2674 vmovdqu (%rax), %xmm9 # xmm9 = Y0
2675
2676 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0)
2677
2678 vpxor %xmm14, %xmm9, %xmm9
2679
2680
2681
2682_return_T\@:
2683 mov arg8, %r10 # r10 = authTag
2684 mov arg9, %r11 # r11 = auth_tag_len
2685
2686 cmp $16, %r11
2687 je _T_16\@
2688
2689 cmp $12, %r11
2690 je _T_12\@
2691
2692_T_8\@:
2693 vmovq %xmm9, %rax
2694 mov %rax, (%r10)
2695 jmp _return_T_done\@
2696_T_12\@:
2697 vmovq %xmm9, %rax
2698 mov %rax, (%r10)
2699 vpsrldq $8, %xmm9, %xmm9
2700 vmovd %xmm9, %eax
2701 mov %eax, 8(%r10)
2702 jmp _return_T_done\@
2703
2704_T_16\@:
2705 vmovdqu %xmm9, (%r10)
2706
2707_return_T_done\@:
2708 mov %r14, %rsp
2709
2710 pop %r15
2711 pop %r14
2712 pop %r13
2713 pop %r12
2714.endm
2715
2716
2717#############################################################
2718#void aesni_gcm_precomp_avx_gen4
2719# (gcm_data *my_ctx_data,
2720# u8 *hash_subkey)# /* H, the Hash sub key input.
2721# Data starts on a 16-byte boundary. */
2722#############################################################
2723ENTRY(aesni_gcm_precomp_avx_gen4)
2724 #the number of pushes must equal STACK_OFFSET
2725 push %r12
2726 push %r13
2727 push %r14
2728 push %r15
2729
2730 mov %rsp, %r14
2731
2732
2733
2734 sub $VARIABLE_OFFSET, %rsp
2735 and $~63, %rsp # align rsp to 64 bytes
2736
2737 vmovdqu (arg2), %xmm6 # xmm6 = HashKey
2738
2739 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
2740 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
2741 vmovdqa %xmm6, %xmm2
2742 vpsllq $1, %xmm6, %xmm6
2743 vpsrlq $63, %xmm2, %xmm2
2744 vmovdqa %xmm2, %xmm1
2745 vpslldq $8, %xmm2, %xmm2
2746 vpsrldq $8, %xmm1, %xmm1
2747 vpor %xmm2, %xmm6, %xmm6
2748 #reduction
2749 vpshufd $0b00100100, %xmm1, %xmm2
2750 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
2751 vpand POLY(%rip), %xmm2, %xmm2
2752 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
2753 #######################################################################
2754 vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly
2755
2756
2757 PRECOMPUTE_AVX2 %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
2758
2759 mov %r14, %rsp
2760
2761 pop %r15
2762 pop %r14
2763 pop %r13
2764 pop %r12
2765 ret
2766ENDPROC(aesni_gcm_precomp_avx_gen4)
2767
2768
2769###############################################################################
2770#void aesni_gcm_enc_avx_gen4(
2771# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2772# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
2773# const u8 *in, /* Plaintext input */
2774# u64 plaintext_len, /* Length of data in Bytes for encryption. */
2775# u8 *iv, /* Pre-counter block j0: 4 byte salt
2776# (from Security Association) concatenated with 8 byte
2777# Initialisation Vector (from IPSec ESP Payload)
2778# concatenated with 0x00000001. 16-byte aligned pointer. */
2779# const u8 *aad, /* Additional Authentication Data (AAD)*/
2780# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2781# u8 *auth_tag, /* Authenticated Tag output. */
2782# u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
2783# Valid values are 16 (most likely), 12 or 8. */
2784###############################################################################
2785ENTRY(aesni_gcm_enc_avx_gen4)
2786 GCM_ENC_DEC_AVX2 ENC
2787 ret
2788ENDPROC(aesni_gcm_enc_avx_gen4)
2789
2790###############################################################################
2791#void aesni_gcm_dec_avx_gen4(
2792# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2793# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
2794# const u8 *in, /* Ciphertext input */
2795# u64 plaintext_len, /* Length of data in Bytes for encryption. */
2796# u8 *iv, /* Pre-counter block j0: 4 byte salt
2797# (from Security Association) concatenated with 8 byte
2798# Initialisation Vector (from IPSec ESP Payload)
2799# concatenated with 0x00000001. 16-byte aligned pointer. */
2800# const u8 *aad, /* Additional Authentication Data (AAD)*/
2801# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2802# u8 *auth_tag, /* Authenticated Tag output. */
2803# u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
2804# Valid values are 16 (most likely), 12 or 8. */
2805###############################################################################
2806ENTRY(aesni_gcm_dec_avx_gen4)
2807 GCM_ENC_DEC_AVX2 DEC
2808 ret
2809ENDPROC(aesni_gcm_dec_avx_gen4)
2810
2811#endif /* CONFIG_AS_AVX2 */
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 835488b745ee..948ad0e77741 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -101,6 +101,9 @@ asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
101int crypto_fpu_init(void); 101int crypto_fpu_init(void);
102void crypto_fpu_exit(void); 102void crypto_fpu_exit(void);
103 103
104#define AVX_GEN2_OPTSIZE 640
105#define AVX_GEN4_OPTSIZE 4096
106
104#ifdef CONFIG_X86_64 107#ifdef CONFIG_X86_64
105asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, 108asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
106 const u8 *in, unsigned int len, u8 *iv); 109 const u8 *in, unsigned int len, u8 *iv);
@@ -150,6 +153,123 @@ asmlinkage void aesni_gcm_dec(void *ctx, u8 *out,
150 u8 *hash_subkey, const u8 *aad, unsigned long aad_len, 153 u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
151 u8 *auth_tag, unsigned long auth_tag_len); 154 u8 *auth_tag, unsigned long auth_tag_len);
152 155
156
157#ifdef CONFIG_AS_AVX
158/*
159 * asmlinkage void aesni_gcm_precomp_avx_gen2()
160 * gcm_data *my_ctx_data, context data
161 * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
162 */
163asmlinkage void aesni_gcm_precomp_avx_gen2(void *my_ctx_data, u8 *hash_subkey);
164
165asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx, u8 *out,
166 const u8 *in, unsigned long plaintext_len, u8 *iv,
167 const u8 *aad, unsigned long aad_len,
168 u8 *auth_tag, unsigned long auth_tag_len);
169
170asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx, u8 *out,
171 const u8 *in, unsigned long ciphertext_len, u8 *iv,
172 const u8 *aad, unsigned long aad_len,
173 u8 *auth_tag, unsigned long auth_tag_len);
174
175static void aesni_gcm_enc_avx(void *ctx, u8 *out,
176 const u8 *in, unsigned long plaintext_len, u8 *iv,
177 u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
178 u8 *auth_tag, unsigned long auth_tag_len)
179{
180 if (plaintext_len < AVX_GEN2_OPTSIZE) {
181 aesni_gcm_enc(ctx, out, in, plaintext_len, iv, hash_subkey, aad,
182 aad_len, auth_tag, auth_tag_len);
183 } else {
184 aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
185 aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad,
186 aad_len, auth_tag, auth_tag_len);
187 }
188}
189
190static void aesni_gcm_dec_avx(void *ctx, u8 *out,
191 const u8 *in, unsigned long ciphertext_len, u8 *iv,
192 u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
193 u8 *auth_tag, unsigned long auth_tag_len)
194{
195 if (ciphertext_len < AVX_GEN2_OPTSIZE) {
196 aesni_gcm_dec(ctx, out, in, ciphertext_len, iv, hash_subkey, aad,
197 aad_len, auth_tag, auth_tag_len);
198 } else {
199 aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
200 aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad,
201 aad_len, auth_tag, auth_tag_len);
202 }
203}
204#endif
205
206#ifdef CONFIG_AS_AVX2
207/*
208 * asmlinkage void aesni_gcm_precomp_avx_gen4()
209 * gcm_data *my_ctx_data, context data
210 * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
211 */
212asmlinkage void aesni_gcm_precomp_avx_gen4(void *my_ctx_data, u8 *hash_subkey);
213
214asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx, u8 *out,
215 const u8 *in, unsigned long plaintext_len, u8 *iv,
216 const u8 *aad, unsigned long aad_len,
217 u8 *auth_tag, unsigned long auth_tag_len);
218
219asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx, u8 *out,
220 const u8 *in, unsigned long ciphertext_len, u8 *iv,
221 const u8 *aad, unsigned long aad_len,
222 u8 *auth_tag, unsigned long auth_tag_len);
223
224static void aesni_gcm_enc_avx2(void *ctx, u8 *out,
225 const u8 *in, unsigned long plaintext_len, u8 *iv,
226 u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
227 u8 *auth_tag, unsigned long auth_tag_len)
228{
229 if (plaintext_len < AVX_GEN2_OPTSIZE) {
230 aesni_gcm_enc(ctx, out, in, plaintext_len, iv, hash_subkey, aad,
231 aad_len, auth_tag, auth_tag_len);
232 } else if (plaintext_len < AVX_GEN4_OPTSIZE) {
233 aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
234 aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad,
235 aad_len, auth_tag, auth_tag_len);
236 } else {
237 aesni_gcm_precomp_avx_gen4(ctx, hash_subkey);
238 aesni_gcm_enc_avx_gen4(ctx, out, in, plaintext_len, iv, aad,
239 aad_len, auth_tag, auth_tag_len);
240 }
241}
242
243static void aesni_gcm_dec_avx2(void *ctx, u8 *out,
244 const u8 *in, unsigned long ciphertext_len, u8 *iv,
245 u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
246 u8 *auth_tag, unsigned long auth_tag_len)
247{
248 if (ciphertext_len < AVX_GEN2_OPTSIZE) {
249 aesni_gcm_dec(ctx, out, in, ciphertext_len, iv, hash_subkey,
250 aad, aad_len, auth_tag, auth_tag_len);
251 } else if (ciphertext_len < AVX_GEN4_OPTSIZE) {
252 aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
253 aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad,
254 aad_len, auth_tag, auth_tag_len);
255 } else {
256 aesni_gcm_precomp_avx_gen4(ctx, hash_subkey);
257 aesni_gcm_dec_avx_gen4(ctx, out, in, ciphertext_len, iv, aad,
258 aad_len, auth_tag, auth_tag_len);
259 }
260}
261#endif
262
263static void (*aesni_gcm_enc_tfm)(void *ctx, u8 *out,
264 const u8 *in, unsigned long plaintext_len, u8 *iv,
265 u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
266 u8 *auth_tag, unsigned long auth_tag_len);
267
268static void (*aesni_gcm_dec_tfm)(void *ctx, u8 *out,
269 const u8 *in, unsigned long ciphertext_len, u8 *iv,
270 u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
271 u8 *auth_tag, unsigned long auth_tag_len);
272
153static inline struct 273static inline struct
154aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm) 274aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm)
155{ 275{
@@ -915,7 +1035,7 @@ static int __driver_rfc4106_encrypt(struct aead_request *req)
915 dst = src; 1035 dst = src;
916 } 1036 }
917 1037
918 aesni_gcm_enc(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv, 1038 aesni_gcm_enc_tfm(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv,
919 ctx->hash_subkey, assoc, (unsigned long)req->assoclen, dst 1039 ctx->hash_subkey, assoc, (unsigned long)req->assoclen, dst
920 + ((unsigned long)req->cryptlen), auth_tag_len); 1040 + ((unsigned long)req->cryptlen), auth_tag_len);
921 1041
@@ -996,12 +1116,12 @@ static int __driver_rfc4106_decrypt(struct aead_request *req)
996 dst = src; 1116 dst = src;
997 } 1117 }
998 1118
999 aesni_gcm_dec(aes_ctx, dst, src, tempCipherLen, iv, 1119 aesni_gcm_dec_tfm(aes_ctx, dst, src, tempCipherLen, iv,
1000 ctx->hash_subkey, assoc, (unsigned long)req->assoclen, 1120 ctx->hash_subkey, assoc, (unsigned long)req->assoclen,
1001 authTag, auth_tag_len); 1121 authTag, auth_tag_len);
1002 1122
1003 /* Compare generated tag with passed in tag. */ 1123 /* Compare generated tag with passed in tag. */
1004 retval = memcmp(src + tempCipherLen, authTag, auth_tag_len) ? 1124 retval = crypto_memneq(src + tempCipherLen, authTag, auth_tag_len) ?
1005 -EBADMSG : 0; 1125 -EBADMSG : 0;
1006 1126
1007 if (one_entry_in_sg) { 1127 if (one_entry_in_sg) {
@@ -1353,6 +1473,27 @@ static int __init aesni_init(void)
1353 1473
1354 if (!x86_match_cpu(aesni_cpu_id)) 1474 if (!x86_match_cpu(aesni_cpu_id))
1355 return -ENODEV; 1475 return -ENODEV;
1476#ifdef CONFIG_X86_64
1477#ifdef CONFIG_AS_AVX2
1478 if (boot_cpu_has(X86_FEATURE_AVX2)) {
1479 pr_info("AVX2 version of gcm_enc/dec engaged.\n");
1480 aesni_gcm_enc_tfm = aesni_gcm_enc_avx2;
1481 aesni_gcm_dec_tfm = aesni_gcm_dec_avx2;
1482 } else
1483#endif
1484#ifdef CONFIG_AS_AVX
1485 if (boot_cpu_has(X86_FEATURE_AVX)) {
1486 pr_info("AVX version of gcm_enc/dec engaged.\n");
1487 aesni_gcm_enc_tfm = aesni_gcm_enc_avx;
1488 aesni_gcm_dec_tfm = aesni_gcm_dec_avx;
1489 } else
1490#endif
1491 {
1492 pr_info("SSE version of gcm_enc/dec engaged.\n");
1493 aesni_gcm_enc_tfm = aesni_gcm_enc;
1494 aesni_gcm_dec_tfm = aesni_gcm_dec;
1495 }
1496#endif
1356 1497
1357 err = crypto_fpu_init(); 1498 err = crypto_fpu_init();
1358 if (err) 1499 if (err)
diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c
index 50ec333b70e6..8af519ed73d1 100644
--- a/arch/x86/crypto/blowfish_glue.c
+++ b/arch/x86/crypto/blowfish_glue.c
@@ -223,9 +223,6 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
223 src -= 1; 223 src -= 1;
224 dst -= 1; 224 dst -= 1;
225 } while (nbytes >= bsize * 4); 225 } while (nbytes >= bsize * 4);
226
227 if (nbytes < bsize)
228 goto done;
229 } 226 }
230 227
231 /* Handle leftovers */ 228 /* Handle leftovers */
diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
index e6a3700489b9..e57e20ab5e0b 100644
--- a/arch/x86/crypto/cast5_avx_glue.c
+++ b/arch/x86/crypto/cast5_avx_glue.c
@@ -203,9 +203,6 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
203 src -= 1; 203 src -= 1;
204 dst -= 1; 204 dst -= 1;
205 } while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS); 205 } while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS);
206
207 if (nbytes < bsize)
208 goto done;
209 } 206 }
210 207
211 /* Handle leftovers */ 208 /* Handle leftovers */
diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S
index 586f41aac361..185fad49d86f 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_asm.S
+++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S
@@ -24,10 +24,6 @@
24.align 16 24.align 16
25.Lbswap_mask: 25.Lbswap_mask:
26 .octa 0x000102030405060708090a0b0c0d0e0f 26 .octa 0x000102030405060708090a0b0c0d0e0f
27.Lpoly:
28 .octa 0xc2000000000000000000000000000001
29.Ltwo_one:
30 .octa 0x00000001000000000000000000000001
31 27
32#define DATA %xmm0 28#define DATA %xmm0
33#define SHASH %xmm1 29#define SHASH %xmm1
@@ -134,28 +130,3 @@ ENTRY(clmul_ghash_update)
134.Lupdate_just_ret: 130.Lupdate_just_ret:
135 ret 131 ret
136ENDPROC(clmul_ghash_update) 132ENDPROC(clmul_ghash_update)
137
138/*
139 * void clmul_ghash_setkey(be128 *shash, const u8 *key);
140 *
141 * Calculate hash_key << 1 mod poly
142 */
143ENTRY(clmul_ghash_setkey)
144 movaps .Lbswap_mask, BSWAP
145 movups (%rsi), %xmm0
146 PSHUFB_XMM BSWAP %xmm0
147 movaps %xmm0, %xmm1
148 psllq $1, %xmm0
149 psrlq $63, %xmm1
150 movaps %xmm1, %xmm2
151 pslldq $8, %xmm1
152 psrldq $8, %xmm2
153 por %xmm1, %xmm0
154 # reduction
155 pshufd $0b00100100, %xmm2, %xmm1
156 pcmpeqd .Ltwo_one, %xmm1
157 pand .Lpoly, %xmm1
158 pxor %xmm1, %xmm0
159 movups %xmm0, (%rdi)
160 ret
161ENDPROC(clmul_ghash_setkey)
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index 6759dd1135be..d785cf2c529c 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -30,8 +30,6 @@ void clmul_ghash_mul(char *dst, const be128 *shash);
30void clmul_ghash_update(char *dst, const char *src, unsigned int srclen, 30void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
31 const be128 *shash); 31 const be128 *shash);
32 32
33void clmul_ghash_setkey(be128 *shash, const u8 *key);
34
35struct ghash_async_ctx { 33struct ghash_async_ctx {
36 struct cryptd_ahash *cryptd_tfm; 34 struct cryptd_ahash *cryptd_tfm;
37}; 35};
@@ -58,13 +56,23 @@ static int ghash_setkey(struct crypto_shash *tfm,
58 const u8 *key, unsigned int keylen) 56 const u8 *key, unsigned int keylen)
59{ 57{
60 struct ghash_ctx *ctx = crypto_shash_ctx(tfm); 58 struct ghash_ctx *ctx = crypto_shash_ctx(tfm);
59 be128 *x = (be128 *)key;
60 u64 a, b;
61 61
62 if (keylen != GHASH_BLOCK_SIZE) { 62 if (keylen != GHASH_BLOCK_SIZE) {
63 crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); 63 crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
64 return -EINVAL; 64 return -EINVAL;
65 } 65 }
66 66
67 clmul_ghash_setkey(&ctx->shash, key); 67 /* perform multiplication by 'x' in GF(2^128) */
68 a = be64_to_cpu(x->a);
69 b = be64_to_cpu(x->b);
70
71 ctx->shash.a = (__be64)((b << 1) | (a >> 63));
72 ctx->shash.b = (__be64)((a << 1) | (b >> 63));
73
74 if (a >> 63)
75 ctx->shash.b ^= cpu_to_be64(0xc2);
68 76
69 return 0; 77 return 0;
70} 78}
diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
new file mode 100644
index 000000000000..1cd792db15ef
--- /dev/null
+++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
@@ -0,0 +1,708 @@
1/*
2 * Implement fast SHA-1 with AVX2 instructions. (x86_64)
3 *
4 * This file is provided under a dual BSD/GPLv2 license. When using or
5 * redistributing this file, you may do so under either license.
6 *
7 * GPL LICENSE SUMMARY
8 *
9 * Copyright(c) 2014 Intel Corporation.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of version 2 of the GNU General Public License as
13 * published by the Free Software Foundation.
14 *
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * Contact Information:
21 * Ilya Albrekht <ilya.albrekht@intel.com>
22 * Maxim Locktyukhin <maxim.locktyukhin@intel.com>
23 * Ronen Zohar <ronen.zohar@intel.com>
24 * Chandramouli Narayanan <mouli@linux.intel.com>
25 *
26 * BSD LICENSE
27 *
28 * Copyright(c) 2014 Intel Corporation.
29 *
30 * Redistribution and use in source and binary forms, with or without
31 * modification, are permitted provided that the following conditions
32 * are met:
33 *
34 * Redistributions of source code must retain the above copyright
35 * notice, this list of conditions and the following disclaimer.
36 * Redistributions in binary form must reproduce the above copyright
37 * notice, this list of conditions and the following disclaimer in
38 * the documentation and/or other materials provided with the
39 * distribution.
40 * Neither the name of Intel Corporation nor the names of its
41 * contributors may be used to endorse or promote products derived
42 * from this software without specific prior written permission.
43 *
44 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
45 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
46 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
47 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
48 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
49 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
50 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
51 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
52 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
53 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
54 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
55 *
56 */
57
58/*
59 * SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
60 *
61 *This implementation is based on the previous SSSE3 release:
62 *Visit http://software.intel.com/en-us/articles/
63 *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
64 *
65 *Updates 20-byte SHA-1 record in 'hash' for even number of
66 *'num_blocks' consecutive 64-byte blocks
67 *
68 *extern "C" void sha1_transform_avx2(
69 * int *hash, const char* input, size_t num_blocks );
70 */
71
72#include <linux/linkage.h>
73
74#define CTX %rdi /* arg1 */
75#define BUF %rsi /* arg2 */
76#define CNT %rdx /* arg3 */
77
78#define REG_A %ecx
79#define REG_B %esi
80#define REG_C %edi
81#define REG_D %eax
82#define REG_E %edx
83#define REG_TB %ebx
84#define REG_TA %r12d
85#define REG_RA %rcx
86#define REG_RB %rsi
87#define REG_RC %rdi
88#define REG_RD %rax
89#define REG_RE %rdx
90#define REG_RTA %r12
91#define REG_RTB %rbx
92#define REG_T1 %ebp
93#define xmm_mov vmovups
94#define avx2_zeroupper vzeroupper
95#define RND_F1 1
96#define RND_F2 2
97#define RND_F3 3
98
99.macro REGALLOC
100 .set A, REG_A
101 .set B, REG_B
102 .set C, REG_C
103 .set D, REG_D
104 .set E, REG_E
105 .set TB, REG_TB
106 .set TA, REG_TA
107
108 .set RA, REG_RA
109 .set RB, REG_RB
110 .set RC, REG_RC
111 .set RD, REG_RD
112 .set RE, REG_RE
113
114 .set RTA, REG_RTA
115 .set RTB, REG_RTB
116
117 .set T1, REG_T1
118.endm
119
120#define K_BASE %r8
121#define HASH_PTR %r9
122#define BUFFER_PTR %r10
123#define BUFFER_PTR2 %r13
124#define BUFFER_END %r11
125
126#define PRECALC_BUF %r14
127#define WK_BUF %r15
128
129#define W_TMP %xmm0
130#define WY_TMP %ymm0
131#define WY_TMP2 %ymm9
132
133# AVX2 variables
134#define WY0 %ymm3
135#define WY4 %ymm5
136#define WY08 %ymm7
137#define WY12 %ymm8
138#define WY16 %ymm12
139#define WY20 %ymm13
140#define WY24 %ymm14
141#define WY28 %ymm15
142
143#define YMM_SHUFB_BSWAP %ymm10
144
145/*
146 * Keep 2 iterations precalculated at a time:
147 * - 80 DWORDs per iteration * 2
148 */
149#define W_SIZE (80*2*2 +16)
150
151#define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF)
152#define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF)
153
154
155.macro UPDATE_HASH hash, val
156 add \hash, \val
157 mov \val, \hash
158.endm
159
160.macro PRECALC_RESET_WY
161 .set WY_00, WY0
162 .set WY_04, WY4
163 .set WY_08, WY08
164 .set WY_12, WY12
165 .set WY_16, WY16
166 .set WY_20, WY20
167 .set WY_24, WY24
168 .set WY_28, WY28
169 .set WY_32, WY_00
170.endm
171
172.macro PRECALC_ROTATE_WY
173 /* Rotate macros */
174 .set WY_32, WY_28
175 .set WY_28, WY_24
176 .set WY_24, WY_20
177 .set WY_20, WY_16
178 .set WY_16, WY_12
179 .set WY_12, WY_08
180 .set WY_08, WY_04
181 .set WY_04, WY_00
182 .set WY_00, WY_32
183
184 /* Define register aliases */
185 .set WY, WY_00
186 .set WY_minus_04, WY_04
187 .set WY_minus_08, WY_08
188 .set WY_minus_12, WY_12
189 .set WY_minus_16, WY_16
190 .set WY_minus_20, WY_20
191 .set WY_minus_24, WY_24
192 .set WY_minus_28, WY_28
193 .set WY_minus_32, WY
194.endm
195
196.macro PRECALC_00_15
197 .if (i == 0) # Initialize and rotate registers
198 PRECALC_RESET_WY
199 PRECALC_ROTATE_WY
200 .endif
201
202 /* message scheduling pre-compute for rounds 0-15 */
203 .if ((i & 7) == 0)
204 /*
205 * blended AVX2 and ALU instruction scheduling
206 * 1 vector iteration per 8 rounds
207 */
208 vmovdqu ((i * 2) + PRECALC_OFFSET)(BUFFER_PTR), W_TMP
209 .elseif ((i & 7) == 1)
210 vinsertf128 $1, (((i-1) * 2)+PRECALC_OFFSET)(BUFFER_PTR2),\
211 WY_TMP, WY_TMP
212 .elseif ((i & 7) == 2)
213 vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
214 .elseif ((i & 7) == 4)
215 vpaddd K_XMM(K_BASE), WY, WY_TMP
216 .elseif ((i & 7) == 7)
217 vmovdqu WY_TMP, PRECALC_WK(i&~7)
218
219 PRECALC_ROTATE_WY
220 .endif
221.endm
222
223.macro PRECALC_16_31
224 /*
225 * message scheduling pre-compute for rounds 16-31
226 * calculating last 32 w[i] values in 8 XMM registers
227 * pre-calculate K+w[i] values and store to mem
228 * for later load by ALU add instruction
229 *
230 * "brute force" vectorization for rounds 16-31 only
231 * due to w[i]->w[i-3] dependency
232 */
233 .if ((i & 7) == 0)
234 /*
235 * blended AVX2 and ALU instruction scheduling
236 * 1 vector iteration per 8 rounds
237 */
238 /* w[i-14] */
239 vpalignr $8, WY_minus_16, WY_minus_12, WY
240 vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */
241 .elseif ((i & 7) == 1)
242 vpxor WY_minus_08, WY, WY
243 vpxor WY_minus_16, WY_TMP, WY_TMP
244 .elseif ((i & 7) == 2)
245 vpxor WY_TMP, WY, WY
246 vpslldq $12, WY, WY_TMP2
247 .elseif ((i & 7) == 3)
248 vpslld $1, WY, WY_TMP
249 vpsrld $31, WY, WY
250 .elseif ((i & 7) == 4)
251 vpor WY, WY_TMP, WY_TMP
252 vpslld $2, WY_TMP2, WY
253 .elseif ((i & 7) == 5)
254 vpsrld $30, WY_TMP2, WY_TMP2
255 vpxor WY, WY_TMP, WY_TMP
256 .elseif ((i & 7) == 7)
257 vpxor WY_TMP2, WY_TMP, WY
258 vpaddd K_XMM(K_BASE), WY, WY_TMP
259 vmovdqu WY_TMP, PRECALC_WK(i&~7)
260
261 PRECALC_ROTATE_WY
262 .endif
263.endm
264
265.macro PRECALC_32_79
266 /*
267 * in SHA-1 specification:
268 * w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
269 * instead we do equal:
270 * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
271 * allows more efficient vectorization
272 * since w[i]=>w[i-3] dependency is broken
273 */
274
275 .if ((i & 7) == 0)
276 /*
277 * blended AVX2 and ALU instruction scheduling
278 * 1 vector iteration per 8 rounds
279 */
280 vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP
281 .elseif ((i & 7) == 1)
282 /* W is W_minus_32 before xor */
283 vpxor WY_minus_28, WY, WY
284 .elseif ((i & 7) == 2)
285 vpxor WY_minus_16, WY_TMP, WY_TMP
286 .elseif ((i & 7) == 3)
287 vpxor WY_TMP, WY, WY
288 .elseif ((i & 7) == 4)
289 vpslld $2, WY, WY_TMP
290 .elseif ((i & 7) == 5)
291 vpsrld $30, WY, WY
292 vpor WY, WY_TMP, WY
293 .elseif ((i & 7) == 7)
294 vpaddd K_XMM(K_BASE), WY, WY_TMP
295 vmovdqu WY_TMP, PRECALC_WK(i&~7)
296
297 PRECALC_ROTATE_WY
298 .endif
299.endm
300
301.macro PRECALC r, s
302 .set i, \r
303
304 .if (i < 40)
305 .set K_XMM, 32*0
306 .elseif (i < 80)
307 .set K_XMM, 32*1
308 .elseif (i < 120)
309 .set K_XMM, 32*2
310 .else
311 .set K_XMM, 32*3
312 .endif
313
314 .if (i<32)
315 PRECALC_00_15 \s
316 .elseif (i<64)
317 PRECALC_16_31 \s
318 .elseif (i < 160)
319 PRECALC_32_79 \s
320 .endif
321.endm
322
323.macro ROTATE_STATE
324 .set T_REG, E
325 .set E, D
326 .set D, C
327 .set C, B
328 .set B, TB
329 .set TB, A
330 .set A, T_REG
331
332 .set T_REG, RE
333 .set RE, RD
334 .set RD, RC
335 .set RC, RB
336 .set RB, RTB
337 .set RTB, RA
338 .set RA, T_REG
339.endm
340
341/* Macro relies on saved ROUND_Fx */
342
343.macro RND_FUN f, r
344 .if (\f == RND_F1)
345 ROUND_F1 \r
346 .elseif (\f == RND_F2)
347 ROUND_F2 \r
348 .elseif (\f == RND_F3)
349 ROUND_F3 \r
350 .endif
351.endm
352
353.macro RR r
354 .set round_id, (\r % 80)
355
356 .if (round_id == 0) /* Precalculate F for first round */
357 .set ROUND_FUNC, RND_F1
358 mov B, TB
359
360 rorx $(32-30), B, B /* b>>>2 */
361 andn D, TB, T1
362 and C, TB
363 xor T1, TB
364 .endif
365
366 RND_FUN ROUND_FUNC, \r
367 ROTATE_STATE
368
369 .if (round_id == 18)
370 .set ROUND_FUNC, RND_F2
371 .elseif (round_id == 38)
372 .set ROUND_FUNC, RND_F3
373 .elseif (round_id == 58)
374 .set ROUND_FUNC, RND_F2
375 .endif
376
377 .set round_id, ( (\r+1) % 80)
378
379 RND_FUN ROUND_FUNC, (\r+1)
380 ROTATE_STATE
381.endm
382
383.macro ROUND_F1 r
384 add WK(\r), E
385
386 andn C, A, T1 /* ~b&d */
387 lea (RE,RTB), E /* Add F from the previous round */
388
389 rorx $(32-5), A, TA /* T2 = A >>> 5 */
390 rorx $(32-30),A, TB /* b>>>2 for next round */
391
392 PRECALC (\r) /* msg scheduling for next 2 blocks */
393
394 /*
395 * Calculate F for the next round
396 * (b & c) ^ andn[b, d]
397 */
398 and B, A /* b&c */
399 xor T1, A /* F1 = (b&c) ^ (~b&d) */
400
401 lea (RE,RTA), E /* E += A >>> 5 */
402.endm
403
404.macro ROUND_F2 r
405 add WK(\r), E
406 lea (RE,RTB), E /* Add F from the previous round */
407
408 /* Calculate F for the next round */
409 rorx $(32-5), A, TA /* T2 = A >>> 5 */
410 .if ((round_id) < 79)
411 rorx $(32-30), A, TB /* b>>>2 for next round */
412 .endif
413 PRECALC (\r) /* msg scheduling for next 2 blocks */
414
415 .if ((round_id) < 79)
416 xor B, A
417 .endif
418
419 add TA, E /* E += A >>> 5 */
420
421 .if ((round_id) < 79)
422 xor C, A
423 .endif
424.endm
425
426.macro ROUND_F3 r
427 add WK(\r), E
428 PRECALC (\r) /* msg scheduling for next 2 blocks */
429
430 lea (RE,RTB), E /* Add F from the previous round */
431
432 mov B, T1
433 or A, T1
434
435 rorx $(32-5), A, TA /* T2 = A >>> 5 */
436 rorx $(32-30), A, TB /* b>>>2 for next round */
437
438 /* Calculate F for the next round
439 * (b and c) or (d and (b or c))
440 */
441 and C, T1
442 and B, A
443 or T1, A
444
445 add TA, E /* E += A >>> 5 */
446
447.endm
448
449/*
450 * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
451 */
452.macro SHA1_PIPELINED_MAIN_BODY
453
454 REGALLOC
455
456 mov (HASH_PTR), A
457 mov 4(HASH_PTR), B
458 mov 8(HASH_PTR), C
459 mov 12(HASH_PTR), D
460 mov 16(HASH_PTR), E
461
462 mov %rsp, PRECALC_BUF
463 lea (2*4*80+32)(%rsp), WK_BUF
464
465 # Precalc WK for first 2 blocks
466 PRECALC_OFFSET = 0
467 .set i, 0
468 .rept 160
469 PRECALC i
470 .set i, i + 1
471 .endr
472 PRECALC_OFFSET = 128
473 xchg WK_BUF, PRECALC_BUF
474
475 .align 32
476_loop:
477 /*
478 * code loops through more than one block
479 * we use K_BASE value as a signal of a last block,
480 * it is set below by: cmovae BUFFER_PTR, K_BASE
481 */
482 cmp K_BASE, BUFFER_PTR
483 jne _begin
484 .align 32
485 jmp _end
486 .align 32
487_begin:
488
489 /*
490 * Do first block
491 * rounds: 0,2,4,6,8
492 */
493 .set j, 0
494 .rept 5
495 RR j
496 .set j, j+2
497 .endr
498
499 jmp _loop0
500_loop0:
501
502 /*
503 * rounds:
504 * 10,12,14,16,18
505 * 20,22,24,26,28
506 * 30,32,34,36,38
507 * 40,42,44,46,48
508 * 50,52,54,56,58
509 */
510 .rept 25
511 RR j
512 .set j, j+2
513 .endr
514
515 add $(2*64), BUFFER_PTR /* move to next odd-64-byte block */
516 cmp BUFFER_END, BUFFER_PTR /* is current block the last one? */
517 cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */
518
519 /*
520 * rounds
521 * 60,62,64,66,68
522 * 70,72,74,76,78
523 */
524 .rept 10
525 RR j
526 .set j, j+2
527 .endr
528
529 UPDATE_HASH (HASH_PTR), A
530 UPDATE_HASH 4(HASH_PTR), TB
531 UPDATE_HASH 8(HASH_PTR), C
532 UPDATE_HASH 12(HASH_PTR), D
533 UPDATE_HASH 16(HASH_PTR), E
534
535 cmp K_BASE, BUFFER_PTR /* is current block the last one? */
536 je _loop
537
538 mov TB, B
539
540 /* Process second block */
541 /*
542 * rounds
543 * 0+80, 2+80, 4+80, 6+80, 8+80
544 * 10+80,12+80,14+80,16+80,18+80
545 */
546
547 .set j, 0
548 .rept 10
549 RR j+80
550 .set j, j+2
551 .endr
552
553 jmp _loop1
554_loop1:
555 /*
556 * rounds
557 * 20+80,22+80,24+80,26+80,28+80
558 * 30+80,32+80,34+80,36+80,38+80
559 */
560 .rept 10
561 RR j+80
562 .set j, j+2
563 .endr
564
565 jmp _loop2
566_loop2:
567
568 /*
569 * rounds
570 * 40+80,42+80,44+80,46+80,48+80
571 * 50+80,52+80,54+80,56+80,58+80
572 */
573 .rept 10
574 RR j+80
575 .set j, j+2
576 .endr
577
578 add $(2*64), BUFFER_PTR2 /* move to next even-64-byte block */
579
580 cmp BUFFER_END, BUFFER_PTR2 /* is current block the last one */
581 cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */
582
583 jmp _loop3
584_loop3:
585
586 /*
587 * rounds
588 * 60+80,62+80,64+80,66+80,68+80
589 * 70+80,72+80,74+80,76+80,78+80
590 */
591 .rept 10
592 RR j+80
593 .set j, j+2
594 .endr
595
596 UPDATE_HASH (HASH_PTR), A
597 UPDATE_HASH 4(HASH_PTR), TB
598 UPDATE_HASH 8(HASH_PTR), C
599 UPDATE_HASH 12(HASH_PTR), D
600 UPDATE_HASH 16(HASH_PTR), E
601
602 /* Reset state for AVX2 reg permutation */
603 mov A, TA
604 mov TB, A
605 mov C, TB
606 mov E, C
607 mov D, B
608 mov TA, D
609
610 REGALLOC
611
612 xchg WK_BUF, PRECALC_BUF
613
614 jmp _loop
615
616 .align 32
617 _end:
618
619.endm
620/*
621 * macro implements SHA-1 function's body for several 64-byte blocks
622 * param: function's name
623 */
624.macro SHA1_VECTOR_ASM name
625 ENTRY(\name)
626
627 push %rbx
628 push %rbp
629 push %r12
630 push %r13
631 push %r14
632 push %r15
633
634 RESERVE_STACK = (W_SIZE*4 + 8+24)
635
636 /* Align stack */
637 mov %rsp, %rbx
638 and $~(0x20-1), %rsp
639 push %rbx
640 sub $RESERVE_STACK, %rsp
641
642 avx2_zeroupper
643
644 lea K_XMM_AR(%rip), K_BASE
645
646 mov CTX, HASH_PTR
647 mov BUF, BUFFER_PTR
648 lea 64(BUF), BUFFER_PTR2
649
650 shl $6, CNT /* mul by 64 */
651 add BUF, CNT
652 add $64, CNT
653 mov CNT, BUFFER_END
654
655 cmp BUFFER_END, BUFFER_PTR2
656 cmovae K_BASE, BUFFER_PTR2
657
658 xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
659
660 SHA1_PIPELINED_MAIN_BODY
661
662 avx2_zeroupper
663
664 add $RESERVE_STACK, %rsp
665 pop %rsp
666
667 pop %r15
668 pop %r14
669 pop %r13
670 pop %r12
671 pop %rbp
672 pop %rbx
673
674 ret
675
676 ENDPROC(\name)
677.endm
678
679.section .rodata
680
681#define K1 0x5a827999
682#define K2 0x6ed9eba1
683#define K3 0x8f1bbcdc
684#define K4 0xca62c1d6
685
686.align 128
687K_XMM_AR:
688 .long K1, K1, K1, K1
689 .long K1, K1, K1, K1
690 .long K2, K2, K2, K2
691 .long K2, K2, K2, K2
692 .long K3, K3, K3, K3
693 .long K3, K3, K3, K3
694 .long K4, K4, K4, K4
695 .long K4, K4, K4, K4
696
697BSWAP_SHUFB_CTL:
698 .long 0x00010203
699 .long 0x04050607
700 .long 0x08090a0b
701 .long 0x0c0d0e0f
702 .long 0x00010203
703 .long 0x04050607
704 .long 0x08090a0b
705 .long 0x0c0d0e0f
706.text
707
708SHA1_VECTOR_ASM sha1_transform_avx2
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
index 4a11a9d72451..74d16ef707c7 100644
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -10,6 +10,7 @@
10 * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> 10 * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
11 * Copyright (c) Jean-Francois Dive <jef@linuxbe.org> 11 * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
12 * Copyright (c) Mathias Krause <minipli@googlemail.com> 12 * Copyright (c) Mathias Krause <minipli@googlemail.com>
13 * Copyright (c) Chandramouli Narayanan <mouli@linux.intel.com>
13 * 14 *
14 * This program is free software; you can redistribute it and/or modify it 15 * This program is free software; you can redistribute it and/or modify it
15 * under the terms of the GNU General Public License as published by the Free 16 * under the terms of the GNU General Public License as published by the Free
@@ -39,6 +40,12 @@ asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data,
39asmlinkage void sha1_transform_avx(u32 *digest, const char *data, 40asmlinkage void sha1_transform_avx(u32 *digest, const char *data,
40 unsigned int rounds); 41 unsigned int rounds);
41#endif 42#endif
43#ifdef CONFIG_AS_AVX2
44#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */
45
46asmlinkage void sha1_transform_avx2(u32 *digest, const char *data,
47 unsigned int rounds);
48#endif
42 49
43static asmlinkage void (*sha1_transform_asm)(u32 *, const char *, unsigned int); 50static asmlinkage void (*sha1_transform_asm)(u32 *, const char *, unsigned int);
44 51
@@ -165,6 +172,18 @@ static int sha1_ssse3_import(struct shash_desc *desc, const void *in)
165 return 0; 172 return 0;
166} 173}
167 174
175#ifdef CONFIG_AS_AVX2
176static void sha1_apply_transform_avx2(u32 *digest, const char *data,
177 unsigned int rounds)
178{
179 /* Select the optimal transform based on data block size */
180 if (rounds >= SHA1_AVX2_BLOCK_OPTSIZE)
181 sha1_transform_avx2(digest, data, rounds);
182 else
183 sha1_transform_avx(digest, data, rounds);
184}
185#endif
186
168static struct shash_alg alg = { 187static struct shash_alg alg = {
169 .digestsize = SHA1_DIGEST_SIZE, 188 .digestsize = SHA1_DIGEST_SIZE,
170 .init = sha1_ssse3_init, 189 .init = sha1_ssse3_init,
@@ -201,27 +220,49 @@ static bool __init avx_usable(void)
201 220
202 return true; 221 return true;
203} 222}
223
224#ifdef CONFIG_AS_AVX2
225static bool __init avx2_usable(void)
226{
227 if (avx_usable() && cpu_has_avx2 && boot_cpu_has(X86_FEATURE_BMI1) &&
228 boot_cpu_has(X86_FEATURE_BMI2))
229 return true;
230
231 return false;
232}
233#endif
204#endif 234#endif
205 235
206static int __init sha1_ssse3_mod_init(void) 236static int __init sha1_ssse3_mod_init(void)
207{ 237{
238 char *algo_name;
239
208 /* test for SSSE3 first */ 240 /* test for SSSE3 first */
209 if (cpu_has_ssse3) 241 if (cpu_has_ssse3) {
210 sha1_transform_asm = sha1_transform_ssse3; 242 sha1_transform_asm = sha1_transform_ssse3;
243 algo_name = "SSSE3";
244 }
211 245
212#ifdef CONFIG_AS_AVX 246#ifdef CONFIG_AS_AVX
213 /* allow AVX to override SSSE3, it's a little faster */ 247 /* allow AVX to override SSSE3, it's a little faster */
214 if (avx_usable()) 248 if (avx_usable()) {
215 sha1_transform_asm = sha1_transform_avx; 249 sha1_transform_asm = sha1_transform_avx;
250 algo_name = "AVX";
251#ifdef CONFIG_AS_AVX2
252 /* allow AVX2 to override AVX, it's a little faster */
253 if (avx2_usable()) {
254 sha1_transform_asm = sha1_apply_transform_avx2;
255 algo_name = "AVX2";
256 }
257#endif
258 }
216#endif 259#endif
217 260
218 if (sha1_transform_asm) { 261 if (sha1_transform_asm) {
219 pr_info("Using %s optimized SHA-1 implementation\n", 262 pr_info("Using %s optimized SHA-1 implementation\n", algo_name);
220 sha1_transform_asm == sha1_transform_ssse3 ? "SSSE3"
221 : "AVX");
222 return crypto_register_shash(&alg); 263 return crypto_register_shash(&alg);
223 } 264 }
224 pr_info("Neither AVX nor SSSE3 is available/usable.\n"); 265 pr_info("Neither AVX nor AVX2 nor SSSE3 is available/usable.\n");
225 266
226 return -ENODEV; 267 return -ENODEV;
227} 268}
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 7f669853317a..3ca9762e1649 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -5,3 +5,6 @@ genhdr-y += unistd_64.h
5genhdr-y += unistd_x32.h 5genhdr-y += unistd_x32.h
6 6
7generic-y += clkdev.h 7generic-y += clkdev.h
8generic-y += early_ioremap.h
9generic-y += cputime.h
10generic-y += mcs_spinlock.h
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index a54ee1d054d9..aaac3b2fb746 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -19,7 +19,7 @@ extern int amd_cache_northbridges(void);
19extern void amd_flush_garts(void); 19extern void amd_flush_garts(void);
20extern int amd_numa_init(void); 20extern int amd_numa_init(void);
21extern int amd_get_subcaches(int); 21extern int amd_get_subcaches(int);
22extern int amd_set_subcaches(int, int); 22extern int amd_set_subcaches(int, unsigned long);
23 23
24struct amd_l3_cache { 24struct amd_l3_cache {
25 unsigned indices; 25 unsigned indices;
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 1d2091a226bc..19b0ebafcd3e 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -93,9 +93,6 @@ static inline int is_vsmp_box(void)
93 return 0; 93 return 0;
94} 94}
95#endif 95#endif
96extern void xapic_wait_icr_idle(void);
97extern u32 safe_xapic_wait_icr_idle(void);
98extern void xapic_icr_write(u32, u32);
99extern int setup_profiling_timer(unsigned int); 96extern int setup_profiling_timer(unsigned int);
100 97
101static inline void native_apic_mem_write(u32 reg, u32 v) 98static inline void native_apic_mem_write(u32 reg, u32 v)
@@ -184,7 +181,6 @@ extern int x2apic_phys;
184extern int x2apic_preenabled; 181extern int x2apic_preenabled;
185extern void check_x2apic(void); 182extern void check_x2apic(void);
186extern void enable_x2apic(void); 183extern void enable_x2apic(void);
187extern void x2apic_icr_write(u32 low, u32 id);
188static inline int x2apic_enabled(void) 184static inline int x2apic_enabled(void)
189{ 185{
190 u64 msr; 186 u64 msr;
@@ -221,7 +217,6 @@ static inline void x2apic_force_phys(void)
221{ 217{
222} 218}
223 219
224#define nox2apic 0
225#define x2apic_preenabled 0 220#define x2apic_preenabled 0
226#define x2apic_supported() 0 221#define x2apic_supported() 0
227#endif 222#endif
@@ -351,7 +346,7 @@ struct apic {
351 int trampoline_phys_low; 346 int trampoline_phys_low;
352 int trampoline_phys_high; 347 int trampoline_phys_high;
353 348
354 void (*wait_for_init_deassert)(atomic_t *deassert); 349 bool wait_for_init_deassert;
355 void (*smp_callin_clear_local_apic)(void); 350 void (*smp_callin_clear_local_apic)(void);
356 void (*inquire_remote_apic)(int apicid); 351 void (*inquire_remote_apic)(int apicid);
357 352
@@ -517,13 +512,6 @@ extern int default_cpu_present_to_apicid(int mps_cpu);
517extern int default_check_phys_apicid_present(int phys_apicid); 512extern int default_check_phys_apicid_present(int phys_apicid);
518#endif 513#endif
519 514
520static inline void default_wait_for_init_deassert(atomic_t *deassert)
521{
522 while (!atomic_read(deassert))
523 cpu_relax();
524 return;
525}
526
527extern void generic_bigsmp_probe(void); 515extern void generic_bigsmp_probe(void);
528 516
529 517
diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h
index 0d9ec770f2f8..69f1366f1aa3 100644
--- a/arch/x86/include/asm/archrandom.h
+++ b/arch/x86/include/asm/archrandom.h
@@ -1,7 +1,7 @@
1/* 1/*
2 * This file is part of the Linux kernel. 2 * This file is part of the Linux kernel.
3 * 3 *
4 * Copyright (c) 2011, Intel Corporation 4 * Copyright (c) 2011-2014, Intel Corporation
5 * Authors: Fenghua Yu <fenghua.yu@intel.com>, 5 * Authors: Fenghua Yu <fenghua.yu@intel.com>,
6 * H. Peter Anvin <hpa@linux.intel.com> 6 * H. Peter Anvin <hpa@linux.intel.com>
7 * 7 *
@@ -31,14 +31,41 @@
31#define RDRAND_RETRY_LOOPS 10 31#define RDRAND_RETRY_LOOPS 10
32 32
33#define RDRAND_INT ".byte 0x0f,0xc7,0xf0" 33#define RDRAND_INT ".byte 0x0f,0xc7,0xf0"
34#define RDSEED_INT ".byte 0x0f,0xc7,0xf8"
34#ifdef CONFIG_X86_64 35#ifdef CONFIG_X86_64
35# define RDRAND_LONG ".byte 0x48,0x0f,0xc7,0xf0" 36# define RDRAND_LONG ".byte 0x48,0x0f,0xc7,0xf0"
37# define RDSEED_LONG ".byte 0x48,0x0f,0xc7,0xf8"
36#else 38#else
37# define RDRAND_LONG RDRAND_INT 39# define RDRAND_LONG RDRAND_INT
40# define RDSEED_LONG RDSEED_INT
38#endif 41#endif
39 42
40#ifdef CONFIG_ARCH_RANDOM 43#ifdef CONFIG_ARCH_RANDOM
41 44
45/* Instead of arch_get_random_long() when alternatives haven't run. */
46static inline int rdrand_long(unsigned long *v)
47{
48 int ok;
49 asm volatile("1: " RDRAND_LONG "\n\t"
50 "jc 2f\n\t"
51 "decl %0\n\t"
52 "jnz 1b\n\t"
53 "2:"
54 : "=r" (ok), "=a" (*v)
55 : "0" (RDRAND_RETRY_LOOPS));
56 return ok;
57}
58
59/* A single attempt at RDSEED */
60static inline bool rdseed_long(unsigned long *v)
61{
62 unsigned char ok;
63 asm volatile(RDSEED_LONG "\n\t"
64 "setc %0"
65 : "=qm" (ok), "=a" (*v));
66 return ok;
67}
68
42#define GET_RANDOM(name, type, rdrand, nop) \ 69#define GET_RANDOM(name, type, rdrand, nop) \
43static inline int name(type *v) \ 70static inline int name(type *v) \
44{ \ 71{ \
@@ -56,18 +83,52 @@ static inline int name(type *v) \
56 return ok; \ 83 return ok; \
57} 84}
58 85
86#define GET_SEED(name, type, rdseed, nop) \
87static inline int name(type *v) \
88{ \
89 unsigned char ok; \
90 alternative_io("movb $0, %0\n\t" \
91 nop, \
92 rdseed "\n\t" \
93 "setc %0", \
94 X86_FEATURE_RDSEED, \
95 ASM_OUTPUT2("=q" (ok), "=a" (*v))); \
96 return ok; \
97}
98
59#ifdef CONFIG_X86_64 99#ifdef CONFIG_X86_64
60 100
61GET_RANDOM(arch_get_random_long, unsigned long, RDRAND_LONG, ASM_NOP5); 101GET_RANDOM(arch_get_random_long, unsigned long, RDRAND_LONG, ASM_NOP5);
62GET_RANDOM(arch_get_random_int, unsigned int, RDRAND_INT, ASM_NOP4); 102GET_RANDOM(arch_get_random_int, unsigned int, RDRAND_INT, ASM_NOP4);
63 103
104GET_SEED(arch_get_random_seed_long, unsigned long, RDSEED_LONG, ASM_NOP5);
105GET_SEED(arch_get_random_seed_int, unsigned int, RDSEED_INT, ASM_NOP4);
106
64#else 107#else
65 108
66GET_RANDOM(arch_get_random_long, unsigned long, RDRAND_LONG, ASM_NOP3); 109GET_RANDOM(arch_get_random_long, unsigned long, RDRAND_LONG, ASM_NOP3);
67GET_RANDOM(arch_get_random_int, unsigned int, RDRAND_INT, ASM_NOP3); 110GET_RANDOM(arch_get_random_int, unsigned int, RDRAND_INT, ASM_NOP3);
68 111
112GET_SEED(arch_get_random_seed_long, unsigned long, RDSEED_LONG, ASM_NOP4);
113GET_SEED(arch_get_random_seed_int, unsigned int, RDSEED_INT, ASM_NOP4);
114
69#endif /* CONFIG_X86_64 */ 115#endif /* CONFIG_X86_64 */
70 116
117#define arch_has_random() static_cpu_has(X86_FEATURE_RDRAND)
118#define arch_has_random_seed() static_cpu_has(X86_FEATURE_RDSEED)
119
120#else
121
122static inline int rdrand_long(unsigned long *v)
123{
124 return 0;
125}
126
127static inline bool rdseed_long(unsigned long *v)
128{
129 return 0;
130}
131
71#endif /* CONFIG_ARCH_RANDOM */ 132#endif /* CONFIG_ARCH_RANDOM */
72 133
73extern void x86_init_rdrand(struct cpuinfo_x86 *c); 134extern void x86_init_rdrand(struct cpuinfo_x86 *c);
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index c6cd358a1eec..69bbb4845020 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -85,19 +85,56 @@
85#else 85#else
86# define smp_rmb() barrier() 86# define smp_rmb() barrier()
87#endif 87#endif
88#ifdef CONFIG_X86_OOSTORE 88#define smp_wmb() barrier()
89# define smp_wmb() wmb()
90#else
91# define smp_wmb() barrier()
92#endif
93#define smp_read_barrier_depends() read_barrier_depends() 89#define smp_read_barrier_depends() read_barrier_depends()
94#define set_mb(var, value) do { (void)xchg(&var, value); } while (0) 90#define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
95#else 91#else /* !SMP */
96#define smp_mb() barrier() 92#define smp_mb() barrier()
97#define smp_rmb() barrier() 93#define smp_rmb() barrier()
98#define smp_wmb() barrier() 94#define smp_wmb() barrier()
99#define smp_read_barrier_depends() do { } while (0) 95#define smp_read_barrier_depends() do { } while (0)
100#define set_mb(var, value) do { var = value; barrier(); } while (0) 96#define set_mb(var, value) do { var = value; barrier(); } while (0)
97#endif /* SMP */
98
99#if defined(CONFIG_X86_PPRO_FENCE)
100
101/*
102 * For either of these options x86 doesn't have a strong TSO memory
103 * model and we should fall back to full barriers.
104 */
105
106#define smp_store_release(p, v) \
107do { \
108 compiletime_assert_atomic_type(*p); \
109 smp_mb(); \
110 ACCESS_ONCE(*p) = (v); \
111} while (0)
112
113#define smp_load_acquire(p) \
114({ \
115 typeof(*p) ___p1 = ACCESS_ONCE(*p); \
116 compiletime_assert_atomic_type(*p); \
117 smp_mb(); \
118 ___p1; \
119})
120
121#else /* regular x86 TSO memory ordering */
122
123#define smp_store_release(p, v) \
124do { \
125 compiletime_assert_atomic_type(*p); \
126 barrier(); \
127 ACCESS_ONCE(*p) = (v); \
128} while (0)
129
130#define smp_load_acquire(p) \
131({ \
132 typeof(*p) ___p1 = ACCESS_ONCE(*p); \
133 compiletime_assert_atomic_type(*p); \
134 barrier(); \
135 ___p1; \
136})
137
101#endif 138#endif
102 139
103/* 140/*
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index 2f03ff018d36..ba38ebbaced3 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -1,7 +1,6 @@
1#ifndef _ASM_X86_BUG_H 1#ifndef _ASM_X86_BUG_H
2#define _ASM_X86_BUG_H 2#define _ASM_X86_BUG_H
3 3
4#ifdef CONFIG_BUG
5#define HAVE_ARCH_BUG 4#define HAVE_ARCH_BUG
6 5
7#ifdef CONFIG_DEBUG_BUGVERBOSE 6#ifdef CONFIG_DEBUG_BUGVERBOSE
@@ -33,8 +32,6 @@ do { \
33} while (0) 32} while (0)
34#endif 33#endif
35 34
36#endif /* !CONFIG_BUG */
37
38#include <asm-generic/bug.h> 35#include <asm-generic/bug.h>
39 36
40#endif /* _ASM_X86_BUG_H */ 37#endif /* _ASM_X86_BUG_H */
diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h
index 16a57f4ed64d..eda81dc0f4ae 100644
--- a/arch/x86/include/asm/clocksource.h
+++ b/arch/x86/include/asm/clocksource.h
@@ -3,8 +3,6 @@
3#ifndef _ASM_X86_CLOCKSOURCE_H 3#ifndef _ASM_X86_CLOCKSOURCE_H
4#define _ASM_X86_CLOCKSOURCE_H 4#define _ASM_X86_CLOCKSOURCE_H
5 5
6#ifdef CONFIG_X86_64
7
8#define VCLOCK_NONE 0 /* No vDSO clock available. */ 6#define VCLOCK_NONE 0 /* No vDSO clock available. */
9#define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */ 7#define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */
10#define VCLOCK_HPET 2 /* vDSO should use vread_hpet. */ 8#define VCLOCK_HPET 2 /* vDSO should use vread_hpet. */
@@ -14,6 +12,4 @@ struct arch_clocksource_data {
14 int vclock_mode; 12 int vclock_mode;
15}; 13};
16 14
17#endif /* CONFIG_X86_64 */
18
19#endif /* _ASM_X86_CLOCKSOURCE_H */ 15#endif /* _ASM_X86_CLOCKSOURCE_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 89270b4318db..e265ff95d16d 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -37,7 +37,7 @@
37#define X86_FEATURE_PAT (0*32+16) /* Page Attribute Table */ 37#define X86_FEATURE_PAT (0*32+16) /* Page Attribute Table */
38#define X86_FEATURE_PSE36 (0*32+17) /* 36-bit PSEs */ 38#define X86_FEATURE_PSE36 (0*32+17) /* 36-bit PSEs */
39#define X86_FEATURE_PN (0*32+18) /* Processor serial number */ 39#define X86_FEATURE_PN (0*32+18) /* Processor serial number */
40#define X86_FEATURE_CLFLSH (0*32+19) /* "clflush" CLFLUSH instruction */ 40#define X86_FEATURE_CLFLUSH (0*32+19) /* CLFLUSH instruction */
41#define X86_FEATURE_DS (0*32+21) /* "dts" Debug Store */ 41#define X86_FEATURE_DS (0*32+21) /* "dts" Debug Store */
42#define X86_FEATURE_ACPI (0*32+22) /* ACPI via MSR */ 42#define X86_FEATURE_ACPI (0*32+22) /* ACPI via MSR */
43#define X86_FEATURE_MMX (0*32+23) /* Multimedia Extensions */ 43#define X86_FEATURE_MMX (0*32+23) /* Multimedia Extensions */
@@ -216,9 +216,15 @@
216#define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */ 216#define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */
217#define X86_FEATURE_INVPCID (9*32+10) /* Invalidate Processor Context ID */ 217#define X86_FEATURE_INVPCID (9*32+10) /* Invalidate Processor Context ID */
218#define X86_FEATURE_RTM (9*32+11) /* Restricted Transactional Memory */ 218#define X86_FEATURE_RTM (9*32+11) /* Restricted Transactional Memory */
219#define X86_FEATURE_MPX (9*32+14) /* Memory Protection Extension */
220#define X86_FEATURE_AVX512F (9*32+16) /* AVX-512 Foundation */
219#define X86_FEATURE_RDSEED (9*32+18) /* The RDSEED instruction */ 221#define X86_FEATURE_RDSEED (9*32+18) /* The RDSEED instruction */
220#define X86_FEATURE_ADX (9*32+19) /* The ADCX and ADOX instructions */ 222#define X86_FEATURE_ADX (9*32+19) /* The ADCX and ADOX instructions */
221#define X86_FEATURE_SMAP (9*32+20) /* Supervisor Mode Access Prevention */ 223#define X86_FEATURE_SMAP (9*32+20) /* Supervisor Mode Access Prevention */
224#define X86_FEATURE_CLFLUSHOPT (9*32+23) /* CLFLUSHOPT instruction */
225#define X86_FEATURE_AVX512PF (9*32+26) /* AVX-512 Prefetch */
226#define X86_FEATURE_AVX512ER (9*32+27) /* AVX-512 Exponential and Reciprocal */
227#define X86_FEATURE_AVX512CD (9*32+28) /* AVX-512 Conflict Detection */
222 228
223/* 229/*
224 * BUG word(s) 230 * BUG word(s)
@@ -312,7 +318,7 @@ extern const char * const x86_power_flags[32];
312#define cpu_has_pmm_enabled boot_cpu_has(X86_FEATURE_PMM_EN) 318#define cpu_has_pmm_enabled boot_cpu_has(X86_FEATURE_PMM_EN)
313#define cpu_has_ds boot_cpu_has(X86_FEATURE_DS) 319#define cpu_has_ds boot_cpu_has(X86_FEATURE_DS)
314#define cpu_has_pebs boot_cpu_has(X86_FEATURE_PEBS) 320#define cpu_has_pebs boot_cpu_has(X86_FEATURE_PEBS)
315#define cpu_has_clflush boot_cpu_has(X86_FEATURE_CLFLSH) 321#define cpu_has_clflush boot_cpu_has(X86_FEATURE_CLFLUSH)
316#define cpu_has_bts boot_cpu_has(X86_FEATURE_BTS) 322#define cpu_has_bts boot_cpu_has(X86_FEATURE_BTS)
317#define cpu_has_gbpages boot_cpu_has(X86_FEATURE_GBPAGES) 323#define cpu_has_gbpages boot_cpu_has(X86_FEATURE_GBPAGES)
318#define cpu_has_arch_perfmon boot_cpu_has(X86_FEATURE_ARCH_PERFMON) 324#define cpu_has_arch_perfmon boot_cpu_has(X86_FEATURE_ARCH_PERFMON)
@@ -540,6 +546,13 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
540#define static_cpu_has_bug(bit) static_cpu_has((bit)) 546#define static_cpu_has_bug(bit) static_cpu_has((bit))
541#define boot_cpu_has_bug(bit) cpu_has_bug(&boot_cpu_data, (bit)) 547#define boot_cpu_has_bug(bit) cpu_has_bug(&boot_cpu_data, (bit))
542 548
549#define MAX_CPU_FEATURES (NCAPINTS * 32)
550#define cpu_have_feature boot_cpu_has
551
552#define CPU_FEATURE_TYPEFMT "x86,ven%04Xfam%04Xmod%04X"
553#define CPU_FEATURE_TYPEVAL boot_cpu_data.x86_vendor, boot_cpu_data.x86, \
554 boot_cpu_data.x86_model
555
543#endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */ 556#endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */
544 557
545#endif /* _ASM_X86_CPUFEATURE_H */ 558#endif /* _ASM_X86_CPUFEATURE_H */
diff --git a/arch/x86/include/asm/cputime.h b/arch/x86/include/asm/cputime.h
deleted file mode 100644
index 6d68ad7e0ea3..000000000000
--- a/arch/x86/include/asm/cputime.h
+++ /dev/null
@@ -1 +0,0 @@
1#include <asm-generic/cputime.h>
diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h
index fd8f9e2ca35f..535192f6bfad 100644
--- a/arch/x86/include/asm/dmi.h
+++ b/arch/x86/include/asm/dmi.h
@@ -13,7 +13,9 @@ static __always_inline __init void *dmi_alloc(unsigned len)
13} 13}
14 14
15/* Use early IO mappings for DMI because it's initialized early */ 15/* Use early IO mappings for DMI because it's initialized early */
16#define dmi_ioremap early_ioremap 16#define dmi_early_remap early_ioremap
17#define dmi_iounmap early_iounmap 17#define dmi_early_unmap early_iounmap
18#define dmi_remap ioremap
19#define dmi_unmap iounmap
18 20
19#endif /* _ASM_X86_DMI_H */ 21#endif /* _ASM_X86_DMI_H */
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 65c6e6e3a552..0869434eaf72 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -1,9 +1,29 @@
1#ifndef _ASM_X86_EFI_H 1#ifndef _ASM_X86_EFI_H
2#define _ASM_X86_EFI_H 2#define _ASM_X86_EFI_H
3 3
4/*
5 * We map the EFI regions needed for runtime services non-contiguously,
6 * with preserved alignment on virtual addresses starting from -4G down
7 * for a total max space of 64G. This way, we provide for stable runtime
8 * services addresses across kernels so that a kexec'd kernel can still
9 * use them.
10 *
11 * This is the main reason why we're doing stable VA mappings for RT
12 * services.
13 *
14 * This flag is used in conjuction with a chicken bit called
15 * "efi=old_map" which can be used as a fallback to the old runtime
16 * services mapping method in case there's some b0rkage with a
17 * particular EFI implementation (haha, it is hard to hold up the
18 * sarcasm here...).
19 */
20#define EFI_OLD_MEMMAP EFI_ARCH_1
21
22#define EFI32_LOADER_SIGNATURE "EL32"
23#define EFI64_LOADER_SIGNATURE "EL64"
24
4#ifdef CONFIG_X86_32 25#ifdef CONFIG_X86_32
5 26
6#define EFI_LOADER_SIGNATURE "EL32"
7 27
8extern unsigned long asmlinkage efi_call_phys(void *, ...); 28extern unsigned long asmlinkage efi_call_phys(void *, ...);
9 29
@@ -39,8 +59,6 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...);
39 59
40#else /* !CONFIG_X86_32 */ 60#else /* !CONFIG_X86_32 */
41 61
42#define EFI_LOADER_SIGNATURE "EL64"
43
44extern u64 efi_call0(void *fp); 62extern u64 efi_call0(void *fp);
45extern u64 efi_call1(void *fp, u64 arg1); 63extern u64 efi_call1(void *fp, u64 arg1);
46extern u64 efi_call2(void *fp, u64 arg1, u64 arg2); 64extern u64 efi_call2(void *fp, u64 arg1, u64 arg2);
@@ -69,24 +87,31 @@ extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3,
69 efi_call6((f), (u64)(a1), (u64)(a2), (u64)(a3), \ 87 efi_call6((f), (u64)(a1), (u64)(a2), (u64)(a3), \
70 (u64)(a4), (u64)(a5), (u64)(a6)) 88 (u64)(a4), (u64)(a5), (u64)(a6))
71 89
90#define _efi_call_virtX(x, f, ...) \
91({ \
92 efi_status_t __s; \
93 \
94 efi_sync_low_kernel_mappings(); \
95 preempt_disable(); \
96 __s = efi_call##x((void *)efi.systab->runtime->f, __VA_ARGS__); \
97 preempt_enable(); \
98 __s; \
99})
100
72#define efi_call_virt0(f) \ 101#define efi_call_virt0(f) \
73 efi_call0((efi.systab->runtime->f)) 102 _efi_call_virtX(0, f)
74#define efi_call_virt1(f, a1) \ 103#define efi_call_virt1(f, a1) \
75 efi_call1((efi.systab->runtime->f), (u64)(a1)) 104 _efi_call_virtX(1, f, (u64)(a1))
76#define efi_call_virt2(f, a1, a2) \ 105#define efi_call_virt2(f, a1, a2) \
77 efi_call2((efi.systab->runtime->f), (u64)(a1), (u64)(a2)) 106 _efi_call_virtX(2, f, (u64)(a1), (u64)(a2))
78#define efi_call_virt3(f, a1, a2, a3) \ 107#define efi_call_virt3(f, a1, a2, a3) \
79 efi_call3((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ 108 _efi_call_virtX(3, f, (u64)(a1), (u64)(a2), (u64)(a3))
80 (u64)(a3)) 109#define efi_call_virt4(f, a1, a2, a3, a4) \
81#define efi_call_virt4(f, a1, a2, a3, a4) \ 110 _efi_call_virtX(4, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4))
82 efi_call4((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ 111#define efi_call_virt5(f, a1, a2, a3, a4, a5) \
83 (u64)(a3), (u64)(a4)) 112 _efi_call_virtX(5, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4), (u64)(a5))
84#define efi_call_virt5(f, a1, a2, a3, a4, a5) \ 113#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \
85 efi_call5((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ 114 _efi_call_virtX(6, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6))
86 (u64)(a3), (u64)(a4), (u64)(a5))
87#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \
88 efi_call6((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
89 (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6))
90 115
91extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size, 116extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size,
92 u32 type, u64 attribute); 117 u32 type, u64 attribute);
@@ -94,13 +119,33 @@ extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size,
94#endif /* CONFIG_X86_32 */ 119#endif /* CONFIG_X86_32 */
95 120
96extern int add_efi_memmap; 121extern int add_efi_memmap;
97extern unsigned long x86_efi_facility; 122extern struct efi_scratch efi_scratch;
98extern void efi_set_executable(efi_memory_desc_t *md, bool executable); 123extern void efi_set_executable(efi_memory_desc_t *md, bool executable);
99extern int efi_memblock_x86_reserve_range(void); 124extern int efi_memblock_x86_reserve_range(void);
100extern void efi_call_phys_prelog(void); 125extern void efi_call_phys_prelog(void);
101extern void efi_call_phys_epilog(void); 126extern void efi_call_phys_epilog(void);
102extern void efi_unmap_memmap(void); 127extern void efi_unmap_memmap(void);
103extern void efi_memory_uc(u64 addr, unsigned long size); 128extern void efi_memory_uc(u64 addr, unsigned long size);
129extern void __init efi_map_region(efi_memory_desc_t *md);
130extern void __init efi_map_region_fixed(efi_memory_desc_t *md);
131extern void efi_sync_low_kernel_mappings(void);
132extern int efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages);
133extern void efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages);
134extern void __init old_map_region(efi_memory_desc_t *md);
135extern void __init runtime_code_page_mkexec(void);
136extern void __init efi_runtime_mkexec(void);
137extern void __init efi_dump_pagetable(void);
138extern void __init efi_apply_memmap_quirks(void);
139
140struct efi_setup_data {
141 u64 fw_vendor;
142 u64 runtime;
143 u64 tables;
144 u64 smbios;
145 u64 reserved[8];
146};
147
148extern u64 efi_setup;
104 149
105#ifdef CONFIG_EFI 150#ifdef CONFIG_EFI
106 151
@@ -109,8 +154,40 @@ static inline bool efi_is_native(void)
109 return IS_ENABLED(CONFIG_X86_64) == efi_enabled(EFI_64BIT); 154 return IS_ENABLED(CONFIG_X86_64) == efi_enabled(EFI_64BIT);
110} 155}
111 156
112extern struct console early_efi_console; 157static inline bool efi_runtime_supported(void)
158{
159 if (efi_is_native())
160 return true;
161
162 if (IS_ENABLED(CONFIG_EFI_MIXED) && !efi_enabled(EFI_OLD_MEMMAP))
163 return true;
164
165 return false;
166}
113 167
168extern struct console early_efi_console;
169extern void parse_efi_setup(u64 phys_addr, u32 data_len);
170
171#ifdef CONFIG_EFI_MIXED
172extern void efi_thunk_runtime_setup(void);
173extern efi_status_t efi_thunk_set_virtual_address_map(
174 void *phys_set_virtual_address_map,
175 unsigned long memory_map_size,
176 unsigned long descriptor_size,
177 u32 descriptor_version,
178 efi_memory_desc_t *virtual_map);
179#else
180static inline void efi_thunk_runtime_setup(void) {}
181static inline efi_status_t efi_thunk_set_virtual_address_map(
182 void *phys_set_virtual_address_map,
183 unsigned long memory_map_size,
184 unsigned long descriptor_size,
185 u32 descriptor_version,
186 efi_memory_desc_t *virtual_map)
187{
188 return EFI_SUCCESS;
189}
190#endif /* CONFIG_EFI_MIXED */
114#else 191#else
115/* 192/*
116 * IF EFI is not configured, have the EFI calls return -ENOSYS. 193 * IF EFI is not configured, have the EFI calls return -ENOSYS.
@@ -122,6 +199,7 @@ extern struct console early_efi_console;
122#define efi_call4(_f, _a1, _a2, _a3, _a4) (-ENOSYS) 199#define efi_call4(_f, _a1, _a2, _a3, _a4) (-ENOSYS)
123#define efi_call5(_f, _a1, _a2, _a3, _a4, _a5) (-ENOSYS) 200#define efi_call5(_f, _a1, _a2, _a3, _a4, _a5) (-ENOSYS)
124#define efi_call6(_f, _a1, _a2, _a3, _a4, _a5, _a6) (-ENOSYS) 201#define efi_call6(_f, _a1, _a2, _a3, _a4, _a5, _a6) (-ENOSYS)
202static inline void parse_efi_setup(u64 phys_addr, u32 data_len) {}
125#endif /* CONFIG_EFI */ 203#endif /* CONFIG_EFI */
126 204
127#endif /* _ASM_X86_EFI_H */ 205#endif /* _ASM_X86_EFI_H */
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 9c999c1674fa..2c71182d30ef 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -281,16 +281,12 @@ do { \
281 281
282#define STACK_RND_MASK (0x7ff) 282#define STACK_RND_MASK (0x7ff)
283 283
284#define VDSO_HIGH_BASE (__fix_to_virt(FIX_VDSO))
285
286#define ARCH_DLINFO ARCH_DLINFO_IA32(vdso_enabled) 284#define ARCH_DLINFO ARCH_DLINFO_IA32(vdso_enabled)
287 285
288/* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */ 286/* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */
289 287
290#else /* CONFIG_X86_32 */ 288#else /* CONFIG_X86_32 */
291 289
292#define VDSO_HIGH_BASE 0xffffe000U /* CONFIG_COMPAT_VDSO address */
293
294/* 1GB for 64bit, 8MB for 32bit */ 290/* 1GB for 64bit, 8MB for 32bit */
295#define STACK_RND_MASK (test_thread_flag(TIF_ADDR32) ? 0x7ff : 0x3fffff) 291#define STACK_RND_MASK (test_thread_flag(TIF_ADDR32) ? 0x7ff : 0x3fffff)
296 292
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index e846225265ed..43f482a0db37 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -40,15 +40,8 @@
40 */ 40 */
41extern unsigned long __FIXADDR_TOP; 41extern unsigned long __FIXADDR_TOP;
42#define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP) 42#define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP)
43
44#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO)
45#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1)
46#else 43#else
47#define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE) 44#define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE)
48
49/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */
50#define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL)
51#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE)
52#endif 45#endif
53 46
54 47
@@ -74,7 +67,6 @@ extern unsigned long __FIXADDR_TOP;
74enum fixed_addresses { 67enum fixed_addresses {
75#ifdef CONFIG_X86_32 68#ifdef CONFIG_X86_32
76 FIX_HOLE, 69 FIX_HOLE,
77 FIX_VDSO,
78#else 70#else
79 VSYSCALL_LAST_PAGE, 71 VSYSCALL_LAST_PAGE,
80 VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE 72 VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
@@ -98,12 +90,6 @@ enum fixed_addresses {
98 FIX_IO_APIC_BASE_0, 90 FIX_IO_APIC_BASE_0,
99 FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1, 91 FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
100#endif 92#endif
101#ifdef CONFIG_X86_VISWS_APIC
102 FIX_CO_CPU, /* Cobalt timer */
103 FIX_CO_APIC, /* Cobalt APIC Redirection Table */
104 FIX_LI_PCIA, /* Lithium PCI Bridge A */
105 FIX_LI_PCIB, /* Lithium PCI Bridge B */
106#endif
107 FIX_RO_IDT, /* Virtual mapping for read-only IDT */ 93 FIX_RO_IDT, /* Virtual mapping for read-only IDT */
108#ifdef CONFIG_X86_32 94#ifdef CONFIG_X86_32
109 FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ 95 FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
@@ -175,64 +161,13 @@ static inline void __set_fixmap(enum fixed_addresses idx,
175} 161}
176#endif 162#endif
177 163
178#define set_fixmap(idx, phys) \ 164#include <asm-generic/fixmap.h>
179 __set_fixmap(idx, phys, PAGE_KERNEL)
180
181/*
182 * Some hardware wants to get fixmapped without caching.
183 */
184#define set_fixmap_nocache(idx, phys) \
185 __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
186
187#define clear_fixmap(idx) \
188 __set_fixmap(idx, 0, __pgprot(0))
189
190#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
191#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
192
193extern void __this_fixmap_does_not_exist(void);
194
195/*
196 * 'index to address' translation. If anyone tries to use the idx
197 * directly without translation, we catch the bug with a NULL-deference
198 * kernel oops. Illegal ranges of incoming indices are caught too.
199 */
200static __always_inline unsigned long fix_to_virt(const unsigned int idx)
201{
202 /*
203 * this branch gets completely eliminated after inlining,
204 * except when someone tries to use fixaddr indices in an
205 * illegal way. (such as mixing up address types or using
206 * out-of-range indices).
207 *
208 * If it doesn't get removed, the linker will complain
209 * loudly with a reasonably clear error message..
210 */
211 if (idx >= __end_of_fixed_addresses)
212 __this_fixmap_does_not_exist();
213
214 return __fix_to_virt(idx);
215}
216
217static inline unsigned long virt_to_fix(const unsigned long vaddr)
218{
219 BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
220 return __virt_to_fix(vaddr);
221}
222
223/* Return an pointer with offset calculated */
224static __always_inline unsigned long
225__set_fixmap_offset(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags)
226{
227 __set_fixmap(idx, phys, flags);
228 return fix_to_virt(idx) + (phys & (PAGE_SIZE - 1));
229}
230 165
231#define set_fixmap_offset(idx, phys) \ 166#define __late_set_fixmap(idx, phys, flags) __set_fixmap(idx, phys, flags)
232 __set_fixmap_offset(idx, phys, PAGE_KERNEL) 167#define __late_clear_fixmap(idx) __set_fixmap(idx, 0, __pgprot(0))
233 168
234#define set_fixmap_offset_nocache(idx, phys) \ 169void __early_set_fixmap(enum fixed_addresses idx,
235 __set_fixmap_offset(idx, phys, PAGE_KERNEL_NOCACHE) 170 phys_addr_t phys, pgprot_t flags);
236 171
237#endif /* !__ASSEMBLY__ */ 172#endif /* !__ASSEMBLY__ */
238#endif /* _ASM_X86_FIXMAP_H */ 173#endif /* _ASM_X86_FIXMAP_H */
diff --git a/arch/x86/include/asm/floppy.h b/arch/x86/include/asm/floppy.h
index d3d74698dce9..1c7eefe32502 100644
--- a/arch/x86/include/asm/floppy.h
+++ b/arch/x86/include/asm/floppy.h
@@ -145,10 +145,10 @@ static int fd_request_irq(void)
145{ 145{
146 if (can_use_virtual_dma) 146 if (can_use_virtual_dma)
147 return request_irq(FLOPPY_IRQ, floppy_hardint, 147 return request_irq(FLOPPY_IRQ, floppy_hardint,
148 IRQF_DISABLED, "floppy", NULL); 148 0, "floppy", NULL);
149 else 149 else
150 return request_irq(FLOPPY_IRQ, floppy_interrupt, 150 return request_irq(FLOPPY_IRQ, floppy_interrupt,
151 IRQF_DISABLED, "floppy", NULL); 151 0, "floppy", NULL);
152} 152}
153 153
154static unsigned long dma_mem_alloc(unsigned long size) 154static unsigned long dma_mem_alloc(unsigned long size)
diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h
index be27ba1e947a..b4c1f5453436 100644
--- a/arch/x86/include/asm/futex.h
+++ b/arch/x86/include/asm/futex.h
@@ -110,26 +110,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
110static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, 110static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
111 u32 oldval, u32 newval) 111 u32 oldval, u32 newval)
112{ 112{
113 int ret = 0; 113 return user_atomic_cmpxchg_inatomic(uval, uaddr, oldval, newval);
114
115 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
116 return -EFAULT;
117
118 asm volatile("\t" ASM_STAC "\n"
119 "1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n"
120 "2:\t" ASM_CLAC "\n"
121 "\t.section .fixup, \"ax\"\n"
122 "3:\tmov %3, %0\n"
123 "\tjmp 2b\n"
124 "\t.previous\n"
125 _ASM_EXTABLE(1b, 3b)
126 : "+r" (ret), "=a" (oldval), "+m" (*uaddr)
127 : "i" (-EFAULT), "r" (newval), "1" (oldval)
128 : "memory"
129 );
130
131 *uval = oldval;
132 return ret;
133} 114}
134 115
135#endif 116#endif
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index ab0ae1aa6d0a..230853da4ec0 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -33,6 +33,9 @@ typedef struct {
33#ifdef CONFIG_X86_MCE_THRESHOLD 33#ifdef CONFIG_X86_MCE_THRESHOLD
34 unsigned int irq_threshold_count; 34 unsigned int irq_threshold_count;
35#endif 35#endif
36#if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN)
37 unsigned int irq_hv_callback_count;
38#endif
36} ____cacheline_aligned irq_cpustat_t; 39} ____cacheline_aligned irq_cpustat_t;
37 40
38DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); 41DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
diff --git a/arch/x86/include/asm/hash.h b/arch/x86/include/asm/hash.h
new file mode 100644
index 000000000000..e8c58f88b1d4
--- /dev/null
+++ b/arch/x86/include/asm/hash.h
@@ -0,0 +1,7 @@
1#ifndef _ASM_X86_HASH_H
2#define _ASM_X86_HASH_H
3
4struct fast_hash_ops;
5extern void setup_arch_fast_hash(struct fast_hash_ops *ops);
6
7#endif /* _ASM_X86_HASH_H */
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index cba45d99ac1a..a307b7530e54 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -98,7 +98,6 @@ extern void trace_call_function_single_interrupt(void);
98#define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1<<(x)) & io_apic_irqs)) 98#define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1<<(x)) & io_apic_irqs))
99extern unsigned long io_apic_irqs; 99extern unsigned long io_apic_irqs;
100 100
101extern void init_VISWS_APIC_irqs(void);
102extern void setup_IO_APIC(void); 101extern void setup_IO_APIC(void);
103extern void disable_IO_APIC(void); 102extern void disable_IO_APIC(void);
104 103
@@ -191,6 +190,9 @@ extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void);
191#define trace_interrupt interrupt 190#define trace_interrupt interrupt
192#endif 191#endif
193 192
193#define VECTOR_UNDEFINED -1
194#define VECTOR_RETRIGGERED -2
195
194typedef int vector_irq_t[NR_VECTORS]; 196typedef int vector_irq_t[NR_VECTORS];
195DECLARE_PER_CPU(vector_irq_t, vector_irq); 197DECLARE_PER_CPU(vector_irq_t, vector_irq);
196extern void setup_vector_irq(int cpu); 198extern void setup_vector_irq(int cpu);
diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h
index 459769d39263..e34e097b6f9d 100644
--- a/arch/x86/include/asm/intel-mid.h
+++ b/arch/x86/include/asm/intel-mid.h
@@ -51,10 +51,41 @@ struct devs_id {
51enum intel_mid_cpu_type { 51enum intel_mid_cpu_type {
52 /* 1 was Moorestown */ 52 /* 1 was Moorestown */
53 INTEL_MID_CPU_CHIP_PENWELL = 2, 53 INTEL_MID_CPU_CHIP_PENWELL = 2,
54 INTEL_MID_CPU_CHIP_CLOVERVIEW,
55 INTEL_MID_CPU_CHIP_TANGIER,
54}; 56};
55 57
56extern enum intel_mid_cpu_type __intel_mid_cpu_chip; 58extern enum intel_mid_cpu_type __intel_mid_cpu_chip;
57 59
60/**
61 * struct intel_mid_ops - Interface between intel-mid & sub archs
62 * @arch_setup: arch_setup function to re-initialize platform
63 * structures (x86_init, x86_platform_init)
64 *
65 * This structure can be extended if any new interface is required
66 * between intel-mid & its sub arch files.
67 */
68struct intel_mid_ops {
69 void (*arch_setup)(void);
70};
71
72/* Helper API's for INTEL_MID_OPS_INIT */
73#define DECLARE_INTEL_MID_OPS_INIT(cpuname, cpuid) \
74 [cpuid] = get_##cpuname##_ops
75
76/* Maximum number of CPU ops */
77#define MAX_CPU_OPS(a) (sizeof(a)/sizeof(void *))
78
79/*
80 * For every new cpu addition, a weak get_<cpuname>_ops() function needs be
81 * declared in arch/x86/platform/intel_mid/intel_mid_weak_decls.h.
82 */
83#define INTEL_MID_OPS_INIT {\
84 DECLARE_INTEL_MID_OPS_INIT(penwell, INTEL_MID_CPU_CHIP_PENWELL), \
85 DECLARE_INTEL_MID_OPS_INIT(cloverview, INTEL_MID_CPU_CHIP_CLOVERVIEW), \
86 DECLARE_INTEL_MID_OPS_INIT(tangier, INTEL_MID_CPU_CHIP_TANGIER) \
87};
88
58#ifdef CONFIG_X86_INTEL_MID 89#ifdef CONFIG_X86_INTEL_MID
59 90
60static inline enum intel_mid_cpu_type intel_mid_identify_cpu(void) 91static inline enum intel_mid_cpu_type intel_mid_identify_cpu(void)
@@ -86,8 +117,21 @@ extern enum intel_mid_timer_options intel_mid_timer_options;
86 * Penwell uses spread spectrum clock, so the freq number is not exactly 117 * Penwell uses spread spectrum clock, so the freq number is not exactly
87 * the same as reported by MSR based on SDM. 118 * the same as reported by MSR based on SDM.
88 */ 119 */
89#define PENWELL_FSB_FREQ_83SKU 83200 120#define FSB_FREQ_83SKU 83200
90#define PENWELL_FSB_FREQ_100SKU 99840 121#define FSB_FREQ_100SKU 99840
122#define FSB_FREQ_133SKU 133000
123
124#define FSB_FREQ_167SKU 167000
125#define FSB_FREQ_200SKU 200000
126#define FSB_FREQ_267SKU 267000
127#define FSB_FREQ_333SKU 333000
128#define FSB_FREQ_400SKU 400000
129
130/* Bus Select SoC Fuse value */
131#define BSEL_SOC_FUSE_MASK 0x7
132#define BSEL_SOC_FUSE_001 0x1 /* FSB 133MHz */
133#define BSEL_SOC_FUSE_101 0x5 /* FSB 100MHz */
134#define BSEL_SOC_FUSE_111 0x7 /* FSB 83MHz */
91 135
92#define SFI_MTMR_MAX_NUM 8 136#define SFI_MTMR_MAX_NUM 8
93#define SFI_MRTC_MAX 8 137#define SFI_MRTC_MAX 8
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 34f69cb9350a..b8237d8a1e0c 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -39,6 +39,7 @@
39#include <linux/string.h> 39#include <linux/string.h>
40#include <linux/compiler.h> 40#include <linux/compiler.h>
41#include <asm/page.h> 41#include <asm/page.h>
42#include <asm/early_ioremap.h>
42 43
43#define build_mmio_read(name, size, type, reg, barrier) \ 44#define build_mmio_read(name, size, type, reg, barrier) \
44static inline type name(const volatile void __iomem *addr) \ 45static inline type name(const volatile void __iomem *addr) \
@@ -237,7 +238,7 @@ memcpy_toio(volatile void __iomem *dst, const void *src, size_t count)
237 238
238static inline void flush_write_buffers(void) 239static inline void flush_write_buffers(void)
239{ 240{
240#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE) 241#if defined(CONFIG_X86_PPRO_FENCE)
241 asm volatile("lock; addl $0,0(%%esp)": : :"memory"); 242 asm volatile("lock; addl $0,0(%%esp)": : :"memory");
242#endif 243#endif
243} 244}
@@ -316,19 +317,6 @@ extern int ioremap_change_attr(unsigned long vaddr, unsigned long size,
316 unsigned long prot_val); 317 unsigned long prot_val);
317extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size); 318extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size);
318 319
319/*
320 * early_ioremap() and early_iounmap() are for temporary early boot-time
321 * mappings, before the real ioremap() is functional.
322 * A boot-time mapping is currently limited to at most 16 pages.
323 */
324extern void early_ioremap_init(void);
325extern void early_ioremap_reset(void);
326extern void __iomem *early_ioremap(resource_size_t phys_addr,
327 unsigned long size);
328extern void __iomem *early_memremap(resource_size_t phys_addr,
329 unsigned long size);
330extern void early_iounmap(void __iomem *addr, unsigned long size);
331extern void fixup_early_ioremap(void);
332extern bool is_early_ioremap_ptep(pte_t *ptep); 320extern bool is_early_ioremap_ptep(pte_t *ptep);
333 321
334#ifdef CONFIG_XEN 322#ifdef CONFIG_XEN
diff --git a/arch/x86/include/asm/iosf_mbi.h b/arch/x86/include/asm/iosf_mbi.h
new file mode 100644
index 000000000000..8e71c7941767
--- /dev/null
+++ b/arch/x86/include/asm/iosf_mbi.h
@@ -0,0 +1,90 @@
1/*
2 * iosf_mbi.h: Intel OnChip System Fabric MailBox access support
3 */
4
5#ifndef IOSF_MBI_SYMS_H
6#define IOSF_MBI_SYMS_H
7
8#define MBI_MCR_OFFSET 0xD0
9#define MBI_MDR_OFFSET 0xD4
10#define MBI_MCRX_OFFSET 0xD8
11
12#define MBI_RD_MASK 0xFEFFFFFF
13#define MBI_WR_MASK 0X01000000
14
15#define MBI_MASK_HI 0xFFFFFF00
16#define MBI_MASK_LO 0x000000FF
17#define MBI_ENABLE 0xF0
18
19/* Baytrail available units */
20#define BT_MBI_UNIT_AUNIT 0x00
21#define BT_MBI_UNIT_SMC 0x01
22#define BT_MBI_UNIT_CPU 0x02
23#define BT_MBI_UNIT_BUNIT 0x03
24#define BT_MBI_UNIT_PMC 0x04
25#define BT_MBI_UNIT_GFX 0x06
26#define BT_MBI_UNIT_SMI 0x0C
27#define BT_MBI_UNIT_USB 0x43
28#define BT_MBI_UNIT_SATA 0xA3
29#define BT_MBI_UNIT_PCIE 0xA6
30
31/* Baytrail read/write opcodes */
32#define BT_MBI_AUNIT_READ 0x10
33#define BT_MBI_AUNIT_WRITE 0x11
34#define BT_MBI_SMC_READ 0x10
35#define BT_MBI_SMC_WRITE 0x11
36#define BT_MBI_CPU_READ 0x10
37#define BT_MBI_CPU_WRITE 0x11
38#define BT_MBI_BUNIT_READ 0x10
39#define BT_MBI_BUNIT_WRITE 0x11
40#define BT_MBI_PMC_READ 0x06
41#define BT_MBI_PMC_WRITE 0x07
42#define BT_MBI_GFX_READ 0x00
43#define BT_MBI_GFX_WRITE 0x01
44#define BT_MBI_SMIO_READ 0x06
45#define BT_MBI_SMIO_WRITE 0x07
46#define BT_MBI_USB_READ 0x06
47#define BT_MBI_USB_WRITE 0x07
48#define BT_MBI_SATA_READ 0x00
49#define BT_MBI_SATA_WRITE 0x01
50#define BT_MBI_PCIE_READ 0x00
51#define BT_MBI_PCIE_WRITE 0x01
52
53/**
54 * iosf_mbi_read() - MailBox Interface read command
55 * @port: port indicating subunit being accessed
56 * @opcode: port specific read or write opcode
57 * @offset: register address offset
58 * @mdr: register data to be read
59 *
60 * Locking is handled by spinlock - cannot sleep.
61 * Return: Nonzero on error
62 */
63int iosf_mbi_read(u8 port, u8 opcode, u32 offset, u32 *mdr);
64
65/**
66 * iosf_mbi_write() - MailBox unmasked write command
67 * @port: port indicating subunit being accessed
68 * @opcode: port specific read or write opcode
69 * @offset: register address offset
70 * @mdr: register data to be written
71 *
72 * Locking is handled by spinlock - cannot sleep.
73 * Return: Nonzero on error
74 */
75int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr);
76
77/**
78 * iosf_mbi_modify() - MailBox masked write command
79 * @port: port indicating subunit being accessed
80 * @opcode: port specific read or write opcode
81 * @offset: register address offset
82 * @mdr: register data being modified
83 * @mask: mask indicating bits in mdr to be modified
84 *
85 * Locking is handled by spinlock - cannot sleep.
86 * Return: Nonzero on error
87 */
88int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask);
89
90#endif /* IOSF_MBI_SYMS_H */
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 0ea10f27d613..cb6cfcd034cf 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -25,6 +25,7 @@ extern void irq_ctx_init(int cpu);
25 25
26#ifdef CONFIG_HOTPLUG_CPU 26#ifdef CONFIG_HOTPLUG_CPU
27#include <linux/cpumask.h> 27#include <linux/cpumask.h>
28extern int check_irq_vectors_for_cpu_disable(void);
28extern void fixup_irqs(void); 29extern void fixup_irqs(void);
29extern void irq_force_complete_move(int); 30extern void irq_force_complete_move(int);
30#endif 31#endif
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ae5d7830855c..fcaf9c961265 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -337,6 +337,11 @@ struct kvm_pmu {
337 u64 reprogram_pmi; 337 u64 reprogram_pmi;
338}; 338};
339 339
340enum {
341 KVM_DEBUGREG_BP_ENABLED = 1,
342 KVM_DEBUGREG_WONT_EXIT = 2,
343};
344
340struct kvm_vcpu_arch { 345struct kvm_vcpu_arch {
341 /* 346 /*
342 * rip and regs accesses must go through 347 * rip and regs accesses must go through
@@ -444,7 +449,6 @@ struct kvm_vcpu_arch {
444 } st; 449 } st;
445 450
446 u64 last_guest_tsc; 451 u64 last_guest_tsc;
447 u64 last_kernel_ns;
448 u64 last_host_tsc; 452 u64 last_host_tsc;
449 u64 tsc_offset_adjustment; 453 u64 tsc_offset_adjustment;
450 u64 this_tsc_nsec; 454 u64 this_tsc_nsec;
@@ -464,7 +468,7 @@ struct kvm_vcpu_arch {
464 struct mtrr_state_type mtrr_state; 468 struct mtrr_state_type mtrr_state;
465 u32 pat; 469 u32 pat;
466 470
467 int switch_db_regs; 471 unsigned switch_db_regs;
468 unsigned long db[KVM_NR_DB_REGS]; 472 unsigned long db[KVM_NR_DB_REGS];
469 unsigned long dr6; 473 unsigned long dr6;
470 unsigned long dr7; 474 unsigned long dr7;
@@ -599,12 +603,15 @@ struct kvm_arch {
599 bool use_master_clock; 603 bool use_master_clock;
600 u64 master_kernel_ns; 604 u64 master_kernel_ns;
601 cycle_t master_cycle_now; 605 cycle_t master_cycle_now;
606 struct delayed_work kvmclock_update_work;
607 struct delayed_work kvmclock_sync_work;
602 608
603 struct kvm_xen_hvm_config xen_hvm_config; 609 struct kvm_xen_hvm_config xen_hvm_config;
604 610
605 /* fields used by HYPER-V emulation */ 611 /* fields used by HYPER-V emulation */
606 u64 hv_guest_os_id; 612 u64 hv_guest_os_id;
607 u64 hv_hypercall; 613 u64 hv_hypercall;
614 u64 hv_tsc_page;
608 615
609 #ifdef CONFIG_KVM_MMU_AUDIT 616 #ifdef CONFIG_KVM_MMU_AUDIT
610 int audit_point; 617 int audit_point;
@@ -699,6 +706,9 @@ struct kvm_x86_ops {
699 void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); 706 void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
700 void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); 707 void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
701 void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); 708 void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
709 u64 (*get_dr6)(struct kvm_vcpu *vcpu);
710 void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value);
711 void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu);
702 void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value); 712 void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value);
703 void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); 713 void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
704 unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); 714 unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
@@ -725,8 +735,8 @@ struct kvm_x86_ops {
725 int (*nmi_allowed)(struct kvm_vcpu *vcpu); 735 int (*nmi_allowed)(struct kvm_vcpu *vcpu);
726 bool (*get_nmi_mask)(struct kvm_vcpu *vcpu); 736 bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
727 void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked); 737 void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked);
728 int (*enable_nmi_window)(struct kvm_vcpu *vcpu); 738 void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
729 int (*enable_irq_window)(struct kvm_vcpu *vcpu); 739 void (*enable_irq_window)(struct kvm_vcpu *vcpu);
730 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); 740 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
731 int (*vm_has_apicv)(struct kvm *kvm); 741 int (*vm_has_apicv)(struct kvm *kvm);
732 void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); 742 void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
@@ -762,6 +772,9 @@ struct kvm_x86_ops {
762 struct x86_instruction_info *info, 772 struct x86_instruction_info *info,
763 enum x86_intercept_stage stage); 773 enum x86_intercept_stage stage);
764 void (*handle_external_intr)(struct kvm_vcpu *vcpu); 774 void (*handle_external_intr)(struct kvm_vcpu *vcpu);
775 bool (*mpx_supported)(void);
776
777 int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr);
765}; 778};
766 779
767struct kvm_arch_async_pf { 780struct kvm_arch_async_pf {
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 1df115909758..c7678e43465b 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -85,28 +85,9 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
85 return ret; 85 return ret;
86} 86}
87 87
88static inline uint32_t kvm_cpuid_base(void)
89{
90 if (boot_cpu_data.cpuid_level < 0)
91 return 0; /* So we don't blow up on old processors */
92
93 if (cpu_has_hypervisor)
94 return hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0);
95
96 return 0;
97}
98
99static inline bool kvm_para_available(void)
100{
101 return kvm_cpuid_base() != 0;
102}
103
104static inline unsigned int kvm_arch_para_features(void)
105{
106 return cpuid_eax(KVM_CPUID_FEATURES);
107}
108
109#ifdef CONFIG_KVM_GUEST 88#ifdef CONFIG_KVM_GUEST
89bool kvm_para_available(void);
90unsigned int kvm_arch_para_features(void);
110void __init kvm_guest_init(void); 91void __init kvm_guest_init(void);
111void kvm_async_pf_task_wait(u32 token); 92void kvm_async_pf_task_wait(u32 token);
112void kvm_async_pf_task_wake(u32 token); 93void kvm_async_pf_task_wake(u32 token);
@@ -126,6 +107,16 @@ static inline void kvm_spinlock_init(void)
126#define kvm_async_pf_task_wait(T) do {} while(0) 107#define kvm_async_pf_task_wait(T) do {} while(0)
127#define kvm_async_pf_task_wake(T) do {} while(0) 108#define kvm_async_pf_task_wake(T) do {} while(0)
128 109
110static inline bool kvm_para_available(void)
111{
112 return 0;
113}
114
115static inline unsigned int kvm_arch_para_features(void)
116{
117 return 0;
118}
119
129static inline u32 kvm_read_and_reset_pf_reason(void) 120static inline u32 kvm_read_and_reset_pf_reason(void)
130{ 121{
131 return 0; 122 return 0;
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index c696a8687567..6e4ce2df87cf 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -118,7 +118,6 @@ extern void mce_register_decode_chain(struct notifier_block *nb);
118extern void mce_unregister_decode_chain(struct notifier_block *nb); 118extern void mce_unregister_decode_chain(struct notifier_block *nb);
119 119
120#include <linux/percpu.h> 120#include <linux/percpu.h>
121#include <linux/init.h>
122#include <linux/atomic.h> 121#include <linux/atomic.h>
123 122
124extern int mce_p5_enabled; 123extern int mce_p5_enabled;
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index f98bd6625318..b59827e76529 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -1,6 +1,21 @@
1#ifndef _ASM_X86_MICROCODE_H 1#ifndef _ASM_X86_MICROCODE_H
2#define _ASM_X86_MICROCODE_H 2#define _ASM_X86_MICROCODE_H
3 3
4#define native_rdmsr(msr, val1, val2) \
5do { \
6 u64 __val = native_read_msr((msr)); \
7 (void)((val1) = (u32)__val); \
8 (void)((val2) = (u32)(__val >> 32)); \
9} while (0)
10
11#define native_wrmsr(msr, low, high) \
12 native_write_msr(msr, low, high)
13
14#define native_wrmsrl(msr, val) \
15 native_write_msr((msr), \
16 (u32)((u64)(val)), \
17 (u32)((u64)(val) >> 32))
18
4struct cpu_signature { 19struct cpu_signature {
5 unsigned int sig; 20 unsigned int sig;
6 unsigned int pf; 21 unsigned int pf;
diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h
index 4c019179a57d..b7b10b82d3e5 100644
--- a/arch/x86/include/asm/microcode_amd.h
+++ b/arch/x86/include/asm/microcode_amd.h
@@ -61,11 +61,10 @@ extern int __apply_microcode_amd(struct microcode_amd *mc_amd);
61extern int apply_microcode_amd(int cpu); 61extern int apply_microcode_amd(int cpu);
62extern enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size); 62extern enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size);
63 63
64#define PATCH_MAX_SIZE PAGE_SIZE
65extern u8 amd_ucode_patch[PATCH_MAX_SIZE];
66
64#ifdef CONFIG_MICROCODE_AMD_EARLY 67#ifdef CONFIG_MICROCODE_AMD_EARLY
65#ifdef CONFIG_X86_32
66#define MPB_MAX_SIZE PAGE_SIZE
67extern u8 amd_bsp_mpb[MPB_MAX_SIZE];
68#endif
69extern void __init load_ucode_amd_bsp(void); 68extern void __init load_ucode_amd_bsp(void);
70extern void load_ucode_amd_ap(void); 69extern void load_ucode_amd_ap(void);
71extern int __init save_microcode_in_initrd_amd(void); 70extern int __init save_microcode_in_initrd_amd(void);
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index 8a9b3e288cb4..1ec990bd7dc0 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -11,9 +11,6 @@
11#ifdef CONFIG_NUMA 11#ifdef CONFIG_NUMA
12extern struct pglist_data *node_data[]; 12extern struct pglist_data *node_data[];
13#define NODE_DATA(nid) (node_data[nid]) 13#define NODE_DATA(nid) (node_data[nid])
14
15#include <asm/numaq.h>
16
17#endif /* CONFIG_NUMA */ 14#endif /* CONFIG_NUMA */
18 15
19#ifdef CONFIG_DISCONTIGMEM 16#ifdef CONFIG_DISCONTIGMEM
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 3142a94c7b4b..f5a617956735 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -1,7 +1,6 @@
1#ifndef _ASM_X86_MPSPEC_H 1#ifndef _ASM_X86_MPSPEC_H
2#define _ASM_X86_MPSPEC_H 2#define _ASM_X86_MPSPEC_H
3 3
4#include <linux/init.h>
5 4
6#include <asm/mpspec_def.h> 5#include <asm/mpspec_def.h>
7#include <asm/x86_init.h> 6#include <asm/x86_init.h>
@@ -26,12 +25,6 @@ extern int pic_mode;
26 25
27extern unsigned int def_to_bigsmp; 26extern unsigned int def_to_bigsmp;
28 27
29#ifdef CONFIG_X86_NUMAQ
30extern int mp_bus_id_to_node[MAX_MP_BUSSES];
31extern int mp_bus_id_to_local[MAX_MP_BUSSES];
32extern int quad_local_to_mp_bus_id [NR_CPUS/4][4];
33#endif
34
35#else /* CONFIG_X86_64: */ 28#else /* CONFIG_X86_64: */
36 29
37#define MAX_MP_BUSSES 256 30#define MAX_MP_BUSSES 256
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index cd9c41938b8a..c163215abb9a 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -2,6 +2,7 @@
2#define _ASM_X86_MSHYPER_H 2#define _ASM_X86_MSHYPER_H
3 3
4#include <linux/types.h> 4#include <linux/types.h>
5#include <linux/interrupt.h>
5#include <asm/hyperv.h> 6#include <asm/hyperv.h>
6 7
7struct ms_hyperv_info { 8struct ms_hyperv_info {
@@ -16,6 +17,7 @@ void hyperv_callback_vector(void);
16#define trace_hyperv_callback_vector hyperv_callback_vector 17#define trace_hyperv_callback_vector hyperv_callback_vector
17#endif 18#endif
18void hyperv_vector_handler(struct pt_regs *regs); 19void hyperv_vector_handler(struct pt_regs *regs);
19void hv_register_vmbus_handler(int irq, irq_handler_t handler); 20void hv_setup_vmbus_irq(void (*handler)(void));
21void hv_remove_vmbus_irq(void);
20 22
21#endif 23#endif
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index e139b13f2a33..de36f22eb0b9 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -214,6 +214,8 @@ do { \
214 214
215struct msr *msrs_alloc(void); 215struct msr *msrs_alloc(void);
216void msrs_free(struct msr *msrs); 216void msrs_free(struct msr *msrs);
217int msr_set_bit(u32 msr, u8 bit);
218int msr_clear_bit(u32 msr, u8 bit);
217 219
218#ifdef CONFIG_SMP 220#ifdef CONFIG_SMP
219int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); 221int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index 2f366d0ac6b4..1da25a5f96f9 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -1,6 +1,8 @@
1#ifndef _ASM_X86_MWAIT_H 1#ifndef _ASM_X86_MWAIT_H
2#define _ASM_X86_MWAIT_H 2#define _ASM_X86_MWAIT_H
3 3
4#include <linux/sched.h>
5
4#define MWAIT_SUBSTATE_MASK 0xf 6#define MWAIT_SUBSTATE_MASK 0xf
5#define MWAIT_CSTATE_MASK 0xf 7#define MWAIT_CSTATE_MASK 0xf
6#define MWAIT_SUBSTATE_SIZE 4 8#define MWAIT_SUBSTATE_SIZE 4
@@ -13,4 +15,45 @@
13 15
14#define MWAIT_ECX_INTERRUPT_BREAK 0x1 16#define MWAIT_ECX_INTERRUPT_BREAK 0x1
15 17
18static inline void __monitor(const void *eax, unsigned long ecx,
19 unsigned long edx)
20{
21 /* "monitor %eax, %ecx, %edx;" */
22 asm volatile(".byte 0x0f, 0x01, 0xc8;"
23 :: "a" (eax), "c" (ecx), "d"(edx));
24}
25
26static inline void __mwait(unsigned long eax, unsigned long ecx)
27{
28 /* "mwait %eax, %ecx;" */
29 asm volatile(".byte 0x0f, 0x01, 0xc9;"
30 :: "a" (eax), "c" (ecx));
31}
32
33/*
34 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
35 * which can obviate IPI to trigger checking of need_resched.
36 * We execute MONITOR against need_resched and enter optimized wait state
37 * through MWAIT. Whenever someone changes need_resched, we would be woken
38 * up from MWAIT (without an IPI).
39 *
40 * New with Core Duo processors, MWAIT can take some hints based on CPU
41 * capability.
42 */
43static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
44{
45 if (!current_set_polling_and_test()) {
46 if (static_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) {
47 mb();
48 clflush((void *)&current_thread_info()->flags);
49 mb();
50 }
51
52 __monitor((void *)&current_thread_info()->flags, 0, 0);
53 if (!need_resched())
54 __mwait(eax, ecx);
55 }
56 current_clr_polling();
57}
58
16#endif /* _ASM_X86_MWAIT_H */ 59#endif /* _ASM_X86_MWAIT_H */
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 86f9301903c8..5f2fc4441b11 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -1,6 +1,7 @@
1#ifndef _ASM_X86_NMI_H 1#ifndef _ASM_X86_NMI_H
2#define _ASM_X86_NMI_H 2#define _ASM_X86_NMI_H
3 3
4#include <linux/irq_work.h>
4#include <linux/pm.h> 5#include <linux/pm.h>
5#include <asm/irq.h> 6#include <asm/irq.h>
6#include <asm/io.h> 7#include <asm/io.h>
@@ -38,6 +39,8 @@ typedef int (*nmi_handler_t)(unsigned int, struct pt_regs *);
38struct nmiaction { 39struct nmiaction {
39 struct list_head list; 40 struct list_head list;
40 nmi_handler_t handler; 41 nmi_handler_t handler;
42 u64 max_duration;
43 struct irq_work irq_work;
41 unsigned long flags; 44 unsigned long flags;
42 const char *name; 45 const char *name;
43}; 46};
diff --git a/arch/x86/include/asm/numaq.h b/arch/x86/include/asm/numaq.h
deleted file mode 100644
index c3b3c322fd87..000000000000
--- a/arch/x86/include/asm/numaq.h
+++ /dev/null
@@ -1,171 +0,0 @@
1/*
2 * Written by: Patricia Gaughen, IBM Corporation
3 *
4 * Copyright (C) 2002, IBM Corp.
5 *
6 * All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
16 * NON INFRINGEMENT. See the GNU General Public License for more
17 * details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 *
23 * Send feedback to <gone@us.ibm.com>
24 */
25
26#ifndef _ASM_X86_NUMAQ_H
27#define _ASM_X86_NUMAQ_H
28
29#ifdef CONFIG_X86_NUMAQ
30
31extern int found_numaq;
32extern int numaq_numa_init(void);
33extern int pci_numaq_init(void);
34
35extern void *xquad_portio;
36
37#define XQUAD_PORTIO_BASE 0xfe400000
38#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */
39#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port)
40
41/*
42 * SYS_CFG_DATA_PRIV_ADDR, struct eachquadmem, and struct sys_cfg_data are the
43 */
44#define SYS_CFG_DATA_PRIV_ADDR 0x0009d000 /* place for scd in private
45 quad space */
46
47/*
48 * Communication area for each processor on lynxer-processor tests.
49 *
50 * NOTE: If you change the size of this eachproc structure you need
51 * to change the definition for EACH_QUAD_SIZE.
52 */
53struct eachquadmem {
54 unsigned int priv_mem_start; /* Starting address of this */
55 /* quad's private memory. */
56 /* This is always 0. */
57 /* In MB. */
58 unsigned int priv_mem_size; /* Size of this quad's */
59 /* private memory. */
60 /* In MB. */
61 unsigned int low_shrd_mem_strp_start;/* Starting address of this */
62 /* quad's low shared block */
63 /* (untranslated). */
64 /* In MB. */
65 unsigned int low_shrd_mem_start; /* Starting address of this */
66 /* quad's low shared memory */
67 /* (untranslated). */
68 /* In MB. */
69 unsigned int low_shrd_mem_size; /* Size of this quad's low */
70 /* shared memory. */
71 /* In MB. */
72 unsigned int lmmio_copb_start; /* Starting address of this */
73 /* quad's local memory */
74 /* mapped I/O in the */
75 /* compatibility OPB. */
76 /* In MB. */
77 unsigned int lmmio_copb_size; /* Size of this quad's local */
78 /* memory mapped I/O in the */
79 /* compatibility OPB. */
80 /* In MB. */
81 unsigned int lmmio_nopb_start; /* Starting address of this */
82 /* quad's local memory */
83 /* mapped I/O in the */
84 /* non-compatibility OPB. */
85 /* In MB. */
86 unsigned int lmmio_nopb_size; /* Size of this quad's local */
87 /* memory mapped I/O in the */
88 /* non-compatibility OPB. */
89 /* In MB. */
90 unsigned int io_apic_0_start; /* Starting address of I/O */
91 /* APIC 0. */
92 unsigned int io_apic_0_sz; /* Size I/O APIC 0. */
93 unsigned int io_apic_1_start; /* Starting address of I/O */
94 /* APIC 1. */
95 unsigned int io_apic_1_sz; /* Size I/O APIC 1. */
96 unsigned int hi_shrd_mem_start; /* Starting address of this */
97 /* quad's high shared memory.*/
98 /* In MB. */
99 unsigned int hi_shrd_mem_size; /* Size of this quad's high */
100 /* shared memory. */
101 /* In MB. */
102 unsigned int mps_table_addr; /* Address of this quad's */
103 /* MPS tables from BIOS, */
104 /* in system space.*/
105 unsigned int lcl_MDC_pio_addr; /* Port-I/O address for */
106 /* local access of MDC. */
107 unsigned int rmt_MDC_mmpio_addr; /* MM-Port-I/O address for */
108 /* remote access of MDC. */
109 unsigned int mm_port_io_start; /* Starting address of this */
110 /* quad's memory mapped Port */
111 /* I/O space. */
112 unsigned int mm_port_io_size; /* Size of this quad's memory*/
113 /* mapped Port I/O space. */
114 unsigned int mm_rmt_io_apic_start; /* Starting address of this */
115 /* quad's memory mapped */
116 /* remote I/O APIC space. */
117 unsigned int mm_rmt_io_apic_size; /* Size of this quad's memory*/
118 /* mapped remote I/O APIC */
119 /* space. */
120 unsigned int mm_isa_start; /* Starting address of this */
121 /* quad's memory mapped ISA */
122 /* space (contains MDC */
123 /* memory space). */
124 unsigned int mm_isa_size; /* Size of this quad's memory*/
125 /* mapped ISA space (contains*/
126 /* MDC memory space). */
127 unsigned int rmt_qmi_addr; /* Remote addr to access QMI.*/
128 unsigned int lcl_qmi_addr; /* Local addr to access QMI. */
129};
130
131/*
132 * Note: This structure must be NOT be changed unless the multiproc and
133 * OS are changed to reflect the new structure.
134 */
135struct sys_cfg_data {
136 unsigned int quad_id;
137 unsigned int bsp_proc_id; /* Boot Strap Processor in this quad. */
138 unsigned int scd_version; /* Version number of this table. */
139 unsigned int first_quad_id;
140 unsigned int quads_present31_0; /* 1 bit for each quad */
141 unsigned int quads_present63_32; /* 1 bit for each quad */
142 unsigned int config_flags;
143 unsigned int boot_flags;
144 unsigned int csr_start_addr; /* Absolute value (not in MB) */
145 unsigned int csr_size; /* Absolute value (not in MB) */
146 unsigned int lcl_apic_start_addr; /* Absolute value (not in MB) */
147 unsigned int lcl_apic_size; /* Absolute value (not in MB) */
148 unsigned int low_shrd_mem_base; /* 0 or 512MB or 1GB */
149 unsigned int low_shrd_mem_quad_offset; /* 0,128M,256M,512M,1G */
150 /* may not be totally populated */
151 unsigned int split_mem_enbl; /* 0 for no low shared memory */
152 unsigned int mmio_sz; /* Size of total system memory mapped I/O */
153 /* (in MB). */
154 unsigned int quad_spin_lock; /* Spare location used for quad */
155 /* bringup. */
156 unsigned int nonzero55; /* For checksumming. */
157 unsigned int nonzeroaa; /* For checksumming. */
158 unsigned int scd_magic_number;
159 unsigned int system_type;
160 unsigned int checksum;
161 /*
162 * memory configuration area for each quad
163 */
164 struct eachquadmem eq[MAX_NUMNODES]; /* indexed by quad id */
165};
166
167void numaq_tsc_disable(void);
168
169#endif /* CONFIG_X86_NUMAQ */
170#endif /* _ASM_X86_NUMAQ_H */
171
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index c87892442e53..775873d3be55 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -71,6 +71,7 @@ extern bool __virt_addr_valid(unsigned long kaddr);
71#include <asm-generic/getorder.h> 71#include <asm-generic/getorder.h>
72 72
73#define __HAVE_ARCH_GATE_AREA 1 73#define __HAVE_ARCH_GATE_AREA 1
74#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
74 75
75#endif /* __KERNEL__ */ 76#endif /* __KERNEL__ */
76#endif /* _ASM_X86_PAGE_H */ 77#endif /* _ASM_X86_PAGE_H */
diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h
index 4d550d04b609..904f528cc8e8 100644
--- a/arch/x86/include/asm/page_32.h
+++ b/arch/x86/include/asm/page_32.h
@@ -5,10 +5,6 @@
5 5
6#ifndef __ASSEMBLY__ 6#ifndef __ASSEMBLY__
7 7
8#ifdef CONFIG_HUGETLB_PAGE
9#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
10#endif
11
12#define __phys_addr_nodebug(x) ((x) - PAGE_OFFSET) 8#define __phys_addr_nodebug(x) ((x) - PAGE_OFFSET)
13#ifdef CONFIG_DEBUG_VIRTUAL 9#ifdef CONFIG_DEBUG_VIRTUAL
14extern unsigned long __phys_addr(unsigned long); 10extern unsigned long __phys_addr(unsigned long);
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 43dcd804ebd5..8de6d9cf3b95 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -39,9 +39,18 @@
39#define __VIRTUAL_MASK_SHIFT 47 39#define __VIRTUAL_MASK_SHIFT 47
40 40
41/* 41/*
42 * Kernel image size is limited to 512 MB (see level2_kernel_pgt in 42 * Kernel image size is limited to 1GiB due to the fixmap living in the
43 * arch/x86/kernel/head_64.S), and it is mapped here: 43 * next 1GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S). Use
44 * 512MiB by default, leaving 1.5GiB for modules once the page tables
45 * are fully set up. If kernel ASLR is configured, it can extend the
46 * kernel page table mapping, reducing the size of the modules area.
44 */ 47 */
45#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) 48#define KERNEL_IMAGE_SIZE_DEFAULT (512 * 1024 * 1024)
49#if defined(CONFIG_RANDOMIZE_BASE) && \
50 CONFIG_RANDOMIZE_BASE_MAX_OFFSET > KERNEL_IMAGE_SIZE_DEFAULT
51#define KERNEL_IMAGE_SIZE CONFIG_RANDOMIZE_BASE_MAX_OFFSET
52#else
53#define KERNEL_IMAGE_SIZE KERNEL_IMAGE_SIZE_DEFAULT
54#endif
46 55
47#endif /* _ASM_X86_PAGE_64_DEFS_H */ 56#endif /* _ASM_X86_PAGE_64_DEFS_H */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 401f350ef71b..cd6e1610e29e 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -781,9 +781,9 @@ static __always_inline void __ticket_unlock_kick(struct arch_spinlock *lock,
781 */ 781 */
782#define PV_CALLEE_SAVE_REGS_THUNK(func) \ 782#define PV_CALLEE_SAVE_REGS_THUNK(func) \
783 extern typeof(func) __raw_callee_save_##func; \ 783 extern typeof(func) __raw_callee_save_##func; \
784 static void *__##func##__ __used = func; \
785 \ 784 \
786 asm(".pushsection .text;" \ 785 asm(".pushsection .text;" \
786 ".globl __raw_callee_save_" #func " ; " \
787 "__raw_callee_save_" #func ": " \ 787 "__raw_callee_save_" #func ": " \
788 PV_SAVE_ALL_CALLER_REGS \ 788 PV_SAVE_ALL_CALLER_REGS \
789 "call " #func ";" \ 789 "call " #func ";" \
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index aab8f671b523..7549b8b369e4 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -388,10 +388,11 @@ extern struct pv_lock_ops pv_lock_ops;
388 _paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]") 388 _paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]")
389 389
390/* Simple instruction patching code. */ 390/* Simple instruction patching code. */
391#define DEF_NATIVE(ops, name, code) \ 391#define NATIVE_LABEL(a,x,b) "\n\t.globl " a #x "_" #b "\n" a #x "_" #b ":\n\t"
392 extern const char start_##ops##_##name[] __visible, \ 392
393 end_##ops##_##name[] __visible; \ 393#define DEF_NATIVE(ops, name, code) \
394 asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":") 394 __visible extern const char start_##ops##_##name[], end_##ops##_##name[]; \
395 asm(NATIVE_LABEL("start_", ops, name) code NATIVE_LABEL("end_", ops, name))
395 396
396unsigned paravirt_patch_nop(void); 397unsigned paravirt_patch_nop(void);
397unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len); 398unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len);
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 947b5c417e83..96ae4f4040bb 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -26,11 +26,6 @@ extern int pci_routeirq;
26extern int noioapicquirk; 26extern int noioapicquirk;
27extern int noioapicreroute; 27extern int noioapicreroute;
28 28
29/* scan a bus after allocating a pci_sysdata for it */
30extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops,
31 int node);
32extern struct pci_bus *pci_scan_bus_with_sysdata(int busno);
33
34#ifdef CONFIG_PCI 29#ifdef CONFIG_PCI
35 30
36#ifdef CONFIG_PCI_DOMAINS 31#ifdef CONFIG_PCI_DOMAINS
@@ -70,7 +65,7 @@ extern unsigned long pci_mem_start;
70 65
71extern int pcibios_enabled; 66extern int pcibios_enabled;
72void pcibios_config_init(void); 67void pcibios_config_init(void);
73struct pci_bus *pcibios_scan_root(int bus); 68void pcibios_scan_root(int bus);
74 69
75void pcibios_set_master(struct pci_dev *dev); 70void pcibios_set_master(struct pci_dev *dev);
76void pcibios_penalize_isa_irq(int irq, int active); 71void pcibios_penalize_isa_irq(int irq, int active);
@@ -104,7 +99,7 @@ extern void pci_iommu_alloc(void);
104struct msi_desc; 99struct msi_desc;
105int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type); 100int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
106void native_teardown_msi_irq(unsigned int irq); 101void native_teardown_msi_irq(unsigned int irq);
107void native_restore_msi_irqs(struct pci_dev *dev, int irq); 102void native_restore_msi_irqs(struct pci_dev *dev);
108int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, 103int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
109 unsigned int irq_base, unsigned int irq_offset); 104 unsigned int irq_base, unsigned int irq_offset);
110#else 105#else
@@ -125,7 +120,6 @@ int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
125 120
126/* generic pci stuff */ 121/* generic pci stuff */
127#include <asm-generic/pci.h> 122#include <asm-generic/pci.h>
128#define PCIBIOS_MAX_MEM_32 0xffffffff
129 123
130#ifdef CONFIG_NUMA 124#ifdef CONFIG_NUMA
131/* Returns the node based on pci bus */ 125/* Returns the node based on pci bus */
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 94220d14d5cc..851bcdc5db04 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -52,7 +52,7 @@
52 * Compared to the generic __my_cpu_offset version, the following 52 * Compared to the generic __my_cpu_offset version, the following
53 * saves one instruction and avoids clobbering a temp register. 53 * saves one instruction and avoids clobbering a temp register.
54 */ 54 */
55#define __this_cpu_ptr(ptr) \ 55#define raw_cpu_ptr(ptr) \
56({ \ 56({ \
57 unsigned long tcp_ptr__; \ 57 unsigned long tcp_ptr__; \
58 __verify_pcpu_ptr(ptr); \ 58 __verify_pcpu_ptr(ptr); \
@@ -362,25 +362,25 @@ do { \
362 */ 362 */
363#define this_cpu_read_stable(var) percpu_from_op("mov", var, "p" (&(var))) 363#define this_cpu_read_stable(var) percpu_from_op("mov", var, "p" (&(var)))
364 364
365#define __this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 365#define raw_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
366#define __this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 366#define raw_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
367#define __this_cpu_read_4(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 367#define raw_cpu_read_4(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
368 368
369#define __this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val) 369#define raw_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val)
370#define __this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val) 370#define raw_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val)
371#define __this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val) 371#define raw_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val)
372#define __this_cpu_add_1(pcp, val) percpu_add_op((pcp), val) 372#define raw_cpu_add_1(pcp, val) percpu_add_op((pcp), val)
373#define __this_cpu_add_2(pcp, val) percpu_add_op((pcp), val) 373#define raw_cpu_add_2(pcp, val) percpu_add_op((pcp), val)
374#define __this_cpu_add_4(pcp, val) percpu_add_op((pcp), val) 374#define raw_cpu_add_4(pcp, val) percpu_add_op((pcp), val)
375#define __this_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val) 375#define raw_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val)
376#define __this_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val) 376#define raw_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val)
377#define __this_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val) 377#define raw_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val)
378#define __this_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val) 378#define raw_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val)
379#define __this_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val) 379#define raw_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val)
380#define __this_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val) 380#define raw_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val)
381#define __this_cpu_xchg_1(pcp, val) percpu_xchg_op(pcp, val) 381#define raw_cpu_xchg_1(pcp, val) percpu_xchg_op(pcp, val)
382#define __this_cpu_xchg_2(pcp, val) percpu_xchg_op(pcp, val) 382#define raw_cpu_xchg_2(pcp, val) percpu_xchg_op(pcp, val)
383#define __this_cpu_xchg_4(pcp, val) percpu_xchg_op(pcp, val) 383#define raw_cpu_xchg_4(pcp, val) percpu_xchg_op(pcp, val)
384 384
385#define this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 385#define this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
386#define this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 386#define this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
@@ -401,16 +401,16 @@ do { \
401#define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval) 401#define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval)
402#define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval) 402#define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval)
403 403
404#define __this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val) 404#define raw_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val)
405#define __this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val) 405#define raw_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val)
406#define __this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val) 406#define raw_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val)
407#define __this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) 407#define raw_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
408#define __this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) 408#define raw_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
409#define __this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) 409#define raw_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
410 410
411#define this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val) 411#define this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val)
412#define this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val) 412#define this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val)
413#define this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val) 413#define this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val)
414#define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) 414#define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
415#define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) 415#define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
416#define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) 416#define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
@@ -427,7 +427,7 @@ do { \
427 __ret; \ 427 __ret; \
428}) 428})
429 429
430#define __this_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double 430#define raw_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double
431#define this_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double 431#define this_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double
432#endif /* CONFIG_X86_CMPXCHG64 */ 432#endif /* CONFIG_X86_CMPXCHG64 */
433 433
@@ -436,22 +436,22 @@ do { \
436 * 32 bit must fall back to generic operations. 436 * 32 bit must fall back to generic operations.
437 */ 437 */
438#ifdef CONFIG_X86_64 438#ifdef CONFIG_X86_64
439#define __this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 439#define raw_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
440#define __this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) 440#define raw_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val)
441#define __this_cpu_add_8(pcp, val) percpu_add_op((pcp), val) 441#define raw_cpu_add_8(pcp, val) percpu_add_op((pcp), val)
442#define __this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) 442#define raw_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
443#define __this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) 443#define raw_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
444#define __this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) 444#define raw_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val)
445#define __this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) 445#define raw_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
446#define __this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) 446#define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
447 447
448#define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 448#define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
449#define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) 449#define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val)
450#define this_cpu_add_8(pcp, val) percpu_add_op((pcp), val) 450#define this_cpu_add_8(pcp, val) percpu_add_op((pcp), val)
451#define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) 451#define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
452#define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) 452#define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
453#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) 453#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val)
454#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) 454#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
455#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) 455#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
456 456
457/* 457/*
@@ -474,7 +474,7 @@ do { \
474 __ret; \ 474 __ret; \
475}) 475})
476 476
477#define __this_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double 477#define raw_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double
478#define this_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double 478#define this_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double
479 479
480#endif 480#endif
@@ -495,9 +495,9 @@ static __always_inline int x86_this_cpu_constant_test_bit(unsigned int nr,
495 unsigned long __percpu *a = (unsigned long *)addr + nr / BITS_PER_LONG; 495 unsigned long __percpu *a = (unsigned long *)addr + nr / BITS_PER_LONG;
496 496
497#ifdef CONFIG_X86_64 497#ifdef CONFIG_X86_64
498 return ((1UL << (nr % BITS_PER_LONG)) & __this_cpu_read_8(*a)) != 0; 498 return ((1UL << (nr % BITS_PER_LONG)) & raw_cpu_read_8(*a)) != 0;
499#else 499#else
500 return ((1UL << (nr % BITS_PER_LONG)) & __this_cpu_read_4(*a)) != 0; 500 return ((1UL << (nr % BITS_PER_LONG)) & raw_cpu_read_4(*a)) != 0;
501#endif 501#endif
502} 502}
503 503
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index 3bf2dd0cf61f..0d193e234647 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -55,6 +55,13 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
55#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) 55#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
56#endif 56#endif
57 57
58/* Bit manipulation helper on pte/pgoff entry */
59static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshift,
60 unsigned long mask, unsigned int leftshift)
61{
62 return ((value >> rightshift) & mask) << leftshift;
63}
64
58#ifdef CONFIG_MEM_SOFT_DIRTY 65#ifdef CONFIG_MEM_SOFT_DIRTY
59 66
60/* 67/*
@@ -71,31 +78,34 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
71#define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1) 78#define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1)
72#define PTE_FILE_BITS3 (PTE_FILE_SHIFT4 - PTE_FILE_SHIFT3 - 1) 79#define PTE_FILE_BITS3 (PTE_FILE_SHIFT4 - PTE_FILE_SHIFT3 - 1)
73 80
74#define pte_to_pgoff(pte) \ 81#define PTE_FILE_MASK1 ((1U << PTE_FILE_BITS1) - 1)
75 ((((pte).pte_low >> (PTE_FILE_SHIFT1)) \ 82#define PTE_FILE_MASK2 ((1U << PTE_FILE_BITS2) - 1)
76 & ((1U << PTE_FILE_BITS1) - 1))) \ 83#define PTE_FILE_MASK3 ((1U << PTE_FILE_BITS3) - 1)
77 + ((((pte).pte_low >> (PTE_FILE_SHIFT2)) \ 84
78 & ((1U << PTE_FILE_BITS2) - 1)) \ 85#define PTE_FILE_LSHIFT2 (PTE_FILE_BITS1)
79 << (PTE_FILE_BITS1)) \ 86#define PTE_FILE_LSHIFT3 (PTE_FILE_BITS1 + PTE_FILE_BITS2)
80 + ((((pte).pte_low >> (PTE_FILE_SHIFT3)) \ 87#define PTE_FILE_LSHIFT4 (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3)
81 & ((1U << PTE_FILE_BITS3) - 1)) \ 88
82 << (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ 89static __always_inline pgoff_t pte_to_pgoff(pte_t pte)
83 + ((((pte).pte_low >> (PTE_FILE_SHIFT4))) \ 90{
84 << (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3)) 91 return (pgoff_t)
85 92 (pte_bitop(pte.pte_low, PTE_FILE_SHIFT1, PTE_FILE_MASK1, 0) +
86#define pgoff_to_pte(off) \ 93 pte_bitop(pte.pte_low, PTE_FILE_SHIFT2, PTE_FILE_MASK2, PTE_FILE_LSHIFT2) +
87 ((pte_t) { .pte_low = \ 94 pte_bitop(pte.pte_low, PTE_FILE_SHIFT3, PTE_FILE_MASK3, PTE_FILE_LSHIFT3) +
88 ((((off)) & ((1U << PTE_FILE_BITS1) - 1)) << PTE_FILE_SHIFT1) \ 95 pte_bitop(pte.pte_low, PTE_FILE_SHIFT4, -1UL, PTE_FILE_LSHIFT4));
89 + ((((off) >> PTE_FILE_BITS1) \ 96}
90 & ((1U << PTE_FILE_BITS2) - 1)) \ 97
91 << PTE_FILE_SHIFT2) \ 98static __always_inline pte_t pgoff_to_pte(pgoff_t off)
92 + ((((off) >> (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ 99{
93 & ((1U << PTE_FILE_BITS3) - 1)) \ 100 return (pte_t){
94 << PTE_FILE_SHIFT3) \ 101 .pte_low =
95 + ((((off) >> \ 102 pte_bitop(off, 0, PTE_FILE_MASK1, PTE_FILE_SHIFT1) +
96 (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3))) \ 103 pte_bitop(off, PTE_FILE_LSHIFT2, PTE_FILE_MASK2, PTE_FILE_SHIFT2) +
97 << PTE_FILE_SHIFT4) \ 104 pte_bitop(off, PTE_FILE_LSHIFT3, PTE_FILE_MASK3, PTE_FILE_SHIFT3) +
98 + _PAGE_FILE }) 105 pte_bitop(off, PTE_FILE_LSHIFT4, -1UL, PTE_FILE_SHIFT4) +
106 _PAGE_FILE,
107 };
108}
99 109
100#else /* CONFIG_MEM_SOFT_DIRTY */ 110#else /* CONFIG_MEM_SOFT_DIRTY */
101 111
@@ -115,22 +125,30 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
115#define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1) 125#define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1)
116#define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1) 126#define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1)
117 127
118#define pte_to_pgoff(pte) \ 128#define PTE_FILE_MASK1 ((1U << PTE_FILE_BITS1) - 1)
119 ((((pte).pte_low >> PTE_FILE_SHIFT1) \ 129#define PTE_FILE_MASK2 ((1U << PTE_FILE_BITS2) - 1)
120 & ((1U << PTE_FILE_BITS1) - 1)) \ 130
121 + ((((pte).pte_low >> PTE_FILE_SHIFT2) \ 131#define PTE_FILE_LSHIFT2 (PTE_FILE_BITS1)
122 & ((1U << PTE_FILE_BITS2) - 1)) << PTE_FILE_BITS1) \ 132#define PTE_FILE_LSHIFT3 (PTE_FILE_BITS1 + PTE_FILE_BITS2)
123 + (((pte).pte_low >> PTE_FILE_SHIFT3) \ 133
124 << (PTE_FILE_BITS1 + PTE_FILE_BITS2))) 134static __always_inline pgoff_t pte_to_pgoff(pte_t pte)
125 135{
126#define pgoff_to_pte(off) \ 136 return (pgoff_t)
127 ((pte_t) { .pte_low = \ 137 (pte_bitop(pte.pte_low, PTE_FILE_SHIFT1, PTE_FILE_MASK1, 0) +
128 (((off) & ((1U << PTE_FILE_BITS1) - 1)) << PTE_FILE_SHIFT1) \ 138 pte_bitop(pte.pte_low, PTE_FILE_SHIFT2, PTE_FILE_MASK2, PTE_FILE_LSHIFT2) +
129 + ((((off) >> PTE_FILE_BITS1) & ((1U << PTE_FILE_BITS2) - 1)) \ 139 pte_bitop(pte.pte_low, PTE_FILE_SHIFT3, -1UL, PTE_FILE_LSHIFT3));
130 << PTE_FILE_SHIFT2) \ 140}
131 + (((off) >> (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ 141
132 << PTE_FILE_SHIFT3) \ 142static __always_inline pte_t pgoff_to_pte(pgoff_t off)
133 + _PAGE_FILE }) 143{
144 return (pte_t){
145 .pte_low =
146 pte_bitop(off, 0, PTE_FILE_MASK1, PTE_FILE_SHIFT1) +
147 pte_bitop(off, PTE_FILE_LSHIFT2, PTE_FILE_MASK2, PTE_FILE_SHIFT2) +
148 pte_bitop(off, PTE_FILE_LSHIFT3, -1UL, PTE_FILE_SHIFT3) +
149 _PAGE_FILE,
150 };
151}
134 152
135#endif /* CONFIG_MEM_SOFT_DIRTY */ 153#endif /* CONFIG_MEM_SOFT_DIRTY */
136 154
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index bbc8b12fa443..b459ddf27d64 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -15,9 +15,10 @@
15 : (prot)) 15 : (prot))
16 16
17#ifndef __ASSEMBLY__ 17#ifndef __ASSEMBLY__
18
19#include <asm/x86_init.h> 18#include <asm/x86_init.h>
20 19
20void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
21
21/* 22/*
22 * ZERO_PAGE is a global shared page that is always zero: used 23 * ZERO_PAGE is a global shared page that is always zero: used
23 * for zero-mapped memory areas etc.. 24 * for zero-mapped memory areas etc..
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 2d883440cb9a..c883bf726398 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -58,7 +58,7 @@ typedef struct { pteval_t pte; } pte_t;
58#define VMALLOC_START _AC(0xffffc90000000000, UL) 58#define VMALLOC_START _AC(0xffffc90000000000, UL)
59#define VMALLOC_END _AC(0xffffe8ffffffffff, UL) 59#define VMALLOC_END _AC(0xffffe8ffffffffff, UL)
60#define VMEMMAP_START _AC(0xffffea0000000000, UL) 60#define VMEMMAP_START _AC(0xffffea0000000000, UL)
61#define MODULES_VADDR _AC(0xffffffffa0000000, UL) 61#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
62#define MODULES_END _AC(0xffffffffff000000, UL) 62#define MODULES_END _AC(0xffffffffff000000, UL)
63#define MODULES_LEN (MODULES_END - MODULES_VADDR) 63#define MODULES_LEN (MODULES_END - MODULES_VADDR)
64 64
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 0ecac257fb26..eb3d44945133 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -121,7 +121,8 @@
121 121
122/* Set of bits not changed in pte_modify */ 122/* Set of bits not changed in pte_modify */
123#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ 123#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
124 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY) 124 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \
125 _PAGE_SOFT_DIRTY)
125#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) 126#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
126 127
127#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT) 128#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT)
@@ -213,13 +214,8 @@
213#ifdef CONFIG_X86_64 214#ifdef CONFIG_X86_64
214#define __PAGE_KERNEL_IDENT_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC 215#define __PAGE_KERNEL_IDENT_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC
215#else 216#else
216/*
217 * For PDE_IDENT_ATTR include USER bit. As the PDE and PTE protection
218 * bits are combined, this will alow user to access the high address mapped
219 * VDSO in the presence of CONFIG_COMPAT_VDSO
220 */
221#define PTE_IDENT_ATTR 0x003 /* PRESENT+RW */ 217#define PTE_IDENT_ATTR 0x003 /* PRESENT+RW */
222#define PDE_IDENT_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */ 218#define PDE_IDENT_ATTR 0x063 /* PRESENT+RW+DIRTY+ACCESSED */
223#define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */ 219#define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */
224#endif 220#endif
225 221
@@ -381,8 +377,13 @@ static inline void update_page_count(int level, unsigned long pages) { }
381 * as a pte too. 377 * as a pte too.
382 */ 378 */
383extern pte_t *lookup_address(unsigned long address, unsigned int *level); 379extern pte_t *lookup_address(unsigned long address, unsigned int *level);
380extern pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
381 unsigned int *level);
384extern phys_addr_t slow_virt_to_phys(void *__address); 382extern phys_addr_t slow_virt_to_phys(void *__address);
385 383extern int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
384 unsigned numpages, unsigned long page_flags);
385void kernel_unmap_pages_in_pgd(pgd_t *root, unsigned long address,
386 unsigned numpages);
386#endif /* !__ASSEMBLY__ */ 387#endif /* !__ASSEMBLY__ */
387 388
388#endif /* _ASM_X86_PGTABLE_DEFS_H */ 389#endif /* _ASM_X86_PGTABLE_DEFS_H */
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index c8b051933b1b..7024c12f7bfe 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -19,12 +19,12 @@ DECLARE_PER_CPU(int, __preempt_count);
19 */ 19 */
20static __always_inline int preempt_count(void) 20static __always_inline int preempt_count(void)
21{ 21{
22 return __this_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED; 22 return raw_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED;
23} 23}
24 24
25static __always_inline void preempt_count_set(int pc) 25static __always_inline void preempt_count_set(int pc)
26{ 26{
27 __this_cpu_write_4(__preempt_count, pc); 27 raw_cpu_write_4(__preempt_count, pc);
28} 28}
29 29
30/* 30/*
@@ -53,17 +53,17 @@ static __always_inline void preempt_count_set(int pc)
53 53
54static __always_inline void set_preempt_need_resched(void) 54static __always_inline void set_preempt_need_resched(void)
55{ 55{
56 __this_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED); 56 raw_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED);
57} 57}
58 58
59static __always_inline void clear_preempt_need_resched(void) 59static __always_inline void clear_preempt_need_resched(void)
60{ 60{
61 __this_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED); 61 raw_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED);
62} 62}
63 63
64static __always_inline bool test_preempt_need_resched(void) 64static __always_inline bool test_preempt_need_resched(void)
65{ 65{
66 return !(__this_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED); 66 return !(raw_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED);
67} 67}
68 68
69/* 69/*
@@ -72,12 +72,12 @@ static __always_inline bool test_preempt_need_resched(void)
72 72
73static __always_inline void __preempt_count_add(int val) 73static __always_inline void __preempt_count_add(int val)
74{ 74{
75 __this_cpu_add_4(__preempt_count, val); 75 raw_cpu_add_4(__preempt_count, val);
76} 76}
77 77
78static __always_inline void __preempt_count_sub(int val) 78static __always_inline void __preempt_count_sub(int val)
79{ 79{
80 __this_cpu_add_4(__preempt_count, -val); 80 raw_cpu_add_4(__preempt_count, -val);
81} 81}
82 82
83/* 83/*
@@ -95,7 +95,7 @@ static __always_inline bool __preempt_count_dec_and_test(void)
95 */ 95 */
96static __always_inline bool should_resched(void) 96static __always_inline bool should_resched(void)
97{ 97{
98 return unlikely(!__this_cpu_read_4(__preempt_count)); 98 return unlikely(!raw_cpu_read_4(__preempt_count));
99} 99}
100 100
101#ifdef CONFIG_PREEMPT 101#ifdef CONFIG_PREEMPT
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 7b034a4057f9..a4ea02351f4d 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -27,7 +27,6 @@ struct mm_struct;
27#include <linux/cache.h> 27#include <linux/cache.h>
28#include <linux/threads.h> 28#include <linux/threads.h>
29#include <linux/math64.h> 29#include <linux/math64.h>
30#include <linux/init.h>
31#include <linux/err.h> 30#include <linux/err.h>
32#include <linux/irqflags.h> 31#include <linux/irqflags.h>
33 32
@@ -72,6 +71,7 @@ extern u16 __read_mostly tlb_lli_4m[NR_INFO];
72extern u16 __read_mostly tlb_lld_4k[NR_INFO]; 71extern u16 __read_mostly tlb_lld_4k[NR_INFO];
73extern u16 __read_mostly tlb_lld_2m[NR_INFO]; 72extern u16 __read_mostly tlb_lld_2m[NR_INFO];
74extern u16 __read_mostly tlb_lld_4m[NR_INFO]; 73extern u16 __read_mostly tlb_lld_4m[NR_INFO];
74extern u16 __read_mostly tlb_lld_1g[NR_INFO];
75extern s8 __read_mostly tlb_flushall_shift; 75extern s8 __read_mostly tlb_flushall_shift;
76 76
77/* 77/*
@@ -370,6 +370,20 @@ struct ymmh_struct {
370 u32 ymmh_space[64]; 370 u32 ymmh_space[64];
371}; 371};
372 372
373/* We don't support LWP yet: */
374struct lwp_struct {
375 u8 reserved[128];
376};
377
378struct bndregs_struct {
379 u64 bndregs[8];
380} __packed;
381
382struct bndcsr_struct {
383 u64 cfg_reg_u;
384 u64 status_reg;
385} __packed;
386
373struct xsave_hdr_struct { 387struct xsave_hdr_struct {
374 u64 xstate_bv; 388 u64 xstate_bv;
375 u64 reserved1[2]; 389 u64 reserved1[2];
@@ -380,6 +394,9 @@ struct xsave_struct {
380 struct i387_fxsave_struct i387; 394 struct i387_fxsave_struct i387;
381 struct xsave_hdr_struct xsave_hdr; 395 struct xsave_hdr_struct xsave_hdr;
382 struct ymmh_struct ymmh; 396 struct ymmh_struct ymmh;
397 struct lwp_struct lwp;
398 struct bndregs_struct bndregs;
399 struct bndcsr_struct bndcsr;
383 /* new processor state extensions will go here */ 400 /* new processor state extensions will go here */
384} __attribute__ ((packed, aligned (64))); 401} __attribute__ ((packed, aligned (64)));
385 402
@@ -432,6 +449,15 @@ struct stack_canary {
432}; 449};
433DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); 450DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
434#endif 451#endif
452/*
453 * per-CPU IRQ handling stacks
454 */
455struct irq_stack {
456 u32 stack[THREAD_SIZE/sizeof(u32)];
457} __aligned(THREAD_SIZE);
458
459DECLARE_PER_CPU(struct irq_stack *, hardirq_stack);
460DECLARE_PER_CPU(struct irq_stack *, softirq_stack);
435#endif /* X86_64 */ 461#endif /* X86_64 */
436 462
437extern unsigned int xstate_size; 463extern unsigned int xstate_size;
@@ -700,29 +726,6 @@ static inline void sync_core(void)
700#endif 726#endif
701} 727}
702 728
703static inline void __monitor(const void *eax, unsigned long ecx,
704 unsigned long edx)
705{
706 /* "monitor %eax, %ecx, %edx;" */
707 asm volatile(".byte 0x0f, 0x01, 0xc8;"
708 :: "a" (eax), "c" (ecx), "d"(edx));
709}
710
711static inline void __mwait(unsigned long eax, unsigned long ecx)
712{
713 /* "mwait %eax, %ecx;" */
714 asm volatile(".byte 0x0f, 0x01, 0xc9;"
715 :: "a" (eax), "c" (ecx));
716}
717
718static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
719{
720 trace_hardirqs_on();
721 /* "mwait %eax, %ecx;" */
722 asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
723 :: "a" (eax), "c" (ecx));
724}
725
726extern void select_idle_routine(const struct cpuinfo_x86 *c); 729extern void select_idle_routine(const struct cpuinfo_x86 *c);
727extern void init_amd_e400_c1e_mask(void); 730extern void init_amd_e400_c1e_mask(void);
728 731
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 942a08623a1a..14fd6fd75a19 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -60,7 +60,6 @@ struct pt_regs {
60 60
61#endif /* !__i386__ */ 61#endif /* !__i386__ */
62 62
63#include <linux/init.h>
64#ifdef CONFIG_PARAVIRT 63#ifdef CONFIG_PARAVIRT
65#include <asm/paravirt_types.h> 64#include <asm/paravirt_types.h>
66#endif 65#endif
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 59bcf4e22418..9264f04a4c55 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -3,7 +3,6 @@
3 3
4#include <uapi/asm/setup.h> 4#include <uapi/asm/setup.h>
5 5
6
7#define COMMAND_LINE_SIZE 2048 6#define COMMAND_LINE_SIZE 2048
8 7
9#include <linux/linkage.h> 8#include <linux/linkage.h>
@@ -29,6 +28,8 @@
29#include <asm/bootparam.h> 28#include <asm/bootparam.h>
30#include <asm/x86_init.h> 29#include <asm/x86_init.h>
31 30
31extern u64 relocated_ramdisk;
32
32/* Interrupt control for vSMPowered x86_64 systems */ 33/* Interrupt control for vSMPowered x86_64 systems */
33#ifdef CONFIG_X86_64 34#ifdef CONFIG_X86_64
34void vsmp_init(void); 35void vsmp_init(void);
@@ -38,12 +39,6 @@ static inline void vsmp_init(void) { }
38 39
39void setup_bios_corruption_check(void); 40void setup_bios_corruption_check(void);
40 41
41#ifdef CONFIG_X86_VISWS
42extern void visws_early_detect(void);
43#else
44static inline void visws_early_detect(void) { }
45#endif
46
47extern unsigned long saved_video_mode; 42extern unsigned long saved_video_mode;
48 43
49extern void reserve_standard_io_resources(void); 44extern void reserve_standard_io_resources(void);
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 4137890e88e3..8cd27e08e23c 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -2,7 +2,6 @@
2#define _ASM_X86_SMP_H 2#define _ASM_X86_SMP_H
3#ifndef __ASSEMBLY__ 3#ifndef __ASSEMBLY__
4#include <linux/cpumask.h> 4#include <linux/cpumask.h>
5#include <linux/init.h>
6#include <asm/percpu.h> 5#include <asm/percpu.h>
7 6
8/* 7/*
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 645cad2c95ff..e820c080a4e9 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -191,6 +191,14 @@ static inline void clflush(volatile void *__p)
191 asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p)); 191 asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p));
192} 192}
193 193
194static inline void clflushopt(volatile void *__p)
195{
196 alternative_io(".byte " __stringify(NOP_DS_PREFIX) "; clflush %P0",
197 ".byte 0x66; clflush %P0",
198 X86_FEATURE_CLFLUSHOPT,
199 "+m" (*(volatile char __force *)__p));
200}
201
194#define nop() asm volatile ("nop") 202#define nop() asm volatile ("nop")
195 203
196 204
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index bf156ded74b5..0f62f5482d91 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -26,10 +26,9 @@
26# define LOCK_PTR_REG "D" 26# define LOCK_PTR_REG "D"
27#endif 27#endif
28 28
29#if defined(CONFIG_X86_32) && \ 29#if defined(CONFIG_X86_32) && (defined(CONFIG_X86_PPRO_FENCE))
30 (defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE))
31/* 30/*
32 * On PPro SMP or if we are using OOSTORE, we use a locked operation to unlock 31 * On PPro SMP, we use a locked operation to unlock
33 * (PPro errata 66, 92) 32 * (PPro errata 66, 92)
34 */ 33 */
35# define UNLOCK_LOCK_PREFIX LOCK_PREFIX 34# define UNLOCK_LOCK_PREFIX LOCK_PREFIX
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 3ba3de457d05..47e5de25ba79 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -9,6 +9,7 @@
9 9
10#include <linux/compiler.h> 10#include <linux/compiler.h>
11#include <asm/page.h> 11#include <asm/page.h>
12#include <asm/percpu.h>
12#include <asm/types.h> 13#include <asm/types.h>
13 14
14/* 15/*
@@ -32,12 +33,6 @@ struct thread_info {
32 mm_segment_t addr_limit; 33 mm_segment_t addr_limit;
33 struct restart_block restart_block; 34 struct restart_block restart_block;
34 void __user *sysenter_return; 35 void __user *sysenter_return;
35#ifdef CONFIG_X86_32
36 unsigned long previous_esp; /* ESP of the previous stack in
37 case of nested (IRQ) stacks
38 */
39 __u8 supervisor_stack[0];
40#endif
41 unsigned int sig_on_uaccess_error:1; 36 unsigned int sig_on_uaccess_error:1;
42 unsigned int uaccess_err:1; /* uaccess failed */ 37 unsigned int uaccess_err:1; /* uaccess failed */
43}; 38};
@@ -153,9 +148,9 @@ struct thread_info {
153#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY) 148#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
154#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW) 149#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
155 150
156#ifdef CONFIG_X86_32 151#define STACK_WARN (THREAD_SIZE/8)
152#define KERNEL_STACK_OFFSET (5*(BITS_PER_LONG/8))
157 153
158#define STACK_WARN (THREAD_SIZE/8)
159/* 154/*
160 * macros/functions for gaining access to the thread information structure 155 * macros/functions for gaining access to the thread information structure
161 * 156 *
@@ -163,40 +158,6 @@ struct thread_info {
163 */ 158 */
164#ifndef __ASSEMBLY__ 159#ifndef __ASSEMBLY__
165 160
166
167/* how to get the current stack pointer from C */
168register unsigned long current_stack_pointer asm("esp") __used;
169
170/* how to get the thread information struct from C */
171static inline struct thread_info *current_thread_info(void)
172{
173 return (struct thread_info *)
174 (current_stack_pointer & ~(THREAD_SIZE - 1));
175}
176
177#else /* !__ASSEMBLY__ */
178
179/* how to get the thread information struct from ASM */
180#define GET_THREAD_INFO(reg) \
181 movl $-THREAD_SIZE, reg; \
182 andl %esp, reg
183
184/* use this one if reg already contains %esp */
185#define GET_THREAD_INFO_WITH_ESP(reg) \
186 andl $-THREAD_SIZE, reg
187
188#endif
189
190#else /* X86_32 */
191
192#include <asm/percpu.h>
193#define KERNEL_STACK_OFFSET (5*8)
194
195/*
196 * macros/functions for gaining access to the thread information structure
197 * preempt_count needs to be 1 initially, until the scheduler is functional.
198 */
199#ifndef __ASSEMBLY__
200DECLARE_PER_CPU(unsigned long, kernel_stack); 161DECLARE_PER_CPU(unsigned long, kernel_stack);
201 162
202static inline struct thread_info *current_thread_info(void) 163static inline struct thread_info *current_thread_info(void)
@@ -211,8 +172,8 @@ static inline struct thread_info *current_thread_info(void)
211 172
212/* how to get the thread information struct from ASM */ 173/* how to get the thread information struct from ASM */
213#define GET_THREAD_INFO(reg) \ 174#define GET_THREAD_INFO(reg) \
214 movq PER_CPU_VAR(kernel_stack),reg ; \ 175 _ASM_MOV PER_CPU_VAR(kernel_stack),reg ; \
215 subq $(THREAD_SIZE-KERNEL_STACK_OFFSET),reg 176 _ASM_SUB $(THREAD_SIZE-KERNEL_STACK_OFFSET),reg ;
216 177
217/* 178/*
218 * Same if PER_CPU_VAR(kernel_stack) is, perhaps with some offset, already in 179 * Same if PER_CPU_VAR(kernel_stack) is, perhaps with some offset, already in
@@ -222,8 +183,6 @@ static inline struct thread_info *current_thread_info(void)
222 183
223#endif 184#endif
224 185
225#endif /* !X86_32 */
226
227/* 186/*
228 * Thread-synchronous status. 187 * Thread-synchronous status.
229 * 188 *
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index 34baa0eb5d0c..a04eabd43d06 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -1,9 +1,9 @@
1#ifndef _ASM_X86_TIMER_H 1#ifndef _ASM_X86_TIMER_H
2#define _ASM_X86_TIMER_H 2#define _ASM_X86_TIMER_H
3#include <linux/init.h>
4#include <linux/pm.h> 3#include <linux/pm.h>
5#include <linux/percpu.h> 4#include <linux/percpu.h>
6#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/math64.h>
7 7
8#define TICK_SIZE (tick_nsec / 1000) 8#define TICK_SIZE (tick_nsec / 1000)
9 9
@@ -12,68 +12,26 @@ extern int recalibrate_cpu_khz(void);
12 12
13extern int no_timer_check; 13extern int no_timer_check;
14 14
15/* Accelerators for sched_clock() 15/*
16 * convert from cycles(64bits) => nanoseconds (64bits) 16 * We use the full linear equation: f(x) = a + b*x, in order to allow
17 * basic equation: 17 * a continuous function in the face of dynamic freq changes.
18 * ns = cycles / (freq / ns_per_sec)
19 * ns = cycles * (ns_per_sec / freq)
20 * ns = cycles * (10^9 / (cpu_khz * 10^3))
21 * ns = cycles * (10^6 / cpu_khz)
22 * 18 *
23 * Then we use scaling math (suggested by george@mvista.com) to get: 19 * Continuity means that when our frequency changes our slope (b); we want to
24 * ns = cycles * (10^6 * SC / cpu_khz) / SC 20 * ensure that: f(t) == f'(t), which gives: a + b*t == a' + b'*t.
25 * ns = cycles * cyc2ns_scale / SC
26 * 21 *
27 * And since SC is a constant power of two, we can convert the div 22 * Without an offset (a) the above would not be possible.
28 * into a shift.
29 * 23 *
30 * We can use khz divisor instead of mhz to keep a better precision, since 24 * See the comment near cycles_2_ns() for details on how we compute (b).
31 * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
32 * (mathieu.desnoyers@polymtl.ca)
33 *
34 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
35 *
36 * In:
37 *
38 * ns = cycles * cyc2ns_scale / SC
39 *
40 * Although we may still have enough bits to store the value of ns,
41 * in some cases, we may not have enough bits to store cycles * cyc2ns_scale,
42 * leading to an incorrect result.
43 *
44 * To avoid this, we can decompose 'cycles' into quotient and remainder
45 * of division by SC. Then,
46 *
47 * ns = (quot * SC + rem) * cyc2ns_scale / SC
48 * = quot * cyc2ns_scale + (rem * cyc2ns_scale) / SC
49 *
50 * - sqazi@google.com
51 */ 25 */
52 26struct cyc2ns_data {
53DECLARE_PER_CPU(unsigned long, cyc2ns); 27 u32 cyc2ns_mul;
54DECLARE_PER_CPU(unsigned long long, cyc2ns_offset); 28 u32 cyc2ns_shift;
55 29 u64 cyc2ns_offset;
56#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ 30 u32 __count;
57 31 /* u32 hole */
58static inline unsigned long long __cycles_2_ns(unsigned long long cyc) 32}; /* 24 bytes -- do not grow */
59{ 33
60 int cpu = smp_processor_id(); 34extern struct cyc2ns_data *cyc2ns_read_begin(void);
61 unsigned long long ns = per_cpu(cyc2ns_offset, cpu); 35extern void cyc2ns_read_end(struct cyc2ns_data *);
62 ns += mult_frac(cyc, per_cpu(cyc2ns, cpu),
63 (1UL << CYC2NS_SCALE_FACTOR));
64 return ns;
65}
66
67static inline unsigned long long cycles_2_ns(unsigned long long cyc)
68{
69 unsigned long long ns;
70 unsigned long flags;
71
72 local_irq_save(flags);
73 ns = __cycles_2_ns(cyc);
74 local_irq_restore(flags);
75
76 return ns;
77}
78 36
79#endif /* _ASM_X86_TIMER_H */ 37#endif /* _ASM_X86_TIMER_H */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index e6d90babc245..04905bfc508b 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -62,7 +62,7 @@ static inline void __flush_tlb_all(void)
62 62
63static inline void __flush_tlb_one(unsigned long addr) 63static inline void __flush_tlb_one(unsigned long addr)
64{ 64{
65 count_vm_event(NR_TLB_LOCAL_FLUSH_ONE); 65 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
66 __flush_tlb_single(addr); 66 __flush_tlb_single(addr);
67} 67}
68 68
@@ -93,13 +93,13 @@ static inline void __flush_tlb_one(unsigned long addr)
93 */ 93 */
94static inline void __flush_tlb_up(void) 94static inline void __flush_tlb_up(void)
95{ 95{
96 count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); 96 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
97 __flush_tlb(); 97 __flush_tlb();
98} 98}
99 99
100static inline void flush_tlb_all(void) 100static inline void flush_tlb_all(void)
101{ 101{
102 count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); 102 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
103 __flush_tlb_all(); 103 __flush_tlb_all();
104} 104}
105 105
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index d35f24e231cd..0e8f04f2c26f 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -119,9 +119,10 @@ static inline void setup_node_to_cpumask_map(void) { }
119 119
120extern const struct cpumask *cpu_coregroup_mask(int cpu); 120extern const struct cpumask *cpu_coregroup_mask(int cpu);
121 121
122#ifdef ENABLE_TOPO_DEFINES
123#define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id) 122#define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id)
124#define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id) 123#define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id)
124
125#ifdef ENABLE_TOPO_DEFINES
125#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) 126#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu))
126#define topology_thread_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu)) 127#define topology_thread_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu))
127#endif 128#endif
@@ -131,25 +132,7 @@ static inline void arch_fix_phys_package_id(int num, u32 slot)
131} 132}
132 133
133struct pci_bus; 134struct pci_bus;
135int x86_pci_root_bus_node(int bus);
134void x86_pci_root_bus_resources(int bus, struct list_head *resources); 136void x86_pci_root_bus_resources(int bus, struct list_head *resources);
135 137
136#ifdef CONFIG_SMP
137#define mc_capable() ((boot_cpu_data.x86_max_cores > 1) && \
138 (cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids))
139#define smt_capable() (smp_num_siblings > 1)
140#endif
141
142#ifdef CONFIG_NUMA
143extern int get_mp_bus_to_node(int busnum);
144extern void set_mp_bus_to_node(int busnum, int node);
145#else
146static inline int get_mp_bus_to_node(int busnum)
147{
148 return 0;
149}
150static inline void set_mp_bus_to_node(int busnum, int node)
151{
152}
153#endif
154
155#endif /* _ASM_X86_TOPOLOGY_H */ 138#endif /* _ASM_X86_TOPOLOGY_H */
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 235be70d5bb4..94605c0e9cee 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -65,4 +65,7 @@ extern int notsc_setup(char *);
65extern void tsc_save_sched_clock_state(void); 65extern void tsc_save_sched_clock_state(void);
66extern void tsc_restore_sched_clock_state(void); 66extern void tsc_restore_sched_clock_state(void);
67 67
68/* MSR based TSC calibration for Intel Atom SoC platforms */
69unsigned long try_msr_calibrate_tsc(void);
70
68#endif /* _ASM_X86_TSC_H */ 71#endif /* _ASM_X86_TSC_H */
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 8ec57c07b125..0d592e0a5b84 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -40,22 +40,30 @@
40/* 40/*
41 * Test whether a block of memory is a valid user space address. 41 * Test whether a block of memory is a valid user space address.
42 * Returns 0 if the range is valid, nonzero otherwise. 42 * Returns 0 if the range is valid, nonzero otherwise.
43 *
44 * This is equivalent to the following test:
45 * (u33)addr + (u33)size > (u33)current->addr_limit.seg (u65 for x86_64)
46 *
47 * This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry...
48 */ 43 */
44static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, unsigned long limit)
45{
46 /*
47 * If we have used "sizeof()" for the size,
48 * we know it won't overflow the limit (but
49 * it might overflow the 'addr', so it's
50 * important to subtract the size from the
51 * limit, not add it to the address).
52 */
53 if (__builtin_constant_p(size))
54 return addr > limit - size;
55
56 /* Arbitrary sizes? Be careful about overflow */
57 addr += size;
58 if (addr < size)
59 return true;
60 return addr > limit;
61}
49 62
50#define __range_not_ok(addr, size, limit) \ 63#define __range_not_ok(addr, size, limit) \
51({ \ 64({ \
52 unsigned long flag, roksum; \
53 __chk_user_ptr(addr); \ 65 __chk_user_ptr(addr); \
54 asm("add %3,%1 ; sbb %0,%0 ; cmp %1,%4 ; sbb $0,%0" \ 66 __chk_range_not_ok((unsigned long __force)(addr), size, limit); \
55 : "=&r" (flag), "=r" (roksum) \
56 : "1" (addr), "g" ((long)(size)), \
57 "rm" (limit)); \
58 flag; \
59}) 67})
60 68
61/** 69/**
@@ -78,7 +86,7 @@
78 * this function, memory access functions may still return -EFAULT. 86 * this function, memory access functions may still return -EFAULT.
79 */ 87 */
80#define access_ok(type, addr, size) \ 88#define access_ok(type, addr, size) \
81 (likely(__range_not_ok(addr, size, user_addr_max()) == 0)) 89 likely(!__range_not_ok(addr, size, user_addr_max()))
82 90
83/* 91/*
84 * The exception table consists of pairs of addresses relative to the 92 * The exception table consists of pairs of addresses relative to the
@@ -525,6 +533,98 @@ extern __must_check long strnlen_user(const char __user *str, long n);
525unsigned long __must_check clear_user(void __user *mem, unsigned long len); 533unsigned long __must_check clear_user(void __user *mem, unsigned long len);
526unsigned long __must_check __clear_user(void __user *mem, unsigned long len); 534unsigned long __must_check __clear_user(void __user *mem, unsigned long len);
527 535
536extern void __cmpxchg_wrong_size(void)
537 __compiletime_error("Bad argument size for cmpxchg");
538
539#define __user_atomic_cmpxchg_inatomic(uval, ptr, old, new, size) \
540({ \
541 int __ret = 0; \
542 __typeof__(ptr) __uval = (uval); \
543 __typeof__(*(ptr)) __old = (old); \
544 __typeof__(*(ptr)) __new = (new); \
545 switch (size) { \
546 case 1: \
547 { \
548 asm volatile("\t" ASM_STAC "\n" \
549 "1:\t" LOCK_PREFIX "cmpxchgb %4, %2\n" \
550 "2:\t" ASM_CLAC "\n" \
551 "\t.section .fixup, \"ax\"\n" \
552 "3:\tmov %3, %0\n" \
553 "\tjmp 2b\n" \
554 "\t.previous\n" \
555 _ASM_EXTABLE(1b, 3b) \
556 : "+r" (__ret), "=a" (__old), "+m" (*(ptr)) \
557 : "i" (-EFAULT), "q" (__new), "1" (__old) \
558 : "memory" \
559 ); \
560 break; \
561 } \
562 case 2: \
563 { \
564 asm volatile("\t" ASM_STAC "\n" \
565 "1:\t" LOCK_PREFIX "cmpxchgw %4, %2\n" \
566 "2:\t" ASM_CLAC "\n" \
567 "\t.section .fixup, \"ax\"\n" \
568 "3:\tmov %3, %0\n" \
569 "\tjmp 2b\n" \
570 "\t.previous\n" \
571 _ASM_EXTABLE(1b, 3b) \
572 : "+r" (__ret), "=a" (__old), "+m" (*(ptr)) \
573 : "i" (-EFAULT), "r" (__new), "1" (__old) \
574 : "memory" \
575 ); \
576 break; \
577 } \
578 case 4: \
579 { \
580 asm volatile("\t" ASM_STAC "\n" \
581 "1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n" \
582 "2:\t" ASM_CLAC "\n" \
583 "\t.section .fixup, \"ax\"\n" \
584 "3:\tmov %3, %0\n" \
585 "\tjmp 2b\n" \
586 "\t.previous\n" \
587 _ASM_EXTABLE(1b, 3b) \
588 : "+r" (__ret), "=a" (__old), "+m" (*(ptr)) \
589 : "i" (-EFAULT), "r" (__new), "1" (__old) \
590 : "memory" \
591 ); \
592 break; \
593 } \
594 case 8: \
595 { \
596 if (!IS_ENABLED(CONFIG_X86_64)) \
597 __cmpxchg_wrong_size(); \
598 \
599 asm volatile("\t" ASM_STAC "\n" \
600 "1:\t" LOCK_PREFIX "cmpxchgq %4, %2\n" \
601 "2:\t" ASM_CLAC "\n" \
602 "\t.section .fixup, \"ax\"\n" \
603 "3:\tmov %3, %0\n" \
604 "\tjmp 2b\n" \
605 "\t.previous\n" \
606 _ASM_EXTABLE(1b, 3b) \
607 : "+r" (__ret), "=a" (__old), "+m" (*(ptr)) \
608 : "i" (-EFAULT), "r" (__new), "1" (__old) \
609 : "memory" \
610 ); \
611 break; \
612 } \
613 default: \
614 __cmpxchg_wrong_size(); \
615 } \
616 *__uval = __old; \
617 __ret; \
618})
619
620#define user_atomic_cmpxchg_inatomic(uval, ptr, old, new) \
621({ \
622 access_ok(VERIFY_WRITE, (ptr), sizeof(*(ptr))) ? \
623 __user_atomic_cmpxchg_inatomic((uval), (ptr), \
624 (old), (new), sizeof(*(ptr))) : \
625 -EFAULT; \
626})
627
528/* 628/*
529 * movsl can be slow when source and dest are not both 8-byte aligned 629 * movsl can be slow when source and dest are not both 8-byte aligned
530 */ 630 */
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 190413d0de57..12a26b979bf1 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -204,13 +204,13 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
204static __must_check __always_inline int 204static __must_check __always_inline int
205__copy_from_user_inatomic(void *dst, const void __user *src, unsigned size) 205__copy_from_user_inatomic(void *dst, const void __user *src, unsigned size)
206{ 206{
207 return __copy_from_user_nocheck(dst, (__force const void *)src, size); 207 return __copy_from_user_nocheck(dst, src, size);
208} 208}
209 209
210static __must_check __always_inline int 210static __must_check __always_inline int
211__copy_to_user_inatomic(void __user *dst, const void *src, unsigned size) 211__copy_to_user_inatomic(void __user *dst, const void *src, unsigned size)
212{ 212{
213 return __copy_to_user_nocheck((__force void *)dst, src, size); 213 return __copy_to_user_nocheck(dst, src, size);
214} 214}
215 215
216extern long __copy_user_nocache(void *dst, const void __user *src, 216extern long __copy_user_nocache(void *dst, const void __user *src,
diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h
index c2a48139c340..3f556c6a0157 100644
--- a/arch/x86/include/asm/unistd.h
+++ b/arch/x86/include/asm/unistd.h
@@ -23,6 +23,9 @@
23# include <asm/unistd_64.h> 23# include <asm/unistd_64.h>
24# include <asm/unistd_64_x32.h> 24# include <asm/unistd_64_x32.h>
25# define __ARCH_WANT_COMPAT_SYS_TIME 25# define __ARCH_WANT_COMPAT_SYS_TIME
26# define __ARCH_WANT_COMPAT_SYS_GETDENTS64
27# define __ARCH_WANT_COMPAT_SYS_PREADV64
28# define __ARCH_WANT_COMPAT_SYS_PWRITEV64
26 29
27# endif 30# endif
28 31
diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h
index 6b964a0b86d1..062921ef34e9 100644
--- a/arch/x86/include/asm/uv/uv.h
+++ b/arch/x86/include/asm/uv/uv.h
@@ -12,7 +12,6 @@ extern enum uv_system_type get_uv_system_type(void);
12extern int is_uv_system(void); 12extern int is_uv_system(void);
13extern void uv_cpu_init(void); 13extern void uv_cpu_init(void);
14extern void uv_nmi_init(void); 14extern void uv_nmi_init(void);
15extern void uv_register_nmi_notifier(void);
16extern void uv_system_init(void); 15extern void uv_system_init(void);
17extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, 16extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
18 struct mm_struct *mm, 17 struct mm_struct *mm,
@@ -26,7 +25,6 @@ static inline enum uv_system_type get_uv_system_type(void) { return UV_NONE; }
26static inline int is_uv_system(void) { return 0; } 25static inline int is_uv_system(void) { return 0; }
27static inline void uv_cpu_init(void) { } 26static inline void uv_cpu_init(void) { }
28static inline void uv_system_init(void) { } 27static inline void uv_system_init(void) { }
29static inline void uv_register_nmi_notifier(void) { }
30static inline const struct cpumask * 28static inline const struct cpumask *
31uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, 29uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm,
32 unsigned long start, unsigned long end, unsigned int cpu) 30 unsigned long start, unsigned long end, unsigned int cpu)
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index fddb53d63915..d1dc55404ff1 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -1,8 +1,45 @@
1#ifndef _ASM_X86_VDSO_H 1#ifndef _ASM_X86_VDSO_H
2#define _ASM_X86_VDSO_H 2#define _ASM_X86_VDSO_H
3 3
4#include <asm/page_types.h>
5#include <linux/linkage.h>
6
7#ifdef __ASSEMBLER__
8
9#define DEFINE_VDSO_IMAGE(symname, filename) \
10__PAGE_ALIGNED_DATA ; \
11 .globl symname##_start, symname##_end ; \
12 .align PAGE_SIZE ; \
13 symname##_start: ; \
14 .incbin filename ; \
15 symname##_end: ; \
16 .align PAGE_SIZE /* extra data here leaks to userspace. */ ; \
17 \
18.previous ; \
19 \
20 .globl symname##_pages ; \
21 .bss ; \
22 .align 8 ; \
23 .type symname##_pages, @object ; \
24 symname##_pages: ; \
25 .zero (symname##_end - symname##_start + PAGE_SIZE - 1) / PAGE_SIZE * (BITS_PER_LONG / 8) ; \
26 .size symname##_pages, .-symname##_pages
27
28#else
29
30#define DECLARE_VDSO_IMAGE(symname) \
31 extern char symname##_start[], symname##_end[]; \
32 extern struct page *symname##_pages[]
33
4#if defined CONFIG_X86_32 || defined CONFIG_COMPAT 34#if defined CONFIG_X86_32 || defined CONFIG_COMPAT
5extern const char VDSO32_PRELINK[]; 35
36#include <asm/vdso32.h>
37
38DECLARE_VDSO_IMAGE(vdso32_int80);
39#ifdef CONFIG_COMPAT
40DECLARE_VDSO_IMAGE(vdso32_syscall);
41#endif
42DECLARE_VDSO_IMAGE(vdso32_sysenter);
6 43
7/* 44/*
8 * Given a pointer to the vDSO image, find the pointer to VDSO32_name 45 * Given a pointer to the vDSO image, find the pointer to VDSO32_name
@@ -11,8 +48,7 @@ extern const char VDSO32_PRELINK[];
11#define VDSO32_SYMBOL(base, name) \ 48#define VDSO32_SYMBOL(base, name) \
12({ \ 49({ \
13 extern const char VDSO32_##name[]; \ 50 extern const char VDSO32_##name[]; \
14 (void __user *)(VDSO32_##name - VDSO32_PRELINK + \ 51 (void __user *)(VDSO32_##name + (unsigned long)(base)); \
15 (unsigned long)(base)); \
16}) 52})
17#endif 53#endif
18 54
@@ -23,12 +59,8 @@ extern const char VDSO32_PRELINK[];
23extern void __user __kernel_sigreturn; 59extern void __user __kernel_sigreturn;
24extern void __user __kernel_rt_sigreturn; 60extern void __user __kernel_rt_sigreturn;
25 61
26/* 62void __init patch_vdso32(void *vdso, size_t len);
27 * These symbols are defined by vdso32.S to mark the bounds 63
28 * of the ELF DSO images included therein. 64#endif /* __ASSEMBLER__ */
29 */
30extern const char vdso32_int80_start, vdso32_int80_end;
31extern const char vdso32_syscall_start, vdso32_syscall_end;
32extern const char vdso32_sysenter_start, vdso32_sysenter_end;
33 65
34#endif /* _ASM_X86_VDSO_H */ 66#endif /* _ASM_X86_VDSO_H */
diff --git a/arch/x86/include/asm/vdso32.h b/arch/x86/include/asm/vdso32.h
new file mode 100644
index 000000000000..7efb7018406e
--- /dev/null
+++ b/arch/x86/include/asm/vdso32.h
@@ -0,0 +1,11 @@
1#ifndef _ASM_X86_VDSO32_H
2#define _ASM_X86_VDSO32_H
3
4#define VDSO_BASE_PAGE 0
5#define VDSO_VVAR_PAGE 1
6#define VDSO_HPET_PAGE 2
7#define VDSO_PAGES 3
8#define VDSO_PREV_PAGES 2
9#define VDSO_OFFSET(x) ((x) * PAGE_SIZE)
10
11#endif
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index 46e24d36b7da..3c3366c2e37f 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -1,30 +1,73 @@
1#ifndef _ASM_X86_VGTOD_H 1#ifndef _ASM_X86_VGTOD_H
2#define _ASM_X86_VGTOD_H 2#define _ASM_X86_VGTOD_H
3 3
4#include <asm/vsyscall.h> 4#include <linux/compiler.h>
5#include <linux/clocksource.h> 5#include <linux/clocksource.h>
6 6
7#ifdef BUILD_VDSO32_64
8typedef u64 gtod_long_t;
9#else
10typedef unsigned long gtod_long_t;
11#endif
12/*
13 * vsyscall_gtod_data will be accessed by 32 and 64 bit code at the same time
14 * so be carefull by modifying this structure.
15 */
7struct vsyscall_gtod_data { 16struct vsyscall_gtod_data {
8 seqcount_t seq; 17 unsigned seq;
9 18
10 struct { /* extract of a clocksource struct */ 19 int vclock_mode;
11 int vclock_mode; 20 cycle_t cycle_last;
12 cycle_t cycle_last; 21 cycle_t mask;
13 cycle_t mask; 22 u32 mult;
14 u32 mult; 23 u32 shift;
15 u32 shift;
16 } clock;
17 24
18 /* open coded 'struct timespec' */ 25 /* open coded 'struct timespec' */
19 time_t wall_time_sec;
20 u64 wall_time_snsec; 26 u64 wall_time_snsec;
27 gtod_long_t wall_time_sec;
28 gtod_long_t monotonic_time_sec;
21 u64 monotonic_time_snsec; 29 u64 monotonic_time_snsec;
22 time_t monotonic_time_sec; 30 gtod_long_t wall_time_coarse_sec;
31 gtod_long_t wall_time_coarse_nsec;
32 gtod_long_t monotonic_time_coarse_sec;
33 gtod_long_t monotonic_time_coarse_nsec;
23 34
24 struct timezone sys_tz; 35 int tz_minuteswest;
25 struct timespec wall_time_coarse; 36 int tz_dsttime;
26 struct timespec monotonic_time_coarse;
27}; 37};
28extern struct vsyscall_gtod_data vsyscall_gtod_data; 38extern struct vsyscall_gtod_data vsyscall_gtod_data;
29 39
40static inline unsigned gtod_read_begin(const struct vsyscall_gtod_data *s)
41{
42 unsigned ret;
43
44repeat:
45 ret = ACCESS_ONCE(s->seq);
46 if (unlikely(ret & 1)) {
47 cpu_relax();
48 goto repeat;
49 }
50 smp_rmb();
51 return ret;
52}
53
54static inline int gtod_read_retry(const struct vsyscall_gtod_data *s,
55 unsigned start)
56{
57 smp_rmb();
58 return unlikely(s->seq != start);
59}
60
61static inline void gtod_write_begin(struct vsyscall_gtod_data *s)
62{
63 ++s->seq;
64 smp_wmb();
65}
66
67static inline void gtod_write_end(struct vsyscall_gtod_data *s)
68{
69 smp_wmb();
70 ++s->seq;
71}
72
30#endif /* _ASM_X86_VGTOD_H */ 73#endif /* _ASM_X86_VGTOD_H */
diff --git a/arch/x86/include/asm/visws/cobalt.h b/arch/x86/include/asm/visws/cobalt.h
deleted file mode 100644
index 2edb37637ead..000000000000
--- a/arch/x86/include/asm/visws/cobalt.h
+++ /dev/null
@@ -1,127 +0,0 @@
1#ifndef _ASM_X86_VISWS_COBALT_H
2#define _ASM_X86_VISWS_COBALT_H
3
4#include <asm/fixmap.h>
5
6/*
7 * Cobalt SGI Visual Workstation system ASIC
8 */
9
10#define CO_CPU_NUM_PHYS 0x1e00
11#define CO_CPU_TAB_PHYS (CO_CPU_NUM_PHYS + 2)
12
13#define CO_CPU_MAX 4
14
15#define CO_CPU_PHYS 0xc2000000
16#define CO_APIC_PHYS 0xc4000000
17
18/* see set_fixmap() and asm/fixmap.h */
19#define CO_CPU_VADDR (fix_to_virt(FIX_CO_CPU))
20#define CO_APIC_VADDR (fix_to_virt(FIX_CO_APIC))
21
22/* Cobalt CPU registers -- relative to CO_CPU_VADDR, use co_cpu_*() */
23#define CO_CPU_REV 0x08
24#define CO_CPU_CTRL 0x10
25#define CO_CPU_STAT 0x20
26#define CO_CPU_TIMEVAL 0x30
27
28/* CO_CPU_CTRL bits */
29#define CO_CTRL_TIMERUN 0x04 /* 0 == disabled */
30#define CO_CTRL_TIMEMASK 0x08 /* 0 == unmasked */
31
32/* CO_CPU_STATUS bits */
33#define CO_STAT_TIMEINTR 0x02 /* (r) 1 == int pend, (w) 0 == clear */
34
35/* CO_CPU_TIMEVAL value */
36#define CO_TIME_HZ 100000000 /* Cobalt core rate */
37
38/* Cobalt APIC registers -- relative to CO_APIC_VADDR, use co_apic_*() */
39#define CO_APIC_HI(n) (((n) * 0x10) + 4)
40#define CO_APIC_LO(n) ((n) * 0x10)
41#define CO_APIC_ID 0x0ffc
42
43/* CO_APIC_ID bits */
44#define CO_APIC_ENABLE 0x00000100
45
46/* CO_APIC_LO bits */
47#define CO_APIC_MASK 0x00010000 /* 0 = enabled */
48#define CO_APIC_LEVEL 0x00008000 /* 0 = edge */
49
50/*
51 * Where things are physically wired to Cobalt
52 * #defines with no board _<type>_<rev>_ are common to all (thus far)
53 */
54#define CO_APIC_IDE0 4
55#define CO_APIC_IDE1 2 /* Only on 320 */
56
57#define CO_APIC_8259 12 /* serial, floppy, par-l-l */
58
59/* Lithium PCI Bridge A -- "the one with 82557 Ethernet" */
60#define CO_APIC_PCIA_BASE0 0 /* and 1 */ /* slot 0, line 0 */
61#define CO_APIC_PCIA_BASE123 5 /* and 6 */ /* slot 0, line 1 */
62
63#define CO_APIC_PIIX4_USB 7 /* this one is weird */
64
65/* Lithium PCI Bridge B -- "the one with PIIX4" */
66#define CO_APIC_PCIB_BASE0 8 /* and 9-12 *//* slot 0, line 0 */
67#define CO_APIC_PCIB_BASE123 13 /* 14.15 */ /* slot 0, line 1 */
68
69#define CO_APIC_VIDOUT0 16
70#define CO_APIC_VIDOUT1 17
71#define CO_APIC_VIDIN0 18
72#define CO_APIC_VIDIN1 19
73
74#define CO_APIC_LI_AUDIO 22
75
76#define CO_APIC_AS 24
77#define CO_APIC_RE 25
78
79#define CO_APIC_CPU 28 /* Timer and Cache interrupt */
80#define CO_APIC_NMI 29
81#define CO_APIC_LAST CO_APIC_NMI
82
83/*
84 * This is how irqs are assigned on the Visual Workstation.
85 * Legacy devices get irq's 1-15 (system clock is 0 and is CO_APIC_CPU).
86 * All other devices (including PCI) go to Cobalt and are irq's 16 on up.
87 */
88#define CO_IRQ_APIC0 16 /* irq of apic entry 0 */
89#define IS_CO_APIC(irq) ((irq) >= CO_IRQ_APIC0)
90#define CO_IRQ(apic) (CO_IRQ_APIC0 + (apic)) /* apic ent to irq */
91#define CO_APIC(irq) ((irq) - CO_IRQ_APIC0) /* irq to apic ent */
92#define CO_IRQ_IDE0 14 /* knowledge of... */
93#define CO_IRQ_IDE1 15 /* ... ide driver defaults! */
94#define CO_IRQ_8259 CO_IRQ(CO_APIC_8259)
95
96#ifdef CONFIG_X86_VISWS_APIC
97static inline void co_cpu_write(unsigned long reg, unsigned long v)
98{
99 *((volatile unsigned long *)(CO_CPU_VADDR+reg))=v;
100}
101
102static inline unsigned long co_cpu_read(unsigned long reg)
103{
104 return *((volatile unsigned long *)(CO_CPU_VADDR+reg));
105}
106
107static inline void co_apic_write(unsigned long reg, unsigned long v)
108{
109 *((volatile unsigned long *)(CO_APIC_VADDR+reg))=v;
110}
111
112static inline unsigned long co_apic_read(unsigned long reg)
113{
114 return *((volatile unsigned long *)(CO_APIC_VADDR+reg));
115}
116#endif
117
118extern char visws_board_type;
119
120#define VISWS_320 0
121#define VISWS_540 1
122
123extern char visws_board_rev;
124
125extern int pci_visws_init(void);
126
127#endif /* _ASM_X86_VISWS_COBALT_H */
diff --git a/arch/x86/include/asm/visws/lithium.h b/arch/x86/include/asm/visws/lithium.h
deleted file mode 100644
index a10d89bc1270..000000000000
--- a/arch/x86/include/asm/visws/lithium.h
+++ /dev/null
@@ -1,53 +0,0 @@
1#ifndef _ASM_X86_VISWS_LITHIUM_H
2#define _ASM_X86_VISWS_LITHIUM_H
3
4#include <asm/fixmap.h>
5
6/*
7 * Lithium is the SGI Visual Workstation I/O ASIC
8 */
9
10#define LI_PCI_A_PHYS 0xfc000000 /* Enet is dev 3 */
11#define LI_PCI_B_PHYS 0xfd000000 /* PIIX4 is here */
12
13/* see set_fixmap() and asm/fixmap.h */
14#define LI_PCIA_VADDR (fix_to_virt(FIX_LI_PCIA))
15#define LI_PCIB_VADDR (fix_to_virt(FIX_LI_PCIB))
16
17/* Not a standard PCI? (not in linux/pci.h) */
18#define LI_PCI_BUSNUM 0x44 /* lo8: primary, hi8: sub */
19#define LI_PCI_INTEN 0x46
20
21/* LI_PCI_INTENT bits */
22#define LI_INTA_0 0x0001
23#define LI_INTA_1 0x0002
24#define LI_INTA_2 0x0004
25#define LI_INTA_3 0x0008
26#define LI_INTA_4 0x0010
27#define LI_INTB 0x0020
28#define LI_INTC 0x0040
29#define LI_INTD 0x0080
30
31/* More special purpose macros... */
32static inline void li_pcia_write16(unsigned long reg, unsigned short v)
33{
34 *((volatile unsigned short *)(LI_PCIA_VADDR+reg))=v;
35}
36
37static inline unsigned short li_pcia_read16(unsigned long reg)
38{
39 return *((volatile unsigned short *)(LI_PCIA_VADDR+reg));
40}
41
42static inline void li_pcib_write16(unsigned long reg, unsigned short v)
43{
44 *((volatile unsigned short *)(LI_PCIB_VADDR+reg))=v;
45}
46
47static inline unsigned short li_pcib_read16(unsigned long reg)
48{
49 return *((volatile unsigned short *)(LI_PCIB_VADDR+reg));
50}
51
52#endif /* _ASM_X86_VISWS_LITHIUM_H */
53
diff --git a/arch/x86/include/asm/visws/piix4.h b/arch/x86/include/asm/visws/piix4.h
deleted file mode 100644
index d0af4d338e7f..000000000000
--- a/arch/x86/include/asm/visws/piix4.h
+++ /dev/null
@@ -1,107 +0,0 @@
1#ifndef _ASM_X86_VISWS_PIIX4_H
2#define _ASM_X86_VISWS_PIIX4_H
3
4/*
5 * PIIX4 as used on SGI Visual Workstations
6 */
7
8#define PIIX_PM_START 0x0F80
9
10#define SIO_GPIO_START 0x0FC0
11
12#define SIO_PM_START 0x0FC8
13
14#define PMBASE PIIX_PM_START
15#define GPIREG0 (PMBASE+0x30)
16#define GPIREG(x) (GPIREG0+((x)/8))
17#define GPIBIT(x) (1 << ((x)%8))
18
19#define PIIX_GPI_BD_ID1 18
20#define PIIX_GPI_BD_ID2 19
21#define PIIX_GPI_BD_ID3 20
22#define PIIX_GPI_BD_ID4 21
23#define PIIX_GPI_BD_REG GPIREG(PIIX_GPI_BD_ID1)
24#define PIIX_GPI_BD_MASK (GPIBIT(PIIX_GPI_BD_ID1) | \
25 GPIBIT(PIIX_GPI_BD_ID2) | \
26 GPIBIT(PIIX_GPI_BD_ID3) | \
27 GPIBIT(PIIX_GPI_BD_ID4) )
28
29#define PIIX_GPI_BD_SHIFT (PIIX_GPI_BD_ID1 % 8)
30
31#define SIO_INDEX 0x2e
32#define SIO_DATA 0x2f
33
34#define SIO_DEV_SEL 0x7
35#define SIO_DEV_ENB 0x30
36#define SIO_DEV_MSB 0x60
37#define SIO_DEV_LSB 0x61
38
39#define SIO_GP_DEV 0x7
40
41#define SIO_GP_BASE SIO_GPIO_START
42#define SIO_GP_MSB (SIO_GP_BASE>>8)
43#define SIO_GP_LSB (SIO_GP_BASE&0xff)
44
45#define SIO_GP_DATA1 (SIO_GP_BASE+0)
46
47#define SIO_PM_DEV 0x8
48
49#define SIO_PM_BASE SIO_PM_START
50#define SIO_PM_MSB (SIO_PM_BASE>>8)
51#define SIO_PM_LSB (SIO_PM_BASE&0xff)
52#define SIO_PM_INDEX (SIO_PM_BASE+0)
53#define SIO_PM_DATA (SIO_PM_BASE+1)
54
55#define SIO_PM_FER2 0x1
56
57#define SIO_PM_GP_EN 0x80
58
59
60
61/*
62 * This is the dev/reg where generating a config cycle will
63 * result in a PCI special cycle.
64 */
65#define SPECIAL_DEV 0xff
66#define SPECIAL_REG 0x00
67
68/*
69 * PIIX4 needs to see a special cycle with the following data
70 * to be convinced the processor has gone into the stop grant
71 * state. PIIX4 insists on seeing this before it will power
72 * down a system.
73 */
74#define PIIX_SPECIAL_STOP 0x00120002
75
76#define PIIX4_RESET_PORT 0xcf9
77#define PIIX4_RESET_VAL 0x6
78
79#define PMSTS_PORT 0xf80 // 2 bytes PM Status
80#define PMEN_PORT 0xf82 // 2 bytes PM Enable
81#define PMCNTRL_PORT 0xf84 // 2 bytes PM Control
82
83#define PM_SUSPEND_ENABLE 0x2000 // start sequence to suspend state
84
85/*
86 * PMSTS and PMEN I/O bit definitions.
87 * (Bits are the same in both registers)
88 */
89#define PM_STS_RSM (1<<15) // Resume Status
90#define PM_STS_PWRBTNOR (1<<11) // Power Button Override
91#define PM_STS_RTC (1<<10) // RTC status
92#define PM_STS_PWRBTN (1<<8) // Power Button Pressed?
93#define PM_STS_GBL (1<<5) // Global Status
94#define PM_STS_BM (1<<4) // Bus Master Status
95#define PM_STS_TMROF (1<<0) // Timer Overflow Status.
96
97/*
98 * Stop clock GPI register
99 */
100#define PIIX_GPIREG0 (0xf80 + 0x30)
101
102/*
103 * Stop clock GPI bit in GPIREG0
104 */
105#define PIIX_GPI_STPCLK 0x4 // STPCLK signal routed back in
106
107#endif /* _ASM_X86_VISWS_PIIX4_H */
diff --git a/arch/x86/include/asm/visws/sgivw.h b/arch/x86/include/asm/visws/sgivw.h
deleted file mode 100644
index 5fbf63e1003c..000000000000
--- a/arch/x86/include/asm/visws/sgivw.h
+++ /dev/null
@@ -1,5 +0,0 @@
1/*
2 * Frame buffer position and size:
3 */
4extern unsigned long sgivwfb_mem_phys;
5extern unsigned long sgivwfb_mem_size;
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 966502d4682e..7004d21e6219 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -85,6 +85,7 @@
85#define VM_EXIT_SAVE_IA32_EFER 0x00100000 85#define VM_EXIT_SAVE_IA32_EFER 0x00100000
86#define VM_EXIT_LOAD_IA32_EFER 0x00200000 86#define VM_EXIT_LOAD_IA32_EFER 0x00200000
87#define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x00400000 87#define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x00400000
88#define VM_EXIT_CLEAR_BNDCFGS 0x00800000
88 89
89#define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR 0x00036dff 90#define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR 0x00036dff
90 91
@@ -95,11 +96,13 @@
95#define VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL 0x00002000 96#define VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL 0x00002000
96#define VM_ENTRY_LOAD_IA32_PAT 0x00004000 97#define VM_ENTRY_LOAD_IA32_PAT 0x00004000
97#define VM_ENTRY_LOAD_IA32_EFER 0x00008000 98#define VM_ENTRY_LOAD_IA32_EFER 0x00008000
99#define VM_ENTRY_LOAD_BNDCFGS 0x00010000
98 100
99#define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x000011ff 101#define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x000011ff
100 102
101#define VMX_MISC_PREEMPTION_TIMER_RATE_MASK 0x0000001f 103#define VMX_MISC_PREEMPTION_TIMER_RATE_MASK 0x0000001f
102#define VMX_MISC_SAVE_EFER_LMA 0x00000020 104#define VMX_MISC_SAVE_EFER_LMA 0x00000020
105#define VMX_MISC_ACTIVITY_HLT 0x00000040
103 106
104/* VMCS Encodings */ 107/* VMCS Encodings */
105enum vmcs_field { 108enum vmcs_field {
@@ -173,6 +176,8 @@ enum vmcs_field {
173 GUEST_PDPTR2_HIGH = 0x0000280f, 176 GUEST_PDPTR2_HIGH = 0x0000280f,
174 GUEST_PDPTR3 = 0x00002810, 177 GUEST_PDPTR3 = 0x00002810,
175 GUEST_PDPTR3_HIGH = 0x00002811, 178 GUEST_PDPTR3_HIGH = 0x00002811,
179 GUEST_BNDCFGS = 0x00002812,
180 GUEST_BNDCFGS_HIGH = 0x00002813,
176 HOST_IA32_PAT = 0x00002c00, 181 HOST_IA32_PAT = 0x00002c00,
177 HOST_IA32_PAT_HIGH = 0x00002c01, 182 HOST_IA32_PAT_HIGH = 0x00002c01,
178 HOST_IA32_EFER = 0x00002c02, 183 HOST_IA32_EFER = 0x00002c02,
diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h
index d76ac40da206..081d909bc495 100644
--- a/arch/x86/include/asm/vvar.h
+++ b/arch/x86/include/asm/vvar.h
@@ -16,8 +16,8 @@
16 * you mess up, the linker will catch it.) 16 * you mess up, the linker will catch it.)
17 */ 17 */
18 18
19/* Base address of vvars. This is not ABI. */ 19#ifndef _ASM_X86_VVAR_H
20#define VVAR_ADDRESS (-10*1024*1024 - 4096) 20#define _ASM_X86_VVAR_H
21 21
22#if defined(__VVAR_KERNEL_LDS) 22#if defined(__VVAR_KERNEL_LDS)
23 23
@@ -29,16 +29,35 @@
29 29
30#else 30#else
31 31
32#ifdef BUILD_VDSO32
33
34#define DECLARE_VVAR(offset, type, name) \
35 extern type vvar_ ## name __attribute__((visibility("hidden")));
36
37#define VVAR(name) (vvar_ ## name)
38
39#else
40
41extern char __vvar_page;
42
43/* Base address of vvars. This is not ABI. */
44#ifdef CONFIG_X86_64
45#define VVAR_ADDRESS (-10*1024*1024 - 4096)
46#else
47#define VVAR_ADDRESS (&__vvar_page)
48#endif
49
32#define DECLARE_VVAR(offset, type, name) \ 50#define DECLARE_VVAR(offset, type, name) \
33 static type const * const vvaraddr_ ## name = \ 51 static type const * const vvaraddr_ ## name = \
34 (void *)(VVAR_ADDRESS + (offset)); 52 (void *)(VVAR_ADDRESS + (offset));
35 53
54#define VVAR(name) (*vvaraddr_ ## name)
55#endif
56
36#define DEFINE_VVAR(type, name) \ 57#define DEFINE_VVAR(type, name) \
37 type name \ 58 type name \
38 __attribute__((section(".vvar_" #name), aligned(16))) __visible 59 __attribute__((section(".vvar_" #name), aligned(16))) __visible
39 60
40#define VVAR(name) (*vvaraddr_ ## name)
41
42#endif 61#endif
43 62
44/* DECLARE_VVAR(offset, type, name) */ 63/* DECLARE_VVAR(offset, type, name) */
@@ -48,3 +67,5 @@ DECLARE_VVAR(16, int, vgetcpu_mode)
48DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data) 67DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data)
49 68
50#undef DECLARE_VVAR 69#undef DECLARE_VVAR
70
71#endif
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 0f1be11e43d2..e45e4da96bf1 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -181,7 +181,7 @@ struct x86_msi_ops {
181 u8 hpet_id); 181 u8 hpet_id);
182 void (*teardown_msi_irq)(unsigned int irq); 182 void (*teardown_msi_irq)(unsigned int irq);
183 void (*teardown_msi_irqs)(struct pci_dev *dev); 183 void (*teardown_msi_irqs)(struct pci_dev *dev);
184 void (*restore_msi_irqs)(struct pci_dev *dev, int irq); 184 void (*restore_msi_irqs)(struct pci_dev *dev);
185 int (*setup_hpet_msi)(unsigned int irq, unsigned int id); 185 int (*setup_hpet_msi)(unsigned int irq, unsigned int id);
186 u32 (*msi_mask_irq)(struct msi_desc *desc, u32 mask, u32 flag); 186 u32 (*msi_mask_irq)(struct msi_desc *desc, u32 mask, u32 flag);
187 u32 (*msix_mask_irq)(struct msi_desc *desc, u32 flag); 187 u32 (*msix_mask_irq)(struct msi_desc *desc, u32 flag);
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index b913915e8e63..c949923a5668 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -49,10 +49,17 @@ extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
49extern unsigned long set_phys_range_identity(unsigned long pfn_s, 49extern unsigned long set_phys_range_identity(unsigned long pfn_s,
50 unsigned long pfn_e); 50 unsigned long pfn_e);
51 51
52extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
53 struct gnttab_map_grant_ref *kmap_ops,
54 struct page **pages, unsigned int count);
52extern int m2p_add_override(unsigned long mfn, struct page *page, 55extern int m2p_add_override(unsigned long mfn, struct page *page,
53 struct gnttab_map_grant_ref *kmap_op); 56 struct gnttab_map_grant_ref *kmap_op);
57extern int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
58 struct gnttab_map_grant_ref *kmap_ops,
59 struct page **pages, unsigned int count);
54extern int m2p_remove_override(struct page *page, 60extern int m2p_remove_override(struct page *page,
55 struct gnttab_map_grant_ref *kmap_op); 61 struct gnttab_map_grant_ref *kmap_op,
62 unsigned long mfn);
56extern struct page *m2p_find_override(unsigned long mfn); 63extern struct page *m2p_find_override(unsigned long mfn);
57extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn); 64extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn);
58 65
@@ -121,7 +128,7 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
121 pfn = m2p_find_override_pfn(mfn, ~0); 128 pfn = m2p_find_override_pfn(mfn, ~0);
122 } 129 }
123 130
124 /* 131 /*
125 * pfn is ~0 if there are no entries in the m2p for mfn or if the 132 * pfn is ~0 if there are no entries in the m2p for mfn or if the
126 * entry doesn't map back to the mfn and m2p_override doesn't have a 133 * entry doesn't map back to the mfn and m2p_override doesn't have a
127 * valid entry for it. 134 * valid entry for it.
@@ -167,7 +174,12 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine)
167 */ 174 */
168static inline unsigned long mfn_to_local_pfn(unsigned long mfn) 175static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
169{ 176{
170 unsigned long pfn = mfn_to_pfn(mfn); 177 unsigned long pfn;
178
179 if (xen_feature(XENFEAT_auto_translated_physmap))
180 return mfn;
181
182 pfn = mfn_to_pfn(mfn);
171 if (get_phys_to_machine(pfn) != mfn) 183 if (get_phys_to_machine(pfn) != mfn)
172 return -1; /* force !pfn_valid() */ 184 return -1; /* force !pfn_valid() */
173 return pfn; 185 return pfn;
@@ -222,5 +234,6 @@ void make_lowmem_page_readonly(void *vaddr);
222void make_lowmem_page_readwrite(void *vaddr); 234void make_lowmem_page_readwrite(void *vaddr);
223 235
224#define xen_remap(cookie, size) ioremap((cookie), (size)); 236#define xen_remap(cookie, size) ioremap((cookie), (size));
237#define xen_unmap(cookie) iounmap((cookie))
225 238
226#endif /* _ASM_X86_XEN_PAGE_H */ 239#endif /* _ASM_X86_XEN_PAGE_H */
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 0415cdabb5a6..d949ef28c48b 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -6,11 +6,18 @@
6 6
7#define XSTATE_CPUID 0x0000000d 7#define XSTATE_CPUID 0x0000000d
8 8
9#define XSTATE_FP 0x1 9#define XSTATE_FP 0x1
10#define XSTATE_SSE 0x2 10#define XSTATE_SSE 0x2
11#define XSTATE_YMM 0x4 11#define XSTATE_YMM 0x4
12#define XSTATE_BNDREGS 0x8
13#define XSTATE_BNDCSR 0x10
14#define XSTATE_OPMASK 0x20
15#define XSTATE_ZMM_Hi256 0x40
16#define XSTATE_Hi16_ZMM 0x80
12 17
13#define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE) 18#define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE)
19/* Bit 63 of XCR0 is reserved for future expansion */
20#define XSTATE_EXTEND_MASK (~(XSTATE_FPSSE | (1ULL << 63)))
14 21
15#define FXSAVE_SIZE 512 22#define FXSAVE_SIZE 512
16 23
@@ -20,10 +27,15 @@
20#define XSAVE_YMM_SIZE 256 27#define XSAVE_YMM_SIZE 256
21#define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET) 28#define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
22 29
23/* 30/* Supported features which support lazy state saving */
24 * These are the features that the OS can handle currently. 31#define XSTATE_LAZY (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \
25 */ 32 | XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM)
26#define XCNTXT_MASK (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) 33
34/* Supported features which require eager state saving */
35#define XSTATE_EAGER (XSTATE_BNDREGS | XSTATE_BNDCSR)
36
37/* All currently supported features */
38#define XCNTXT_MASK (XSTATE_LAZY | XSTATE_EAGER)
27 39
28#ifdef CONFIG_X86_64 40#ifdef CONFIG_X86_64
29#define REX_PREFIX "0x48, " 41#define REX_PREFIX "0x48, "
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index 9c3733c5f8f7..225b0988043a 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -6,6 +6,7 @@
6#define SETUP_E820_EXT 1 6#define SETUP_E820_EXT 1
7#define SETUP_DTB 2 7#define SETUP_DTB 2
8#define SETUP_PCI 3 8#define SETUP_PCI 3
9#define SETUP_EFI 4
9 10
10/* ram_size flags */ 11/* ram_size flags */
11#define RAMDISK_IMAGE_START_MASK 0x07FF 12#define RAMDISK_IMAGE_START_MASK 0x07FF
@@ -23,6 +24,7 @@
23#define XLF_CAN_BE_LOADED_ABOVE_4G (1<<1) 24#define XLF_CAN_BE_LOADED_ABOVE_4G (1<<1)
24#define XLF_EFI_HANDOVER_32 (1<<2) 25#define XLF_EFI_HANDOVER_32 (1<<2)
25#define XLF_EFI_HANDOVER_64 (1<<3) 26#define XLF_EFI_HANDOVER_64 (1<<3)
27#define XLF_EFI_KEXEC (1<<4)
26 28
27#ifndef __ASSEMBLY__ 29#ifndef __ASSEMBLY__
28 30
diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h
index b8f1c0176cbc..462efe746d77 100644
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ b/arch/x86/include/uapi/asm/hyperv.h
@@ -28,6 +28,9 @@
28/* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/ 28/* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/
29#define HV_X64_MSR_TIME_REF_COUNT_AVAILABLE (1 << 1) 29#define HV_X64_MSR_TIME_REF_COUNT_AVAILABLE (1 << 1)
30 30
31/* A partition's reference time stamp counter (TSC) page */
32#define HV_X64_MSR_REFERENCE_TSC 0x40000021
33
31/* 34/*
32 * There is a single feature flag that signifies the presence of the MSR 35 * There is a single feature flag that signifies the presence of the MSR
33 * that can be used to retrieve both the local APIC Timer frequency as 36 * that can be used to retrieve both the local APIC Timer frequency as
@@ -198,6 +201,9 @@
198#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_MASK \ 201#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_MASK \
199 (~((1ull << HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT) - 1)) 202 (~((1ull << HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT) - 1))
200 203
204#define HV_X64_MSR_TSC_REFERENCE_ENABLE 0x00000001
205#define HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT 12
206
201#define HV_PROCESSOR_POWER_STATE_C0 0 207#define HV_PROCESSOR_POWER_STATE_C0 0
202#define HV_PROCESSOR_POWER_STATE_C1 1 208#define HV_PROCESSOR_POWER_STATE_C1 1
203#define HV_PROCESSOR_POWER_STATE_C2 2 209#define HV_PROCESSOR_POWER_STATE_C2 2
@@ -210,4 +216,11 @@
210#define HV_STATUS_INVALID_ALIGNMENT 4 216#define HV_STATUS_INVALID_ALIGNMENT 4
211#define HV_STATUS_INSUFFICIENT_BUFFERS 19 217#define HV_STATUS_INSUFFICIENT_BUFFERS 19
212 218
219typedef struct _HV_REFERENCE_TSC_PAGE {
220 __u32 tsc_sequence;
221 __u32 res1;
222 __u64 tsc_scale;
223 __s64 tsc_offset;
224} HV_REFERENCE_TSC_PAGE, *PHV_REFERENCE_TSC_PAGE;
225
213#endif 226#endif
diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
index 37813b5ddc37..c827ace3121b 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -184,6 +184,7 @@
184#define MSR_AMD64_PATCH_LOADER 0xc0010020 184#define MSR_AMD64_PATCH_LOADER 0xc0010020
185#define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140 185#define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140
186#define MSR_AMD64_OSVW_STATUS 0xc0010141 186#define MSR_AMD64_OSVW_STATUS 0xc0010141
187#define MSR_AMD64_LS_CFG 0xc0011020
187#define MSR_AMD64_DC_CFG 0xc0011022 188#define MSR_AMD64_DC_CFG 0xc0011022
188#define MSR_AMD64_BU_CFG2 0xc001102a 189#define MSR_AMD64_BU_CFG2 0xc001102a
189#define MSR_AMD64_IBSFETCHCTL 0xc0011030 190#define MSR_AMD64_IBSFETCHCTL 0xc0011030
@@ -294,6 +295,7 @@
294#define MSR_SMI_COUNT 0x00000034 295#define MSR_SMI_COUNT 0x00000034
295#define MSR_IA32_FEATURE_CONTROL 0x0000003a 296#define MSR_IA32_FEATURE_CONTROL 0x0000003a
296#define MSR_IA32_TSC_ADJUST 0x0000003b 297#define MSR_IA32_TSC_ADJUST 0x0000003b
298#define MSR_IA32_BNDCFGS 0x00000d90
297 299
298#define FEATURE_CONTROL_LOCKED (1<<0) 300#define FEATURE_CONTROL_LOCKED (1<<0)
299#define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1) 301#define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1)
@@ -367,33 +369,58 @@
367#define THERM_LOG_THRESHOLD1 (1 << 9) 369#define THERM_LOG_THRESHOLD1 (1 << 9)
368 370
369/* MISC_ENABLE bits: architectural */ 371/* MISC_ENABLE bits: architectural */
370#define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0) 372#define MSR_IA32_MISC_ENABLE_FAST_STRING_BIT 0
371#define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1) 373#define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << MSR_IA32_MISC_ENABLE_FAST_STRING_BIT)
372#define MSR_IA32_MISC_ENABLE_EMON (1ULL << 7) 374#define MSR_IA32_MISC_ENABLE_TCC_BIT 1
373#define MSR_IA32_MISC_ENABLE_BTS_UNAVAIL (1ULL << 11) 375#define MSR_IA32_MISC_ENABLE_TCC (1ULL << MSR_IA32_MISC_ENABLE_TCC_BIT)
374#define MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL (1ULL << 12) 376#define MSR_IA32_MISC_ENABLE_EMON_BIT 7
375#define MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP (1ULL << 16) 377#define MSR_IA32_MISC_ENABLE_EMON (1ULL << MSR_IA32_MISC_ENABLE_EMON_BIT)
376#define MSR_IA32_MISC_ENABLE_MWAIT (1ULL << 18) 378#define MSR_IA32_MISC_ENABLE_BTS_UNAVAIL_BIT 11
377#define MSR_IA32_MISC_ENABLE_LIMIT_CPUID (1ULL << 22) 379#define MSR_IA32_MISC_ENABLE_BTS_UNAVAIL (1ULL << MSR_IA32_MISC_ENABLE_BTS_UNAVAIL_BIT)
378#define MSR_IA32_MISC_ENABLE_XTPR_DISABLE (1ULL << 23) 380#define MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL_BIT 12
379#define MSR_IA32_MISC_ENABLE_XD_DISABLE (1ULL << 34) 381#define MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL (1ULL << MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL_BIT)
382#define MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP_BIT 16
383#define MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP (1ULL << MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP_BIT)
384#define MSR_IA32_MISC_ENABLE_MWAIT_BIT 18
385#define MSR_IA32_MISC_ENABLE_MWAIT (1ULL << MSR_IA32_MISC_ENABLE_MWAIT_BIT)
386#define MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT 22
387#define MSR_IA32_MISC_ENABLE_LIMIT_CPUID (1ULL << MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT);
388#define MSR_IA32_MISC_ENABLE_XTPR_DISABLE_BIT 23
389#define MSR_IA32_MISC_ENABLE_XTPR_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_XTPR_DISABLE_BIT)
390#define MSR_IA32_MISC_ENABLE_XD_DISABLE_BIT 34
391#define MSR_IA32_MISC_ENABLE_XD_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_XD_DISABLE_BIT)
380 392
381/* MISC_ENABLE bits: model-specific, meaning may vary from core to core */ 393/* MISC_ENABLE bits: model-specific, meaning may vary from core to core */
382#define MSR_IA32_MISC_ENABLE_X87_COMPAT (1ULL << 2) 394#define MSR_IA32_MISC_ENABLE_X87_COMPAT_BIT 2
383#define MSR_IA32_MISC_ENABLE_TM1 (1ULL << 3) 395#define MSR_IA32_MISC_ENABLE_X87_COMPAT (1ULL << MSR_IA32_MISC_ENABLE_X87_COMPAT_BIT)
384#define MSR_IA32_MISC_ENABLE_SPLIT_LOCK_DISABLE (1ULL << 4) 396#define MSR_IA32_MISC_ENABLE_TM1_BIT 3
385#define MSR_IA32_MISC_ENABLE_L3CACHE_DISABLE (1ULL << 6) 397#define MSR_IA32_MISC_ENABLE_TM1 (1ULL << MSR_IA32_MISC_ENABLE_TM1_BIT)
386#define MSR_IA32_MISC_ENABLE_SUPPRESS_LOCK (1ULL << 8) 398#define MSR_IA32_MISC_ENABLE_SPLIT_LOCK_DISABLE_BIT 4
387#define MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE (1ULL << 9) 399#define MSR_IA32_MISC_ENABLE_SPLIT_LOCK_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_SPLIT_LOCK_DISABLE_BIT)
388#define MSR_IA32_MISC_ENABLE_FERR (1ULL << 10) 400#define MSR_IA32_MISC_ENABLE_L3CACHE_DISABLE_BIT 6
389#define MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX (1ULL << 10) 401#define MSR_IA32_MISC_ENABLE_L3CACHE_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_L3CACHE_DISABLE_BIT)
390#define MSR_IA32_MISC_ENABLE_TM2 (1ULL << 13) 402#define MSR_IA32_MISC_ENABLE_SUPPRESS_LOCK_BIT 8
391#define MSR_IA32_MISC_ENABLE_ADJ_PREF_DISABLE (1ULL << 19) 403#define MSR_IA32_MISC_ENABLE_SUPPRESS_LOCK (1ULL << MSR_IA32_MISC_ENABLE_SUPPRESS_LOCK_BIT)
392#define MSR_IA32_MISC_ENABLE_SPEEDSTEP_LOCK (1ULL << 20) 404#define MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT 9
393#define MSR_IA32_MISC_ENABLE_L1D_CONTEXT (1ULL << 24) 405#define MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT)
394#define MSR_IA32_MISC_ENABLE_DCU_PREF_DISABLE (1ULL << 37) 406#define MSR_IA32_MISC_ENABLE_FERR_BIT 10
395#define MSR_IA32_MISC_ENABLE_TURBO_DISABLE (1ULL << 38) 407#define MSR_IA32_MISC_ENABLE_FERR (1ULL << MSR_IA32_MISC_ENABLE_FERR_BIT)
396#define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE (1ULL << 39) 408#define MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX_BIT 10
409#define MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX (1ULL << MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX_BIT)
410#define MSR_IA32_MISC_ENABLE_TM2_BIT 13
411#define MSR_IA32_MISC_ENABLE_TM2 (1ULL << MSR_IA32_MISC_ENABLE_TM2_BIT)
412#define MSR_IA32_MISC_ENABLE_ADJ_PREF_DISABLE_BIT 19
413#define MSR_IA32_MISC_ENABLE_ADJ_PREF_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_ADJ_PREF_DISABLE_BIT)
414#define MSR_IA32_MISC_ENABLE_SPEEDSTEP_LOCK_BIT 20
415#define MSR_IA32_MISC_ENABLE_SPEEDSTEP_LOCK (1ULL << MSR_IA32_MISC_ENABLE_SPEEDSTEP_LOCK_BIT)
416#define MSR_IA32_MISC_ENABLE_L1D_CONTEXT_BIT 24
417#define MSR_IA32_MISC_ENABLE_L1D_CONTEXT (1ULL << MSR_IA32_MISC_ENABLE_L1D_CONTEXT_BIT)
418#define MSR_IA32_MISC_ENABLE_DCU_PREF_DISABLE_BIT 37
419#define MSR_IA32_MISC_ENABLE_DCU_PREF_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_DCU_PREF_DISABLE_BIT)
420#define MSR_IA32_MISC_ENABLE_TURBO_DISABLE_BIT 38
421#define MSR_IA32_MISC_ENABLE_TURBO_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_TURBO_DISABLE_BIT)
422#define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE_BIT 39
423#define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE_BIT)
397 424
398#define MSR_IA32_TSC_DEADLINE 0x000006E0 425#define MSR_IA32_TSC_DEADLINE 0x000006E0
399 426
@@ -527,6 +554,7 @@
527#define MSR_IA32_VMX_TRUE_PROCBASED_CTLS 0x0000048e 554#define MSR_IA32_VMX_TRUE_PROCBASED_CTLS 0x0000048e
528#define MSR_IA32_VMX_TRUE_EXIT_CTLS 0x0000048f 555#define MSR_IA32_VMX_TRUE_EXIT_CTLS 0x0000048f
529#define MSR_IA32_VMX_TRUE_ENTRY_CTLS 0x00000490 556#define MSR_IA32_VMX_TRUE_ENTRY_CTLS 0x00000490
557#define MSR_IA32_VMX_VMFUNC 0x00000491
530 558
531/* VMX_BASIC bits and bitmasks */ 559/* VMX_BASIC bits and bitmasks */
532#define VMX_BASIC_VMCS_SIZE_SHIFT 32 560#define VMX_BASIC_VMCS_SIZE_SHIFT 32
diff --git a/arch/x86/include/uapi/asm/sembuf.h b/arch/x86/include/uapi/asm/sembuf.h
index ee50c801f7b7..cc2d6a3aeae7 100644
--- a/arch/x86/include/uapi/asm/sembuf.h
+++ b/arch/x86/include/uapi/asm/sembuf.h
@@ -13,12 +13,12 @@
13struct semid64_ds { 13struct semid64_ds {
14 struct ipc64_perm sem_perm; /* permissions .. see ipc.h */ 14 struct ipc64_perm sem_perm; /* permissions .. see ipc.h */
15 __kernel_time_t sem_otime; /* last semop time */ 15 __kernel_time_t sem_otime; /* last semop time */
16 unsigned long __unused1; 16 __kernel_ulong_t __unused1;
17 __kernel_time_t sem_ctime; /* last change time */ 17 __kernel_time_t sem_ctime; /* last change time */
18 unsigned long __unused2; 18 __kernel_ulong_t __unused2;
19 unsigned long sem_nsems; /* no. of semaphores in array */ 19 __kernel_ulong_t sem_nsems; /* no. of semaphores in array */
20 unsigned long __unused3; 20 __kernel_ulong_t __unused3;
21 unsigned long __unused4; 21 __kernel_ulong_t __unused4;
22}; 22};
23 23
24#endif /* _ASM_X86_SEMBUF_H */ 24#endif /* _ASM_X86_SEMBUF_H */
diff --git a/arch/x86/include/uapi/asm/stat.h b/arch/x86/include/uapi/asm/stat.h
index 7b3ddc348585..bc03eb5d6360 100644
--- a/arch/x86/include/uapi/asm/stat.h
+++ b/arch/x86/include/uapi/asm/stat.h
@@ -1,6 +1,8 @@
1#ifndef _ASM_X86_STAT_H 1#ifndef _ASM_X86_STAT_H
2#define _ASM_X86_STAT_H 2#define _ASM_X86_STAT_H
3 3
4#include <asm/posix_types.h>
5
4#define STAT_HAVE_NSEC 1 6#define STAT_HAVE_NSEC 1
5 7
6#ifdef __i386__ 8#ifdef __i386__
@@ -78,26 +80,26 @@ struct stat64 {
78#else /* __i386__ */ 80#else /* __i386__ */
79 81
80struct stat { 82struct stat {
81 unsigned long st_dev; 83 __kernel_ulong_t st_dev;
82 unsigned long st_ino; 84 __kernel_ulong_t st_ino;
83 unsigned long st_nlink; 85 __kernel_ulong_t st_nlink;
84 86
85 unsigned int st_mode; 87 unsigned int st_mode;
86 unsigned int st_uid; 88 unsigned int st_uid;
87 unsigned int st_gid; 89 unsigned int st_gid;
88 unsigned int __pad0; 90 unsigned int __pad0;
89 unsigned long st_rdev; 91 __kernel_ulong_t st_rdev;
90 long st_size; 92 __kernel_long_t st_size;
91 long st_blksize; 93 __kernel_long_t st_blksize;
92 long st_blocks; /* Number 512-byte blocks allocated. */ 94 __kernel_long_t st_blocks; /* Number 512-byte blocks allocated. */
93 95
94 unsigned long st_atime; 96 __kernel_ulong_t st_atime;
95 unsigned long st_atime_nsec; 97 __kernel_ulong_t st_atime_nsec;
96 unsigned long st_mtime; 98 __kernel_ulong_t st_mtime;
97 unsigned long st_mtime_nsec; 99 __kernel_ulong_t st_mtime_nsec;
98 unsigned long st_ctime; 100 __kernel_ulong_t st_ctime;
99 unsigned long st_ctime_nsec; 101 __kernel_ulong_t st_ctime_nsec;
100 long __unused[3]; 102 __kernel_long_t __unused[3];
101}; 103};
102 104
103/* We don't need to memset the whole thing just to initialize the padding */ 105/* We don't need to memset the whole thing just to initialize the padding */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 9b0a34e2cd79..f4d96000d33a 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -26,13 +26,14 @@ obj-$(CONFIG_IRQ_WORK) += irq_work.o
26obj-y += probe_roms.o 26obj-y += probe_roms.o
27obj-$(CONFIG_X86_32) += i386_ksyms_32.o 27obj-$(CONFIG_X86_32) += i386_ksyms_32.o
28obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o 28obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
29obj-y += syscall_$(BITS).o 29obj-y += syscall_$(BITS).o vsyscall_gtod.o
30obj-$(CONFIG_X86_64) += vsyscall_64.o 30obj-$(CONFIG_X86_64) += vsyscall_64.o
31obj-$(CONFIG_X86_64) += vsyscall_emu_64.o 31obj-$(CONFIG_X86_64) += vsyscall_emu_64.o
32obj-$(CONFIG_SYSFS) += ksysfs.o
32obj-y += bootflag.o e820.o 33obj-y += bootflag.o e820.o
33obj-y += pci-dma.o quirks.o topology.o kdebugfs.o 34obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
34obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o 35obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
35obj-y += tsc.o io_delay.o rtc.o 36obj-y += tsc.o tsc_msr.o io_delay.o rtc.o
36obj-y += pci-iommu_table.o 37obj-y += pci-iommu_table.o
37obj-y += resource.o 38obj-y += resource.o
38 39
@@ -91,15 +92,6 @@ obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
91 92
92obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o 93obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
93 94
94obj-$(CONFIG_MICROCODE_EARLY) += microcode_core_early.o
95obj-$(CONFIG_MICROCODE_INTEL_EARLY) += microcode_intel_early.o
96obj-$(CONFIG_MICROCODE_INTEL_LIB) += microcode_intel_lib.o
97microcode-y := microcode_core.o
98microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o
99microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o
100obj-$(CONFIG_MICROCODE_AMD_EARLY) += microcode_amd_early.o
101obj-$(CONFIG_MICROCODE) += microcode.o
102
103obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o 95obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
104 96
105obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o 97obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
@@ -111,6 +103,7 @@ obj-$(CONFIG_EFI) += sysfb_efi.o
111 103
112obj-$(CONFIG_PERF_EVENTS) += perf_regs.o 104obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
113obj-$(CONFIG_TRACING) += tracepoint.o 105obj-$(CONFIG_TRACING) += tracepoint.o
106obj-$(CONFIG_IOSF_MBI) += iosf_mbi.o
114 107
115### 108###
116# 64 bit specific files 109# 64 bit specific files
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 6c0b43bd024b..86281ffb96d6 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -46,7 +46,6 @@
46 46
47#include "sleep.h" /* To include x86_acpi_suspend_lowlevel */ 47#include "sleep.h" /* To include x86_acpi_suspend_lowlevel */
48static int __initdata acpi_force = 0; 48static int __initdata acpi_force = 0;
49u32 acpi_rsdt_forced;
50int acpi_disabled; 49int acpi_disabled;
51EXPORT_SYMBOL(acpi_disabled); 50EXPORT_SYMBOL(acpi_disabled);
52 51
@@ -54,10 +53,6 @@ EXPORT_SYMBOL(acpi_disabled);
54# include <asm/proto.h> 53# include <asm/proto.h>
55#endif /* X86 */ 54#endif /* X86 */
56 55
57#define BAD_MADT_ENTRY(entry, end) ( \
58 (!entry) || (unsigned long)entry + sizeof(*entry) > end || \
59 ((struct acpi_subtable_header *)entry)->length < sizeof(*entry))
60
61#define PREFIX "ACPI: " 56#define PREFIX "ACPI: "
62 57
63int acpi_noirq; /* skip ACPI IRQ initialization */ 58int acpi_noirq; /* skip ACPI IRQ initialization */
@@ -614,10 +609,10 @@ static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
614 int nid; 609 int nid;
615 610
616 nid = acpi_get_node(handle); 611 nid = acpi_get_node(handle);
617 if (nid == -1 || !node_online(nid)) 612 if (nid != -1) {
618 return; 613 set_apicid_to_node(physid, nid);
619 set_apicid_to_node(physid, nid); 614 numa_set_node(cpu, nid);
620 numa_set_node(cpu, nid); 615 }
621#endif 616#endif
622} 617}
623 618
@@ -908,10 +903,6 @@ static int __init acpi_parse_madt_lapic_entries(void)
908#ifdef CONFIG_X86_IO_APIC 903#ifdef CONFIG_X86_IO_APIC
909#define MP_ISA_BUS 0 904#define MP_ISA_BUS 0
910 905
911#ifdef CONFIG_X86_ES7000
912extern int es7000_plat;
913#endif
914
915void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) 906void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
916{ 907{
917 int ioapic; 908 int ioapic;
@@ -961,14 +952,6 @@ void __init mp_config_acpi_legacy_irqs(void)
961 set_bit(MP_ISA_BUS, mp_bus_not_pci); 952 set_bit(MP_ISA_BUS, mp_bus_not_pci);
962 pr_debug("Bus #%d is ISA\n", MP_ISA_BUS); 953 pr_debug("Bus #%d is ISA\n", MP_ISA_BUS);
963 954
964#ifdef CONFIG_X86_ES7000
965 /*
966 * Older generations of ES7000 have no legacy identity mappings
967 */
968 if (es7000_plat == 1)
969 return;
970#endif
971
972 /* 955 /*
973 * Use the default configuration for the IRQs 0-15. Unless 956 * Use the default configuration for the IRQs 0-15. Unless
974 * overridden by (MADT) interrupt source override entries. 957 * overridden by (MADT) interrupt source override entries.
@@ -1034,9 +1017,7 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
1034 1017
1035 if (!acpi_ioapic) 1018 if (!acpi_ioapic)
1036 return 0; 1019 return 0;
1037 if (!dev) 1020 if (!dev || !dev_is_pci(dev))
1038 return 0;
1039 if (dev->bus != &pci_bus_type)
1040 return 0; 1021 return 0;
1041 1022
1042 pdev = to_pci_dev(dev); 1023 pdev = to_pci_dev(dev);
@@ -1564,7 +1545,7 @@ static int __init parse_acpi(char *arg)
1564 } 1545 }
1565 /* acpi=rsdt use RSDT instead of XSDT */ 1546 /* acpi=rsdt use RSDT instead of XSDT */
1566 else if (strcmp(arg, "rsdt") == 0) { 1547 else if (strcmp(arg, "rsdt") == 0) {
1567 acpi_rsdt_forced = 1; 1548 acpi_gbl_do_not_use_xsdt = TRUE;
1568 } 1549 }
1569 /* "acpi=noirq" disables ACPI interrupt routing */ 1550 /* "acpi=noirq" disables ACPI interrupt routing */
1570 else if (strcmp(arg, "noirq") == 0) { 1551 else if (strcmp(arg, "noirq") == 0) {
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index d2b7f27781bc..4b28159e0421 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -87,7 +87,9 @@ static long acpi_processor_ffh_cstate_probe_cpu(void *_cx)
87 num_cstate_subtype = edx_part & MWAIT_SUBSTATE_MASK; 87 num_cstate_subtype = edx_part & MWAIT_SUBSTATE_MASK;
88 88
89 retval = 0; 89 retval = 0;
90 if (num_cstate_subtype < (cx->address & MWAIT_SUBSTATE_MASK)) { 90 /* If the HW does not support any sub-states in this C-state */
91 if (num_cstate_subtype == 0) {
92 pr_warn(FW_BUG "ACPI MWAIT C-state 0x%x not supported by HW (0x%x)\n", cx->address, edx_part);
91 retval = -1; 93 retval = -1;
92 goto out; 94 goto out;
93 } 95 }
@@ -150,29 +152,6 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
150} 152}
151EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); 153EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
152 154
153/*
154 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
155 * which can obviate IPI to trigger checking of need_resched.
156 * We execute MONITOR against need_resched and enter optimized wait state
157 * through MWAIT. Whenever someone changes need_resched, we would be woken
158 * up from MWAIT (without an IPI).
159 *
160 * New with Core Duo processors, MWAIT can take some hints based on CPU
161 * capability.
162 */
163void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
164{
165 if (!need_resched()) {
166 if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
167 clflush((void *)&current_thread_info()->flags);
168
169 __monitor((void *)&current_thread_info()->flags, 0, 0);
170 smp_mb();
171 if (!need_resched())
172 __mwait(ax, cx);
173 }
174}
175
176void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx) 155void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
177{ 156{
178 unsigned int cpu = smp_processor_id(); 157 unsigned int cpu = smp_processor_id();
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 59554dca96ec..f04dbb3069b8 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -22,6 +22,7 @@ const struct pci_device_id amd_nb_misc_ids[] = {
22 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) }, 22 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) },
23 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F3) }, 23 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F3) },
24 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) }, 24 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) },
25 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) },
25 {} 26 {}
26}; 27};
27EXPORT_SYMBOL(amd_nb_misc_ids); 28EXPORT_SYMBOL(amd_nb_misc_ids);
@@ -30,6 +31,7 @@ static const struct pci_device_id amd_nb_link_ids[] = {
30 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) }, 31 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) },
31 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F4) }, 32 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F4) },
32 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) }, 33 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) },
34 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) },
33 {} 35 {}
34}; 36};
35 37
@@ -179,7 +181,7 @@ int amd_get_subcaches(int cpu)
179 return (mask >> (4 * cuid)) & 0xf; 181 return (mask >> (4 * cuid)) & 0xf;
180} 182}
181 183
182int amd_set_subcaches(int cpu, int mask) 184int amd_set_subcaches(int cpu, unsigned long mask)
183{ 185{
184 static unsigned int reset, ban; 186 static unsigned int reset, ban;
185 struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu)); 187 struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu));
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index fd972a3e4cbb..9fa8aa051f54 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -18,7 +18,6 @@
18#include <linux/pci_ids.h> 18#include <linux/pci_ids.h>
19#include <linux/pci.h> 19#include <linux/pci.h>
20#include <linux/bitops.h> 20#include <linux/bitops.h>
21#include <linux/ioport.h>
22#include <linux/suspend.h> 21#include <linux/suspend.h>
23#include <asm/e820.h> 22#include <asm/e820.h>
24#include <asm/io.h> 23#include <asm/io.h>
@@ -54,18 +53,6 @@ int fallback_aper_force __initdata;
54 53
55int fix_aperture __initdata = 1; 54int fix_aperture __initdata = 1;
56 55
57static struct resource gart_resource = {
58 .name = "GART",
59 .flags = IORESOURCE_MEM,
60};
61
62static void __init insert_aperture_resource(u32 aper_base, u32 aper_size)
63{
64 gart_resource.start = aper_base;
65 gart_resource.end = aper_base + aper_size - 1;
66 insert_resource(&iomem_resource, &gart_resource);
67}
68
69/* This code runs before the PCI subsystem is initialized, so just 56/* This code runs before the PCI subsystem is initialized, so just
70 access the northbridge directly. */ 57 access the northbridge directly. */
71 58
@@ -96,7 +83,6 @@ static u32 __init allocate_aperture(void)
96 memblock_reserve(addr, aper_size); 83 memblock_reserve(addr, aper_size);
97 printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n", 84 printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n",
98 aper_size >> 10, addr); 85 aper_size >> 10, addr);
99 insert_aperture_resource((u32)addr, aper_size);
100 register_nosave_region(addr >> PAGE_SHIFT, 86 register_nosave_region(addr >> PAGE_SHIFT,
101 (addr+aper_size) >> PAGE_SHIFT); 87 (addr+aper_size) >> PAGE_SHIFT);
102 88
@@ -444,12 +430,8 @@ int __init gart_iommu_hole_init(void)
444 430
445out: 431out:
446 if (!fix && !fallback_aper_force) { 432 if (!fix && !fallback_aper_force) {
447 if (last_aper_base) { 433 if (last_aper_base)
448 unsigned long n = (32 * 1024 * 1024) << last_aper_order;
449
450 insert_aperture_resource((u32)last_aper_base, n);
451 return 1; 434 return 1;
452 }
453 return 0; 435 return 0;
454 } 436 }
455 437
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 0ae0323b1f9c..dcb5b15401ce 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -18,10 +18,7 @@ obj-y += apic_flat_64.o
18endif 18endif
19 19
20# APIC probe will depend on the listing order here 20# APIC probe will depend on the listing order here
21obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
22obj-$(CONFIG_X86_SUMMIT) += summit_32.o
23obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o 21obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o
24obj-$(CONFIG_X86_ES7000) += es7000_32.o
25 22
26# For 32bit, probe_32 need to be listed last 23# For 32bit, probe_32 need to be listed last
27obj-$(CONFIG_X86_LOCAL_APIC) += probe_$(BITS).o 24obj-$(CONFIG_X86_LOCAL_APIC) += probe_$(BITS).o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index d278736bf774..ad28db7e6bde 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -75,6 +75,13 @@ unsigned int max_physical_apicid;
75physid_mask_t phys_cpu_present_map; 75physid_mask_t phys_cpu_present_map;
76 76
77/* 77/*
78 * Processor to be disabled specified by kernel parameter
79 * disable_cpu_apicid=<int>, mostly used for the kdump 2nd kernel to
80 * avoid undefined behaviour caused by sending INIT from AP to BSP.
81 */
82static unsigned int disabled_cpu_apicid __read_mostly = BAD_APICID;
83
84/*
78 * Map cpu index to physical APIC ID 85 * Map cpu index to physical APIC ID
79 */ 86 */
80DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID); 87DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID);
@@ -126,6 +133,10 @@ static inline void imcr_apic_to_pic(void)
126 * +1=force-enable 133 * +1=force-enable
127 */ 134 */
128static int force_enable_local_apic __initdata; 135static int force_enable_local_apic __initdata;
136
137/* Control whether x2APIC mode is enabled or not */
138static bool nox2apic __initdata;
139
129/* 140/*
130 * APIC command line parameters 141 * APIC command line parameters
131 */ 142 */
@@ -155,8 +166,7 @@ int x2apic_mode;
155/* x2apic enabled before OS handover */ 166/* x2apic enabled before OS handover */
156int x2apic_preenabled; 167int x2apic_preenabled;
157static int x2apic_disabled; 168static int x2apic_disabled;
158static int nox2apic; 169static int __init setup_nox2apic(char *str)
159static __init int setup_nox2apic(char *str)
160{ 170{
161 if (x2apic_enabled()) { 171 if (x2apic_enabled()) {
162 int apicid = native_apic_msr_read(APIC_ID); 172 int apicid = native_apic_msr_read(APIC_ID);
@@ -171,7 +181,7 @@ static __init int setup_nox2apic(char *str)
171 } else 181 } else
172 setup_clear_cpu_cap(X86_FEATURE_X2APIC); 182 setup_clear_cpu_cap(X86_FEATURE_X2APIC);
173 183
174 nox2apic = 1; 184 nox2apic = true;
175 185
176 return 0; 186 return 0;
177} 187}
@@ -276,8 +286,12 @@ u32 native_safe_apic_wait_icr_idle(void)
276 286
277void native_apic_icr_write(u32 low, u32 id) 287void native_apic_icr_write(u32 low, u32 id)
278{ 288{
289 unsigned long flags;
290
291 local_irq_save(flags);
279 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id)); 292 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id));
280 apic_write(APIC_ICR, low); 293 apic_write(APIC_ICR, low);
294 local_irq_restore(flags);
281} 295}
282 296
283u64 native_apic_icr_read(void) 297u64 native_apic_icr_read(void)
@@ -1968,7 +1982,7 @@ __visible void smp_trace_spurious_interrupt(struct pt_regs *regs)
1968 */ 1982 */
1969static inline void __smp_error_interrupt(struct pt_regs *regs) 1983static inline void __smp_error_interrupt(struct pt_regs *regs)
1970{ 1984{
1971 u32 v0, v1; 1985 u32 v;
1972 u32 i = 0; 1986 u32 i = 0;
1973 static const char * const error_interrupt_reason[] = { 1987 static const char * const error_interrupt_reason[] = {
1974 "Send CS error", /* APIC Error Bit 0 */ 1988 "Send CS error", /* APIC Error Bit 0 */
@@ -1982,21 +1996,21 @@ static inline void __smp_error_interrupt(struct pt_regs *regs)
1982 }; 1996 };
1983 1997
1984 /* First tickle the hardware, only then report what went on. -- REW */ 1998 /* First tickle the hardware, only then report what went on. -- REW */
1985 v0 = apic_read(APIC_ESR); 1999 if (lapic_get_maxlvt() > 3) /* Due to the Pentium erratum 3AP. */
1986 apic_write(APIC_ESR, 0); 2000 apic_write(APIC_ESR, 0);
1987 v1 = apic_read(APIC_ESR); 2001 v = apic_read(APIC_ESR);
1988 ack_APIC_irq(); 2002 ack_APIC_irq();
1989 atomic_inc(&irq_err_count); 2003 atomic_inc(&irq_err_count);
1990 2004
1991 apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x(%02x)", 2005 apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x",
1992 smp_processor_id(), v0 , v1); 2006 smp_processor_id(), v);
1993 2007
1994 v1 = v1 & 0xff; 2008 v &= 0xff;
1995 while (v1) { 2009 while (v) {
1996 if (v1 & 0x1) 2010 if (v & 0x1)
1997 apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]); 2011 apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]);
1998 i++; 2012 i++;
1999 v1 >>= 1; 2013 v >>= 1;
2000 } 2014 }
2001 2015
2002 apic_printk(APIC_DEBUG, KERN_CONT "\n"); 2016 apic_printk(APIC_DEBUG, KERN_CONT "\n");
@@ -2115,6 +2129,38 @@ int generic_processor_info(int apicid, int version)
2115 phys_cpu_present_map); 2129 phys_cpu_present_map);
2116 2130
2117 /* 2131 /*
2132 * boot_cpu_physical_apicid is designed to have the apicid
2133 * returned by read_apic_id(), i.e, the apicid of the
2134 * currently booting-up processor. However, on some platforms,
2135 * it is temporarily modified by the apicid reported as BSP
2136 * through MP table. Concretely:
2137 *
2138 * - arch/x86/kernel/mpparse.c: MP_processor_info()
2139 * - arch/x86/mm/amdtopology.c: amd_numa_init()
2140 *
2141 * This function is executed with the modified
2142 * boot_cpu_physical_apicid. So, disabled_cpu_apicid kernel
2143 * parameter doesn't work to disable APs on kdump 2nd kernel.
2144 *
2145 * Since fixing handling of boot_cpu_physical_apicid requires
2146 * another discussion and tests on each platform, we leave it
2147 * for now and here we use read_apic_id() directly in this
2148 * function, generic_processor_info().
2149 */
2150 if (disabled_cpu_apicid != BAD_APICID &&
2151 disabled_cpu_apicid != read_apic_id() &&
2152 disabled_cpu_apicid == apicid) {
2153 int thiscpu = num_processors + disabled_cpus;
2154
2155 pr_warning("APIC: Disabling requested cpu."
2156 " Processor %d/0x%x ignored.\n",
2157 thiscpu, apicid);
2158
2159 disabled_cpus++;
2160 return -ENODEV;
2161 }
2162
2163 /*
2118 * If boot cpu has not been detected yet, then only allow upto 2164 * If boot cpu has not been detected yet, then only allow upto
2119 * nr_cpu_ids - 1 processors and keep one slot free for boot cpu 2165 * nr_cpu_ids - 1 processors and keep one slot free for boot cpu
2120 */ 2166 */
@@ -2592,3 +2638,12 @@ static int __init lapic_insert_resource(void)
2592 * that is using request_resource 2638 * that is using request_resource
2593 */ 2639 */
2594late_initcall(lapic_insert_resource); 2640late_initcall(lapic_insert_resource);
2641
2642static int __init apic_set_disabled_cpu_apicid(char *arg)
2643{
2644 if (!arg || !get_option(&arg, &disabled_cpu_apicid))
2645 return -EINVAL;
2646
2647 return 0;
2648}
2649early_param("disable_cpu_apicid", apic_set_disabled_cpu_apicid);
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 00c77cf78e9e..7c1b29479513 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -14,16 +14,13 @@
14#include <linux/string.h> 14#include <linux/string.h>
15#include <linux/kernel.h> 15#include <linux/kernel.h>
16#include <linux/ctype.h> 16#include <linux/ctype.h>
17#include <linux/init.h>
18#include <linux/hardirq.h> 17#include <linux/hardirq.h>
19#include <linux/module.h> 18#include <linux/module.h>
20#include <asm/smp.h> 19#include <asm/smp.h>
21#include <asm/apic.h> 20#include <asm/apic.h>
22#include <asm/ipi.h> 21#include <asm/ipi.h>
23 22
24#ifdef CONFIG_ACPI 23#include <linux/acpi.h>
25#include <acpi/acpi_bus.h>
26#endif
27 24
28static struct apic apic_physflat; 25static struct apic apic_physflat;
29static struct apic apic_flat; 26static struct apic apic_flat;
@@ -201,7 +198,7 @@ static struct apic apic_flat = {
201 198
202 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, 199 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
203 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, 200 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
204 .wait_for_init_deassert = NULL, 201 .wait_for_init_deassert = false,
205 .smp_callin_clear_local_apic = NULL, 202 .smp_callin_clear_local_apic = NULL,
206 .inquire_remote_apic = default_inquire_remote_apic, 203 .inquire_remote_apic = default_inquire_remote_apic,
207 204
@@ -317,7 +314,7 @@ static struct apic apic_physflat = {
317 314
318 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, 315 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
319 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, 316 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
320 .wait_for_init_deassert = NULL, 317 .wait_for_init_deassert = false,
321 .smp_callin_clear_local_apic = NULL, 318 .smp_callin_clear_local_apic = NULL,
322 .inquire_remote_apic = default_inquire_remote_apic, 319 .inquire_remote_apic = default_inquire_remote_apic,
323 320
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index e145f28b4099..8c7c98249c20 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -15,7 +15,6 @@
15#include <linux/string.h> 15#include <linux/string.h>
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/ctype.h> 17#include <linux/ctype.h>
18#include <linux/init.h>
19#include <linux/errno.h> 18#include <linux/errno.h>
20#include <asm/fixmap.h> 19#include <asm/fixmap.h>
21#include <asm/mpspec.h> 20#include <asm/mpspec.h>
@@ -173,8 +172,7 @@ struct apic apic_noop = {
173 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, 172 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
174 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, 173 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
175 174
176 .wait_for_init_deassert = NULL, 175 .wait_for_init_deassert = false,
177
178 .smp_callin_clear_local_apic = NULL, 176 .smp_callin_clear_local_apic = NULL,
179 .inquire_remote_apic = NULL, 177 .inquire_remote_apic = NULL,
180 178
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 3e67f9e3d7ef..a5b45df8bc88 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -248,7 +248,7 @@ static const struct apic apic_numachip __refconst = {
248 .wakeup_secondary_cpu = numachip_wakeup_secondary, 248 .wakeup_secondary_cpu = numachip_wakeup_secondary,
249 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, 249 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
250 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, 250 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
251 .wait_for_init_deassert = NULL, 251 .wait_for_init_deassert = false,
252 .smp_callin_clear_local_apic = NULL, 252 .smp_callin_clear_local_apic = NULL,
253 .inquire_remote_apic = NULL, /* REMRD not supported */ 253 .inquire_remote_apic = NULL, /* REMRD not supported */
254 254
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index d50e3640d5ae..e4840aa7a255 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -199,8 +199,7 @@ static struct apic apic_bigsmp = {
199 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, 199 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
200 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, 200 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
201 201
202 .wait_for_init_deassert = default_wait_for_init_deassert, 202 .wait_for_init_deassert = true,
203
204 .smp_callin_clear_local_apic = NULL, 203 .smp_callin_clear_local_apic = NULL,
205 .inquire_remote_apic = default_inquire_remote_apic, 204 .inquire_remote_apic = default_inquire_remote_apic,
206 205
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
deleted file mode 100644
index c55224731b2d..000000000000
--- a/arch/x86/kernel/apic/es7000_32.c
+++ /dev/null
@@ -1,746 +0,0 @@
1/*
2 * Written by: Garry Forsgren, Unisys Corporation
3 * Natalie Protasevich, Unisys Corporation
4 *
5 * This file contains the code to configure and interface
6 * with Unisys ES7000 series hardware system manager.
7 *
8 * Copyright (c) 2003 Unisys Corporation.
9 * Copyright (C) 2009, Red Hat, Inc., Ingo Molnar
10 *
11 * All Rights Reserved.
12 *
13 * This program is free software; you can redistribute it and/or modify it
14 * under the terms of version 2 of the GNU General Public License as
15 * published by the Free Software Foundation.
16 *
17 * This program is distributed in the hope that it would be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
20 *
21 * You should have received a copy of the GNU General Public License along
22 * with this program; if not, write the Free Software Foundation, Inc., 59
23 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
24 *
25 * Contact information: Unisys Corporation, Township Line & Union Meeting
26 * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or:
27 *
28 * http://www.unisys.com
29 */
30
31#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
32
33#include <linux/notifier.h>
34#include <linux/spinlock.h>
35#include <linux/cpumask.h>
36#include <linux/threads.h>
37#include <linux/kernel.h>
38#include <linux/module.h>
39#include <linux/reboot.h>
40#include <linux/string.h>
41#include <linux/types.h>
42#include <linux/errno.h>
43#include <linux/acpi.h>
44#include <linux/init.h>
45#include <linux/gfp.h>
46#include <linux/nmi.h>
47#include <linux/smp.h>
48#include <linux/io.h>
49
50#include <asm/apicdef.h>
51#include <linux/atomic.h>
52#include <asm/fixmap.h>
53#include <asm/mpspec.h>
54#include <asm/setup.h>
55#include <asm/apic.h>
56#include <asm/ipi.h>
57
58/*
59 * ES7000 chipsets
60 */
61
62#define NON_UNISYS 0
63#define ES7000_CLASSIC 1
64#define ES7000_ZORRO 2
65
66#define MIP_REG 1
67#define MIP_PSAI_REG 4
68
69#define MIP_BUSY 1
70#define MIP_SPIN 0xf0000
71#define MIP_VALID 0x0100000000000000ULL
72#define MIP_SW_APIC 0x1020b
73
74#define MIP_PORT(val) ((val >> 32) & 0xffff)
75
76#define MIP_RD_LO(val) (val & 0xffffffff)
77
78struct mip_reg {
79 unsigned long long off_0x00;
80 unsigned long long off_0x08;
81 unsigned long long off_0x10;
82 unsigned long long off_0x18;
83 unsigned long long off_0x20;
84 unsigned long long off_0x28;
85 unsigned long long off_0x30;
86 unsigned long long off_0x38;
87};
88
89struct mip_reg_info {
90 unsigned long long mip_info;
91 unsigned long long delivery_info;
92 unsigned long long host_reg;
93 unsigned long long mip_reg;
94};
95
96struct psai {
97 unsigned long long entry_type;
98 unsigned long long addr;
99 unsigned long long bep_addr;
100};
101
102#ifdef CONFIG_ACPI
103
104struct es7000_oem_table {
105 struct acpi_table_header Header;
106 u32 OEMTableAddr;
107 u32 OEMTableSize;
108};
109
110static unsigned long oem_addrX;
111static unsigned long oem_size;
112
113#endif
114
115/*
116 * ES7000 Globals
117 */
118
119static volatile unsigned long *psai;
120static struct mip_reg *mip_reg;
121static struct mip_reg *host_reg;
122static int mip_port;
123static unsigned long mip_addr;
124static unsigned long host_addr;
125
126int es7000_plat;
127
128/*
129 * GSI override for ES7000 platforms.
130 */
131
132
133static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
134{
135 unsigned long vect = 0, psaival = 0;
136
137 if (psai == NULL)
138 return -1;
139
140 vect = ((unsigned long)__pa(eip)/0x1000) << 16;
141 psaival = (0x1000000 | vect | cpu);
142
143 while (*psai & 0x1000000)
144 ;
145
146 *psai = psaival;
147
148 return 0;
149}
150
151static int es7000_apic_is_cluster(void)
152{
153 /* MPENTIUMIII */
154 if (boot_cpu_data.x86 == 6 &&
155 (boot_cpu_data.x86_model >= 7 && boot_cpu_data.x86_model <= 11))
156 return 1;
157
158 return 0;
159}
160
161static void setup_unisys(void)
162{
163 /*
164 * Determine the generation of the ES7000 currently running.
165 *
166 * es7000_plat = 1 if the machine is a 5xx ES7000 box
167 * es7000_plat = 2 if the machine is a x86_64 ES7000 box
168 *
169 */
170 if (!(boot_cpu_data.x86 <= 15 && boot_cpu_data.x86_model <= 2))
171 es7000_plat = ES7000_ZORRO;
172 else
173 es7000_plat = ES7000_CLASSIC;
174}
175
176/*
177 * Parse the OEM Table:
178 */
179static int parse_unisys_oem(char *oemptr)
180{
181 int i;
182 int success = 0;
183 unsigned char type, size;
184 unsigned long val;
185 char *tp = NULL;
186 struct psai *psaip = NULL;
187 struct mip_reg_info *mi;
188 struct mip_reg *host, *mip;
189
190 tp = oemptr;
191
192 tp += 8;
193
194 for (i = 0; i <= 6; i++) {
195 type = *tp++;
196 size = *tp++;
197 tp -= 2;
198 switch (type) {
199 case MIP_REG:
200 mi = (struct mip_reg_info *)tp;
201 val = MIP_RD_LO(mi->host_reg);
202 host_addr = val;
203 host = (struct mip_reg *)val;
204 host_reg = __va(host);
205 val = MIP_RD_LO(mi->mip_reg);
206 mip_port = MIP_PORT(mi->mip_info);
207 mip_addr = val;
208 mip = (struct mip_reg *)val;
209 mip_reg = __va(mip);
210 pr_debug("host_reg = 0x%lx\n",
211 (unsigned long)host_reg);
212 pr_debug("mip_reg = 0x%lx\n",
213 (unsigned long)mip_reg);
214 success++;
215 break;
216 case MIP_PSAI_REG:
217 psaip = (struct psai *)tp;
218 if (tp != NULL) {
219 if (psaip->addr)
220 psai = __va(psaip->addr);
221 else
222 psai = NULL;
223 success++;
224 }
225 break;
226 default:
227 break;
228 }
229 tp += size;
230 }
231
232 if (success < 2)
233 es7000_plat = NON_UNISYS;
234 else
235 setup_unisys();
236
237 return es7000_plat;
238}
239
240#ifdef CONFIG_ACPI
241static int __init find_unisys_acpi_oem_table(unsigned long *oem_addr)
242{
243 struct acpi_table_header *header = NULL;
244 struct es7000_oem_table *table;
245 acpi_size tbl_size;
246 acpi_status ret;
247 int i = 0;
248
249 for (;;) {
250 ret = acpi_get_table_with_size("OEM1", i++, &header, &tbl_size);
251 if (!ACPI_SUCCESS(ret))
252 return -1;
253
254 if (!memcmp((char *) &header->oem_id, "UNISYS", 6))
255 break;
256
257 early_acpi_os_unmap_memory(header, tbl_size);
258 }
259
260 table = (void *)header;
261
262 oem_addrX = table->OEMTableAddr;
263 oem_size = table->OEMTableSize;
264
265 early_acpi_os_unmap_memory(header, tbl_size);
266
267 *oem_addr = (unsigned long)__acpi_map_table(oem_addrX, oem_size);
268
269 return 0;
270}
271
272static void __init unmap_unisys_acpi_oem_table(unsigned long oem_addr)
273{
274 if (!oem_addr)
275 return;
276
277 __acpi_unmap_table((char *)oem_addr, oem_size);
278}
279
280static int es7000_check_dsdt(void)
281{
282 struct acpi_table_header header;
283
284 if (ACPI_SUCCESS(acpi_get_table_header(ACPI_SIG_DSDT, 0, &header)) &&
285 !strncmp(header.oem_id, "UNISYS", 6))
286 return 1;
287 return 0;
288}
289
290static int es7000_acpi_ret;
291
292/* Hook from generic ACPI tables.c */
293static int __init es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
294{
295 unsigned long oem_addr = 0;
296 int check_dsdt;
297 int ret = 0;
298
299 /* check dsdt at first to avoid clear fix_map for oem_addr */
300 check_dsdt = es7000_check_dsdt();
301
302 if (!find_unisys_acpi_oem_table(&oem_addr)) {
303 if (check_dsdt) {
304 ret = parse_unisys_oem((char *)oem_addr);
305 } else {
306 setup_unisys();
307 ret = 1;
308 }
309 /*
310 * we need to unmap it
311 */
312 unmap_unisys_acpi_oem_table(oem_addr);
313 }
314
315 es7000_acpi_ret = ret;
316
317 return ret && !es7000_apic_is_cluster();
318}
319
320static int es7000_acpi_madt_oem_check_cluster(char *oem_id, char *oem_table_id)
321{
322 int ret = es7000_acpi_ret;
323
324 return ret && es7000_apic_is_cluster();
325}
326
327#else /* !CONFIG_ACPI: */
328static int es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
329{
330 return 0;
331}
332
333static int es7000_acpi_madt_oem_check_cluster(char *oem_id, char *oem_table_id)
334{
335 return 0;
336}
337#endif /* !CONFIG_ACPI */
338
339static void es7000_spin(int n)
340{
341 int i = 0;
342
343 while (i++ < n)
344 rep_nop();
345}
346
347static int es7000_mip_write(struct mip_reg *mip_reg)
348{
349 int status = 0;
350 int spin;
351
352 spin = MIP_SPIN;
353 while ((host_reg->off_0x38 & MIP_VALID) != 0) {
354 if (--spin <= 0) {
355 WARN(1, "Timeout waiting for Host Valid Flag\n");
356 return -1;
357 }
358 es7000_spin(MIP_SPIN);
359 }
360
361 memcpy(host_reg, mip_reg, sizeof(struct mip_reg));
362 outb(1, mip_port);
363
364 spin = MIP_SPIN;
365
366 while ((mip_reg->off_0x38 & MIP_VALID) == 0) {
367 if (--spin <= 0) {
368 WARN(1, "Timeout waiting for MIP Valid Flag\n");
369 return -1;
370 }
371 es7000_spin(MIP_SPIN);
372 }
373
374 status = (mip_reg->off_0x00 & 0xffff0000000000ULL) >> 48;
375 mip_reg->off_0x38 &= ~MIP_VALID;
376
377 return status;
378}
379
380static void es7000_enable_apic_mode(void)
381{
382 struct mip_reg es7000_mip_reg;
383 int mip_status;
384
385 if (!es7000_plat)
386 return;
387
388 pr_info("Enabling APIC mode.\n");
389 memset(&es7000_mip_reg, 0, sizeof(struct mip_reg));
390 es7000_mip_reg.off_0x00 = MIP_SW_APIC;
391 es7000_mip_reg.off_0x38 = MIP_VALID;
392
393 while ((mip_status = es7000_mip_write(&es7000_mip_reg)) != 0)
394 WARN(1, "Command failed, status = %x\n", mip_status);
395}
396
397static void es7000_wait_for_init_deassert(atomic_t *deassert)
398{
399 while (!atomic_read(deassert))
400 cpu_relax();
401}
402
403static unsigned int es7000_get_apic_id(unsigned long x)
404{
405 return (x >> 24) & 0xFF;
406}
407
408static void es7000_send_IPI_mask(const struct cpumask *mask, int vector)
409{
410 default_send_IPI_mask_sequence_phys(mask, vector);
411}
412
413static void es7000_send_IPI_allbutself(int vector)
414{
415 default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
416}
417
418static void es7000_send_IPI_all(int vector)
419{
420 es7000_send_IPI_mask(cpu_online_mask, vector);
421}
422
423static int es7000_apic_id_registered(void)
424{
425 return 1;
426}
427
428static const struct cpumask *target_cpus_cluster(void)
429{
430 return cpu_all_mask;
431}
432
433static const struct cpumask *es7000_target_cpus(void)
434{
435 return cpumask_of(smp_processor_id());
436}
437
438static unsigned long es7000_check_apicid_used(physid_mask_t *map, int apicid)
439{
440 return 0;
441}
442
443static unsigned long es7000_check_apicid_present(int bit)
444{
445 return physid_isset(bit, phys_cpu_present_map);
446}
447
448static int es7000_early_logical_apicid(int cpu)
449{
450 /* on es7000, logical apicid is the same as physical */
451 return early_per_cpu(x86_bios_cpu_apicid, cpu);
452}
453
454static unsigned long calculate_ldr(int cpu)
455{
456 unsigned long id = per_cpu(x86_bios_cpu_apicid, cpu);
457
458 return SET_APIC_LOGICAL_ID(id);
459}
460
461/*
462 * Set up the logical destination ID.
463 *
464 * Intel recommends to set DFR, LdR and TPR before enabling
465 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
466 * document number 292116). So here it goes...
467 */
468static void es7000_init_apic_ldr_cluster(void)
469{
470 unsigned long val;
471 int cpu = smp_processor_id();
472
473 apic_write(APIC_DFR, APIC_DFR_CLUSTER);
474 val = calculate_ldr(cpu);
475 apic_write(APIC_LDR, val);
476}
477
478static void es7000_init_apic_ldr(void)
479{
480 unsigned long val;
481 int cpu = smp_processor_id();
482
483 apic_write(APIC_DFR, APIC_DFR_FLAT);
484 val = calculate_ldr(cpu);
485 apic_write(APIC_LDR, val);
486}
487
488static void es7000_setup_apic_routing(void)
489{
490 int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id());
491
492 pr_info("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n",
493 (apic_version[apic] == 0x14) ?
494 "Physical Cluster" : "Logical Cluster",
495 nr_ioapics, cpumask_bits(es7000_target_cpus())[0]);
496}
497
498static int es7000_cpu_present_to_apicid(int mps_cpu)
499{
500 if (!mps_cpu)
501 return boot_cpu_physical_apicid;
502 else if (mps_cpu < nr_cpu_ids)
503 return per_cpu(x86_bios_cpu_apicid, mps_cpu);
504 else
505 return BAD_APICID;
506}
507
508static int cpu_id;
509
510static void es7000_apicid_to_cpu_present(int phys_apicid, physid_mask_t *retmap)
511{
512 physid_set_mask_of_physid(cpu_id, retmap);
513 ++cpu_id;
514}
515
516static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
517{
518 /* For clustered we don't have a good way to do this yet - hack */
519 physids_promote(0xFFL, retmap);
520}
521
522static int es7000_check_phys_apicid_present(int cpu_physical_apicid)
523{
524 boot_cpu_physical_apicid = read_apic_id();
525 return 1;
526}
527
528static inline int
529es7000_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id)
530{
531 unsigned int round = 0;
532 unsigned int cpu, uninitialized_var(apicid);
533
534 /*
535 * The cpus in the mask must all be on the apic cluster.
536 */
537 for_each_cpu_and(cpu, cpumask, cpu_online_mask) {
538 int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
539
540 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
541 WARN(1, "Not a valid mask!");
542
543 return -EINVAL;
544 }
545 apicid |= new_apicid;
546 round++;
547 }
548 if (!round)
549 return -EINVAL;
550 *dest_id = apicid;
551 return 0;
552}
553
554static int
555es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask,
556 const struct cpumask *andmask,
557 unsigned int *apicid)
558{
559 cpumask_var_t cpumask;
560 *apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
561
562 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
563 return 0;
564
565 cpumask_and(cpumask, inmask, andmask);
566 es7000_cpu_mask_to_apicid(cpumask, apicid);
567
568 free_cpumask_var(cpumask);
569
570 return 0;
571}
572
573static int es7000_phys_pkg_id(int cpuid_apic, int index_msb)
574{
575 return cpuid_apic >> index_msb;
576}
577
578static int probe_es7000(void)
579{
580 /* probed later in mptable/ACPI hooks */
581 return 0;
582}
583
584static int es7000_mps_ret;
585static int es7000_mps_oem_check(struct mpc_table *mpc, char *oem,
586 char *productid)
587{
588 int ret = 0;
589
590 if (mpc->oemptr) {
591 struct mpc_oemtable *oem_table =
592 (struct mpc_oemtable *)mpc->oemptr;
593
594 if (!strncmp(oem, "UNISYS", 6))
595 ret = parse_unisys_oem((char *)oem_table);
596 }
597
598 es7000_mps_ret = ret;
599
600 return ret && !es7000_apic_is_cluster();
601}
602
603static int es7000_mps_oem_check_cluster(struct mpc_table *mpc, char *oem,
604 char *productid)
605{
606 int ret = es7000_mps_ret;
607
608 return ret && es7000_apic_is_cluster();
609}
610
611/* We've been warned by a false positive warning.Use __refdata to keep calm. */
612static struct apic __refdata apic_es7000_cluster = {
613
614 .name = "es7000",
615 .probe = probe_es7000,
616 .acpi_madt_oem_check = es7000_acpi_madt_oem_check_cluster,
617 .apic_id_valid = default_apic_id_valid,
618 .apic_id_registered = es7000_apic_id_registered,
619
620 .irq_delivery_mode = dest_LowestPrio,
621 /* logical delivery broadcast to all procs: */
622 .irq_dest_mode = 1,
623
624 .target_cpus = target_cpus_cluster,
625 .disable_esr = 1,
626 .dest_logical = 0,
627 .check_apicid_used = es7000_check_apicid_used,
628 .check_apicid_present = es7000_check_apicid_present,
629
630 .vector_allocation_domain = flat_vector_allocation_domain,
631 .init_apic_ldr = es7000_init_apic_ldr_cluster,
632
633 .ioapic_phys_id_map = es7000_ioapic_phys_id_map,
634 .setup_apic_routing = es7000_setup_apic_routing,
635 .multi_timer_check = NULL,
636 .cpu_present_to_apicid = es7000_cpu_present_to_apicid,
637 .apicid_to_cpu_present = es7000_apicid_to_cpu_present,
638 .setup_portio_remap = NULL,
639 .check_phys_apicid_present = es7000_check_phys_apicid_present,
640 .enable_apic_mode = es7000_enable_apic_mode,
641 .phys_pkg_id = es7000_phys_pkg_id,
642 .mps_oem_check = es7000_mps_oem_check_cluster,
643
644 .get_apic_id = es7000_get_apic_id,
645 .set_apic_id = NULL,
646 .apic_id_mask = 0xFF << 24,
647
648 .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and,
649
650 .send_IPI_mask = es7000_send_IPI_mask,
651 .send_IPI_mask_allbutself = NULL,
652 .send_IPI_allbutself = es7000_send_IPI_allbutself,
653 .send_IPI_all = es7000_send_IPI_all,
654 .send_IPI_self = default_send_IPI_self,
655
656 .wakeup_secondary_cpu = wakeup_secondary_cpu_via_mip,
657
658 .trampoline_phys_low = 0x467,
659 .trampoline_phys_high = 0x469,
660
661 .wait_for_init_deassert = NULL,
662
663 /* Nothing to do for most platforms, since cleared by the INIT cycle: */
664 .smp_callin_clear_local_apic = NULL,
665 .inquire_remote_apic = default_inquire_remote_apic,
666
667 .read = native_apic_mem_read,
668 .write = native_apic_mem_write,
669 .eoi_write = native_apic_mem_write,
670 .icr_read = native_apic_icr_read,
671 .icr_write = native_apic_icr_write,
672 .wait_icr_idle = native_apic_wait_icr_idle,
673 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
674
675 .x86_32_early_logical_apicid = es7000_early_logical_apicid,
676};
677
678static struct apic __refdata apic_es7000 = {
679
680 .name = "es7000",
681 .probe = probe_es7000,
682 .acpi_madt_oem_check = es7000_acpi_madt_oem_check,
683 .apic_id_valid = default_apic_id_valid,
684 .apic_id_registered = es7000_apic_id_registered,
685
686 .irq_delivery_mode = dest_Fixed,
687 /* phys delivery to target CPUs: */
688 .irq_dest_mode = 0,
689
690 .target_cpus = es7000_target_cpus,
691 .disable_esr = 1,
692 .dest_logical = 0,
693 .check_apicid_used = es7000_check_apicid_used,
694 .check_apicid_present = es7000_check_apicid_present,
695
696 .vector_allocation_domain = flat_vector_allocation_domain,
697 .init_apic_ldr = es7000_init_apic_ldr,
698
699 .ioapic_phys_id_map = es7000_ioapic_phys_id_map,
700 .setup_apic_routing = es7000_setup_apic_routing,
701 .multi_timer_check = NULL,
702 .cpu_present_to_apicid = es7000_cpu_present_to_apicid,
703 .apicid_to_cpu_present = es7000_apicid_to_cpu_present,
704 .setup_portio_remap = NULL,
705 .check_phys_apicid_present = es7000_check_phys_apicid_present,
706 .enable_apic_mode = es7000_enable_apic_mode,
707 .phys_pkg_id = es7000_phys_pkg_id,
708 .mps_oem_check = es7000_mps_oem_check,
709
710 .get_apic_id = es7000_get_apic_id,
711 .set_apic_id = NULL,
712 .apic_id_mask = 0xFF << 24,
713
714 .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and,
715
716 .send_IPI_mask = es7000_send_IPI_mask,
717 .send_IPI_mask_allbutself = NULL,
718 .send_IPI_allbutself = es7000_send_IPI_allbutself,
719 .send_IPI_all = es7000_send_IPI_all,
720 .send_IPI_self = default_send_IPI_self,
721
722 .trampoline_phys_low = 0x467,
723 .trampoline_phys_high = 0x469,
724
725 .wait_for_init_deassert = es7000_wait_for_init_deassert,
726
727 /* Nothing to do for most platforms, since cleared by the INIT cycle: */
728 .smp_callin_clear_local_apic = NULL,
729 .inquire_remote_apic = default_inquire_remote_apic,
730
731 .read = native_apic_mem_read,
732 .write = native_apic_mem_write,
733 .eoi_write = native_apic_mem_write,
734 .icr_read = native_apic_icr_read,
735 .icr_write = native_apic_icr_write,
736 .wait_icr_idle = native_apic_wait_icr_idle,
737 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
738
739 .x86_32_early_logical_apicid = es7000_early_logical_apicid,
740};
741
742/*
743 * Need to check for es7000 followed by es7000_cluster, so this order
744 * in apic_drivers is important.
745 */
746apic_drivers(apic_es7000, apic_es7000_cluster);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e63a5bd2a78f..6ad4658de705 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -37,9 +37,6 @@
37#include <linux/kthread.h> 37#include <linux/kthread.h>
38#include <linux/jiffies.h> /* time_after() */ 38#include <linux/jiffies.h> /* time_after() */
39#include <linux/slab.h> 39#include <linux/slab.h>
40#ifdef CONFIG_ACPI
41#include <acpi/acpi_bus.h>
42#endif
43#include <linux/bootmem.h> 40#include <linux/bootmem.h>
44#include <linux/dmar.h> 41#include <linux/dmar.h>
45#include <linux/hpet.h> 42#include <linux/hpet.h>
@@ -1142,9 +1139,10 @@ next:
1142 if (test_bit(vector, used_vectors)) 1139 if (test_bit(vector, used_vectors))
1143 goto next; 1140 goto next;
1144 1141
1145 for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) 1142 for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) {
1146 if (per_cpu(vector_irq, new_cpu)[vector] != -1) 1143 if (per_cpu(vector_irq, new_cpu)[vector] > VECTOR_UNDEFINED)
1147 goto next; 1144 goto next;
1145 }
1148 /* Found one! */ 1146 /* Found one! */
1149 current_vector = vector; 1147 current_vector = vector;
1150 current_offset = offset; 1148 current_offset = offset;
@@ -1183,7 +1181,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
1183 1181
1184 vector = cfg->vector; 1182 vector = cfg->vector;
1185 for_each_cpu_and(cpu, cfg->domain, cpu_online_mask) 1183 for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
1186 per_cpu(vector_irq, cpu)[vector] = -1; 1184 per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
1187 1185
1188 cfg->vector = 0; 1186 cfg->vector = 0;
1189 cpumask_clear(cfg->domain); 1187 cpumask_clear(cfg->domain);
@@ -1191,11 +1189,10 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
1191 if (likely(!cfg->move_in_progress)) 1189 if (likely(!cfg->move_in_progress))
1192 return; 1190 return;
1193 for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) { 1191 for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
1194 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; 1192 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
1195 vector++) {
1196 if (per_cpu(vector_irq, cpu)[vector] != irq) 1193 if (per_cpu(vector_irq, cpu)[vector] != irq)
1197 continue; 1194 continue;
1198 per_cpu(vector_irq, cpu)[vector] = -1; 1195 per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
1199 break; 1196 break;
1200 } 1197 }
1201 } 1198 }
@@ -1228,12 +1225,12 @@ void __setup_vector_irq(int cpu)
1228 /* Mark the free vectors */ 1225 /* Mark the free vectors */
1229 for (vector = 0; vector < NR_VECTORS; ++vector) { 1226 for (vector = 0; vector < NR_VECTORS; ++vector) {
1230 irq = per_cpu(vector_irq, cpu)[vector]; 1227 irq = per_cpu(vector_irq, cpu)[vector];
1231 if (irq < 0) 1228 if (irq <= VECTOR_UNDEFINED)
1232 continue; 1229 continue;
1233 1230
1234 cfg = irq_cfg(irq); 1231 cfg = irq_cfg(irq);
1235 if (!cpumask_test_cpu(cpu, cfg->domain)) 1232 if (!cpumask_test_cpu(cpu, cfg->domain))
1236 per_cpu(vector_irq, cpu)[vector] = -1; 1233 per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
1237 } 1234 }
1238 raw_spin_unlock(&vector_lock); 1235 raw_spin_unlock(&vector_lock);
1239} 1236}
@@ -2202,13 +2199,13 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2202 2199
2203 me = smp_processor_id(); 2200 me = smp_processor_id();
2204 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { 2201 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
2205 unsigned int irq; 2202 int irq;
2206 unsigned int irr; 2203 unsigned int irr;
2207 struct irq_desc *desc; 2204 struct irq_desc *desc;
2208 struct irq_cfg *cfg; 2205 struct irq_cfg *cfg;
2209 irq = __this_cpu_read(vector_irq[vector]); 2206 irq = __this_cpu_read(vector_irq[vector]);
2210 2207
2211 if (irq == -1) 2208 if (irq <= VECTOR_UNDEFINED)
2212 continue; 2209 continue;
2213 2210
2214 desc = irq_to_desc(irq); 2211 desc = irq_to_desc(irq);
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index 7434d8556d09..62071569bd50 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -1,6 +1,5 @@
1#include <linux/cpumask.h> 1#include <linux/cpumask.h>
2#include <linux/interrupt.h> 2#include <linux/interrupt.h>
3#include <linux/init.h>
4 3
5#include <linux/mm.h> 4#include <linux/mm.h>
6#include <linux/delay.h> 5#include <linux/delay.h>
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
deleted file mode 100644
index 1e42e8f305ee..000000000000
--- a/arch/x86/kernel/apic/numaq_32.c
+++ /dev/null
@@ -1,525 +0,0 @@
1/*
2 * Written by: Patricia Gaughen, IBM Corporation
3 *
4 * Copyright (C) 2002, IBM Corp.
5 * Copyright (C) 2009, Red Hat, Inc., Ingo Molnar
6 *
7 * All rights reserved.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
17 * NON INFRINGEMENT. See the GNU General Public License for more
18 * details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * Send feedback to <gone@us.ibm.com>
25 */
26#include <linux/nodemask.h>
27#include <linux/topology.h>
28#include <linux/bootmem.h>
29#include <linux/memblock.h>
30#include <linux/threads.h>
31#include <linux/cpumask.h>
32#include <linux/kernel.h>
33#include <linux/mmzone.h>
34#include <linux/module.h>
35#include <linux/string.h>
36#include <linux/init.h>
37#include <linux/numa.h>
38#include <linux/smp.h>
39#include <linux/io.h>
40#include <linux/mm.h>
41
42#include <asm/processor.h>
43#include <asm/fixmap.h>
44#include <asm/mpspec.h>
45#include <asm/numaq.h>
46#include <asm/setup.h>
47#include <asm/apic.h>
48#include <asm/e820.h>
49#include <asm/ipi.h>
50
51int found_numaq;
52
53/*
54 * Have to match translation table entries to main table entries by counter
55 * hence the mpc_record variable .... can't see a less disgusting way of
56 * doing this ....
57 */
58struct mpc_trans {
59 unsigned char mpc_type;
60 unsigned char trans_len;
61 unsigned char trans_type;
62 unsigned char trans_quad;
63 unsigned char trans_global;
64 unsigned char trans_local;
65 unsigned short trans_reserved;
66};
67
68static int mpc_record;
69
70static struct mpc_trans *translation_table[MAX_MPC_ENTRY];
71
72int mp_bus_id_to_node[MAX_MP_BUSSES];
73int mp_bus_id_to_local[MAX_MP_BUSSES];
74int quad_local_to_mp_bus_id[NR_CPUS/4][4];
75
76
77static inline void numaq_register_node(int node, struct sys_cfg_data *scd)
78{
79 struct eachquadmem *eq = scd->eq + node;
80 u64 start = (u64)(eq->hi_shrd_mem_start - eq->priv_mem_size) << 20;
81 u64 end = (u64)(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size) << 20;
82 int ret;
83
84 node_set(node, numa_nodes_parsed);
85 ret = numa_add_memblk(node, start, end);
86 BUG_ON(ret < 0);
87}
88
89/*
90 * Function: smp_dump_qct()
91 *
92 * Description: gets memory layout from the quad config table. This
93 * function also updates numa_nodes_parsed with the nodes (quads) present.
94 */
95static void __init smp_dump_qct(void)
96{
97 struct sys_cfg_data *scd;
98 int node;
99
100 scd = (void *)__va(SYS_CFG_DATA_PRIV_ADDR);
101
102 for_each_node(node) {
103 if (scd->quads_present31_0 & (1 << node))
104 numaq_register_node(node, scd);
105 }
106}
107
108void numaq_tsc_disable(void)
109{
110 if (!found_numaq)
111 return;
112
113 if (num_online_nodes() > 1) {
114 printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
115 setup_clear_cpu_cap(X86_FEATURE_TSC);
116 }
117}
118
119static void __init numaq_tsc_init(void)
120{
121 numaq_tsc_disable();
122}
123
124static inline int generate_logical_apicid(int quad, int phys_apicid)
125{
126 return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
127}
128
129/* x86_quirks member */
130static int mpc_apic_id(struct mpc_cpu *m)
131{
132 int quad = translation_table[mpc_record]->trans_quad;
133 int logical_apicid = generate_logical_apicid(quad, m->apicid);
134
135 printk(KERN_DEBUG
136 "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
137 m->apicid, (m->cpufeature & CPU_FAMILY_MASK) >> 8,
138 (m->cpufeature & CPU_MODEL_MASK) >> 4,
139 m->apicver, quad, logical_apicid);
140
141 return logical_apicid;
142}
143
144/* x86_quirks member */
145static void mpc_oem_bus_info(struct mpc_bus *m, char *name)
146{
147 int quad = translation_table[mpc_record]->trans_quad;
148 int local = translation_table[mpc_record]->trans_local;
149
150 mp_bus_id_to_node[m->busid] = quad;
151 mp_bus_id_to_local[m->busid] = local;
152
153 printk(KERN_INFO "Bus #%d is %s (node %d)\n", m->busid, name, quad);
154}
155
156/* x86_quirks member */
157static void mpc_oem_pci_bus(struct mpc_bus *m)
158{
159 int quad = translation_table[mpc_record]->trans_quad;
160 int local = translation_table[mpc_record]->trans_local;
161
162 quad_local_to_mp_bus_id[quad][local] = m->busid;
163}
164
165/*
166 * Called from mpparse code.
167 * mode = 0: prescan
168 * mode = 1: one mpc entry scanned
169 */
170static void numaq_mpc_record(unsigned int mode)
171{
172 if (!mode)
173 mpc_record = 0;
174 else
175 mpc_record++;
176}
177
178static void __init MP_translation_info(struct mpc_trans *m)
179{
180 printk(KERN_INFO
181 "Translation: record %d, type %d, quad %d, global %d, local %d\n",
182 mpc_record, m->trans_type, m->trans_quad, m->trans_global,
183 m->trans_local);
184
185 if (mpc_record >= MAX_MPC_ENTRY)
186 printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
187 else
188 translation_table[mpc_record] = m; /* stash this for later */
189
190 if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
191 node_set_online(m->trans_quad);
192}
193
194static int __init mpf_checksum(unsigned char *mp, int len)
195{
196 int sum = 0;
197
198 while (len--)
199 sum += *mp++;
200
201 return sum & 0xFF;
202}
203
204/*
205 * Read/parse the MPC oem tables
206 */
207static void __init smp_read_mpc_oem(struct mpc_table *mpc)
208{
209 struct mpc_oemtable *oemtable = (void *)(long)mpc->oemptr;
210 int count = sizeof(*oemtable); /* the header size */
211 unsigned char *oemptr = ((unsigned char *)oemtable) + count;
212
213 mpc_record = 0;
214 printk(KERN_INFO
215 "Found an OEM MPC table at %8p - parsing it...\n", oemtable);
216
217 if (memcmp(oemtable->signature, MPC_OEM_SIGNATURE, 4)) {
218 printk(KERN_WARNING
219 "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
220 oemtable->signature[0], oemtable->signature[1],
221 oemtable->signature[2], oemtable->signature[3]);
222 return;
223 }
224
225 if (mpf_checksum((unsigned char *)oemtable, oemtable->length)) {
226 printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
227 return;
228 }
229
230 while (count < oemtable->length) {
231 switch (*oemptr) {
232 case MP_TRANSLATION:
233 {
234 struct mpc_trans *m = (void *)oemptr;
235
236 MP_translation_info(m);
237 oemptr += sizeof(*m);
238 count += sizeof(*m);
239 ++mpc_record;
240 break;
241 }
242 default:
243 printk(KERN_WARNING
244 "Unrecognised OEM table entry type! - %d\n",
245 (int)*oemptr);
246 return;
247 }
248 }
249}
250
251static __init void early_check_numaq(void)
252{
253 /*
254 * get boot-time SMP configuration:
255 */
256 if (smp_found_config)
257 early_get_smp_config();
258
259 if (found_numaq) {
260 x86_init.mpparse.mpc_record = numaq_mpc_record;
261 x86_init.mpparse.setup_ioapic_ids = x86_init_noop;
262 x86_init.mpparse.mpc_apic_id = mpc_apic_id;
263 x86_init.mpparse.smp_read_mpc_oem = smp_read_mpc_oem;
264 x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus;
265 x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info;
266 x86_init.timers.tsc_pre_init = numaq_tsc_init;
267 x86_init.pci.init = pci_numaq_init;
268 }
269}
270
271int __init numaq_numa_init(void)
272{
273 early_check_numaq();
274 if (!found_numaq)
275 return -ENOENT;
276 smp_dump_qct();
277
278 return 0;
279}
280
281#define NUMAQ_APIC_DFR_VALUE (APIC_DFR_CLUSTER)
282
283static inline unsigned int numaq_get_apic_id(unsigned long x)
284{
285 return (x >> 24) & 0x0F;
286}
287
288static inline void numaq_send_IPI_mask(const struct cpumask *mask, int vector)
289{
290 default_send_IPI_mask_sequence_logical(mask, vector);
291}
292
293static inline void numaq_send_IPI_allbutself(int vector)
294{
295 default_send_IPI_mask_allbutself_logical(cpu_online_mask, vector);
296}
297
298static inline void numaq_send_IPI_all(int vector)
299{
300 numaq_send_IPI_mask(cpu_online_mask, vector);
301}
302
303#define NUMAQ_TRAMPOLINE_PHYS_LOW (0x8)
304#define NUMAQ_TRAMPOLINE_PHYS_HIGH (0xa)
305
306/*
307 * Because we use NMIs rather than the INIT-STARTUP sequence to
308 * bootstrap the CPUs, the APIC may be in a weird state. Kick it:
309 */
310static inline void numaq_smp_callin_clear_local_apic(void)
311{
312 clear_local_APIC();
313}
314
315static inline const struct cpumask *numaq_target_cpus(void)
316{
317 return cpu_all_mask;
318}
319
320static unsigned long numaq_check_apicid_used(physid_mask_t *map, int apicid)
321{
322 return physid_isset(apicid, *map);
323}
324
325static inline unsigned long numaq_check_apicid_present(int bit)
326{
327 return physid_isset(bit, phys_cpu_present_map);
328}
329
330static inline int numaq_apic_id_registered(void)
331{
332 return 1;
333}
334
335static inline void numaq_init_apic_ldr(void)
336{
337 /* Already done in NUMA-Q firmware */
338}
339
340static inline void numaq_setup_apic_routing(void)
341{
342 printk(KERN_INFO
343 "Enabling APIC mode: NUMA-Q. Using %d I/O APICs\n",
344 nr_ioapics);
345}
346
347/*
348 * Skip adding the timer int on secondary nodes, which causes
349 * a small but painful rift in the time-space continuum.
350 */
351static inline int numaq_multi_timer_check(int apic, int irq)
352{
353 return apic != 0 && irq == 0;
354}
355
356static inline void numaq_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
357{
358 /* We don't have a good way to do this yet - hack */
359 return physids_promote(0xFUL, retmap);
360}
361
362/*
363 * Supporting over 60 cpus on NUMA-Q requires a locality-dependent
364 * cpu to APIC ID relation to properly interact with the intelligent
365 * mode of the cluster controller.
366 */
367static inline int numaq_cpu_present_to_apicid(int mps_cpu)
368{
369 if (mps_cpu < 60)
370 return ((mps_cpu >> 2) << 4) | (1 << (mps_cpu & 0x3));
371 else
372 return BAD_APICID;
373}
374
375static inline int numaq_apicid_to_node(int logical_apicid)
376{
377 return logical_apicid >> 4;
378}
379
380static int numaq_numa_cpu_node(int cpu)
381{
382 int logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
383
384 if (logical_apicid != BAD_APICID)
385 return numaq_apicid_to_node(logical_apicid);
386 return NUMA_NO_NODE;
387}
388
389static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap)
390{
391 int node = numaq_apicid_to_node(logical_apicid);
392 int cpu = __ffs(logical_apicid & 0xf);
393
394 physid_set_mask_of_physid(cpu + 4*node, retmap);
395}
396
397/* Where the IO area was mapped on multiquad, always 0 otherwise */
398void *xquad_portio;
399
400static inline int numaq_check_phys_apicid_present(int phys_apicid)
401{
402 return 1;
403}
404
405/*
406 * We use physical apicids here, not logical, so just return the default
407 * physical broadcast to stop people from breaking us
408 */
409static int
410numaq_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
411 const struct cpumask *andmask,
412 unsigned int *apicid)
413{
414 *apicid = 0x0F;
415 return 0;
416}
417
418/* No NUMA-Q box has a HT CPU, but it can't hurt to use the default code. */
419static inline int numaq_phys_pkg_id(int cpuid_apic, int index_msb)
420{
421 return cpuid_apic >> index_msb;
422}
423
424static int
425numaq_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
426{
427 if (strncmp(oem, "IBM NUMA", 8))
428 printk(KERN_ERR "Warning! Not a NUMA-Q system!\n");
429 else
430 found_numaq = 1;
431
432 return found_numaq;
433}
434
435static int probe_numaq(void)
436{
437 /* already know from get_memcfg_numaq() */
438 return found_numaq;
439}
440
441static void numaq_setup_portio_remap(void)
442{
443 int num_quads = num_online_nodes();
444
445 if (num_quads <= 1)
446 return;
447
448 printk(KERN_INFO
449 "Remapping cross-quad port I/O for %d quads\n", num_quads);
450
451 xquad_portio = ioremap(XQUAD_PORTIO_BASE, num_quads*XQUAD_PORTIO_QUAD);
452
453 printk(KERN_INFO
454 "xquad_portio vaddr 0x%08lx, len %08lx\n",
455 (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD);
456}
457
458/* Use __refdata to keep false positive warning calm. */
459static struct apic __refdata apic_numaq = {
460
461 .name = "NUMAQ",
462 .probe = probe_numaq,
463 .acpi_madt_oem_check = NULL,
464 .apic_id_valid = default_apic_id_valid,
465 .apic_id_registered = numaq_apic_id_registered,
466
467 .irq_delivery_mode = dest_LowestPrio,
468 /* physical delivery on LOCAL quad: */
469 .irq_dest_mode = 0,
470
471 .target_cpus = numaq_target_cpus,
472 .disable_esr = 1,
473 .dest_logical = APIC_DEST_LOGICAL,
474 .check_apicid_used = numaq_check_apicid_used,
475 .check_apicid_present = numaq_check_apicid_present,
476
477 .vector_allocation_domain = flat_vector_allocation_domain,
478 .init_apic_ldr = numaq_init_apic_ldr,
479
480 .ioapic_phys_id_map = numaq_ioapic_phys_id_map,
481 .setup_apic_routing = numaq_setup_apic_routing,
482 .multi_timer_check = numaq_multi_timer_check,
483 .cpu_present_to_apicid = numaq_cpu_present_to_apicid,
484 .apicid_to_cpu_present = numaq_apicid_to_cpu_present,
485 .setup_portio_remap = numaq_setup_portio_remap,
486 .check_phys_apicid_present = numaq_check_phys_apicid_present,
487 .enable_apic_mode = NULL,
488 .phys_pkg_id = numaq_phys_pkg_id,
489 .mps_oem_check = numaq_mps_oem_check,
490
491 .get_apic_id = numaq_get_apic_id,
492 .set_apic_id = NULL,
493 .apic_id_mask = 0x0F << 24,
494
495 .cpu_mask_to_apicid_and = numaq_cpu_mask_to_apicid_and,
496
497 .send_IPI_mask = numaq_send_IPI_mask,
498 .send_IPI_mask_allbutself = NULL,
499 .send_IPI_allbutself = numaq_send_IPI_allbutself,
500 .send_IPI_all = numaq_send_IPI_all,
501 .send_IPI_self = default_send_IPI_self,
502
503 .wakeup_secondary_cpu = wakeup_secondary_cpu_via_nmi,
504 .trampoline_phys_low = NUMAQ_TRAMPOLINE_PHYS_LOW,
505 .trampoline_phys_high = NUMAQ_TRAMPOLINE_PHYS_HIGH,
506
507 /* We don't do anything here because we use NMI's to boot instead */
508 .wait_for_init_deassert = NULL,
509
510 .smp_callin_clear_local_apic = numaq_smp_callin_clear_local_apic,
511 .inquire_remote_apic = NULL,
512
513 .read = native_apic_mem_read,
514 .write = native_apic_mem_write,
515 .eoi_write = native_apic_mem_write,
516 .icr_read = native_apic_icr_read,
517 .icr_write = native_apic_icr_write,
518 .wait_icr_idle = native_apic_wait_icr_idle,
519 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
520
521 .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid,
522 .x86_32_numa_cpu_node = numaq_numa_cpu_node,
523};
524
525apic_driver(apic_numaq);
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index eb35ef9ee63f..cceb352c968c 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -119,8 +119,7 @@ static struct apic apic_default = {
119 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, 119 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
120 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, 120 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
121 121
122 .wait_for_init_deassert = default_wait_for_init_deassert, 122 .wait_for_init_deassert = true,
123
124 .smp_callin_clear_local_apic = NULL, 123 .smp_callin_clear_local_apic = NULL,
125 .inquire_remote_apic = default_inquire_remote_apic, 124 .inquire_remote_apic = default_inquire_remote_apic,
126 125
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
deleted file mode 100644
index 77c95c0e1bf7..000000000000
--- a/arch/x86/kernel/apic/summit_32.c
+++ /dev/null
@@ -1,552 +0,0 @@
1/*
2 * IBM Summit-Specific Code
3 *
4 * Written By: Matthew Dobson, IBM Corporation
5 *
6 * Copyright (c) 2003 IBM Corp.
7 *
8 * All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or (at
13 * your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
18 * NON INFRINGEMENT. See the GNU General Public License for more
19 * details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 * Send feedback to <colpatch@us.ibm.com>
26 *
27 */
28
29#define pr_fmt(fmt) "summit: %s: " fmt, __func__
30
31#include <linux/mm.h>
32#include <linux/init.h>
33#include <asm/io.h>
34#include <asm/bios_ebda.h>
35
36/*
37 * APIC driver for the IBM "Summit" chipset.
38 */
39#include <linux/threads.h>
40#include <linux/cpumask.h>
41#include <asm/mpspec.h>
42#include <asm/apic.h>
43#include <asm/smp.h>
44#include <asm/fixmap.h>
45#include <asm/apicdef.h>
46#include <asm/ipi.h>
47#include <linux/kernel.h>
48#include <linux/string.h>
49#include <linux/gfp.h>
50#include <linux/smp.h>
51
52static unsigned summit_get_apic_id(unsigned long x)
53{
54 return (x >> 24) & 0xFF;
55}
56
57static inline void summit_send_IPI_mask(const struct cpumask *mask, int vector)
58{
59 default_send_IPI_mask_sequence_logical(mask, vector);
60}
61
62static void summit_send_IPI_allbutself(int vector)
63{
64 default_send_IPI_mask_allbutself_logical(cpu_online_mask, vector);
65}
66
67static void summit_send_IPI_all(int vector)
68{
69 summit_send_IPI_mask(cpu_online_mask, vector);
70}
71
72#include <asm/tsc.h>
73
74extern int use_cyclone;
75
76#ifdef CONFIG_X86_SUMMIT_NUMA
77static void setup_summit(void);
78#else
79static inline void setup_summit(void) {}
80#endif
81
82static int summit_mps_oem_check(struct mpc_table *mpc, char *oem,
83 char *productid)
84{
85 if (!strncmp(oem, "IBM ENSW", 8) &&
86 (!strncmp(productid, "VIGIL SMP", 9)
87 || !strncmp(productid, "EXA", 3)
88 || !strncmp(productid, "RUTHLESS SMP", 12))){
89 mark_tsc_unstable("Summit based system");
90 use_cyclone = 1; /*enable cyclone-timer*/
91 setup_summit();
92 return 1;
93 }
94 return 0;
95}
96
97/* Hook from generic ACPI tables.c */
98static int summit_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
99{
100 if (!strncmp(oem_id, "IBM", 3) &&
101 (!strncmp(oem_table_id, "SERVIGIL", 8)
102 || !strncmp(oem_table_id, "EXA", 3))){
103 mark_tsc_unstable("Summit based system");
104 use_cyclone = 1; /*enable cyclone-timer*/
105 setup_summit();
106 return 1;
107 }
108 return 0;
109}
110
111struct rio_table_hdr {
112 unsigned char version; /* Version number of this data structure */
113 /* Version 3 adds chassis_num & WP_index */
114 unsigned char num_scal_dev; /* # of Scalability devices (Twisters for Vigil) */
115 unsigned char num_rio_dev; /* # of RIO I/O devices (Cyclones and Winnipegs) */
116} __attribute__((packed));
117
118struct scal_detail {
119 unsigned char node_id; /* Scalability Node ID */
120 unsigned long CBAR; /* Address of 1MB register space */
121 unsigned char port0node; /* Node ID port connected to: 0xFF=None */
122 unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */
123 unsigned char port1node; /* Node ID port connected to: 0xFF = None */
124 unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */
125 unsigned char port2node; /* Node ID port connected to: 0xFF = None */
126 unsigned char port2port; /* Port num port connected to: 0,1,2, or 0xFF=None */
127 unsigned char chassis_num; /* 1 based Chassis number (1 = boot node) */
128} __attribute__((packed));
129
130struct rio_detail {
131 unsigned char node_id; /* RIO Node ID */
132 unsigned long BBAR; /* Address of 1MB register space */
133 unsigned char type; /* Type of device */
134 unsigned char owner_id; /* For WPEG: Node ID of Cyclone that owns this WPEG*/
135 /* For CYC: Node ID of Twister that owns this CYC */
136 unsigned char port0node; /* Node ID port connected to: 0xFF=None */
137 unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */
138 unsigned char port1node; /* Node ID port connected to: 0xFF=None */
139 unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */
140 unsigned char first_slot; /* For WPEG: Lowest slot number below this WPEG */
141 /* For CYC: 0 */
142 unsigned char status; /* For WPEG: Bit 0 = 1 : the XAPIC is used */
143 /* = 0 : the XAPIC is not used, ie:*/
144 /* ints fwded to another XAPIC */
145 /* Bits1:7 Reserved */
146 /* For CYC: Bits0:7 Reserved */
147 unsigned char WP_index; /* For WPEG: WPEG instance index - lower ones have */
148 /* lower slot numbers/PCI bus numbers */
149 /* For CYC: No meaning */
150 unsigned char chassis_num; /* 1 based Chassis number */
151 /* For LookOut WPEGs this field indicates the */
152 /* Expansion Chassis #, enumerated from Boot */
153 /* Node WPEG external port, then Boot Node CYC */
154 /* external port, then Next Vigil chassis WPEG */
155 /* external port, etc. */
156 /* Shared Lookouts have only 1 chassis number (the */
157 /* first one assigned) */
158} __attribute__((packed));
159
160
161typedef enum {
162 CompatTwister = 0, /* Compatibility Twister */
163 AltTwister = 1, /* Alternate Twister of internal 8-way */
164 CompatCyclone = 2, /* Compatibility Cyclone */
165 AltCyclone = 3, /* Alternate Cyclone of internal 8-way */
166 CompatWPEG = 4, /* Compatibility WPEG */
167 AltWPEG = 5, /* Second Planar WPEG */
168 LookOutAWPEG = 6, /* LookOut WPEG */
169 LookOutBWPEG = 7, /* LookOut WPEG */
170} node_type;
171
172static inline int is_WPEG(struct rio_detail *rio){
173 return (rio->type == CompatWPEG || rio->type == AltWPEG ||
174 rio->type == LookOutAWPEG || rio->type == LookOutBWPEG);
175}
176
177#define SUMMIT_APIC_DFR_VALUE (APIC_DFR_CLUSTER)
178
179static const struct cpumask *summit_target_cpus(void)
180{
181 /* CPU_MASK_ALL (0xff) has undefined behaviour with
182 * dest_LowestPrio mode logical clustered apic interrupt routing
183 * Just start on cpu 0. IRQ balancing will spread load
184 */
185 return cpumask_of(0);
186}
187
188static unsigned long summit_check_apicid_used(physid_mask_t *map, int apicid)
189{
190 return 0;
191}
192
193/* we don't use the phys_cpu_present_map to indicate apicid presence */
194static unsigned long summit_check_apicid_present(int bit)
195{
196 return 1;
197}
198
199static int summit_early_logical_apicid(int cpu)
200{
201 int count = 0;
202 u8 my_id = early_per_cpu(x86_cpu_to_apicid, cpu);
203 u8 my_cluster = APIC_CLUSTER(my_id);
204#ifdef CONFIG_SMP
205 u8 lid;
206 int i;
207
208 /* Create logical APIC IDs by counting CPUs already in cluster. */
209 for (count = 0, i = nr_cpu_ids; --i >= 0; ) {
210 lid = early_per_cpu(x86_cpu_to_logical_apicid, i);
211 if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster)
212 ++count;
213 }
214#endif
215 /* We only have a 4 wide bitmap in cluster mode. If a deranged
216 * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */
217 BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT);
218 return my_cluster | (1UL << count);
219}
220
221static void summit_init_apic_ldr(void)
222{
223 int cpu = smp_processor_id();
224 unsigned long id = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
225 unsigned long val;
226
227 apic_write(APIC_DFR, SUMMIT_APIC_DFR_VALUE);
228 val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
229 val |= SET_APIC_LOGICAL_ID(id);
230 apic_write(APIC_LDR, val);
231}
232
233static int summit_apic_id_registered(void)
234{
235 return 1;
236}
237
238static void summit_setup_apic_routing(void)
239{
240 pr_info("Enabling APIC mode: Summit. Using %d I/O APICs\n",
241 nr_ioapics);
242}
243
244static int summit_cpu_present_to_apicid(int mps_cpu)
245{
246 if (mps_cpu < nr_cpu_ids)
247 return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
248 else
249 return BAD_APICID;
250}
251
252static void summit_ioapic_phys_id_map(physid_mask_t *phys_id_map, physid_mask_t *retmap)
253{
254 /* For clustered we don't have a good way to do this yet - hack */
255 physids_promote(0x0FL, retmap);
256}
257
258static void summit_apicid_to_cpu_present(int apicid, physid_mask_t *retmap)
259{
260 physid_set_mask_of_physid(0, retmap);
261}
262
263static int summit_check_phys_apicid_present(int physical_apicid)
264{
265 return 1;
266}
267
268static inline int
269summit_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id)
270{
271 unsigned int round = 0;
272 unsigned int cpu, apicid = 0;
273
274 /*
275 * The cpus in the mask must all be on the apic cluster.
276 */
277 for_each_cpu_and(cpu, cpumask, cpu_online_mask) {
278 int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
279
280 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
281 pr_err("Not a valid mask!\n");
282 return -EINVAL;
283 }
284 apicid |= new_apicid;
285 round++;
286 }
287 if (!round)
288 return -EINVAL;
289 *dest_id = apicid;
290 return 0;
291}
292
293static int
294summit_cpu_mask_to_apicid_and(const struct cpumask *inmask,
295 const struct cpumask *andmask,
296 unsigned int *apicid)
297{
298 cpumask_var_t cpumask;
299 *apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
300
301 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
302 return 0;
303
304 cpumask_and(cpumask, inmask, andmask);
305 summit_cpu_mask_to_apicid(cpumask, apicid);
306
307 free_cpumask_var(cpumask);
308
309 return 0;
310}
311
312/*
313 * cpuid returns the value latched in the HW at reset, not the APIC ID
314 * register's value. For any box whose BIOS changes APIC IDs, like
315 * clustered APIC systems, we must use hard_smp_processor_id.
316 *
317 * See Intel's IA-32 SW Dev's Manual Vol2 under CPUID.
318 */
319static int summit_phys_pkg_id(int cpuid_apic, int index_msb)
320{
321 return hard_smp_processor_id() >> index_msb;
322}
323
324static int probe_summit(void)
325{
326 /* probed later in mptable/ACPI hooks */
327 return 0;
328}
329
330#ifdef CONFIG_X86_SUMMIT_NUMA
331static struct rio_table_hdr *rio_table_hdr;
332static struct scal_detail *scal_devs[MAX_NUMNODES];
333static struct rio_detail *rio_devs[MAX_NUMNODES*4];
334
335#ifndef CONFIG_X86_NUMAQ
336static int mp_bus_id_to_node[MAX_MP_BUSSES];
337#endif
338
339static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
340{
341 int twister = 0, node = 0;
342 int i, bus, num_buses;
343
344 for (i = 0; i < rio_table_hdr->num_rio_dev; i++) {
345 if (rio_devs[i]->node_id == rio_devs[wpeg_num]->owner_id) {
346 twister = rio_devs[i]->owner_id;
347 break;
348 }
349 }
350 if (i == rio_table_hdr->num_rio_dev) {
351 pr_err("Couldn't find owner Cyclone for Winnipeg!\n");
352 return last_bus;
353 }
354
355 for (i = 0; i < rio_table_hdr->num_scal_dev; i++) {
356 if (scal_devs[i]->node_id == twister) {
357 node = scal_devs[i]->node_id;
358 break;
359 }
360 }
361 if (i == rio_table_hdr->num_scal_dev) {
362 pr_err("Couldn't find owner Twister for Cyclone!\n");
363 return last_bus;
364 }
365
366 switch (rio_devs[wpeg_num]->type) {
367 case CompatWPEG:
368 /*
369 * The Compatibility Winnipeg controls the 2 legacy buses,
370 * the 66MHz PCI bus [2 slots] and the 2 "extra" buses in case
371 * a PCI-PCI bridge card is used in either slot: total 5 buses.
372 */
373 num_buses = 5;
374 break;
375 case AltWPEG:
376 /*
377 * The Alternate Winnipeg controls the 2 133MHz buses [1 slot
378 * each], their 2 "extra" buses, the 100MHz bus [2 slots] and
379 * the "extra" buses for each of those slots: total 7 buses.
380 */
381 num_buses = 7;
382 break;
383 case LookOutAWPEG:
384 case LookOutBWPEG:
385 /*
386 * A Lookout Winnipeg controls 3 100MHz buses [2 slots each]
387 * & the "extra" buses for each of those slots: total 9 buses.
388 */
389 num_buses = 9;
390 break;
391 default:
392 pr_info("Unsupported Winnipeg type!\n");
393 return last_bus;
394 }
395
396 for (bus = last_bus; bus < last_bus + num_buses; bus++)
397 mp_bus_id_to_node[bus] = node;
398 return bus;
399}
400
401static int build_detail_arrays(void)
402{
403 unsigned long ptr;
404 int i, scal_detail_size, rio_detail_size;
405
406 if (rio_table_hdr->num_scal_dev > MAX_NUMNODES) {
407 pr_warn("MAX_NUMNODES too low! Defined as %d, but system has %d nodes\n",
408 MAX_NUMNODES, rio_table_hdr->num_scal_dev);
409 return 0;
410 }
411
412 switch (rio_table_hdr->version) {
413 default:
414 pr_warn("Invalid Rio Grande Table Version: %d\n",
415 rio_table_hdr->version);
416 return 0;
417 case 2:
418 scal_detail_size = 11;
419 rio_detail_size = 13;
420 break;
421 case 3:
422 scal_detail_size = 12;
423 rio_detail_size = 15;
424 break;
425 }
426
427 ptr = (unsigned long)rio_table_hdr + 3;
428 for (i = 0; i < rio_table_hdr->num_scal_dev; i++, ptr += scal_detail_size)
429 scal_devs[i] = (struct scal_detail *)ptr;
430
431 for (i = 0; i < rio_table_hdr->num_rio_dev; i++, ptr += rio_detail_size)
432 rio_devs[i] = (struct rio_detail *)ptr;
433
434 return 1;
435}
436
437void setup_summit(void)
438{
439 unsigned long ptr;
440 unsigned short offset;
441 int i, next_wpeg, next_bus = 0;
442
443 /* The pointer to the EBDA is stored in the word @ phys 0x40E(40:0E) */
444 ptr = get_bios_ebda();
445 ptr = (unsigned long)phys_to_virt(ptr);
446
447 rio_table_hdr = NULL;
448 offset = 0x180;
449 while (offset) {
450 /* The block id is stored in the 2nd word */
451 if (*((unsigned short *)(ptr + offset + 2)) == 0x4752) {
452 /* set the pointer past the offset & block id */
453 rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4);
454 break;
455 }
456 /* The next offset is stored in the 1st word. 0 means no more */
457 offset = *((unsigned short *)(ptr + offset));
458 }
459 if (!rio_table_hdr) {
460 pr_err("Unable to locate Rio Grande Table in EBDA - bailing!\n");
461 return;
462 }
463
464 if (!build_detail_arrays())
465 return;
466
467 /* The first Winnipeg we're looking for has an index of 0 */
468 next_wpeg = 0;
469 do {
470 for (i = 0; i < rio_table_hdr->num_rio_dev; i++) {
471 if (is_WPEG(rio_devs[i]) && rio_devs[i]->WP_index == next_wpeg) {
472 /* It's the Winnipeg we're looking for! */
473 next_bus = setup_pci_node_map_for_wpeg(i, next_bus);
474 next_wpeg++;
475 break;
476 }
477 }
478 /*
479 * If we go through all Rio devices and don't find one with
480 * the next index, it means we've found all the Winnipegs,
481 * and thus all the PCI buses.
482 */
483 if (i == rio_table_hdr->num_rio_dev)
484 next_wpeg = 0;
485 } while (next_wpeg != 0);
486}
487#endif
488
489static struct apic apic_summit = {
490
491 .name = "summit",
492 .probe = probe_summit,
493 .acpi_madt_oem_check = summit_acpi_madt_oem_check,
494 .apic_id_valid = default_apic_id_valid,
495 .apic_id_registered = summit_apic_id_registered,
496
497 .irq_delivery_mode = dest_LowestPrio,
498 /* logical delivery broadcast to all CPUs: */
499 .irq_dest_mode = 1,
500
501 .target_cpus = summit_target_cpus,
502 .disable_esr = 1,
503 .dest_logical = APIC_DEST_LOGICAL,
504 .check_apicid_used = summit_check_apicid_used,
505 .check_apicid_present = summit_check_apicid_present,
506
507 .vector_allocation_domain = flat_vector_allocation_domain,
508 .init_apic_ldr = summit_init_apic_ldr,
509
510 .ioapic_phys_id_map = summit_ioapic_phys_id_map,
511 .setup_apic_routing = summit_setup_apic_routing,
512 .multi_timer_check = NULL,
513 .cpu_present_to_apicid = summit_cpu_present_to_apicid,
514 .apicid_to_cpu_present = summit_apicid_to_cpu_present,
515 .setup_portio_remap = NULL,
516 .check_phys_apicid_present = summit_check_phys_apicid_present,
517 .enable_apic_mode = NULL,
518 .phys_pkg_id = summit_phys_pkg_id,
519 .mps_oem_check = summit_mps_oem_check,
520
521 .get_apic_id = summit_get_apic_id,
522 .set_apic_id = NULL,
523 .apic_id_mask = 0xFF << 24,
524
525 .cpu_mask_to_apicid_and = summit_cpu_mask_to_apicid_and,
526
527 .send_IPI_mask = summit_send_IPI_mask,
528 .send_IPI_mask_allbutself = NULL,
529 .send_IPI_allbutself = summit_send_IPI_allbutself,
530 .send_IPI_all = summit_send_IPI_all,
531 .send_IPI_self = default_send_IPI_self,
532
533 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
534 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
535
536 .wait_for_init_deassert = default_wait_for_init_deassert,
537
538 .smp_callin_clear_local_apic = NULL,
539 .inquire_remote_apic = default_inquire_remote_apic,
540
541 .read = native_apic_mem_read,
542 .write = native_apic_mem_write,
543 .eoi_write = native_apic_mem_write,
544 .icr_read = native_apic_icr_read,
545 .icr_write = native_apic_icr_write,
546 .wait_icr_idle = native_apic_wait_icr_idle,
547 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
548
549 .x86_32_early_logical_apicid = summit_early_logical_apicid,
550};
551
552apic_driver(apic_summit);
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 140e29db478d..e66766bf1641 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -3,7 +3,6 @@
3#include <linux/string.h> 3#include <linux/string.h>
4#include <linux/kernel.h> 4#include <linux/kernel.h>
5#include <linux/ctype.h> 5#include <linux/ctype.h>
6#include <linux/init.h>
7#include <linux/dmar.h> 6#include <linux/dmar.h>
8#include <linux/cpu.h> 7#include <linux/cpu.h>
9 8
@@ -280,7 +279,7 @@ static struct apic apic_x2apic_cluster = {
280 279
281 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, 280 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
282 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, 281 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
283 .wait_for_init_deassert = NULL, 282 .wait_for_init_deassert = false,
284 .smp_callin_clear_local_apic = NULL, 283 .smp_callin_clear_local_apic = NULL,
285 .inquire_remote_apic = NULL, 284 .inquire_remote_apic = NULL,
286 285
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 562a76d433c8..6d600ebf6c12 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -3,7 +3,6 @@
3#include <linux/string.h> 3#include <linux/string.h>
4#include <linux/kernel.h> 4#include <linux/kernel.h>
5#include <linux/ctype.h> 5#include <linux/ctype.h>
6#include <linux/init.h>
7#include <linux/dmar.h> 6#include <linux/dmar.h>
8 7
9#include <asm/smp.h> 8#include <asm/smp.h>
@@ -134,7 +133,7 @@ static struct apic apic_x2apic_phys = {
134 133
135 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, 134 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
136 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, 135 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
137 .wait_for_init_deassert = NULL, 136 .wait_for_init_deassert = false,
138 .smp_callin_clear_local_apic = NULL, 137 .smp_callin_clear_local_apic = NULL,
139 .inquire_remote_apic = NULL, 138 .inquire_remote_apic = NULL,
140 139
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index ad0dc0428baf..7834389ba5be 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -396,7 +396,7 @@ static struct apic __refdata apic_x2apic_uv_x = {
396 .wakeup_secondary_cpu = uv_wakeup_secondary, 396 .wakeup_secondary_cpu = uv_wakeup_secondary,
397 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, 397 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
398 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, 398 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
399 .wait_for_init_deassert = NULL, 399 .wait_for_init_deassert = false,
400 .smp_callin_clear_local_apic = NULL, 400 .smp_callin_clear_local_apic = NULL,
401 .inquire_remote_apic = NULL, 401 .inquire_remote_apic = NULL,
402 402
@@ -980,7 +980,6 @@ void __init uv_system_init(void)
980 uv_nmi_setup(); 980 uv_nmi_setup();
981 uv_cpu_init(); 981 uv_cpu_init();
982 uv_scir_register_cpu_notifier(); 982 uv_scir_register_cpu_notifier();
983 uv_register_nmi_notifier();
984 proc_mkdir("sgi_uv", NULL); 983 proc_mkdir("sgi_uv", NULL);
985 984
986 /* register Legacy VGA I/O redirection handler */ 985 /* register Legacy VGA I/O redirection handler */
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index e2dbcb7dabdd..83a7995625a6 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -91,7 +91,7 @@ void __init setup_bios_corruption_check(void)
91 91
92 corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); 92 corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
93 93
94 for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) { 94 for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) {
95 start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE), 95 start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE),
96 PAGE_SIZE, corruption_check_size); 96 PAGE_SIZE, corruption_check_size);
97 end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE), 97 end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE),
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 47b56a7e99cb..7fd54f09b011 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -36,12 +36,13 @@ obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd_iommu.o
36endif 36endif
37obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o 37obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o
38obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o 38obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
39obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o 39obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o perf_event_intel_rapl.o
40endif 40endif
41 41
42 42
43obj-$(CONFIG_X86_MCE) += mcheck/ 43obj-$(CONFIG_X86_MCE) += mcheck/
44obj-$(CONFIG_MTRR) += mtrr/ 44obj-$(CONFIG_MTRR) += mtrr/
45obj-$(CONFIG_MICROCODE) += microcode/
45 46
46obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o perf_event_amd_ibs.o 47obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o perf_event_amd_ibs.o
47 48
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index bca023bdd6b2..ce8b8ff0e0ef 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1,5 +1,4 @@
1#include <linux/export.h> 1#include <linux/export.h>
2#include <linux/init.h>
3#include <linux/bitops.h> 2#include <linux/bitops.h>
4#include <linux/elf.h> 3#include <linux/elf.h>
5#include <linux/mm.h> 4#include <linux/mm.h>
@@ -219,7 +218,7 @@ static void amd_k7_smp_check(struct cpuinfo_x86 *c)
219 */ 218 */
220 WARN_ONCE(1, "WARNING: This combination of AMD" 219 WARN_ONCE(1, "WARNING: This combination of AMD"
221 " processors is not suitable for SMP.\n"); 220 " processors is not suitable for SMP.\n");
222 add_taint(TAINT_UNSAFE_SMP, LOCKDEP_NOW_UNRELIABLE); 221 add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE);
223} 222}
224 223
225static void init_amd_k7(struct cpuinfo_x86 *c) 224static void init_amd_k7(struct cpuinfo_x86 *c)
@@ -234,9 +233,7 @@ static void init_amd_k7(struct cpuinfo_x86 *c)
234 if (c->x86_model >= 6 && c->x86_model <= 10) { 233 if (c->x86_model >= 6 && c->x86_model <= 10) {
235 if (!cpu_has(c, X86_FEATURE_XMM)) { 234 if (!cpu_has(c, X86_FEATURE_XMM)) {
236 printk(KERN_INFO "Enabling disabled K7/SSE Support.\n"); 235 printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
237 rdmsr(MSR_K7_HWCR, l, h); 236 msr_clear_bit(MSR_K7_HWCR, 15);
238 l &= ~0x00008000;
239 wrmsr(MSR_K7_HWCR, l, h);
240 set_cpu_cap(c, X86_FEATURE_XMM); 237 set_cpu_cap(c, X86_FEATURE_XMM);
241 } 238 }
242 } 239 }
@@ -487,7 +484,7 @@ static void early_init_amd(struct cpuinfo_x86 *c)
487 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 484 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
488 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); 485 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
489 if (!check_tsc_unstable()) 486 if (!check_tsc_unstable())
490 sched_clock_stable = 1; 487 set_sched_clock_stable();
491 } 488 }
492 489
493#ifdef CONFIG_X86_64 490#ifdef CONFIG_X86_64
@@ -508,6 +505,10 @@ static void early_init_amd(struct cpuinfo_x86 *c)
508 set_cpu_cap(c, X86_FEATURE_EXTD_APICID); 505 set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
509 } 506 }
510#endif 507#endif
508
509 /* F16h erratum 793, CVE-2013-6885 */
510 if (c->x86 == 0x16 && c->x86_model <= 0xf)
511 msr_set_bit(MSR_AMD64_LS_CFG, 15);
511} 512}
512 513
513static const int amd_erratum_383[]; 514static const int amd_erratum_383[];
@@ -527,11 +528,8 @@ static void init_amd(struct cpuinfo_x86 *c)
527 * Errata 63 for SH-B3 steppings 528 * Errata 63 for SH-B3 steppings
528 * Errata 122 for all steppings (F+ have it disabled by default) 529 * Errata 122 for all steppings (F+ have it disabled by default)
529 */ 530 */
530 if (c->x86 == 0xf) { 531 if (c->x86 == 0xf)
531 rdmsrl(MSR_K7_HWCR, value); 532 msr_set_bit(MSR_K7_HWCR, 6);
532 value |= 1 << 6;
533 wrmsrl(MSR_K7_HWCR, value);
534 }
535#endif 533#endif
536 534
537 early_init_amd(c); 535 early_init_amd(c);
@@ -614,14 +612,11 @@ static void init_amd(struct cpuinfo_x86 *c)
614 (c->x86_model >= 0x10) && (c->x86_model <= 0x1f) && 612 (c->x86_model >= 0x10) && (c->x86_model <= 0x1f) &&
615 !cpu_has(c, X86_FEATURE_TOPOEXT)) { 613 !cpu_has(c, X86_FEATURE_TOPOEXT)) {
616 614
617 if (!rdmsrl_safe(0xc0011005, &value)) { 615 if (msr_set_bit(0xc0011005, 54) > 0) {
618 value |= 1ULL << 54;
619 wrmsrl_safe(0xc0011005, value);
620 rdmsrl(0xc0011005, value); 616 rdmsrl(0xc0011005, value);
621 if (value & (1ULL << 54)) { 617 if (value & BIT_64(54)) {
622 set_cpu_cap(c, X86_FEATURE_TOPOEXT); 618 set_cpu_cap(c, X86_FEATURE_TOPOEXT);
623 printk(KERN_INFO FW_INFO "CPU: Re-enabling " 619 pr_info(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n");
624 "disabled Topology Extensions Support\n");
625 } 620 }
626 } 621 }
627 } 622 }
@@ -700,19 +695,12 @@ static void init_amd(struct cpuinfo_x86 *c)
700 * Disable GART TLB Walk Errors on Fam10h. We do this here 695 * Disable GART TLB Walk Errors on Fam10h. We do this here
701 * because this is always needed when GART is enabled, even in a 696 * because this is always needed when GART is enabled, even in a
702 * kernel which has no MCE support built in. 697 * kernel which has no MCE support built in.
703 * BIOS should disable GartTlbWlk Errors themself. If 698 * BIOS should disable GartTlbWlk Errors already. If
704 * it doesn't do it here as suggested by the BKDG. 699 * it doesn't, do it here as suggested by the BKDG.
705 * 700 *
706 * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012 701 * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012
707 */ 702 */
708 u64 mask; 703 msr_set_bit(MSR_AMD64_MCx_MASK(4), 10);
709 int err;
710
711 err = rdmsrl_safe(MSR_AMD64_MCx_MASK(4), &mask);
712 if (err == 0) {
713 mask |= (1 << 10);
714 wrmsrl_safe(MSR_AMD64_MCx_MASK(4), mask);
715 }
716 704
717 /* 705 /*
718 * On family 10h BIOS may not have properly enabled WC+ support, 706 * On family 10h BIOS may not have properly enabled WC+ support,
@@ -724,10 +712,7 @@ static void init_amd(struct cpuinfo_x86 *c)
724 * NOTE: we want to use the _safe accessors so as not to #GP kvm 712 * NOTE: we want to use the _safe accessors so as not to #GP kvm
725 * guests on older kvm hosts. 713 * guests on older kvm hosts.
726 */ 714 */
727 715 msr_clear_bit(MSR_AMD64_BU_CFG2, 24);
728 rdmsrl_safe(MSR_AMD64_BU_CFG2, &value);
729 value &= ~(1ULL << 24);
730 wrmsrl_safe(MSR_AMD64_BU_CFG2, value);
731 716
732 if (cpu_has_amd_erratum(c, amd_erratum_383)) 717 if (cpu_has_amd_erratum(c, amd_erratum_383))
733 set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH); 718 set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH);
@@ -758,10 +743,7 @@ static unsigned int amd_size_cache(struct cpuinfo_x86 *c, unsigned int size)
758 743
759static void cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c) 744static void cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c)
760{ 745{
761 tlb_flushall_shift = 5; 746 tlb_flushall_shift = 6;
762
763 if (c->x86 <= 0x11)
764 tlb_flushall_shift = 4;
765} 747}
766 748
767static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c) 749static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
@@ -790,14 +772,10 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
790 } 772 }
791 773
792 /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ 774 /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
793 if (!((eax >> 16) & mask)) { 775 if (!((eax >> 16) & mask))
794 u32 a, b, c, d; 776 tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff;
795 777 else
796 cpuid(0x80000005, &a, &b, &c, &d);
797 tlb_lld_2m[ENTRIES] = (a >> 16) & 0xff;
798 } else {
799 tlb_lld_2m[ENTRIES] = (eax >> 16) & mask; 778 tlb_lld_2m[ENTRIES] = (eax >> 16) & mask;
800 }
801 779
802 /* a 4M entry uses two 2M entries */ 780 /* a 4M entry uses two 2M entries */
803 tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1; 781 tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1;
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index 8d5652dc99dd..d8fba5c15fbd 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -1,6 +1,5 @@
1#include <linux/bitops.h> 1#include <linux/bitops.h>
2#include <linux/kernel.h> 2#include <linux/kernel.h>
3#include <linux/init.h>
4 3
5#include <asm/processor.h> 4#include <asm/processor.h>
6#include <asm/e820.h> 5#include <asm/e820.h>
@@ -9,236 +8,6 @@
9 8
10#include "cpu.h" 9#include "cpu.h"
11 10
12#ifdef CONFIG_X86_OOSTORE
13
14static u32 power2(u32 x)
15{
16 u32 s = 1;
17
18 while (s <= x)
19 s <<= 1;
20
21 return s >>= 1;
22}
23
24
25/*
26 * Set up an actual MCR
27 */
28static void centaur_mcr_insert(int reg, u32 base, u32 size, int key)
29{
30 u32 lo, hi;
31
32 hi = base & ~0xFFF;
33 lo = ~(size-1); /* Size is a power of 2 so this makes a mask */
34 lo &= ~0xFFF; /* Remove the ctrl value bits */
35 lo |= key; /* Attribute we wish to set */
36 wrmsr(reg+MSR_IDT_MCR0, lo, hi);
37 mtrr_centaur_report_mcr(reg, lo, hi); /* Tell the mtrr driver */
38}
39
40/*
41 * Figure what we can cover with MCR's
42 *
43 * Shortcut: We know you can't put 4Gig of RAM on a winchip
44 */
45static u32 ramtop(void)
46{
47 u32 clip = 0xFFFFFFFFUL;
48 u32 top = 0;
49 int i;
50
51 for (i = 0; i < e820.nr_map; i++) {
52 unsigned long start, end;
53
54 if (e820.map[i].addr > 0xFFFFFFFFUL)
55 continue;
56 /*
57 * Don't MCR over reserved space. Ignore the ISA hole
58 * we frob around that catastrophe already
59 */
60 if (e820.map[i].type == E820_RESERVED) {
61 if (e820.map[i].addr >= 0x100000UL &&
62 e820.map[i].addr < clip)
63 clip = e820.map[i].addr;
64 continue;
65 }
66 start = e820.map[i].addr;
67 end = e820.map[i].addr + e820.map[i].size;
68 if (start >= end)
69 continue;
70 if (end > top)
71 top = end;
72 }
73 /*
74 * Everything below 'top' should be RAM except for the ISA hole.
75 * Because of the limited MCR's we want to map NV/ACPI into our
76 * MCR range for gunk in RAM
77 *
78 * Clip might cause us to MCR insufficient RAM but that is an
79 * acceptable failure mode and should only bite obscure boxes with
80 * a VESA hole at 15Mb
81 *
82 * The second case Clip sometimes kicks in is when the EBDA is marked
83 * as reserved. Again we fail safe with reasonable results
84 */
85 if (top > clip)
86 top = clip;
87
88 return top;
89}
90
91/*
92 * Compute a set of MCR's to give maximum coverage
93 */
94static int centaur_mcr_compute(int nr, int key)
95{
96 u32 mem = ramtop();
97 u32 root = power2(mem);
98 u32 base = root;
99 u32 top = root;
100 u32 floor = 0;
101 int ct = 0;
102
103 while (ct < nr) {
104 u32 fspace = 0;
105 u32 high;
106 u32 low;
107
108 /*
109 * Find the largest block we will fill going upwards
110 */
111 high = power2(mem-top);
112
113 /*
114 * Find the largest block we will fill going downwards
115 */
116 low = base/2;
117
118 /*
119 * Don't fill below 1Mb going downwards as there
120 * is an ISA hole in the way.
121 */
122 if (base <= 1024*1024)
123 low = 0;
124
125 /*
126 * See how much space we could cover by filling below
127 * the ISA hole
128 */
129
130 if (floor == 0)
131 fspace = 512*1024;
132 else if (floor == 512*1024)
133 fspace = 128*1024;
134
135 /* And forget ROM space */
136
137 /*
138 * Now install the largest coverage we get
139 */
140 if (fspace > high && fspace > low) {
141 centaur_mcr_insert(ct, floor, fspace, key);
142 floor += fspace;
143 } else if (high > low) {
144 centaur_mcr_insert(ct, top, high, key);
145 top += high;
146 } else if (low > 0) {
147 base -= low;
148 centaur_mcr_insert(ct, base, low, key);
149 } else
150 break;
151 ct++;
152 }
153 /*
154 * We loaded ct values. We now need to set the mask. The caller
155 * must do this bit.
156 */
157 return ct;
158}
159
160static void centaur_create_optimal_mcr(void)
161{
162 int used;
163 int i;
164
165 /*
166 * Allocate up to 6 mcrs to mark as much of ram as possible
167 * as write combining and weak write ordered.
168 *
169 * To experiment with: Linux never uses stack operations for
170 * mmio spaces so we could globally enable stack operation wc
171 *
172 * Load the registers with type 31 - full write combining, all
173 * writes weakly ordered.
174 */
175 used = centaur_mcr_compute(6, 31);
176
177 /*
178 * Wipe unused MCRs
179 */
180 for (i = used; i < 8; i++)
181 wrmsr(MSR_IDT_MCR0+i, 0, 0);
182}
183
184static void winchip2_create_optimal_mcr(void)
185{
186 u32 lo, hi;
187 int used;
188 int i;
189
190 /*
191 * Allocate up to 6 mcrs to mark as much of ram as possible
192 * as write combining, weak store ordered.
193 *
194 * Load the registers with type 25
195 * 8 - weak write ordering
196 * 16 - weak read ordering
197 * 1 - write combining
198 */
199 used = centaur_mcr_compute(6, 25);
200
201 /*
202 * Mark the registers we are using.
203 */
204 rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
205 for (i = 0; i < used; i++)
206 lo |= 1<<(9+i);
207 wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
208
209 /*
210 * Wipe unused MCRs
211 */
212
213 for (i = used; i < 8; i++)
214 wrmsr(MSR_IDT_MCR0+i, 0, 0);
215}
216
217/*
218 * Handle the MCR key on the Winchip 2.
219 */
220static void winchip2_unprotect_mcr(void)
221{
222 u32 lo, hi;
223 u32 key;
224
225 rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
226 lo &= ~0x1C0; /* blank bits 8-6 */
227 key = (lo>>17) & 7;
228 lo |= key<<6; /* replace with unlock key */
229 wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
230}
231
232static void winchip2_protect_mcr(void)
233{
234 u32 lo, hi;
235
236 rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
237 lo &= ~0x1C0; /* blank bits 8-6 */
238 wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
239}
240#endif /* CONFIG_X86_OOSTORE */
241
242#define ACE_PRESENT (1 << 6) 11#define ACE_PRESENT (1 << 6)
243#define ACE_ENABLED (1 << 7) 12#define ACE_ENABLED (1 << 7)
244#define ACE_FCR (1 << 28) /* MSR_VIA_FCR */ 13#define ACE_FCR (1 << 28) /* MSR_VIA_FCR */
@@ -363,20 +132,6 @@ static void init_centaur(struct cpuinfo_x86 *c)
363 fcr_clr = DPDC; 132 fcr_clr = DPDC;
364 printk(KERN_NOTICE "Disabling bugged TSC.\n"); 133 printk(KERN_NOTICE "Disabling bugged TSC.\n");
365 clear_cpu_cap(c, X86_FEATURE_TSC); 134 clear_cpu_cap(c, X86_FEATURE_TSC);
366#ifdef CONFIG_X86_OOSTORE
367 centaur_create_optimal_mcr();
368 /*
369 * Enable:
370 * write combining on non-stack, non-string
371 * write combining on string, all types
372 * weak write ordering
373 *
374 * The C6 original lacks weak read order
375 *
376 * Note 0x120 is write only on Winchip 1
377 */
378 wrmsr(MSR_IDT_MCR_CTRL, 0x01F0001F, 0);
379#endif
380 break; 135 break;
381 case 8: 136 case 8:
382 switch (c->x86_mask) { 137 switch (c->x86_mask) {
@@ -393,40 +148,12 @@ static void init_centaur(struct cpuinfo_x86 *c)
393 fcr_set = ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK| 148 fcr_set = ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK|
394 E2MMX|EAMD3D; 149 E2MMX|EAMD3D;
395 fcr_clr = DPDC; 150 fcr_clr = DPDC;
396#ifdef CONFIG_X86_OOSTORE
397 winchip2_unprotect_mcr();
398 winchip2_create_optimal_mcr();
399 rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
400 /*
401 * Enable:
402 * write combining on non-stack, non-string
403 * write combining on string, all types
404 * weak write ordering
405 */
406 lo |= 31;
407 wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
408 winchip2_protect_mcr();
409#endif
410 break; 151 break;
411 case 9: 152 case 9:
412 name = "3"; 153 name = "3";
413 fcr_set = ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK| 154 fcr_set = ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK|
414 E2MMX|EAMD3D; 155 E2MMX|EAMD3D;
415 fcr_clr = DPDC; 156 fcr_clr = DPDC;
416#ifdef CONFIG_X86_OOSTORE
417 winchip2_unprotect_mcr();
418 winchip2_create_optimal_mcr();
419 rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
420 /*
421 * Enable:
422 * write combining on non-stack, non-string
423 * write combining on string, all types
424 * weak write ordering
425 */
426 lo |= 31;
427 wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
428 winchip2_protect_mcr();
429#endif
430 break; 157 break;
431 default: 158 default:
432 name = "??"; 159 name = "??";
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 6abc172b8258..a135239badb7 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -284,8 +284,13 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
284 raw_local_save_flags(eflags); 284 raw_local_save_flags(eflags);
285 BUG_ON(eflags & X86_EFLAGS_AC); 285 BUG_ON(eflags & X86_EFLAGS_AC);
286 286
287 if (cpu_has(c, X86_FEATURE_SMAP)) 287 if (cpu_has(c, X86_FEATURE_SMAP)) {
288#ifdef CONFIG_X86_SMAP
288 set_in_cr4(X86_CR4_SMAP); 289 set_in_cr4(X86_CR4_SMAP);
290#else
291 clear_in_cr4(X86_CR4_SMAP);
292#endif
293 }
289} 294}
290 295
291/* 296/*
@@ -472,6 +477,7 @@ u16 __read_mostly tlb_lli_4m[NR_INFO];
472u16 __read_mostly tlb_lld_4k[NR_INFO]; 477u16 __read_mostly tlb_lld_4k[NR_INFO];
473u16 __read_mostly tlb_lld_2m[NR_INFO]; 478u16 __read_mostly tlb_lld_2m[NR_INFO];
474u16 __read_mostly tlb_lld_4m[NR_INFO]; 479u16 __read_mostly tlb_lld_4m[NR_INFO];
480u16 __read_mostly tlb_lld_1g[NR_INFO];
475 481
476/* 482/*
477 * tlb_flushall_shift shows the balance point in replacing cr3 write 483 * tlb_flushall_shift shows the balance point in replacing cr3 write
@@ -486,13 +492,13 @@ void cpu_detect_tlb(struct cpuinfo_x86 *c)
486 if (this_cpu->c_detect_tlb) 492 if (this_cpu->c_detect_tlb)
487 this_cpu->c_detect_tlb(c); 493 this_cpu->c_detect_tlb(c);
488 494
489 printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \ 495 printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n"
490 "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \ 496 "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n"
491 "tlb_flushall_shift: %d\n", 497 "tlb_flushall_shift: %d\n",
492 tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES], 498 tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES],
493 tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES], 499 tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES],
494 tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES], 500 tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES],
495 tlb_flushall_shift); 501 tlb_lld_1g[ENTRIES], tlb_flushall_shift);
496} 502}
497 503
498void detect_ht(struct cpuinfo_x86 *c) 504void detect_ht(struct cpuinfo_x86 *c)
@@ -1019,7 +1025,8 @@ __setup("show_msr=", setup_show_msr);
1019 1025
1020static __init int setup_noclflush(char *arg) 1026static __init int setup_noclflush(char *arg)
1021{ 1027{
1022 setup_clear_cpu_cap(X86_FEATURE_CLFLSH); 1028 setup_clear_cpu_cap(X86_FEATURE_CLFLUSH);
1029 setup_clear_cpu_cap(X86_FEATURE_CLFLUSHOPT);
1023 return 1; 1030 return 1;
1024} 1031}
1025__setup("noclflush", setup_noclflush); 1032__setup("noclflush", setup_noclflush);
@@ -1072,6 +1079,10 @@ static __init int setup_disablecpuid(char *arg)
1072} 1079}
1073__setup("clearcpuid=", setup_disablecpuid); 1080__setup("clearcpuid=", setup_disablecpuid);
1074 1081
1082DEFINE_PER_CPU(unsigned long, kernel_stack) =
1083 (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE;
1084EXPORT_PER_CPU_SYMBOL(kernel_stack);
1085
1075#ifdef CONFIG_X86_64 1086#ifdef CONFIG_X86_64
1076struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; 1087struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table };
1077struct desc_ptr debug_idt_descr = { NR_VECTORS * 16 - 1, 1088struct desc_ptr debug_idt_descr = { NR_VECTORS * 16 - 1,
@@ -1088,10 +1099,6 @@ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
1088 &init_task; 1099 &init_task;
1089EXPORT_PER_CPU_SYMBOL(current_task); 1100EXPORT_PER_CPU_SYMBOL(current_task);
1090 1101
1091DEFINE_PER_CPU(unsigned long, kernel_stack) =
1092 (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE;
1093EXPORT_PER_CPU_SYMBOL(kernel_stack);
1094
1095DEFINE_PER_CPU(char *, irq_stack_ptr) = 1102DEFINE_PER_CPU(char *, irq_stack_ptr) =
1096 init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; 1103 init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
1097 1104
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index d0969c75ab54..aaf152e79637 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -1,4 +1,3 @@
1#include <linux/init.h>
2#include <linux/bitops.h> 1#include <linux/bitops.h>
3#include <linux/delay.h> 2#include <linux/delay.h>
4#include <linux/pci.h> 3#include <linux/pci.h>
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index ea04b342c026..a80029035bf2 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -1,4 +1,3 @@
1#include <linux/init.h>
2#include <linux/kernel.h> 1#include <linux/kernel.h>
3 2
4#include <linux/string.h> 3#include <linux/string.h>
@@ -32,11 +31,8 @@ static void early_init_intel(struct cpuinfo_x86 *c)
32 31
33 /* Unmask CPUID levels if masked: */ 32 /* Unmask CPUID levels if masked: */
34 if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { 33 if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
35 rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); 34 if (msr_clear_bit(MSR_IA32_MISC_ENABLE,
36 35 MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0) {
37 if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) {
38 misc_enable &= ~MSR_IA32_MISC_ENABLE_LIMIT_CPUID;
39 wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
40 c->cpuid_level = cpuid_eax(0); 36 c->cpuid_level = cpuid_eax(0);
41 get_cpu_cap(c); 37 get_cpu_cap(c);
42 } 38 }
@@ -93,7 +89,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
93 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 89 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
94 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); 90 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
95 if (!check_tsc_unstable()) 91 if (!check_tsc_unstable())
96 sched_clock_stable = 1; 92 set_sched_clock_stable();
97 } 93 }
98 94
99 /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */ 95 /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */
@@ -130,16 +126,10 @@ static void early_init_intel(struct cpuinfo_x86 *c)
130 * Ingo Molnar reported a Pentium D (model 6) and a Xeon 126 * Ingo Molnar reported a Pentium D (model 6) and a Xeon
131 * (model 2) with the same problem. 127 * (model 2) with the same problem.
132 */ 128 */
133 if (c->x86 == 15) { 129 if (c->x86 == 15)
134 rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); 130 if (msr_clear_bit(MSR_IA32_MISC_ENABLE,
135 131 MSR_IA32_MISC_ENABLE_FAST_STRING_BIT) > 0)
136 if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) { 132 pr_info("kmemcheck: Disabling fast string operations\n");
137 printk(KERN_INFO "kmemcheck: Disabling fast string operations\n");
138
139 misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING;
140 wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
141 }
142 }
143#endif 133#endif
144 134
145 /* 135 /*
@@ -196,10 +186,16 @@ static void intel_smp_check(struct cpuinfo_x86 *c)
196 } 186 }
197} 187}
198 188
199static void intel_workarounds(struct cpuinfo_x86 *c) 189static int forcepae;
190static int __init forcepae_setup(char *__unused)
200{ 191{
201 unsigned long lo, hi; 192 forcepae = 1;
193 return 1;
194}
195__setup("forcepae", forcepae_setup);
202 196
197static void intel_workarounds(struct cpuinfo_x86 *c)
198{
203#ifdef CONFIG_X86_F00F_BUG 199#ifdef CONFIG_X86_F00F_BUG
204 /* 200 /*
205 * All current models of Pentium and Pentium with MMX technology CPUs 201 * All current models of Pentium and Pentium with MMX technology CPUs
@@ -226,16 +222,26 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
226 clear_cpu_cap(c, X86_FEATURE_SEP); 222 clear_cpu_cap(c, X86_FEATURE_SEP);
227 223
228 /* 224 /*
225 * PAE CPUID issue: many Pentium M report no PAE but may have a
226 * functionally usable PAE implementation.
227 * Forcefully enable PAE if kernel parameter "forcepae" is present.
228 */
229 if (forcepae) {
230 printk(KERN_WARNING "PAE forced!\n");
231 set_cpu_cap(c, X86_FEATURE_PAE);
232 add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE);
233 }
234
235 /*
229 * P4 Xeon errata 037 workaround. 236 * P4 Xeon errata 037 workaround.
230 * Hardware prefetcher may cause stale data to be loaded into the cache. 237 * Hardware prefetcher may cause stale data to be loaded into the cache.
231 */ 238 */
232 if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) { 239 if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) {
233 rdmsr(MSR_IA32_MISC_ENABLE, lo, hi); 240 if (msr_set_bit(MSR_IA32_MISC_ENABLE,
234 if ((lo & MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE) == 0) { 241 MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT)
235 printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n"); 242 > 0) {
236 printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n"); 243 pr_info("CPU: C0 stepping P4 Xeon detected.\n");
237 lo |= MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE; 244 pr_info("CPU: Disabling hardware prefetching (Errata 037)\n");
238 wrmsr(MSR_IA32_MISC_ENABLE, lo, hi);
239 } 245 }
240 } 246 }
241 247
@@ -268,10 +274,6 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
268 } 274 }
269#endif 275#endif
270 276
271#ifdef CONFIG_X86_NUMAQ
272 numaq_tsc_disable();
273#endif
274
275 intel_smp_check(c); 277 intel_smp_check(c);
276} 278}
277#else 279#else
@@ -506,6 +508,7 @@ static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
506#define TLB_DATA0_2M_4M 0x23 508#define TLB_DATA0_2M_4M 0x23
507 509
508#define STLB_4K 0x41 510#define STLB_4K 0x41
511#define STLB_4K_2M 0x42
509 512
510static const struct _tlb_table intel_tlb_table[] = { 513static const struct _tlb_table intel_tlb_table[] = {
511 { 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" }, 514 { 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" },
@@ -526,13 +529,20 @@ static const struct _tlb_table intel_tlb_table[] = {
526 { 0x5b, TLB_DATA_4K_4M, 64, " TLB_DATA 4 KByte and 4 MByte pages" }, 529 { 0x5b, TLB_DATA_4K_4M, 64, " TLB_DATA 4 KByte and 4 MByte pages" },
527 { 0x5c, TLB_DATA_4K_4M, 128, " TLB_DATA 4 KByte and 4 MByte pages" }, 530 { 0x5c, TLB_DATA_4K_4M, 128, " TLB_DATA 4 KByte and 4 MByte pages" },
528 { 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" }, 531 { 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" },
532 { 0x61, TLB_INST_4K, 48, " TLB_INST 4 KByte pages, full associative" },
533 { 0x63, TLB_DATA_1G, 4, " TLB_DATA 1 GByte pages, 4-way set associative" },
534 { 0x76, TLB_INST_2M_4M, 8, " TLB_INST 2-MByte or 4-MByte pages, fully associative" },
529 { 0xb0, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 4-way set associative" }, 535 { 0xb0, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 4-way set associative" },
530 { 0xb1, TLB_INST_2M_4M, 4, " TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" }, 536 { 0xb1, TLB_INST_2M_4M, 4, " TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" },
531 { 0xb2, TLB_INST_4K, 64, " TLB_INST 4KByte pages, 4-way set associative" }, 537 { 0xb2, TLB_INST_4K, 64, " TLB_INST 4KByte pages, 4-way set associative" },
532 { 0xb3, TLB_DATA_4K, 128, " TLB_DATA 4 KByte pages, 4-way set associative" }, 538 { 0xb3, TLB_DATA_4K, 128, " TLB_DATA 4 KByte pages, 4-way set associative" },
533 { 0xb4, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 4-way associative" }, 539 { 0xb4, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 4-way associative" },
540 { 0xb5, TLB_INST_4K, 64, " TLB_INST 4 KByte pages, 8-way set ssociative" },
541 { 0xb6, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 8-way set ssociative" },
534 { 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" }, 542 { 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" },
535 { 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" }, 543 { 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" },
544 { 0xc1, STLB_4K_2M, 1024, " STLB 4 KByte and 2 MByte pages, 8-way associative" },
545 { 0xc2, TLB_DATA_2M_4M, 16, " DTLB 2 MByte/4MByte pages, 4-way associative" },
536 { 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" }, 546 { 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" },
537 { 0x00, 0, 0 } 547 { 0x00, 0, 0 }
538}; 548};
@@ -558,6 +568,20 @@ static void intel_tlb_lookup(const unsigned char desc)
558 if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) 568 if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
559 tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; 569 tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
560 break; 570 break;
571 case STLB_4K_2M:
572 if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
573 tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
574 if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
575 tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
576 if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
577 tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
578 if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries)
579 tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries;
580 if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
581 tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
582 if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
583 tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
584 break;
561 case TLB_INST_ALL: 585 case TLB_INST_ALL:
562 if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) 586 if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
563 tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; 587 tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
@@ -603,6 +627,10 @@ static void intel_tlb_lookup(const unsigned char desc)
603 if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) 627 if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
604 tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; 628 tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
605 break; 629 break;
630 case TLB_DATA_1G:
631 if (tlb_lld_1g[ENTRIES] < intel_tlb_table[k].entries)
632 tlb_lld_1g[ENTRIES] = intel_tlb_table[k].entries;
633 break;
606 } 634 }
607} 635}
608 636
@@ -615,21 +643,17 @@ static void intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c)
615 case 0x61d: /* six-core 45 nm xeon "Dunnington" */ 643 case 0x61d: /* six-core 45 nm xeon "Dunnington" */
616 tlb_flushall_shift = -1; 644 tlb_flushall_shift = -1;
617 break; 645 break;
646 case 0x63a: /* Ivybridge */
647 tlb_flushall_shift = 2;
648 break;
618 case 0x61a: /* 45 nm nehalem, "Bloomfield" */ 649 case 0x61a: /* 45 nm nehalem, "Bloomfield" */
619 case 0x61e: /* 45 nm nehalem, "Lynnfield" */ 650 case 0x61e: /* 45 nm nehalem, "Lynnfield" */
620 case 0x625: /* 32 nm nehalem, "Clarkdale" */ 651 case 0x625: /* 32 nm nehalem, "Clarkdale" */
621 case 0x62c: /* 32 nm nehalem, "Gulftown" */ 652 case 0x62c: /* 32 nm nehalem, "Gulftown" */
622 case 0x62e: /* 45 nm nehalem-ex, "Beckton" */ 653 case 0x62e: /* 45 nm nehalem-ex, "Beckton" */
623 case 0x62f: /* 32 nm Xeon E7 */ 654 case 0x62f: /* 32 nm Xeon E7 */
624 tlb_flushall_shift = 6;
625 break;
626 case 0x62a: /* SandyBridge */ 655 case 0x62a: /* SandyBridge */
627 case 0x62d: /* SandyBridge, "Romely-EP" */ 656 case 0x62d: /* SandyBridge, "Romely-EP" */
628 tlb_flushall_shift = 5;
629 break;
630 case 0x63a: /* Ivybridge */
631 tlb_flushall_shift = 1;
632 break;
633 default: 657 default:
634 tlb_flushall_shift = 6; 658 tlb_flushall_shift = 6;
635 } 659 }
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 0641113e2965..a952e9c85b6f 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -1225,21 +1225,24 @@ static struct notifier_block cacheinfo_cpu_notifier = {
1225 1225
1226static int __init cache_sysfs_init(void) 1226static int __init cache_sysfs_init(void)
1227{ 1227{
1228 int i; 1228 int i, err = 0;
1229 1229
1230 if (num_cache_leaves == 0) 1230 if (num_cache_leaves == 0)
1231 return 0; 1231 return 0;
1232 1232
1233 cpu_notifier_register_begin();
1233 for_each_online_cpu(i) { 1234 for_each_online_cpu(i) {
1234 int err;
1235 struct device *dev = get_cpu_device(i); 1235 struct device *dev = get_cpu_device(i);
1236 1236
1237 err = cache_add_dev(dev); 1237 err = cache_add_dev(dev);
1238 if (err) 1238 if (err)
1239 return err; 1239 goto out;
1240 } 1240 }
1241 register_hotcpu_notifier(&cacheinfo_cpu_notifier); 1241 __register_hotcpu_notifier(&cacheinfo_cpu_notifier);
1242 return 0; 1242
1243out:
1244 cpu_notifier_register_done();
1245 return err;
1243} 1246}
1244 1247
1245device_initcall(cache_sysfs_init); 1248device_initcall(cache_sysfs_init);
diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c
index 36565373af87..afa9f0d487ea 100644
--- a/arch/x86/kernel/cpu/match.c
+++ b/arch/x86/kernel/cpu/match.c
@@ -47,45 +47,3 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match)
47 return NULL; 47 return NULL;
48} 48}
49EXPORT_SYMBOL(x86_match_cpu); 49EXPORT_SYMBOL(x86_match_cpu);
50
51ssize_t arch_print_cpu_modalias(struct device *dev,
52 struct device_attribute *attr,
53 char *bufptr)
54{
55 int size = PAGE_SIZE;
56 int i, n;
57 char *buf = bufptr;
58
59 n = snprintf(buf, size, "x86cpu:vendor:%04X:family:%04X:"
60 "model:%04X:feature:",
61 boot_cpu_data.x86_vendor,
62 boot_cpu_data.x86,
63 boot_cpu_data.x86_model);
64 size -= n;
65 buf += n;
66 size -= 1;
67 for (i = 0; i < NCAPINTS*32; i++) {
68 if (boot_cpu_has(i)) {
69 n = snprintf(buf, size, ",%04X", i);
70 if (n >= size) {
71 WARN(1, "x86 features overflow page\n");
72 break;
73 }
74 size -= n;
75 buf += n;
76 }
77 }
78 *buf++ = '\n';
79 return buf - bufptr;
80}
81
82int arch_cpu_uevent(struct device *dev, struct kobj_uevent_env *env)
83{
84 char *buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
85 if (buf) {
86 arch_print_cpu_modalias(NULL, NULL, buf);
87 add_uevent_var(env, "MODALIAS=%s", buf);
88 kfree(buf);
89 }
90 return 0;
91}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
index de8b60a53f69..a1aef9533154 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -33,22 +33,28 @@
33#include <linux/acpi.h> 33#include <linux/acpi.h>
34#include <linux/cper.h> 34#include <linux/cper.h>
35#include <acpi/apei.h> 35#include <acpi/apei.h>
36#include <acpi/ghes.h>
36#include <asm/mce.h> 37#include <asm/mce.h>
37 38
38#include "mce-internal.h" 39#include "mce-internal.h"
39 40
40void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err) 41void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
41{ 42{
42 struct mce m; 43 struct mce m;
43 44
44 /* Only corrected MC is reported */ 45 if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
45 if (!corrected || !(mem_err->validation_bits & CPER_MEM_VALID_PA))
46 return; 46 return;
47 47
48 mce_setup(&m); 48 mce_setup(&m);
49 m.bank = 1; 49 m.bank = 1;
50 /* Fake a memory read corrected error with unknown channel */ 50 /* Fake a memory read error with unknown channel */
51 m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f; 51 m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f;
52
53 if (severity >= GHES_SEV_RECOVERABLE)
54 m.status |= MCI_STATUS_UC;
55 if (severity >= GHES_SEV_PANIC)
56 m.status |= MCI_STATUS_PCC;
57
52 m.addr = mem_err->physical_addr; 58 m.addr = mem_err->physical_addr;
53 mce_log(&m); 59 mce_log(&m);
54 mce_notify_irq(); 60 mce_notify_irq();
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index b3218cdee95f..eeee23ff75ef 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -89,6 +89,9 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
89static DEFINE_PER_CPU(struct mce, mces_seen); 89static DEFINE_PER_CPU(struct mce, mces_seen);
90static int cpu_missing; 90static int cpu_missing;
91 91
92/* CMCI storm detection filter */
93static DEFINE_PER_CPU(unsigned long, mce_polled_error);
94
92/* 95/*
93 * MCA banks polled by the period polling timer for corrected events. 96 * MCA banks polled by the period polling timer for corrected events.
94 * With Intel CMCI, this only has MCA banks which do not support CMCI (if any). 97 * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
@@ -595,6 +598,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
595{ 598{
596 struct mce m; 599 struct mce m;
597 int i; 600 int i;
601 unsigned long *v;
598 602
599 this_cpu_inc(mce_poll_count); 603 this_cpu_inc(mce_poll_count);
600 604
@@ -614,6 +618,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
614 if (!(m.status & MCI_STATUS_VAL)) 618 if (!(m.status & MCI_STATUS_VAL))
615 continue; 619 continue;
616 620
621 v = &get_cpu_var(mce_polled_error);
622 set_bit(0, v);
617 /* 623 /*
618 * Uncorrected or signalled events are handled by the exception 624 * Uncorrected or signalled events are handled by the exception
619 * handler when it is enabled, so don't process those here. 625 * handler when it is enabled, so don't process those here.
@@ -1278,10 +1284,18 @@ static unsigned long mce_adjust_timer_default(unsigned long interval)
1278static unsigned long (*mce_adjust_timer)(unsigned long interval) = 1284static unsigned long (*mce_adjust_timer)(unsigned long interval) =
1279 mce_adjust_timer_default; 1285 mce_adjust_timer_default;
1280 1286
1287static int cmc_error_seen(void)
1288{
1289 unsigned long *v = &__get_cpu_var(mce_polled_error);
1290
1291 return test_and_clear_bit(0, v);
1292}
1293
1281static void mce_timer_fn(unsigned long data) 1294static void mce_timer_fn(unsigned long data)
1282{ 1295{
1283 struct timer_list *t = &__get_cpu_var(mce_timer); 1296 struct timer_list *t = &__get_cpu_var(mce_timer);
1284 unsigned long iv; 1297 unsigned long iv;
1298 int notify;
1285 1299
1286 WARN_ON(smp_processor_id() != data); 1300 WARN_ON(smp_processor_id() != data);
1287 1301
@@ -1296,7 +1310,9 @@ static void mce_timer_fn(unsigned long data)
1296 * polling interval, otherwise increase the polling interval. 1310 * polling interval, otherwise increase the polling interval.
1297 */ 1311 */
1298 iv = __this_cpu_read(mce_next_interval); 1312 iv = __this_cpu_read(mce_next_interval);
1299 if (mce_notify_irq()) { 1313 notify = mce_notify_irq();
1314 notify |= cmc_error_seen();
1315 if (notify) {
1300 iv = max(iv / 2, (unsigned long) HZ/100); 1316 iv = max(iv / 2, (unsigned long) HZ/100);
1301 } else { 1317 } else {
1302 iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); 1318 iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
@@ -1638,15 +1654,15 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1638 1654
1639static void mce_start_timer(unsigned int cpu, struct timer_list *t) 1655static void mce_start_timer(unsigned int cpu, struct timer_list *t)
1640{ 1656{
1641 unsigned long iv = mce_adjust_timer(check_interval * HZ); 1657 unsigned long iv = check_interval * HZ;
1642
1643 __this_cpu_write(mce_next_interval, iv);
1644 1658
1645 if (mca_cfg.ignore_ce || !iv) 1659 if (mca_cfg.ignore_ce || !iv)
1646 return; 1660 return;
1647 1661
1662 per_cpu(mce_next_interval, cpu) = iv;
1663
1648 t->expires = round_jiffies(jiffies + iv); 1664 t->expires = round_jiffies(jiffies + iv);
1649 add_timer_on(t, smp_processor_id()); 1665 add_timer_on(t, cpu);
1650} 1666}
1651 1667
1652static void __mcheck_cpu_init_timer(void) 1668static void __mcheck_cpu_init_timer(void)
@@ -2272,8 +2288,10 @@ static int mce_device_create(unsigned int cpu)
2272 dev->release = &mce_device_release; 2288 dev->release = &mce_device_release;
2273 2289
2274 err = device_register(dev); 2290 err = device_register(dev);
2275 if (err) 2291 if (err) {
2292 put_device(dev);
2276 return err; 2293 return err;
2294 }
2277 2295
2278 for (i = 0; mce_device_attrs[i]; i++) { 2296 for (i = 0; mce_device_attrs[i]; i++) {
2279 err = device_create_file(dev, mce_device_attrs[i]); 2297 err = device_create_file(dev, mce_device_attrs[i]);
@@ -2432,14 +2450,18 @@ static __init int mcheck_init_device(void)
2432 if (err) 2450 if (err)
2433 return err; 2451 return err;
2434 2452
2453 cpu_notifier_register_begin();
2435 for_each_online_cpu(i) { 2454 for_each_online_cpu(i) {
2436 err = mce_device_create(i); 2455 err = mce_device_create(i);
2437 if (err) 2456 if (err) {
2457 cpu_notifier_register_done();
2438 return err; 2458 return err;
2459 }
2439 } 2460 }
2440 2461
2441 register_syscore_ops(&mce_syscore_ops); 2462 register_syscore_ops(&mce_syscore_ops);
2442 register_hotcpu_notifier(&mce_cpu_notifier); 2463 __register_hotcpu_notifier(&mce_cpu_notifier);
2464 cpu_notifier_register_done();
2443 2465
2444 /* register character device /dev/mcelog */ 2466 /* register character device /dev/mcelog */
2445 misc_register(&mce_chrdev_device); 2467 misc_register(&mce_chrdev_device);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 4cfe0458ca66..3bdb95ae8c43 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -6,10 +6,10 @@
6 */ 6 */
7 7
8#include <linux/gfp.h> 8#include <linux/gfp.h>
9#include <linux/init.h>
10#include <linux/interrupt.h> 9#include <linux/interrupt.h>
11#include <linux/percpu.h> 10#include <linux/percpu.h>
12#include <linux/sched.h> 11#include <linux/sched.h>
12#include <linux/cpumask.h>
13#include <asm/apic.h> 13#include <asm/apic.h>
14#include <asm/processor.h> 14#include <asm/processor.h>
15#include <asm/msr.h> 15#include <asm/msr.h>
@@ -138,6 +138,22 @@ unsigned long mce_intel_adjust_timer(unsigned long interval)
138 } 138 }
139} 139}
140 140
141static void cmci_storm_disable_banks(void)
142{
143 unsigned long flags, *owned;
144 int bank;
145 u64 val;
146
147 raw_spin_lock_irqsave(&cmci_discover_lock, flags);
148 owned = __get_cpu_var(mce_banks_owned);
149 for_each_set_bit(bank, owned, MAX_NR_BANKS) {
150 rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
151 val &= ~MCI_CTL2_CMCI_EN;
152 wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
153 }
154 raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
155}
156
141static bool cmci_storm_detect(void) 157static bool cmci_storm_detect(void)
142{ 158{
143 unsigned int cnt = __this_cpu_read(cmci_storm_cnt); 159 unsigned int cnt = __this_cpu_read(cmci_storm_cnt);
@@ -159,7 +175,7 @@ static bool cmci_storm_detect(void)
159 if (cnt <= CMCI_STORM_THRESHOLD) 175 if (cnt <= CMCI_STORM_THRESHOLD)
160 return false; 176 return false;
161 177
162 cmci_clear(); 178 cmci_storm_disable_banks();
163 __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE); 179 __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
164 r = atomic_add_return(1, &cmci_storm_on_cpus); 180 r = atomic_add_return(1, &cmci_storm_on_cpus);
165 mce_timer_kick(CMCI_POLL_INTERVAL); 181 mce_timer_kick(CMCI_POLL_INTERVAL);
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index 1c044b1ccc59..a3042989398c 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -5,7 +5,6 @@
5#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel.h> 6#include <linux/kernel.h>
7#include <linux/types.h> 7#include <linux/types.h>
8#include <linux/init.h>
9#include <linux/smp.h> 8#include <linux/smp.h>
10 9
11#include <asm/processor.h> 10#include <asm/processor.h>
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 3eec7de76efb..d921b7ee6595 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -271,9 +271,6 @@ static void thermal_throttle_remove_dev(struct device *dev)
271 sysfs_remove_group(&dev->kobj, &thermal_attr_group); 271 sysfs_remove_group(&dev->kobj, &thermal_attr_group);
272} 272}
273 273
274/* Mutex protecting device creation against CPU hotplug: */
275static DEFINE_MUTEX(therm_cpu_lock);
276
277/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 274/* Get notified when a cpu comes on/off. Be hotplug friendly. */
278static int 275static int
279thermal_throttle_cpu_callback(struct notifier_block *nfb, 276thermal_throttle_cpu_callback(struct notifier_block *nfb,
@@ -289,18 +286,14 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,
289 switch (action) { 286 switch (action) {
290 case CPU_UP_PREPARE: 287 case CPU_UP_PREPARE:
291 case CPU_UP_PREPARE_FROZEN: 288 case CPU_UP_PREPARE_FROZEN:
292 mutex_lock(&therm_cpu_lock);
293 err = thermal_throttle_add_dev(dev, cpu); 289 err = thermal_throttle_add_dev(dev, cpu);
294 mutex_unlock(&therm_cpu_lock);
295 WARN_ON(err); 290 WARN_ON(err);
296 break; 291 break;
297 case CPU_UP_CANCELED: 292 case CPU_UP_CANCELED:
298 case CPU_UP_CANCELED_FROZEN: 293 case CPU_UP_CANCELED_FROZEN:
299 case CPU_DEAD: 294 case CPU_DEAD:
300 case CPU_DEAD_FROZEN: 295 case CPU_DEAD_FROZEN:
301 mutex_lock(&therm_cpu_lock);
302 thermal_throttle_remove_dev(dev); 296 thermal_throttle_remove_dev(dev);
303 mutex_unlock(&therm_cpu_lock);
304 break; 297 break;
305 } 298 }
306 return notifier_from_errno(err); 299 return notifier_from_errno(err);
@@ -319,19 +312,16 @@ static __init int thermal_throttle_init_device(void)
319 if (!atomic_read(&therm_throt_en)) 312 if (!atomic_read(&therm_throt_en))
320 return 0; 313 return 0;
321 314
322 register_hotcpu_notifier(&thermal_throttle_cpu_notifier); 315 cpu_notifier_register_begin();
323 316
324#ifdef CONFIG_HOTPLUG_CPU
325 mutex_lock(&therm_cpu_lock);
326#endif
327 /* connect live CPUs to sysfs */ 317 /* connect live CPUs to sysfs */
328 for_each_online_cpu(cpu) { 318 for_each_online_cpu(cpu) {
329 err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu); 319 err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu);
330 WARN_ON(err); 320 WARN_ON(err);
331 } 321 }
332#ifdef CONFIG_HOTPLUG_CPU 322
333 mutex_unlock(&therm_cpu_lock); 323 __register_hotcpu_notifier(&thermal_throttle_cpu_notifier);
334#endif 324 cpu_notifier_register_done();
335 325
336 return 0; 326 return 0;
337} 327}
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index e9a701aecaa1..7dc5564d0cdf 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -5,7 +5,6 @@
5#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel.h> 6#include <linux/kernel.h>
7#include <linux/types.h> 7#include <linux/types.h>
8#include <linux/init.h>
9 8
10#include <asm/processor.h> 9#include <asm/processor.h>
11#include <asm/mce.h> 10#include <asm/mce.h>
diff --git a/arch/x86/kernel/cpu/microcode/Makefile b/arch/x86/kernel/cpu/microcode/Makefile
new file mode 100644
index 000000000000..285c85427c32
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/Makefile
@@ -0,0 +1,7 @@
1microcode-y := core.o
2obj-$(CONFIG_MICROCODE) += microcode.o
3microcode-$(CONFIG_MICROCODE_INTEL) += intel.o intel_lib.o
4microcode-$(CONFIG_MICROCODE_AMD) += amd.o
5obj-$(CONFIG_MICROCODE_EARLY) += core_early.o
6obj-$(CONFIG_MICROCODE_INTEL_EARLY) += intel_early.o
7obj-$(CONFIG_MICROCODE_AMD_EARLY) += amd_early.o
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index c3d4cc972eca..8fffd845e22b 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -182,10 +182,10 @@ int __apply_microcode_amd(struct microcode_amd *mc_amd)
182{ 182{
183 u32 rev, dummy; 183 u32 rev, dummy;
184 184
185 wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); 185 native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
186 186
187 /* verify patch application was successful */ 187 /* verify patch application was successful */
188 rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); 188 native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
189 if (rev != mc_amd->hdr.patch_id) 189 if (rev != mc_amd->hdr.patch_id)
190 return -1; 190 return -1;
191 191
@@ -332,6 +332,9 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover)
332 patch->patch_id = mc_hdr->patch_id; 332 patch->patch_id = mc_hdr->patch_id;
333 patch->equiv_cpu = proc_id; 333 patch->equiv_cpu = proc_id;
334 334
335 pr_debug("%s: Added patch_id: 0x%08x, proc_id: 0x%04x\n",
336 __func__, patch->patch_id, proc_id);
337
335 /* ... and add to cache. */ 338 /* ... and add to cache. */
336 update_cache(patch); 339 update_cache(patch);
337 340
@@ -390,9 +393,9 @@ enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size)
390 if (cpu_data(smp_processor_id()).cpu_index == boot_cpu_data.cpu_index) { 393 if (cpu_data(smp_processor_id()).cpu_index == boot_cpu_data.cpu_index) {
391 struct ucode_patch *p = find_patch(smp_processor_id()); 394 struct ucode_patch *p = find_patch(smp_processor_id());
392 if (p) { 395 if (p) {
393 memset(amd_bsp_mpb, 0, MPB_MAX_SIZE); 396 memset(amd_ucode_patch, 0, PATCH_MAX_SIZE);
394 memcpy(amd_bsp_mpb, p->data, min_t(u32, ksize(p->data), 397 memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data),
395 MPB_MAX_SIZE)); 398 PATCH_MAX_SIZE));
396 } 399 }
397 } 400 }
398#endif 401#endif
@@ -430,7 +433,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device,
430 if (c->x86 >= 0x15) 433 if (c->x86 >= 0x15)
431 snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86); 434 snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86);
432 435
433 if (request_firmware(&fw, (const char *)fw_name, device)) { 436 if (request_firmware_direct(&fw, (const char *)fw_name, device)) {
434 pr_debug("failed to load file %s\n", fw_name); 437 pr_debug("failed to load file %s\n", fw_name);
435 goto out; 438 goto out;
436 } 439 }
diff --git a/arch/x86/kernel/microcode_amd_early.c b/arch/x86/kernel/cpu/microcode/amd_early.c
index 6073104ccaa3..617a9e284245 100644
--- a/arch/x86/kernel/microcode_amd_early.c
+++ b/arch/x86/kernel/cpu/microcode/amd_early.c
@@ -2,6 +2,7 @@
2 * Copyright (C) 2013 Advanced Micro Devices, Inc. 2 * Copyright (C) 2013 Advanced Micro Devices, Inc.
3 * 3 *
4 * Author: Jacob Shin <jacob.shin@amd.com> 4 * Author: Jacob Shin <jacob.shin@amd.com>
5 * Fixes: Borislav Petkov <bp@suse.de>
5 * 6 *
6 * This program is free software; you can redistribute it and/or modify 7 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as 8 * it under the terms of the GNU General Public License version 2 as
@@ -15,10 +16,18 @@
15#include <asm/setup.h> 16#include <asm/setup.h>
16#include <asm/microcode_amd.h> 17#include <asm/microcode_amd.h>
17 18
18static bool ucode_loaded; 19/*
20 * This points to the current valid container of microcode patches which we will
21 * save from the initrd before jettisoning its contents.
22 */
23static u8 *container;
24static size_t container_size;
25
19static u32 ucode_new_rev; 26static u32 ucode_new_rev;
20static unsigned long ucode_offset; 27u8 amd_ucode_patch[PATCH_MAX_SIZE];
21static size_t ucode_size; 28static u16 this_equiv_id;
29
30struct cpio_data ucode_cpio;
22 31
23/* 32/*
24 * Microcode patch container file is prepended to the initrd in cpio format. 33 * Microcode patch container file is prepended to the initrd in cpio format.
@@ -32,9 +41,6 @@ static struct cpio_data __init find_ucode_in_initrd(void)
32 char *path; 41 char *path;
33 void *start; 42 void *start;
34 size_t size; 43 size_t size;
35 unsigned long *uoffset;
36 size_t *usize;
37 struct cpio_data cd;
38 44
39#ifdef CONFIG_X86_32 45#ifdef CONFIG_X86_32
40 struct boot_params *p; 46 struct boot_params *p;
@@ -47,30 +53,50 @@ static struct cpio_data __init find_ucode_in_initrd(void)
47 path = (char *)__pa_nodebug(ucode_path); 53 path = (char *)__pa_nodebug(ucode_path);
48 start = (void *)p->hdr.ramdisk_image; 54 start = (void *)p->hdr.ramdisk_image;
49 size = p->hdr.ramdisk_size; 55 size = p->hdr.ramdisk_size;
50 uoffset = (unsigned long *)__pa_nodebug(&ucode_offset);
51 usize = (size_t *)__pa_nodebug(&ucode_size);
52#else 56#else
53 path = ucode_path; 57 path = ucode_path;
54 start = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET); 58 start = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET);
55 size = boot_params.hdr.ramdisk_size; 59 size = boot_params.hdr.ramdisk_size;
56 uoffset = &ucode_offset;
57 usize = &ucode_size;
58#endif 60#endif
59 61
60 cd = find_cpio_data(path, start, size, &offset); 62 return find_cpio_data(path, start, size, &offset);
61 if (!cd.data) 63}
62 return cd;
63 64
64 if (*(u32 *)cd.data != UCODE_MAGIC) { 65static size_t compute_container_size(u8 *data, u32 total_size)
65 cd.data = NULL; 66{
66 cd.size = 0; 67 size_t size = 0;
67 return cd; 68 u32 *header = (u32 *)data;
68 } 69
70 if (header[0] != UCODE_MAGIC ||
71 header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
72 header[2] == 0) /* size */
73 return size;
74
75 size = header[2] + CONTAINER_HDR_SZ;
76 total_size -= size;
77 data += size;
69 78
70 *uoffset = (u8 *)cd.data - (u8 *)start; 79 while (total_size) {
71 *usize = cd.size; 80 u16 patch_size;
81
82 header = (u32 *)data;
83
84 if (header[0] != UCODE_UCODE_TYPE)
85 break;
86
87 /*
88 * Sanity-check patch size.
89 */
90 patch_size = header[1];
91 if (patch_size > PATCH_MAX_SIZE)
92 break;
93
94 size += patch_size + SECTION_HDR_SIZE;
95 data += patch_size + SECTION_HDR_SIZE;
96 total_size -= patch_size + SECTION_HDR_SIZE;
97 }
72 98
73 return cd; 99 return size;
74} 100}
75 101
76/* 102/*
@@ -85,23 +111,22 @@ static struct cpio_data __init find_ucode_in_initrd(void)
85static void apply_ucode_in_initrd(void *ucode, size_t size) 111static void apply_ucode_in_initrd(void *ucode, size_t size)
86{ 112{
87 struct equiv_cpu_entry *eq; 113 struct equiv_cpu_entry *eq;
114 size_t *cont_sz;
88 u32 *header; 115 u32 *header;
89 u8 *data; 116 u8 *data, **cont;
90 u16 eq_id = 0; 117 u16 eq_id = 0;
91 int offset, left; 118 int offset, left;
92 u32 rev, eax; 119 u32 rev, eax, ebx, ecx, edx;
93 u32 *new_rev; 120 u32 *new_rev;
94 unsigned long *uoffset;
95 size_t *usize;
96 121
97#ifdef CONFIG_X86_32 122#ifdef CONFIG_X86_32
98 new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); 123 new_rev = (u32 *)__pa_nodebug(&ucode_new_rev);
99 uoffset = (unsigned long *)__pa_nodebug(&ucode_offset); 124 cont_sz = (size_t *)__pa_nodebug(&container_size);
100 usize = (size_t *)__pa_nodebug(&ucode_size); 125 cont = (u8 **)__pa_nodebug(&container);
101#else 126#else
102 new_rev = &ucode_new_rev; 127 new_rev = &ucode_new_rev;
103 uoffset = &ucode_offset; 128 cont_sz = &container_size;
104 usize = &ucode_size; 129 cont = &container;
105#endif 130#endif
106 131
107 data = ucode; 132 data = ucode;
@@ -109,23 +134,37 @@ static void apply_ucode_in_initrd(void *ucode, size_t size)
109 header = (u32 *)data; 134 header = (u32 *)data;
110 135
111 /* find equiv cpu table */ 136 /* find equiv cpu table */
112 137 if (header[0] != UCODE_MAGIC ||
113 if (header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */ 138 header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
114 header[2] == 0) /* size */ 139 header[2] == 0) /* size */
115 return; 140 return;
116 141
117 eax = cpuid_eax(0x00000001); 142 eax = 0x00000001;
143 ecx = 0;
144 native_cpuid(&eax, &ebx, &ecx, &edx);
118 145
119 while (left > 0) { 146 while (left > 0) {
120 eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ); 147 eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ);
121 148
149 *cont = data;
150
151 /* Advance past the container header */
122 offset = header[2] + CONTAINER_HDR_SZ; 152 offset = header[2] + CONTAINER_HDR_SZ;
123 data += offset; 153 data += offset;
124 left -= offset; 154 left -= offset;
125 155
126 eq_id = find_equiv_id(eq, eax); 156 eq_id = find_equiv_id(eq, eax);
127 if (eq_id) 157 if (eq_id) {
158 this_equiv_id = eq_id;
159 *cont_sz = compute_container_size(*cont, left + offset);
160
161 /*
162 * truncate how much we need to iterate over in the
163 * ucode update loop below
164 */
165 left = *cont_sz - offset;
128 break; 166 break;
167 }
129 168
130 /* 169 /*
131 * support multiple container files appended together. if this 170 * support multiple container files appended together. if this
@@ -145,19 +184,18 @@ static void apply_ucode_in_initrd(void *ucode, size_t size)
145 184
146 /* mark where the next microcode container file starts */ 185 /* mark where the next microcode container file starts */
147 offset = data - (u8 *)ucode; 186 offset = data - (u8 *)ucode;
148 *uoffset += offset;
149 *usize -= offset;
150 ucode = data; 187 ucode = data;
151 } 188 }
152 189
153 if (!eq_id) { 190 if (!eq_id) {
154 *usize = 0; 191 *cont = NULL;
192 *cont_sz = 0;
155 return; 193 return;
156 } 194 }
157 195
158 /* find ucode and update if needed */ 196 /* find ucode and update if needed */
159 197
160 rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax); 198 native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax);
161 199
162 while (left > 0) { 200 while (left > 0) {
163 struct microcode_amd *mc; 201 struct microcode_amd *mc;
@@ -168,134 +206,190 @@ static void apply_ucode_in_initrd(void *ucode, size_t size)
168 break; 206 break;
169 207
170 mc = (struct microcode_amd *)(data + SECTION_HDR_SIZE); 208 mc = (struct microcode_amd *)(data + SECTION_HDR_SIZE);
171 if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id) 209
172 if (__apply_microcode_amd(mc) == 0) { 210 if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id) {
211
212 if (!__apply_microcode_amd(mc)) {
173 rev = mc->hdr.patch_id; 213 rev = mc->hdr.patch_id;
174 *new_rev = rev; 214 *new_rev = rev;
215
216 /* save ucode patch */
217 memcpy(amd_ucode_patch, mc,
218 min_t(u32, header[1], PATCH_MAX_SIZE));
175 } 219 }
220 }
176 221
177 offset = header[1] + SECTION_HDR_SIZE; 222 offset = header[1] + SECTION_HDR_SIZE;
178 data += offset; 223 data += offset;
179 left -= offset; 224 left -= offset;
180 } 225 }
181
182 /* mark where this microcode container file ends */
183 offset = *usize - (data - (u8 *)ucode);
184 *usize -= offset;
185
186 if (!(*new_rev))
187 *usize = 0;
188} 226}
189 227
190void __init load_ucode_amd_bsp(void) 228void __init load_ucode_amd_bsp(void)
191{ 229{
192 struct cpio_data cd = find_ucode_in_initrd(); 230 struct cpio_data cp;
193 if (!cd.data) 231 void **data;
232 size_t *size;
233
234#ifdef CONFIG_X86_32
235 data = (void **)__pa_nodebug(&ucode_cpio.data);
236 size = (size_t *)__pa_nodebug(&ucode_cpio.size);
237#else
238 data = &ucode_cpio.data;
239 size = &ucode_cpio.size;
240#endif
241
242 cp = find_ucode_in_initrd();
243 if (!cp.data)
194 return; 244 return;
195 245
196 apply_ucode_in_initrd(cd.data, cd.size); 246 *data = cp.data;
247 *size = cp.size;
248
249 apply_ucode_in_initrd(cp.data, cp.size);
197} 250}
198 251
199#ifdef CONFIG_X86_32 252#ifdef CONFIG_X86_32
200u8 amd_bsp_mpb[MPB_MAX_SIZE];
201
202/* 253/*
203 * On 32-bit, since AP's early load occurs before paging is turned on, we 254 * On 32-bit, since AP's early load occurs before paging is turned on, we
204 * cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during 255 * cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during
205 * cold boot, AP will apply_ucode_in_initrd() just like the BSP. During 256 * cold boot, AP will apply_ucode_in_initrd() just like the BSP. During
206 * save_microcode_in_initrd_amd() BSP's patch is copied to amd_bsp_mpb, which 257 * save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch,
207 * is used upon resume from suspend. 258 * which is used upon resume from suspend.
208 */ 259 */
209void load_ucode_amd_ap(void) 260void load_ucode_amd_ap(void)
210{ 261{
211 struct microcode_amd *mc; 262 struct microcode_amd *mc;
212 unsigned long *initrd;
213 unsigned long *uoffset;
214 size_t *usize; 263 size_t *usize;
215 void *ucode; 264 void **ucode;
216 265
217 mc = (struct microcode_amd *)__pa(amd_bsp_mpb); 266 mc = (struct microcode_amd *)__pa(amd_ucode_patch);
218 if (mc->hdr.patch_id && mc->hdr.processor_rev_id) { 267 if (mc->hdr.patch_id && mc->hdr.processor_rev_id) {
219 __apply_microcode_amd(mc); 268 __apply_microcode_amd(mc);
220 return; 269 return;
221 } 270 }
222 271
223 initrd = (unsigned long *)__pa(&initrd_start); 272 ucode = (void *)__pa_nodebug(&container);
224 uoffset = (unsigned long *)__pa(&ucode_offset); 273 usize = (size_t *)__pa_nodebug(&container_size);
225 usize = (size_t *)__pa(&ucode_size);
226 274
227 if (!*usize || !*initrd) 275 if (!*ucode || !*usize)
228 return; 276 return;
229 277
230 ucode = (void *)((unsigned long)__pa(*initrd) + *uoffset); 278 apply_ucode_in_initrd(*ucode, *usize);
231 apply_ucode_in_initrd(ucode, *usize);
232} 279}
233 280
234static void __init collect_cpu_sig_on_bsp(void *arg) 281static void __init collect_cpu_sig_on_bsp(void *arg)
235{ 282{
236 unsigned int cpu = smp_processor_id(); 283 unsigned int cpu = smp_processor_id();
237 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 284 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
285
238 uci->cpu_sig.sig = cpuid_eax(0x00000001); 286 uci->cpu_sig.sig = cpuid_eax(0x00000001);
239} 287}
288
289static void __init get_bsp_sig(void)
290{
291 unsigned int bsp = boot_cpu_data.cpu_index;
292 struct ucode_cpu_info *uci = ucode_cpu_info + bsp;
293
294 if (!uci->cpu_sig.sig)
295 smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1);
296}
240#else 297#else
241void load_ucode_amd_ap(void) 298void load_ucode_amd_ap(void)
242{ 299{
243 unsigned int cpu = smp_processor_id(); 300 unsigned int cpu = smp_processor_id();
244 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 301 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
302 struct equiv_cpu_entry *eq;
303 struct microcode_amd *mc;
245 u32 rev, eax; 304 u32 rev, eax;
305 u16 eq_id;
306
307 /* Exit if called on the BSP. */
308 if (!cpu)
309 return;
310
311 if (!container)
312 return;
246 313
247 rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax); 314 rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax);
248 eax = cpuid_eax(0x00000001);
249 315
250 uci->cpu_sig.rev = rev; 316 uci->cpu_sig.rev = rev;
251 uci->cpu_sig.sig = eax; 317 uci->cpu_sig.sig = eax;
252 318
253 if (cpu && !ucode_loaded) { 319 eax = cpuid_eax(0x00000001);
254 void *ucode; 320 eq = (struct equiv_cpu_entry *)(container + CONTAINER_HDR_SZ);
321
322 eq_id = find_equiv_id(eq, eax);
323 if (!eq_id)
324 return;
255 325
256 if (!ucode_size || !initrd_start) 326 if (eq_id == this_equiv_id) {
257 return; 327 mc = (struct microcode_amd *)amd_ucode_patch;
328
329 if (mc && rev < mc->hdr.patch_id) {
330 if (!__apply_microcode_amd(mc))
331 ucode_new_rev = mc->hdr.patch_id;
332 }
258 333
259 ucode = (void *)(initrd_start + ucode_offset); 334 } else {
260 eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); 335 if (!ucode_cpio.data)
261 if (load_microcode_amd(eax, ucode, ucode_size) != UCODE_OK)
262 return; 336 return;
263 337
264 ucode_loaded = true; 338 /*
339 * AP has a different equivalence ID than BSP, looks like
340 * mixed-steppings silicon so go through the ucode blob anew.
341 */
342 apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size);
265 } 343 }
266
267 apply_microcode_amd(cpu);
268} 344}
269#endif 345#endif
270 346
271int __init save_microcode_in_initrd_amd(void) 347int __init save_microcode_in_initrd_amd(void)
272{ 348{
349 unsigned long cont;
273 enum ucode_state ret; 350 enum ucode_state ret;
274 void *ucode;
275 u32 eax; 351 u32 eax;
276 352
277#ifdef CONFIG_X86_32 353 if (!container)
278 unsigned int bsp = boot_cpu_data.cpu_index; 354 return -EINVAL;
279 struct ucode_cpu_info *uci = ucode_cpu_info + bsp;
280 355
281 if (!uci->cpu_sig.sig) 356#ifdef CONFIG_X86_32
282 smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1); 357 get_bsp_sig();
358 cont = (unsigned long)container;
359#else
360 /*
361 * We need the physical address of the container for both bitness since
362 * boot_params.hdr.ramdisk_image is a physical address.
363 */
364 cont = __pa(container);
283#endif 365#endif
366
367 /*
368 * Take into account the fact that the ramdisk might get relocated and
369 * therefore we need to recompute the container's position in virtual
370 * memory space.
371 */
372 if (relocated_ramdisk)
373 container = (u8 *)(__va(relocated_ramdisk) +
374 (cont - boot_params.hdr.ramdisk_image));
375
284 if (ucode_new_rev) 376 if (ucode_new_rev)
285 pr_info("microcode: updated early to new patch_level=0x%08x\n", 377 pr_info("microcode: updated early to new patch_level=0x%08x\n",
286 ucode_new_rev); 378 ucode_new_rev);
287 379
288 if (ucode_loaded || !ucode_size || !initrd_start)
289 return 0;
290
291 ucode = (void *)(initrd_start + ucode_offset);
292 eax = cpuid_eax(0x00000001); 380 eax = cpuid_eax(0x00000001);
293 eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); 381 eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
294 382
295 ret = load_microcode_amd(eax, ucode, ucode_size); 383 ret = load_microcode_amd(eax, container, container_size);
296 if (ret != UCODE_OK) 384 if (ret != UCODE_OK)
297 return -EINVAL; 385 return -EINVAL;
298 386
299 ucode_loaded = true; 387 /*
388 * This will be freed any msec now, stash patches for the current
389 * family and switch to patch cache for cpu hotplug, etc later.
390 */
391 container = NULL;
392 container_size = 0;
393
300 return 0; 394 return 0;
301} 395}
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/cpu/microcode/core.c
index 15c987698b0f..15c987698b0f 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
diff --git a/arch/x86/kernel/microcode_core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c
index be7f8514f577..be7f8514f577 100644
--- a/arch/x86/kernel/microcode_core_early.c
+++ b/arch/x86/kernel/cpu/microcode/core_early.c
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 5fb2cebf556b..a276fa75d9b5 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -278,7 +278,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device,
278 sprintf(name, "intel-ucode/%02x-%02x-%02x", 278 sprintf(name, "intel-ucode/%02x-%02x-%02x",
279 c->x86, c->x86_model, c->x86_mask); 279 c->x86, c->x86_model, c->x86_mask);
280 280
281 if (request_firmware(&firmware, name, device)) { 281 if (request_firmware_direct(&firmware, name, device)) {
282 pr_debug("data file %s load failed\n", name); 282 pr_debug("data file %s load failed\n", name);
283 return UCODE_NFOUND; 283 return UCODE_NFOUND;
284 } 284 }
diff --git a/arch/x86/kernel/microcode_intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
index 1575deb2e636..18f739129e72 100644
--- a/arch/x86/kernel/microcode_intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -365,16 +365,6 @@ out:
365 return state; 365 return state;
366} 366}
367 367
368#define native_rdmsr(msr, val1, val2) \
369do { \
370 u64 __val = native_read_msr((msr)); \
371 (void)((val1) = (u32)__val); \
372 (void)((val2) = (u32)(__val >> 32)); \
373} while (0)
374
375#define native_wrmsr(msr, low, high) \
376 native_write_msr(msr, low, high);
377
378static int collect_cpu_info_early(struct ucode_cpu_info *uci) 368static int collect_cpu_info_early(struct ucode_cpu_info *uci)
379{ 369{
380 unsigned int val[2]; 370 unsigned int val[2];
diff --git a/arch/x86/kernel/microcode_intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c
index ce69320d0179..ce69320d0179 100644
--- a/arch/x86/kernel/microcode_intel_lib.c
+++ b/arch/x86/kernel/cpu/microcode/intel_lib.c
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 9f7ca266864a..76f98fe5b35c 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -17,6 +17,7 @@
17#include <linux/hardirq.h> 17#include <linux/hardirq.h>
18#include <linux/efi.h> 18#include <linux/efi.h>
19#include <linux/interrupt.h> 19#include <linux/interrupt.h>
20#include <linux/irq.h>
20#include <asm/processor.h> 21#include <asm/processor.h>
21#include <asm/hypervisor.h> 22#include <asm/hypervisor.h>
22#include <asm/hyperv.h> 23#include <asm/hyperv.h>
@@ -26,10 +27,50 @@
26#include <asm/irq_regs.h> 27#include <asm/irq_regs.h>
27#include <asm/i8259.h> 28#include <asm/i8259.h>
28#include <asm/apic.h> 29#include <asm/apic.h>
30#include <asm/timer.h>
29 31
30struct ms_hyperv_info ms_hyperv; 32struct ms_hyperv_info ms_hyperv;
31EXPORT_SYMBOL_GPL(ms_hyperv); 33EXPORT_SYMBOL_GPL(ms_hyperv);
32 34
35#if IS_ENABLED(CONFIG_HYPERV)
36static void (*vmbus_handler)(void);
37
38void hyperv_vector_handler(struct pt_regs *regs)
39{
40 struct pt_regs *old_regs = set_irq_regs(regs);
41
42 irq_enter();
43 exit_idle();
44
45 inc_irq_stat(irq_hv_callback_count);
46 if (vmbus_handler)
47 vmbus_handler();
48
49 irq_exit();
50 set_irq_regs(old_regs);
51}
52
53void hv_setup_vmbus_irq(void (*handler)(void))
54{
55 vmbus_handler = handler;
56 /*
57 * Setup the IDT for hypervisor callback. Prevent reallocation
58 * at module reload.
59 */
60 if (!test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors))
61 alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR,
62 hyperv_callback_vector);
63}
64
65void hv_remove_vmbus_irq(void)
66{
67 /* We have no way to deallocate the interrupt gate */
68 vmbus_handler = NULL;
69}
70EXPORT_SYMBOL_GPL(hv_setup_vmbus_irq);
71EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq);
72#endif
73
33static uint32_t __init ms_hyperv_platform(void) 74static uint32_t __init ms_hyperv_platform(void)
34{ 75{
35 u32 eax; 76 u32 eax;
@@ -105,6 +146,11 @@ static void __init ms_hyperv_init_platform(void)
105 146
106 if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE) 147 if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE)
107 clocksource_register_hz(&hyperv_cs, NSEC_PER_SEC/100); 148 clocksource_register_hz(&hyperv_cs, NSEC_PER_SEC/100);
149
150#ifdef CONFIG_X86_IO_APIC
151 no_timer_check = 1;
152#endif
153
108} 154}
109 155
110const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { 156const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
@@ -113,41 +159,3 @@ const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
113 .init_platform = ms_hyperv_init_platform, 159 .init_platform = ms_hyperv_init_platform,
114}; 160};
115EXPORT_SYMBOL(x86_hyper_ms_hyperv); 161EXPORT_SYMBOL(x86_hyper_ms_hyperv);
116
117#if IS_ENABLED(CONFIG_HYPERV)
118static int vmbus_irq = -1;
119static irq_handler_t vmbus_isr;
120
121void hv_register_vmbus_handler(int irq, irq_handler_t handler)
122{
123 /*
124 * Setup the IDT for hypervisor callback.
125 */
126 alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector);
127
128 vmbus_irq = irq;
129 vmbus_isr = handler;
130}
131
132void hyperv_vector_handler(struct pt_regs *regs)
133{
134 struct pt_regs *old_regs = set_irq_regs(regs);
135 struct irq_desc *desc;
136
137 irq_enter();
138 exit_idle();
139
140 desc = irq_to_desc(vmbus_irq);
141
142 if (desc)
143 generic_handle_irq_desc(vmbus_irq, desc);
144
145 irq_exit();
146 set_irq_regs(old_regs);
147}
148#else
149void hv_register_vmbus_handler(int irq, irq_handler_t handler)
150{
151}
152#endif
153EXPORT_SYMBOL_GPL(hv_register_vmbus_handler);
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index ce2d0a2c3e4f..0e25a1bc5ab5 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -683,7 +683,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
683 } 683 }
684 684
685 /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ 685 /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
686 count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); 686 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
687 __flush_tlb(); 687 __flush_tlb();
688 688
689 /* Save MTRR state */ 689 /* Save MTRR state */
@@ -697,7 +697,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
697static void post_set(void) __releases(set_atomicity_lock) 697static void post_set(void) __releases(set_atomicity_lock)
698{ 698{
699 /* Flush TLBs (no need to flush caches - they are disabled) */ 699 /* Flush TLBs (no need to flush caches - they are disabled) */
700 count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); 700 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
701 __flush_tlb(); 701 __flush_tlb();
702 702
703 /* Intel (P6) standard MTRRs */ 703 /* Intel (P6) standard MTRRs */
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 8e132931614d..ae407f7226c8 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -892,7 +892,6 @@ static void x86_pmu_enable(struct pmu *pmu)
892 * hw_perf_group_sched_in() or x86_pmu_enable() 892 * hw_perf_group_sched_in() or x86_pmu_enable()
893 * 893 *
894 * step1: save events moving to new counters 894 * step1: save events moving to new counters
895 * step2: reprogram moved events into new counters
896 */ 895 */
897 for (i = 0; i < n_running; i++) { 896 for (i = 0; i < n_running; i++) {
898 event = cpuc->event_list[i]; 897 event = cpuc->event_list[i];
@@ -918,6 +917,9 @@ static void x86_pmu_enable(struct pmu *pmu)
918 x86_pmu_stop(event, PERF_EF_UPDATE); 917 x86_pmu_stop(event, PERF_EF_UPDATE);
919 } 918 }
920 919
920 /*
921 * step2: reprogram moved events into new counters
922 */
921 for (i = 0; i < cpuc->n_events; i++) { 923 for (i = 0; i < cpuc->n_events; i++) {
922 event = cpuc->event_list[i]; 924 event = cpuc->event_list[i];
923 hwc = &event->hw; 925 hwc = &event->hw;
@@ -1043,7 +1045,7 @@ static int x86_pmu_add(struct perf_event *event, int flags)
1043 /* 1045 /*
1044 * If group events scheduling transaction was started, 1046 * If group events scheduling transaction was started,
1045 * skip the schedulability test here, it will be performed 1047 * skip the schedulability test here, it will be performed
1046 * at commit time (->commit_txn) as a whole 1048 * at commit time (->commit_txn) as a whole.
1047 */ 1049 */
1048 if (cpuc->group_flag & PERF_EVENT_TXN) 1050 if (cpuc->group_flag & PERF_EVENT_TXN)
1049 goto done_collect; 1051 goto done_collect;
@@ -1058,6 +1060,10 @@ static int x86_pmu_add(struct perf_event *event, int flags)
1058 memcpy(cpuc->assign, assign, n*sizeof(int)); 1060 memcpy(cpuc->assign, assign, n*sizeof(int));
1059 1061
1060done_collect: 1062done_collect:
1063 /*
1064 * Commit the collect_events() state. See x86_pmu_del() and
1065 * x86_pmu_*_txn().
1066 */
1061 cpuc->n_events = n; 1067 cpuc->n_events = n;
1062 cpuc->n_added += n - n0; 1068 cpuc->n_added += n - n0;
1063 cpuc->n_txn += n - n0; 1069 cpuc->n_txn += n - n0;
@@ -1183,25 +1189,38 @@ static void x86_pmu_del(struct perf_event *event, int flags)
1183 * If we're called during a txn, we don't need to do anything. 1189 * If we're called during a txn, we don't need to do anything.
1184 * The events never got scheduled and ->cancel_txn will truncate 1190 * The events never got scheduled and ->cancel_txn will truncate
1185 * the event_list. 1191 * the event_list.
1192 *
1193 * XXX assumes any ->del() called during a TXN will only be on
1194 * an event added during that same TXN.
1186 */ 1195 */
1187 if (cpuc->group_flag & PERF_EVENT_TXN) 1196 if (cpuc->group_flag & PERF_EVENT_TXN)
1188 return; 1197 return;
1189 1198
1199 /*
1200 * Not a TXN, therefore cleanup properly.
1201 */
1190 x86_pmu_stop(event, PERF_EF_UPDATE); 1202 x86_pmu_stop(event, PERF_EF_UPDATE);
1191 1203
1192 for (i = 0; i < cpuc->n_events; i++) { 1204 for (i = 0; i < cpuc->n_events; i++) {
1193 if (event == cpuc->event_list[i]) { 1205 if (event == cpuc->event_list[i])
1206 break;
1207 }
1194 1208
1195 if (x86_pmu.put_event_constraints) 1209 if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */
1196 x86_pmu.put_event_constraints(cpuc, event); 1210 return;
1197 1211
1198 while (++i < cpuc->n_events) 1212 /* If we have a newly added event; make sure to decrease n_added. */
1199 cpuc->event_list[i-1] = cpuc->event_list[i]; 1213 if (i >= cpuc->n_events - cpuc->n_added)
1214 --cpuc->n_added;
1215
1216 if (x86_pmu.put_event_constraints)
1217 x86_pmu.put_event_constraints(cpuc, event);
1218
1219 /* Delete the array entry. */
1220 while (++i < cpuc->n_events)
1221 cpuc->event_list[i-1] = cpuc->event_list[i];
1222 --cpuc->n_events;
1200 1223
1201 --cpuc->n_events;
1202 break;
1203 }
1204 }
1205 perf_event_update_userpage(event); 1224 perf_event_update_userpage(event);
1206} 1225}
1207 1226
@@ -1521,6 +1540,8 @@ static int __init init_hw_perf_events(void)
1521 1540
1522 pr_cont("%s PMU driver.\n", x86_pmu.name); 1541 pr_cont("%s PMU driver.\n", x86_pmu.name);
1523 1542
1543 x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */
1544
1524 for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next) 1545 for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next)
1525 quirk->func(); 1546 quirk->func();
1526 1547
@@ -1534,7 +1555,6 @@ static int __init init_hw_perf_events(void)
1534 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1, 1555 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
1535 0, x86_pmu.num_counters, 0, 0); 1556 0, x86_pmu.num_counters, 0, 0);
1536 1557
1537 x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */
1538 x86_pmu_format_group.attrs = x86_pmu.format_attrs; 1558 x86_pmu_format_group.attrs = x86_pmu.format_attrs;
1539 1559
1540 if (x86_pmu.event_attrs) 1560 if (x86_pmu.event_attrs)
@@ -1594,7 +1614,8 @@ static void x86_pmu_cancel_txn(struct pmu *pmu)
1594{ 1614{
1595 __this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN); 1615 __this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN);
1596 /* 1616 /*
1597 * Truncate the collected events. 1617 * Truncate collected array by the number of events added in this
1618 * transaction. See x86_pmu_add() and x86_pmu_*_txn().
1598 */ 1619 */
1599 __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn)); 1620 __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
1600 __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn)); 1621 __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
@@ -1605,6 +1626,8 @@ static void x86_pmu_cancel_txn(struct pmu *pmu)
1605 * Commit group events scheduling transaction 1626 * Commit group events scheduling transaction
1606 * Perform the group schedulability test as a whole 1627 * Perform the group schedulability test as a whole
1607 * Return 0 if success 1628 * Return 0 if success
1629 *
1630 * Does not cancel the transaction on failure; expects the caller to do this.
1608 */ 1631 */
1609static int x86_pmu_commit_txn(struct pmu *pmu) 1632static int x86_pmu_commit_txn(struct pmu *pmu)
1610{ 1633{
@@ -1820,9 +1843,12 @@ static ssize_t set_attr_rdpmc(struct device *cdev,
1820 if (ret) 1843 if (ret)
1821 return ret; 1844 return ret;
1822 1845
1846 if (x86_pmu.attr_rdpmc_broken)
1847 return -ENOTSUPP;
1848
1823 if (!!val != !!x86_pmu.attr_rdpmc) { 1849 if (!!val != !!x86_pmu.attr_rdpmc) {
1824 x86_pmu.attr_rdpmc = !!val; 1850 x86_pmu.attr_rdpmc = !!val;
1825 smp_call_function(change_rdpmc, (void *)val, 1); 1851 on_each_cpu(change_rdpmc, (void *)val, 1);
1826 } 1852 }
1827 1853
1828 return count; 1854 return count;
@@ -1883,21 +1909,27 @@ static struct pmu pmu = {
1883 1909
1884void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) 1910void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
1885{ 1911{
1912 struct cyc2ns_data *data;
1913
1886 userpg->cap_user_time = 0; 1914 userpg->cap_user_time = 0;
1887 userpg->cap_user_time_zero = 0; 1915 userpg->cap_user_time_zero = 0;
1888 userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc; 1916 userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc;
1889 userpg->pmc_width = x86_pmu.cntval_bits; 1917 userpg->pmc_width = x86_pmu.cntval_bits;
1890 1918
1891 if (!sched_clock_stable) 1919 if (!sched_clock_stable())
1892 return; 1920 return;
1893 1921
1922 data = cyc2ns_read_begin();
1923
1894 userpg->cap_user_time = 1; 1924 userpg->cap_user_time = 1;
1895 userpg->time_mult = this_cpu_read(cyc2ns); 1925 userpg->time_mult = data->cyc2ns_mul;
1896 userpg->time_shift = CYC2NS_SCALE_FACTOR; 1926 userpg->time_shift = data->cyc2ns_shift;
1897 userpg->time_offset = this_cpu_read(cyc2ns_offset) - now; 1927 userpg->time_offset = data->cyc2ns_offset - now;
1898 1928
1899 userpg->cap_user_time_zero = 1; 1929 userpg->cap_user_time_zero = 1;
1900 userpg->time_zero = this_cpu_read(cyc2ns_offset); 1930 userpg->time_zero = data->cyc2ns_offset;
1931
1932 cyc2ns_read_end(data);
1901} 1933}
1902 1934
1903/* 1935/*
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index c1a861829d81..3b2f9bdd974b 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -130,9 +130,11 @@ struct cpu_hw_events {
130 unsigned long running[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 130 unsigned long running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
131 int enabled; 131 int enabled;
132 132
133 int n_events; 133 int n_events; /* the # of events in the below arrays */
134 int n_added; 134 int n_added; /* the # last events in the below arrays;
135 int n_txn; 135 they've never been enabled yet */
136 int n_txn; /* the # last events in the below arrays;
137 added in the current transaction */
136 int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ 138 int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
137 u64 tags[X86_PMC_IDX_MAX]; 139 u64 tags[X86_PMC_IDX_MAX];
138 struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ 140 struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
@@ -409,6 +411,7 @@ struct x86_pmu {
409 /* 411 /*
410 * sysfs attrs 412 * sysfs attrs
411 */ 413 */
414 int attr_rdpmc_broken;
412 int attr_rdpmc; 415 int attr_rdpmc;
413 struct attribute **format_attrs; 416 struct attribute **format_attrs;
414 struct attribute **event_attrs; 417 struct attribute **event_attrs;
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index 4b8e4d3cd6ea..4c36bbe3173a 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -926,13 +926,13 @@ static __init int amd_ibs_init(void)
926 goto out; 926 goto out;
927 927
928 perf_ibs_pm_init(); 928 perf_ibs_pm_init();
929 get_online_cpus(); 929 cpu_notifier_register_begin();
930 ibs_caps = caps; 930 ibs_caps = caps;
931 /* make ibs_caps visible to other cpus: */ 931 /* make ibs_caps visible to other cpus: */
932 smp_mb(); 932 smp_mb();
933 perf_cpu_notifier(perf_ibs_cpu_notifier);
934 smp_call_function(setup_APIC_ibs, NULL, 1); 933 smp_call_function(setup_APIC_ibs, NULL, 1);
935 put_online_cpus(); 934 __perf_cpu_notifier(perf_ibs_cpu_notifier);
935 cpu_notifier_register_done();
936 936
937 ret = perf_event_ibs_init(); 937 ret = perf_event_ibs_init();
938out: 938out:
diff --git a/arch/x86/kernel/cpu/perf_event_amd_uncore.c b/arch/x86/kernel/cpu/perf_event_amd_uncore.c
index 754291adec33..3bbdf4cd38b9 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_uncore.c
@@ -531,15 +531,16 @@ static int __init amd_uncore_init(void)
531 if (ret) 531 if (ret)
532 return -ENODEV; 532 return -ENODEV;
533 533
534 get_online_cpus(); 534 cpu_notifier_register_begin();
535
535 /* init cpus already online before registering for hotplug notifier */ 536 /* init cpus already online before registering for hotplug notifier */
536 for_each_online_cpu(cpu) { 537 for_each_online_cpu(cpu) {
537 amd_uncore_cpu_up_prepare(cpu); 538 amd_uncore_cpu_up_prepare(cpu);
538 smp_call_function_single(cpu, init_cpu_already_online, NULL, 1); 539 smp_call_function_single(cpu, init_cpu_already_online, NULL, 1);
539 } 540 }
540 541
541 register_cpu_notifier(&amd_uncore_cpu_notifier_block); 542 __register_cpu_notifier(&amd_uncore_cpu_notifier_block);
542 put_online_cpus(); 543 cpu_notifier_register_done();
543 544
544 return 0; 545 return 0;
545} 546}
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 0fa4f242f050..aa333d966886 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1361,10 +1361,8 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
1361 intel_pmu_disable_all(); 1361 intel_pmu_disable_all();
1362 handled = intel_pmu_drain_bts_buffer(); 1362 handled = intel_pmu_drain_bts_buffer();
1363 status = intel_pmu_get_status(); 1363 status = intel_pmu_get_status();
1364 if (!status) { 1364 if (!status)
1365 intel_pmu_enable_all(0); 1365 goto done;
1366 return handled;
1367 }
1368 1366
1369 loops = 0; 1367 loops = 0;
1370again: 1368again:
@@ -2310,10 +2308,7 @@ __init int intel_pmu_init(void)
2310 if (version > 1) 2308 if (version > 1)
2311 x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); 2309 x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
2312 2310
2313 /* 2311 if (boot_cpu_has(X86_FEATURE_PDCM)) {
2314 * v2 and above have a perf capabilities MSR
2315 */
2316 if (version > 1) {
2317 u64 capabilities; 2312 u64 capabilities;
2318 2313
2319 rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities); 2314 rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
new file mode 100644
index 000000000000..059218ed5208
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
@@ -0,0 +1,680 @@
1/*
2 * perf_event_intel_rapl.c: support Intel RAPL energy consumption counters
3 * Copyright (C) 2013 Google, Inc., Stephane Eranian
4 *
5 * Intel RAPL interface is specified in the IA-32 Manual Vol3b
6 * section 14.7.1 (September 2013)
7 *
8 * RAPL provides more controls than just reporting energy consumption
9 * however here we only expose the 3 energy consumption free running
10 * counters (pp0, pkg, dram).
11 *
12 * Each of those counters increments in a power unit defined by the
13 * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
14 * but it can vary.
15 *
16 * Counter to rapl events mappings:
17 *
18 * pp0 counter: consumption of all physical cores (power plane 0)
19 * event: rapl_energy_cores
20 * perf code: 0x1
21 *
22 * pkg counter: consumption of the whole processor package
23 * event: rapl_energy_pkg
24 * perf code: 0x2
25 *
26 * dram counter: consumption of the dram domain (servers only)
27 * event: rapl_energy_dram
28 * perf code: 0x3
29 *
30 * dram counter: consumption of the builtin-gpu domain (client only)
31 * event: rapl_energy_gpu
32 * perf code: 0x4
33 *
34 * We manage those counters as free running (read-only). They may be
35 * use simultaneously by other tools, such as turbostat.
36 *
37 * The events only support system-wide mode counting. There is no
38 * sampling support because it does not make sense and is not
39 * supported by the RAPL hardware.
40 *
41 * Because we want to avoid floating-point operations in the kernel,
42 * the events are all reported in fixed point arithmetic (32.32).
43 * Tools must adjust the counts to convert them to Watts using
44 * the duration of the measurement. Tools may use a function such as
45 * ldexp(raw_count, -32);
46 */
47#include <linux/module.h>
48#include <linux/slab.h>
49#include <linux/perf_event.h>
50#include <asm/cpu_device_id.h>
51#include "perf_event.h"
52
53/*
54 * RAPL energy status counters
55 */
56#define RAPL_IDX_PP0_NRG_STAT 0 /* all cores */
57#define INTEL_RAPL_PP0 0x1 /* pseudo-encoding */
58#define RAPL_IDX_PKG_NRG_STAT 1 /* entire package */
59#define INTEL_RAPL_PKG 0x2 /* pseudo-encoding */
60#define RAPL_IDX_RAM_NRG_STAT 2 /* DRAM */
61#define INTEL_RAPL_RAM 0x3 /* pseudo-encoding */
62#define RAPL_IDX_PP1_NRG_STAT 3 /* DRAM */
63#define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */
64
65/* Clients have PP0, PKG */
66#define RAPL_IDX_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\
67 1<<RAPL_IDX_PKG_NRG_STAT|\
68 1<<RAPL_IDX_PP1_NRG_STAT)
69
70/* Servers have PP0, PKG, RAM */
71#define RAPL_IDX_SRV (1<<RAPL_IDX_PP0_NRG_STAT|\
72 1<<RAPL_IDX_PKG_NRG_STAT|\
73 1<<RAPL_IDX_RAM_NRG_STAT)
74
75/*
76 * event code: LSB 8 bits, passed in attr->config
77 * any other bit is reserved
78 */
79#define RAPL_EVENT_MASK 0xFFULL
80
81#define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format) \
82static ssize_t __rapl_##_var##_show(struct kobject *kobj, \
83 struct kobj_attribute *attr, \
84 char *page) \
85{ \
86 BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \
87 return sprintf(page, _format "\n"); \
88} \
89static struct kobj_attribute format_attr_##_var = \
90 __ATTR(_name, 0444, __rapl_##_var##_show, NULL)
91
92#define RAPL_EVENT_DESC(_name, _config) \
93{ \
94 .attr = __ATTR(_name, 0444, rapl_event_show, NULL), \
95 .config = _config, \
96}
97
98#define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */
99
100struct rapl_pmu {
101 spinlock_t lock;
102 int hw_unit; /* 1/2^hw_unit Joule */
103 int n_active; /* number of active events */
104 struct list_head active_list;
105 struct pmu *pmu; /* pointer to rapl_pmu_class */
106 ktime_t timer_interval; /* in ktime_t unit */
107 struct hrtimer hrtimer;
108};
109
110static struct pmu rapl_pmu_class;
111static cpumask_t rapl_cpu_mask;
112static int rapl_cntr_mask;
113
114static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu);
115static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free);
116
117static inline u64 rapl_read_counter(struct perf_event *event)
118{
119 u64 raw;
120 rdmsrl(event->hw.event_base, raw);
121 return raw;
122}
123
124static inline u64 rapl_scale(u64 v)
125{
126 /*
127 * scale delta to smallest unit (1/2^32)
128 * users must then scale back: count * 1/(1e9*2^32) to get Joules
129 * or use ldexp(count, -32).
130 * Watts = Joules/Time delta
131 */
132 return v << (32 - __get_cpu_var(rapl_pmu)->hw_unit);
133}
134
135static u64 rapl_event_update(struct perf_event *event)
136{
137 struct hw_perf_event *hwc = &event->hw;
138 u64 prev_raw_count, new_raw_count;
139 s64 delta, sdelta;
140 int shift = RAPL_CNTR_WIDTH;
141
142again:
143 prev_raw_count = local64_read(&hwc->prev_count);
144 rdmsrl(event->hw.event_base, new_raw_count);
145
146 if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
147 new_raw_count) != prev_raw_count) {
148 cpu_relax();
149 goto again;
150 }
151
152 /*
153 * Now we have the new raw value and have updated the prev
154 * timestamp already. We can now calculate the elapsed delta
155 * (event-)time and add that to the generic event.
156 *
157 * Careful, not all hw sign-extends above the physical width
158 * of the count.
159 */
160 delta = (new_raw_count << shift) - (prev_raw_count << shift);
161 delta >>= shift;
162
163 sdelta = rapl_scale(delta);
164
165 local64_add(sdelta, &event->count);
166
167 return new_raw_count;
168}
169
170static void rapl_start_hrtimer(struct rapl_pmu *pmu)
171{
172 __hrtimer_start_range_ns(&pmu->hrtimer,
173 pmu->timer_interval, 0,
174 HRTIMER_MODE_REL_PINNED, 0);
175}
176
177static void rapl_stop_hrtimer(struct rapl_pmu *pmu)
178{
179 hrtimer_cancel(&pmu->hrtimer);
180}
181
182static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
183{
184 struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu);
185 struct perf_event *event;
186 unsigned long flags;
187
188 if (!pmu->n_active)
189 return HRTIMER_NORESTART;
190
191 spin_lock_irqsave(&pmu->lock, flags);
192
193 list_for_each_entry(event, &pmu->active_list, active_entry) {
194 rapl_event_update(event);
195 }
196
197 spin_unlock_irqrestore(&pmu->lock, flags);
198
199 hrtimer_forward_now(hrtimer, pmu->timer_interval);
200
201 return HRTIMER_RESTART;
202}
203
204static void rapl_hrtimer_init(struct rapl_pmu *pmu)
205{
206 struct hrtimer *hr = &pmu->hrtimer;
207
208 hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
209 hr->function = rapl_hrtimer_handle;
210}
211
212static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
213 struct perf_event *event)
214{
215 if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
216 return;
217
218 event->hw.state = 0;
219
220 list_add_tail(&event->active_entry, &pmu->active_list);
221
222 local64_set(&event->hw.prev_count, rapl_read_counter(event));
223
224 pmu->n_active++;
225 if (pmu->n_active == 1)
226 rapl_start_hrtimer(pmu);
227}
228
229static void rapl_pmu_event_start(struct perf_event *event, int mode)
230{
231 struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu);
232 unsigned long flags;
233
234 spin_lock_irqsave(&pmu->lock, flags);
235 __rapl_pmu_event_start(pmu, event);
236 spin_unlock_irqrestore(&pmu->lock, flags);
237}
238
239static void rapl_pmu_event_stop(struct perf_event *event, int mode)
240{
241 struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu);
242 struct hw_perf_event *hwc = &event->hw;
243 unsigned long flags;
244
245 spin_lock_irqsave(&pmu->lock, flags);
246
247 /* mark event as deactivated and stopped */
248 if (!(hwc->state & PERF_HES_STOPPED)) {
249 WARN_ON_ONCE(pmu->n_active <= 0);
250 pmu->n_active--;
251 if (pmu->n_active == 0)
252 rapl_stop_hrtimer(pmu);
253
254 list_del(&event->active_entry);
255
256 WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
257 hwc->state |= PERF_HES_STOPPED;
258 }
259
260 /* check if update of sw counter is necessary */
261 if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
262 /*
263 * Drain the remaining delta count out of a event
264 * that we are disabling:
265 */
266 rapl_event_update(event);
267 hwc->state |= PERF_HES_UPTODATE;
268 }
269
270 spin_unlock_irqrestore(&pmu->lock, flags);
271}
272
273static int rapl_pmu_event_add(struct perf_event *event, int mode)
274{
275 struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu);
276 struct hw_perf_event *hwc = &event->hw;
277 unsigned long flags;
278
279 spin_lock_irqsave(&pmu->lock, flags);
280
281 hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
282
283 if (mode & PERF_EF_START)
284 __rapl_pmu_event_start(pmu, event);
285
286 spin_unlock_irqrestore(&pmu->lock, flags);
287
288 return 0;
289}
290
291static void rapl_pmu_event_del(struct perf_event *event, int flags)
292{
293 rapl_pmu_event_stop(event, PERF_EF_UPDATE);
294}
295
296static int rapl_pmu_event_init(struct perf_event *event)
297{
298 u64 cfg = event->attr.config & RAPL_EVENT_MASK;
299 int bit, msr, ret = 0;
300
301 /* only look at RAPL events */
302 if (event->attr.type != rapl_pmu_class.type)
303 return -ENOENT;
304
305 /* check only supported bits are set */
306 if (event->attr.config & ~RAPL_EVENT_MASK)
307 return -EINVAL;
308
309 /*
310 * check event is known (determines counter)
311 */
312 switch (cfg) {
313 case INTEL_RAPL_PP0:
314 bit = RAPL_IDX_PP0_NRG_STAT;
315 msr = MSR_PP0_ENERGY_STATUS;
316 break;
317 case INTEL_RAPL_PKG:
318 bit = RAPL_IDX_PKG_NRG_STAT;
319 msr = MSR_PKG_ENERGY_STATUS;
320 break;
321 case INTEL_RAPL_RAM:
322 bit = RAPL_IDX_RAM_NRG_STAT;
323 msr = MSR_DRAM_ENERGY_STATUS;
324 break;
325 case INTEL_RAPL_PP1:
326 bit = RAPL_IDX_PP1_NRG_STAT;
327 msr = MSR_PP1_ENERGY_STATUS;
328 break;
329 default:
330 return -EINVAL;
331 }
332 /* check event supported */
333 if (!(rapl_cntr_mask & (1 << bit)))
334 return -EINVAL;
335
336 /* unsupported modes and filters */
337 if (event->attr.exclude_user ||
338 event->attr.exclude_kernel ||
339 event->attr.exclude_hv ||
340 event->attr.exclude_idle ||
341 event->attr.exclude_host ||
342 event->attr.exclude_guest ||
343 event->attr.sample_period) /* no sampling */
344 return -EINVAL;
345
346 /* must be done before validate_group */
347 event->hw.event_base = msr;
348 event->hw.config = cfg;
349 event->hw.idx = bit;
350
351 return ret;
352}
353
354static void rapl_pmu_event_read(struct perf_event *event)
355{
356 rapl_event_update(event);
357}
358
359static ssize_t rapl_get_attr_cpumask(struct device *dev,
360 struct device_attribute *attr, char *buf)
361{
362 int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &rapl_cpu_mask);
363
364 buf[n++] = '\n';
365 buf[n] = '\0';
366 return n;
367}
368
369static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
370
371static struct attribute *rapl_pmu_attrs[] = {
372 &dev_attr_cpumask.attr,
373 NULL,
374};
375
376static struct attribute_group rapl_pmu_attr_group = {
377 .attrs = rapl_pmu_attrs,
378};
379
380EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
381EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02");
382EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
383EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04");
384
385EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
386EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules");
387EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules");
388EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules");
389
390/*
391 * we compute in 0.23 nJ increments regardless of MSR
392 */
393EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
394EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10");
395EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10");
396EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10");
397
398static struct attribute *rapl_events_srv_attr[] = {
399 EVENT_PTR(rapl_cores),
400 EVENT_PTR(rapl_pkg),
401 EVENT_PTR(rapl_ram),
402
403 EVENT_PTR(rapl_cores_unit),
404 EVENT_PTR(rapl_pkg_unit),
405 EVENT_PTR(rapl_ram_unit),
406
407 EVENT_PTR(rapl_cores_scale),
408 EVENT_PTR(rapl_pkg_scale),
409 EVENT_PTR(rapl_ram_scale),
410 NULL,
411};
412
413static struct attribute *rapl_events_cln_attr[] = {
414 EVENT_PTR(rapl_cores),
415 EVENT_PTR(rapl_pkg),
416 EVENT_PTR(rapl_gpu),
417
418 EVENT_PTR(rapl_cores_unit),
419 EVENT_PTR(rapl_pkg_unit),
420 EVENT_PTR(rapl_gpu_unit),
421
422 EVENT_PTR(rapl_cores_scale),
423 EVENT_PTR(rapl_pkg_scale),
424 EVENT_PTR(rapl_gpu_scale),
425 NULL,
426};
427
428static struct attribute_group rapl_pmu_events_group = {
429 .name = "events",
430 .attrs = NULL, /* patched at runtime */
431};
432
433DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7");
434static struct attribute *rapl_formats_attr[] = {
435 &format_attr_event.attr,
436 NULL,
437};
438
439static struct attribute_group rapl_pmu_format_group = {
440 .name = "format",
441 .attrs = rapl_formats_attr,
442};
443
444const struct attribute_group *rapl_attr_groups[] = {
445 &rapl_pmu_attr_group,
446 &rapl_pmu_format_group,
447 &rapl_pmu_events_group,
448 NULL,
449};
450
451static struct pmu rapl_pmu_class = {
452 .attr_groups = rapl_attr_groups,
453 .task_ctx_nr = perf_invalid_context, /* system-wide only */
454 .event_init = rapl_pmu_event_init,
455 .add = rapl_pmu_event_add, /* must have */
456 .del = rapl_pmu_event_del, /* must have */
457 .start = rapl_pmu_event_start,
458 .stop = rapl_pmu_event_stop,
459 .read = rapl_pmu_event_read,
460};
461
462static void rapl_cpu_exit(int cpu)
463{
464 struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
465 int i, phys_id = topology_physical_package_id(cpu);
466 int target = -1;
467
468 /* find a new cpu on same package */
469 for_each_online_cpu(i) {
470 if (i == cpu)
471 continue;
472 if (phys_id == topology_physical_package_id(i)) {
473 target = i;
474 break;
475 }
476 }
477 /*
478 * clear cpu from cpumask
479 * if was set in cpumask and still some cpu on package,
480 * then move to new cpu
481 */
482 if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0)
483 cpumask_set_cpu(target, &rapl_cpu_mask);
484
485 WARN_ON(cpumask_empty(&rapl_cpu_mask));
486 /*
487 * migrate events and context to new cpu
488 */
489 if (target >= 0)
490 perf_pmu_migrate_context(pmu->pmu, cpu, target);
491
492 /* cancel overflow polling timer for CPU */
493 rapl_stop_hrtimer(pmu);
494}
495
496static void rapl_cpu_init(int cpu)
497{
498 int i, phys_id = topology_physical_package_id(cpu);
499
500 /* check if phys_is is already covered */
501 for_each_cpu(i, &rapl_cpu_mask) {
502 if (phys_id == topology_physical_package_id(i))
503 return;
504 }
505 /* was not found, so add it */
506 cpumask_set_cpu(cpu, &rapl_cpu_mask);
507}
508
509static int rapl_cpu_prepare(int cpu)
510{
511 struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
512 int phys_id = topology_physical_package_id(cpu);
513 u64 ms;
514
515 if (pmu)
516 return 0;
517
518 if (phys_id < 0)
519 return -1;
520
521 pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
522 if (!pmu)
523 return -1;
524
525 spin_lock_init(&pmu->lock);
526
527 INIT_LIST_HEAD(&pmu->active_list);
528
529 /*
530 * grab power unit as: 1/2^unit Joules
531 *
532 * we cache in local PMU instance
533 */
534 rdmsrl(MSR_RAPL_POWER_UNIT, pmu->hw_unit);
535 pmu->hw_unit = (pmu->hw_unit >> 8) & 0x1FULL;
536 pmu->pmu = &rapl_pmu_class;
537
538 /*
539 * use reference of 200W for scaling the timeout
540 * to avoid missing counter overflows.
541 * 200W = 200 Joules/sec
542 * divide interval by 2 to avoid lockstep (2 * 100)
543 * if hw unit is 32, then we use 2 ms 1/200/2
544 */
545 if (pmu->hw_unit < 32)
546 ms = (1000 / (2 * 100)) * (1ULL << (32 - pmu->hw_unit - 1));
547 else
548 ms = 2;
549
550 pmu->timer_interval = ms_to_ktime(ms);
551
552 rapl_hrtimer_init(pmu);
553
554 /* set RAPL pmu for this cpu for now */
555 per_cpu(rapl_pmu, cpu) = pmu;
556 per_cpu(rapl_pmu_to_free, cpu) = NULL;
557
558 return 0;
559}
560
561static void rapl_cpu_kfree(int cpu)
562{
563 struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu);
564
565 kfree(pmu);
566
567 per_cpu(rapl_pmu_to_free, cpu) = NULL;
568}
569
570static int rapl_cpu_dying(int cpu)
571{
572 struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
573
574 if (!pmu)
575 return 0;
576
577 per_cpu(rapl_pmu, cpu) = NULL;
578
579 per_cpu(rapl_pmu_to_free, cpu) = pmu;
580
581 return 0;
582}
583
584static int rapl_cpu_notifier(struct notifier_block *self,
585 unsigned long action, void *hcpu)
586{
587 unsigned int cpu = (long)hcpu;
588
589 switch (action & ~CPU_TASKS_FROZEN) {
590 case CPU_UP_PREPARE:
591 rapl_cpu_prepare(cpu);
592 break;
593 case CPU_STARTING:
594 rapl_cpu_init(cpu);
595 break;
596 case CPU_UP_CANCELED:
597 case CPU_DYING:
598 rapl_cpu_dying(cpu);
599 break;
600 case CPU_ONLINE:
601 case CPU_DEAD:
602 rapl_cpu_kfree(cpu);
603 break;
604 case CPU_DOWN_PREPARE:
605 rapl_cpu_exit(cpu);
606 break;
607 default:
608 break;
609 }
610
611 return NOTIFY_OK;
612}
613
614static const struct x86_cpu_id rapl_cpu_match[] = {
615 [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 },
616 [1] = {},
617};
618
619static int __init rapl_pmu_init(void)
620{
621 struct rapl_pmu *pmu;
622 int cpu, ret;
623
624 /*
625 * check for Intel processor family 6
626 */
627 if (!x86_match_cpu(rapl_cpu_match))
628 return 0;
629
630 /* check supported CPU */
631 switch (boot_cpu_data.x86_model) {
632 case 42: /* Sandy Bridge */
633 case 58: /* Ivy Bridge */
634 case 60: /* Haswell */
635 case 69: /* Haswell-Celeron */
636 rapl_cntr_mask = RAPL_IDX_CLN;
637 rapl_pmu_events_group.attrs = rapl_events_cln_attr;
638 break;
639 case 45: /* Sandy Bridge-EP */
640 case 62: /* IvyTown */
641 rapl_cntr_mask = RAPL_IDX_SRV;
642 rapl_pmu_events_group.attrs = rapl_events_srv_attr;
643 break;
644
645 default:
646 /* unsupported */
647 return 0;
648 }
649
650 cpu_notifier_register_begin();
651
652 for_each_online_cpu(cpu) {
653 rapl_cpu_prepare(cpu);
654 rapl_cpu_init(cpu);
655 }
656
657 __perf_cpu_notifier(rapl_cpu_notifier);
658
659 ret = perf_pmu_register(&rapl_pmu_class, "power", -1);
660 if (WARN_ON(ret)) {
661 pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret);
662 cpu_notifier_register_done();
663 return -1;
664 }
665
666 pmu = __get_cpu_var(rapl_pmu);
667
668 pr_info("RAPL PMU detected, hw unit 2^-%d Joules,"
669 " API unit is 2^-32 Joules,"
670 " %d fixed counters"
671 " %llu ms ovfl timer\n",
672 pmu->hw_unit,
673 hweight32(rapl_cntr_mask),
674 ktime_to_ms(pmu->timer_interval));
675
676 cpu_notifier_register_done();
677
678 return 0;
679}
680device_initcall(rapl_pmu_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 29c248799ced..65bbbea38b9c 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -66,6 +66,47 @@ DEFINE_UNCORE_FORMAT_ATTR(mask_vnw, mask_vnw, "config2:3-4");
66DEFINE_UNCORE_FORMAT_ATTR(mask0, mask0, "config2:0-31"); 66DEFINE_UNCORE_FORMAT_ATTR(mask0, mask0, "config2:0-31");
67DEFINE_UNCORE_FORMAT_ATTR(mask1, mask1, "config2:32-63"); 67DEFINE_UNCORE_FORMAT_ATTR(mask1, mask1, "config2:32-63");
68 68
69static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box);
70static void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box);
71static void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event);
72static void uncore_pmu_event_read(struct perf_event *event);
73
74static struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event)
75{
76 return container_of(event->pmu, struct intel_uncore_pmu, pmu);
77}
78
79static struct intel_uncore_box *
80uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
81{
82 struct intel_uncore_box *box;
83
84 box = *per_cpu_ptr(pmu->box, cpu);
85 if (box)
86 return box;
87
88 raw_spin_lock(&uncore_box_lock);
89 list_for_each_entry(box, &pmu->box_list, list) {
90 if (box->phys_id == topology_physical_package_id(cpu)) {
91 atomic_inc(&box->refcnt);
92 *per_cpu_ptr(pmu->box, cpu) = box;
93 break;
94 }
95 }
96 raw_spin_unlock(&uncore_box_lock);
97
98 return *per_cpu_ptr(pmu->box, cpu);
99}
100
101static struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
102{
103 /*
104 * perf core schedules event on the basis of cpu, uncore events are
105 * collected by one of the cpus inside a physical package.
106 */
107 return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id());
108}
109
69static u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event) 110static u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)
70{ 111{
71 u64 count; 112 u64 count;
@@ -501,8 +542,11 @@ static struct extra_reg snbep_uncore_cbox_extra_regs[] = {
501 SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN, 542 SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
502 SNBEP_CBO_PMON_CTL_TID_EN, 0x1), 543 SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
503 SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4), 544 SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
545 SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0x6),
504 SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4), 546 SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
547 SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0x6),
505 SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4), 548 SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
549 SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0x6),
506 SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x6), 550 SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x6),
507 SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x8), 551 SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x8),
508 SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x8), 552 SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x8),
@@ -1178,10 +1222,15 @@ static struct extra_reg ivt_uncore_cbox_extra_regs[] = {
1178 SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN, 1222 SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
1179 SNBEP_CBO_PMON_CTL_TID_EN, 0x1), 1223 SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
1180 SNBEP_CBO_EVENT_EXTRA_REG(0x1031, 0x10ff, 0x2), 1224 SNBEP_CBO_EVENT_EXTRA_REG(0x1031, 0x10ff, 0x2),
1225 SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
1226 SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc),
1227 SNBEP_CBO_EVENT_EXTRA_REG(0x5134, 0xffff, 0xc),
1181 SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4), 1228 SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
1229 SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0xc),
1182 SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4), 1230 SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
1231 SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0xc),
1183 SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4), 1232 SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
1184 SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc), 1233 SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0xc),
1185 SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x10), 1234 SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x10),
1186 SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10), 1235 SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10),
1187 SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10), 1236 SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10),
@@ -1631,6 +1680,349 @@ static struct intel_uncore_type *snb_msr_uncores[] = {
1631 &snb_uncore_cbox, 1680 &snb_uncore_cbox,
1632 NULL, 1681 NULL,
1633}; 1682};
1683
1684enum {
1685 SNB_PCI_UNCORE_IMC,
1686};
1687
1688static struct uncore_event_desc snb_uncore_imc_events[] = {
1689 INTEL_UNCORE_EVENT_DESC(data_reads, "event=0x01"),
1690 INTEL_UNCORE_EVENT_DESC(data_reads.scale, "6.103515625e-5"),
1691 INTEL_UNCORE_EVENT_DESC(data_reads.unit, "MiB"),
1692
1693 INTEL_UNCORE_EVENT_DESC(data_writes, "event=0x02"),
1694 INTEL_UNCORE_EVENT_DESC(data_writes.scale, "6.103515625e-5"),
1695 INTEL_UNCORE_EVENT_DESC(data_writes.unit, "MiB"),
1696
1697 { /* end: all zeroes */ },
1698};
1699
1700#define SNB_UNCORE_PCI_IMC_EVENT_MASK 0xff
1701#define SNB_UNCORE_PCI_IMC_BAR_OFFSET 0x48
1702
1703/* page size multiple covering all config regs */
1704#define SNB_UNCORE_PCI_IMC_MAP_SIZE 0x6000
1705
1706#define SNB_UNCORE_PCI_IMC_DATA_READS 0x1
1707#define SNB_UNCORE_PCI_IMC_DATA_READS_BASE 0x5050
1708#define SNB_UNCORE_PCI_IMC_DATA_WRITES 0x2
1709#define SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE 0x5054
1710#define SNB_UNCORE_PCI_IMC_CTR_BASE SNB_UNCORE_PCI_IMC_DATA_READS_BASE
1711
1712static struct attribute *snb_uncore_imc_formats_attr[] = {
1713 &format_attr_event.attr,
1714 NULL,
1715};
1716
1717static struct attribute_group snb_uncore_imc_format_group = {
1718 .name = "format",
1719 .attrs = snb_uncore_imc_formats_attr,
1720};
1721
1722static void snb_uncore_imc_init_box(struct intel_uncore_box *box)
1723{
1724 struct pci_dev *pdev = box->pci_dev;
1725 int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET;
1726 resource_size_t addr;
1727 u32 pci_dword;
1728
1729 pci_read_config_dword(pdev, where, &pci_dword);
1730 addr = pci_dword;
1731
1732#ifdef CONFIG_PHYS_ADDR_T_64BIT
1733 pci_read_config_dword(pdev, where + 4, &pci_dword);
1734 addr |= ((resource_size_t)pci_dword << 32);
1735#endif
1736
1737 addr &= ~(PAGE_SIZE - 1);
1738
1739 box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE);
1740 box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL;
1741}
1742
1743static void snb_uncore_imc_enable_box(struct intel_uncore_box *box)
1744{}
1745
1746static void snb_uncore_imc_disable_box(struct intel_uncore_box *box)
1747{}
1748
1749static void snb_uncore_imc_enable_event(struct intel_uncore_box *box, struct perf_event *event)
1750{}
1751
1752static void snb_uncore_imc_disable_event(struct intel_uncore_box *box, struct perf_event *event)
1753{}
1754
1755static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf_event *event)
1756{
1757 struct hw_perf_event *hwc = &event->hw;
1758
1759 return (u64)*(unsigned int *)(box->io_addr + hwc->event_base);
1760}
1761
1762/*
1763 * custom event_init() function because we define our own fixed, free
1764 * running counters, so we do not want to conflict with generic uncore
1765 * logic. Also simplifies processing
1766 */
1767static int snb_uncore_imc_event_init(struct perf_event *event)
1768{
1769 struct intel_uncore_pmu *pmu;
1770 struct intel_uncore_box *box;
1771 struct hw_perf_event *hwc = &event->hw;
1772 u64 cfg = event->attr.config & SNB_UNCORE_PCI_IMC_EVENT_MASK;
1773 int idx, base;
1774
1775 if (event->attr.type != event->pmu->type)
1776 return -ENOENT;
1777
1778 pmu = uncore_event_to_pmu(event);
1779 /* no device found for this pmu */
1780 if (pmu->func_id < 0)
1781 return -ENOENT;
1782
1783 /* Sampling not supported yet */
1784 if (hwc->sample_period)
1785 return -EINVAL;
1786
1787 /* unsupported modes and filters */
1788 if (event->attr.exclude_user ||
1789 event->attr.exclude_kernel ||
1790 event->attr.exclude_hv ||
1791 event->attr.exclude_idle ||
1792 event->attr.exclude_host ||
1793 event->attr.exclude_guest ||
1794 event->attr.sample_period) /* no sampling */
1795 return -EINVAL;
1796
1797 /*
1798 * Place all uncore events for a particular physical package
1799 * onto a single cpu
1800 */
1801 if (event->cpu < 0)
1802 return -EINVAL;
1803
1804 /* check only supported bits are set */
1805 if (event->attr.config & ~SNB_UNCORE_PCI_IMC_EVENT_MASK)
1806 return -EINVAL;
1807
1808 box = uncore_pmu_to_box(pmu, event->cpu);
1809 if (!box || box->cpu < 0)
1810 return -EINVAL;
1811
1812 event->cpu = box->cpu;
1813
1814 event->hw.idx = -1;
1815 event->hw.last_tag = ~0ULL;
1816 event->hw.extra_reg.idx = EXTRA_REG_NONE;
1817 event->hw.branch_reg.idx = EXTRA_REG_NONE;
1818 /*
1819 * check event is known (whitelist, determines counter)
1820 */
1821 switch (cfg) {
1822 case SNB_UNCORE_PCI_IMC_DATA_READS:
1823 base = SNB_UNCORE_PCI_IMC_DATA_READS_BASE;
1824 idx = UNCORE_PMC_IDX_FIXED;
1825 break;
1826 case SNB_UNCORE_PCI_IMC_DATA_WRITES:
1827 base = SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE;
1828 idx = UNCORE_PMC_IDX_FIXED + 1;
1829 break;
1830 default:
1831 return -EINVAL;
1832 }
1833
1834 /* must be done before validate_group */
1835 event->hw.event_base = base;
1836 event->hw.config = cfg;
1837 event->hw.idx = idx;
1838
1839 /* no group validation needed, we have free running counters */
1840
1841 return 0;
1842}
1843
1844static int snb_uncore_imc_hw_config(struct intel_uncore_box *box, struct perf_event *event)
1845{
1846 return 0;
1847}
1848
1849static void snb_uncore_imc_event_start(struct perf_event *event, int flags)
1850{
1851 struct intel_uncore_box *box = uncore_event_to_box(event);
1852 u64 count;
1853
1854 if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
1855 return;
1856
1857 event->hw.state = 0;
1858 box->n_active++;
1859
1860 list_add_tail(&event->active_entry, &box->active_list);
1861
1862 count = snb_uncore_imc_read_counter(box, event);
1863 local64_set(&event->hw.prev_count, count);
1864
1865 if (box->n_active == 1)
1866 uncore_pmu_start_hrtimer(box);
1867}
1868
1869static void snb_uncore_imc_event_stop(struct perf_event *event, int flags)
1870{
1871 struct intel_uncore_box *box = uncore_event_to_box(event);
1872 struct hw_perf_event *hwc = &event->hw;
1873
1874 if (!(hwc->state & PERF_HES_STOPPED)) {
1875 box->n_active--;
1876
1877 WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
1878 hwc->state |= PERF_HES_STOPPED;
1879
1880 list_del(&event->active_entry);
1881
1882 if (box->n_active == 0)
1883 uncore_pmu_cancel_hrtimer(box);
1884 }
1885
1886 if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
1887 /*
1888 * Drain the remaining delta count out of a event
1889 * that we are disabling:
1890 */
1891 uncore_perf_event_update(box, event);
1892 hwc->state |= PERF_HES_UPTODATE;
1893 }
1894}
1895
1896static int snb_uncore_imc_event_add(struct perf_event *event, int flags)
1897{
1898 struct intel_uncore_box *box = uncore_event_to_box(event);
1899 struct hw_perf_event *hwc = &event->hw;
1900
1901 if (!box)
1902 return -ENODEV;
1903
1904 hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
1905 if (!(flags & PERF_EF_START))
1906 hwc->state |= PERF_HES_ARCH;
1907
1908 snb_uncore_imc_event_start(event, 0);
1909
1910 box->n_events++;
1911
1912 return 0;
1913}
1914
1915static void snb_uncore_imc_event_del(struct perf_event *event, int flags)
1916{
1917 struct intel_uncore_box *box = uncore_event_to_box(event);
1918 int i;
1919
1920 snb_uncore_imc_event_stop(event, PERF_EF_UPDATE);
1921
1922 for (i = 0; i < box->n_events; i++) {
1923 if (event == box->event_list[i]) {
1924 --box->n_events;
1925 break;
1926 }
1927 }
1928}
1929
1930static int snb_pci2phy_map_init(int devid)
1931{
1932 struct pci_dev *dev = NULL;
1933 int bus;
1934
1935 dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, dev);
1936 if (!dev)
1937 return -ENOTTY;
1938
1939 bus = dev->bus->number;
1940
1941 pcibus_to_physid[bus] = 0;
1942
1943 pci_dev_put(dev);
1944
1945 return 0;
1946}
1947
1948static struct pmu snb_uncore_imc_pmu = {
1949 .task_ctx_nr = perf_invalid_context,
1950 .event_init = snb_uncore_imc_event_init,
1951 .add = snb_uncore_imc_event_add,
1952 .del = snb_uncore_imc_event_del,
1953 .start = snb_uncore_imc_event_start,
1954 .stop = snb_uncore_imc_event_stop,
1955 .read = uncore_pmu_event_read,
1956};
1957
1958static struct intel_uncore_ops snb_uncore_imc_ops = {
1959 .init_box = snb_uncore_imc_init_box,
1960 .enable_box = snb_uncore_imc_enable_box,
1961 .disable_box = snb_uncore_imc_disable_box,
1962 .disable_event = snb_uncore_imc_disable_event,
1963 .enable_event = snb_uncore_imc_enable_event,
1964 .hw_config = snb_uncore_imc_hw_config,
1965 .read_counter = snb_uncore_imc_read_counter,
1966};
1967
1968static struct intel_uncore_type snb_uncore_imc = {
1969 .name = "imc",
1970 .num_counters = 2,
1971 .num_boxes = 1,
1972 .fixed_ctr_bits = 32,
1973 .fixed_ctr = SNB_UNCORE_PCI_IMC_CTR_BASE,
1974 .event_descs = snb_uncore_imc_events,
1975 .format_group = &snb_uncore_imc_format_group,
1976 .perf_ctr = SNB_UNCORE_PCI_IMC_DATA_READS_BASE,
1977 .event_mask = SNB_UNCORE_PCI_IMC_EVENT_MASK,
1978 .ops = &snb_uncore_imc_ops,
1979 .pmu = &snb_uncore_imc_pmu,
1980};
1981
1982static struct intel_uncore_type *snb_pci_uncores[] = {
1983 [SNB_PCI_UNCORE_IMC] = &snb_uncore_imc,
1984 NULL,
1985};
1986
1987static DEFINE_PCI_DEVICE_TABLE(snb_uncore_pci_ids) = {
1988 { /* IMC */
1989 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SNB_IMC),
1990 .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
1991 },
1992 { /* end: all zeroes */ },
1993};
1994
1995static DEFINE_PCI_DEVICE_TABLE(ivb_uncore_pci_ids) = {
1996 { /* IMC */
1997 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_IMC),
1998 .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
1999 },
2000 { /* end: all zeroes */ },
2001};
2002
2003static DEFINE_PCI_DEVICE_TABLE(hsw_uncore_pci_ids) = {
2004 { /* IMC */
2005 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC),
2006 .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
2007 },
2008 { /* end: all zeroes */ },
2009};
2010
2011static struct pci_driver snb_uncore_pci_driver = {
2012 .name = "snb_uncore",
2013 .id_table = snb_uncore_pci_ids,
2014};
2015
2016static struct pci_driver ivb_uncore_pci_driver = {
2017 .name = "ivb_uncore",
2018 .id_table = ivb_uncore_pci_ids,
2019};
2020
2021static struct pci_driver hsw_uncore_pci_driver = {
2022 .name = "hsw_uncore",
2023 .id_table = hsw_uncore_pci_ids,
2024};
2025
1634/* end of Sandy Bridge uncore support */ 2026/* end of Sandy Bridge uncore support */
1635 2027
1636/* Nehalem uncore support */ 2028/* Nehalem uncore support */
@@ -2781,6 +3173,7 @@ again:
2781static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer) 3173static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer)
2782{ 3174{
2783 struct intel_uncore_box *box; 3175 struct intel_uncore_box *box;
3176 struct perf_event *event;
2784 unsigned long flags; 3177 unsigned long flags;
2785 int bit; 3178 int bit;
2786 3179
@@ -2793,19 +3186,27 @@ static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer)
2793 */ 3186 */
2794 local_irq_save(flags); 3187 local_irq_save(flags);
2795 3188
3189 /*
3190 * handle boxes with an active event list as opposed to active
3191 * counters
3192 */
3193 list_for_each_entry(event, &box->active_list, active_entry) {
3194 uncore_perf_event_update(box, event);
3195 }
3196
2796 for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX) 3197 for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX)
2797 uncore_perf_event_update(box, box->events[bit]); 3198 uncore_perf_event_update(box, box->events[bit]);
2798 3199
2799 local_irq_restore(flags); 3200 local_irq_restore(flags);
2800 3201
2801 hrtimer_forward_now(hrtimer, ns_to_ktime(UNCORE_PMU_HRTIMER_INTERVAL)); 3202 hrtimer_forward_now(hrtimer, ns_to_ktime(box->hrtimer_duration));
2802 return HRTIMER_RESTART; 3203 return HRTIMER_RESTART;
2803} 3204}
2804 3205
2805static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box) 3206static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box)
2806{ 3207{
2807 __hrtimer_start_range_ns(&box->hrtimer, 3208 __hrtimer_start_range_ns(&box->hrtimer,
2808 ns_to_ktime(UNCORE_PMU_HRTIMER_INTERVAL), 0, 3209 ns_to_ktime(box->hrtimer_duration), 0,
2809 HRTIMER_MODE_REL_PINNED, 0); 3210 HRTIMER_MODE_REL_PINNED, 0);
2810} 3211}
2811 3212
@@ -2839,43 +3240,12 @@ static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type,
2839 box->cpu = -1; 3240 box->cpu = -1;
2840 box->phys_id = -1; 3241 box->phys_id = -1;
2841 3242
2842 return box; 3243 /* set default hrtimer timeout */
2843} 3244 box->hrtimer_duration = UNCORE_PMU_HRTIMER_INTERVAL;
2844
2845static struct intel_uncore_box *
2846uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
2847{
2848 struct intel_uncore_box *box;
2849
2850 box = *per_cpu_ptr(pmu->box, cpu);
2851 if (box)
2852 return box;
2853
2854 raw_spin_lock(&uncore_box_lock);
2855 list_for_each_entry(box, &pmu->box_list, list) {
2856 if (box->phys_id == topology_physical_package_id(cpu)) {
2857 atomic_inc(&box->refcnt);
2858 *per_cpu_ptr(pmu->box, cpu) = box;
2859 break;
2860 }
2861 }
2862 raw_spin_unlock(&uncore_box_lock);
2863
2864 return *per_cpu_ptr(pmu->box, cpu);
2865}
2866 3245
2867static struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event) 3246 INIT_LIST_HEAD(&box->active_list);
2868{
2869 return container_of(event->pmu, struct intel_uncore_pmu, pmu);
2870}
2871 3247
2872static struct intel_uncore_box *uncore_event_to_box(struct perf_event *event) 3248 return box;
2873{
2874 /*
2875 * perf core schedules event on the basis of cpu, uncore events are
2876 * collected by one of the cpus inside a physical package.
2877 */
2878 return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id());
2879} 3249}
2880 3250
2881static int 3251static int
@@ -3271,16 +3641,21 @@ static int __init uncore_pmu_register(struct intel_uncore_pmu *pmu)
3271{ 3641{
3272 int ret; 3642 int ret;
3273 3643
3274 pmu->pmu = (struct pmu) { 3644 if (!pmu->type->pmu) {
3275 .attr_groups = pmu->type->attr_groups, 3645 pmu->pmu = (struct pmu) {
3276 .task_ctx_nr = perf_invalid_context, 3646 .attr_groups = pmu->type->attr_groups,
3277 .event_init = uncore_pmu_event_init, 3647 .task_ctx_nr = perf_invalid_context,
3278 .add = uncore_pmu_event_add, 3648 .event_init = uncore_pmu_event_init,
3279 .del = uncore_pmu_event_del, 3649 .add = uncore_pmu_event_add,
3280 .start = uncore_pmu_event_start, 3650 .del = uncore_pmu_event_del,
3281 .stop = uncore_pmu_event_stop, 3651 .start = uncore_pmu_event_start,
3282 .read = uncore_pmu_event_read, 3652 .stop = uncore_pmu_event_stop,
3283 }; 3653 .read = uncore_pmu_event_read,
3654 };
3655 } else {
3656 pmu->pmu = *pmu->type->pmu;
3657 pmu->pmu.attr_groups = pmu->type->attr_groups;
3658 }
3284 3659
3285 if (pmu->type->num_boxes == 1) { 3660 if (pmu->type->num_boxes == 1) {
3286 if (strlen(pmu->type->name) > 0) 3661 if (strlen(pmu->type->name) > 0)
@@ -3326,6 +3701,8 @@ static int __init uncore_type_init(struct intel_uncore_type *type)
3326 if (!pmus) 3701 if (!pmus)
3327 return -ENOMEM; 3702 return -ENOMEM;
3328 3703
3704 type->pmus = pmus;
3705
3329 type->unconstrainted = (struct event_constraint) 3706 type->unconstrainted = (struct event_constraint)
3330 __EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1, 3707 __EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1,
3331 0, type->num_counters, 0, 0); 3708 0, type->num_counters, 0, 0);
@@ -3361,7 +3738,6 @@ static int __init uncore_type_init(struct intel_uncore_type *type)
3361 } 3738 }
3362 3739
3363 type->pmu_group = &uncore_pmu_attr_group; 3740 type->pmu_group = &uncore_pmu_attr_group;
3364 type->pmus = pmus;
3365 return 0; 3741 return 0;
3366fail: 3742fail:
3367 uncore_type_exit(type); 3743 uncore_type_exit(type);
@@ -3493,6 +3869,28 @@ static int __init uncore_pci_init(void)
3493 pci_uncores = ivt_pci_uncores; 3869 pci_uncores = ivt_pci_uncores;
3494 uncore_pci_driver = &ivt_uncore_pci_driver; 3870 uncore_pci_driver = &ivt_uncore_pci_driver;
3495 break; 3871 break;
3872 case 42: /* Sandy Bridge */
3873 ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_SNB_IMC);
3874 if (ret)
3875 return ret;
3876 pci_uncores = snb_pci_uncores;
3877 uncore_pci_driver = &snb_uncore_pci_driver;
3878 break;
3879 case 58: /* Ivy Bridge */
3880 ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_IVB_IMC);
3881 if (ret)
3882 return ret;
3883 pci_uncores = snb_pci_uncores;
3884 uncore_pci_driver = &ivb_uncore_pci_driver;
3885 break;
3886 case 60: /* Haswell */
3887 case 69: /* Haswell Celeron */
3888 ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_HSW_IMC);
3889 if (ret)
3890 return ret;
3891 pci_uncores = snb_pci_uncores;
3892 uncore_pci_driver = &hsw_uncore_pci_driver;
3893 break;
3496 default: 3894 default:
3497 return 0; 3895 return 0;
3498 } 3896 }
@@ -3764,7 +4162,7 @@ static void __init uncore_cpu_setup(void *dummy)
3764 4162
3765static int __init uncore_cpu_init(void) 4163static int __init uncore_cpu_init(void)
3766{ 4164{
3767 int ret, cpu, max_cores; 4165 int ret, max_cores;
3768 4166
3769 max_cores = boot_cpu_data.x86_max_cores; 4167 max_cores = boot_cpu_data.x86_max_cores;
3770 switch (boot_cpu_data.x86_model) { 4168 switch (boot_cpu_data.x86_model) {
@@ -3808,29 +4206,6 @@ static int __init uncore_cpu_init(void)
3808 if (ret) 4206 if (ret)
3809 return ret; 4207 return ret;
3810 4208
3811 get_online_cpus();
3812
3813 for_each_online_cpu(cpu) {
3814 int i, phys_id = topology_physical_package_id(cpu);
3815
3816 for_each_cpu(i, &uncore_cpu_mask) {
3817 if (phys_id == topology_physical_package_id(i)) {
3818 phys_id = -1;
3819 break;
3820 }
3821 }
3822 if (phys_id < 0)
3823 continue;
3824
3825 uncore_cpu_prepare(cpu, phys_id);
3826 uncore_event_init_cpu(cpu);
3827 }
3828 on_each_cpu(uncore_cpu_setup, NULL, 1);
3829
3830 register_cpu_notifier(&uncore_cpu_nb);
3831
3832 put_online_cpus();
3833
3834 return 0; 4209 return 0;
3835} 4210}
3836 4211
@@ -3859,6 +4234,41 @@ static int __init uncore_pmus_register(void)
3859 return 0; 4234 return 0;
3860} 4235}
3861 4236
4237static void __init uncore_cpumask_init(void)
4238{
4239 int cpu;
4240
4241 /*
4242 * ony invoke once from msr or pci init code
4243 */
4244 if (!cpumask_empty(&uncore_cpu_mask))
4245 return;
4246
4247 cpu_notifier_register_begin();
4248
4249 for_each_online_cpu(cpu) {
4250 int i, phys_id = topology_physical_package_id(cpu);
4251
4252 for_each_cpu(i, &uncore_cpu_mask) {
4253 if (phys_id == topology_physical_package_id(i)) {
4254 phys_id = -1;
4255 break;
4256 }
4257 }
4258 if (phys_id < 0)
4259 continue;
4260
4261 uncore_cpu_prepare(cpu, phys_id);
4262 uncore_event_init_cpu(cpu);
4263 }
4264 on_each_cpu(uncore_cpu_setup, NULL, 1);
4265
4266 __register_cpu_notifier(&uncore_cpu_nb);
4267
4268 cpu_notifier_register_done();
4269}
4270
4271
3862static int __init intel_uncore_init(void) 4272static int __init intel_uncore_init(void)
3863{ 4273{
3864 int ret; 4274 int ret;
@@ -3877,6 +4287,7 @@ static int __init intel_uncore_init(void)
3877 uncore_pci_exit(); 4287 uncore_pci_exit();
3878 goto fail; 4288 goto fail;
3879 } 4289 }
4290 uncore_cpumask_init();
3880 4291
3881 uncore_pmus_register(); 4292 uncore_pmus_register();
3882 return 0; 4293 return 0;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index a80ab71a883d..90236f0c94a9 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -6,6 +6,7 @@
6 6
7#define UNCORE_PMU_NAME_LEN 32 7#define UNCORE_PMU_NAME_LEN 32
8#define UNCORE_PMU_HRTIMER_INTERVAL (60LL * NSEC_PER_SEC) 8#define UNCORE_PMU_HRTIMER_INTERVAL (60LL * NSEC_PER_SEC)
9#define UNCORE_SNB_IMC_HRTIMER_INTERVAL (5ULL * NSEC_PER_SEC)
9 10
10#define UNCORE_FIXED_EVENT 0xff 11#define UNCORE_FIXED_EVENT 0xff
11#define UNCORE_PMC_IDX_MAX_GENERIC 8 12#define UNCORE_PMC_IDX_MAX_GENERIC 8
@@ -440,6 +441,7 @@ struct intel_uncore_type {
440 struct intel_uncore_ops *ops; 441 struct intel_uncore_ops *ops;
441 struct uncore_event_desc *event_descs; 442 struct uncore_event_desc *event_descs;
442 const struct attribute_group *attr_groups[4]; 443 const struct attribute_group *attr_groups[4];
444 struct pmu *pmu; /* for custom pmu ops */
443}; 445};
444 446
445#define pmu_group attr_groups[0] 447#define pmu_group attr_groups[0]
@@ -488,8 +490,11 @@ struct intel_uncore_box {
488 u64 tags[UNCORE_PMC_IDX_MAX]; 490 u64 tags[UNCORE_PMC_IDX_MAX];
489 struct pci_dev *pci_dev; 491 struct pci_dev *pci_dev;
490 struct intel_uncore_pmu *pmu; 492 struct intel_uncore_pmu *pmu;
493 u64 hrtimer_duration; /* hrtimer timeout for this box */
491 struct hrtimer hrtimer; 494 struct hrtimer hrtimer;
492 struct list_head list; 495 struct list_head list;
496 struct list_head active_list;
497 void *io_addr;
493 struct intel_uncore_extra_reg shared_regs[0]; 498 struct intel_uncore_extra_reg shared_regs[0];
494}; 499};
495 500
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 3486e6660357..5d466b7d8609 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -1257,7 +1257,24 @@ again:
1257 pass++; 1257 pass++;
1258 goto again; 1258 goto again;
1259 } 1259 }
1260 1260 /*
1261 * Perf does test runs to see if a whole group can be assigned
1262 * together succesfully. There can be multiple rounds of this.
1263 * Unfortunately, p4_pmu_swap_config_ts touches the hwc->config
1264 * bits, such that the next round of group assignments will
1265 * cause the above p4_should_swap_ts to pass instead of fail.
1266 * This leads to counters exclusive to thread0 being used by
1267 * thread1.
1268 *
1269 * Solve this with a cheap hack, reset the idx back to -1 to
1270 * force a new lookup (p4_next_cntr) to get the right counter
1271 * for the right thread.
1272 *
1273 * This probably doesn't comply with the general spirit of how
1274 * perf wants to work, but P4 is special. :-(
1275 */
1276 if (p4_should_swap_ts(hwc->config, cpu))
1277 hwc->idx = -1;
1261 p4_pmu_swap_config_ts(hwc, cpu); 1278 p4_pmu_swap_config_ts(hwc, cpu);
1262 if (assign) 1279 if (assign)
1263 assign[i] = cntr_idx; 1280 assign[i] = cntr_idx;
@@ -1322,6 +1339,7 @@ static __initconst const struct x86_pmu p4_pmu = {
1322__init int p4_pmu_init(void) 1339__init int p4_pmu_init(void)
1323{ 1340{
1324 unsigned int low, high; 1341 unsigned int low, high;
1342 int i, reg;
1325 1343
1326 /* If we get stripped -- indexing fails */ 1344 /* If we get stripped -- indexing fails */
1327 BUILD_BUG_ON(ARCH_P4_MAX_CCCR > INTEL_PMC_MAX_GENERIC); 1345 BUILD_BUG_ON(ARCH_P4_MAX_CCCR > INTEL_PMC_MAX_GENERIC);
@@ -1340,5 +1358,19 @@ __init int p4_pmu_init(void)
1340 1358
1341 x86_pmu = p4_pmu; 1359 x86_pmu = p4_pmu;
1342 1360
1361 /*
1362 * Even though the counters are configured to interrupt a particular
1363 * logical processor when an overflow happens, testing has shown that
1364 * on kdump kernels (which uses a single cpu), thread1's counter
1365 * continues to run and will report an NMI on thread0. Due to the
1366 * overflow bug, this leads to a stream of unknown NMIs.
1367 *
1368 * Solve this by zero'ing out the registers to mimic a reset.
1369 */
1370 for (i = 0; i < x86_pmu.num_counters; i++) {
1371 reg = x86_pmu_config_addr(i);
1372 wrmsrl_safe(reg, 0ULL);
1373 }
1374
1343 return 0; 1375 return 0;
1344} 1376}
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index b1e2fe115323..7c1a0c07b607 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -231,31 +231,49 @@ static __initconst const struct x86_pmu p6_pmu = {
231 231
232}; 232};
233 233
234static __init void p6_pmu_rdpmc_quirk(void)
235{
236 if (boot_cpu_data.x86_mask < 9) {
237 /*
238 * PPro erratum 26; fixed in stepping 9 and above.
239 */
240 pr_warn("Userspace RDPMC support disabled due to a CPU erratum\n");
241 x86_pmu.attr_rdpmc_broken = 1;
242 x86_pmu.attr_rdpmc = 0;
243 }
244}
245
234__init int p6_pmu_init(void) 246__init int p6_pmu_init(void)
235{ 247{
248 x86_pmu = p6_pmu;
249
236 switch (boot_cpu_data.x86_model) { 250 switch (boot_cpu_data.x86_model) {
237 case 1: 251 case 1: /* Pentium Pro */
238 case 3: /* Pentium Pro */ 252 x86_add_quirk(p6_pmu_rdpmc_quirk);
239 case 5: 253 break;
240 case 6: /* Pentium II */ 254
241 case 7: 255 case 3: /* Pentium II - Klamath */
242 case 8: 256 case 5: /* Pentium II - Deschutes */
243 case 11: /* Pentium III */ 257 case 6: /* Pentium II - Mendocino */
244 case 9:
245 case 13:
246 /* Pentium M */
247 break; 258 break;
259
260 case 7: /* Pentium III - Katmai */
261 case 8: /* Pentium III - Coppermine */
262 case 10: /* Pentium III Xeon */
263 case 11: /* Pentium III - Tualatin */
264 break;
265
266 case 9: /* Pentium M - Banias */
267 case 13: /* Pentium M - Dothan */
268 break;
269
248 default: 270 default:
249 pr_cont("unsupported p6 CPU model %d ", 271 pr_cont("unsupported p6 CPU model %d ", boot_cpu_data.x86_model);
250 boot_cpu_data.x86_model);
251 return -ENODEV; 272 return -ENODEV;
252 } 273 }
253 274
254 x86_pmu = p6_pmu;
255
256 memcpy(hw_cache_event_ids, p6_hw_cache_event_ids, 275 memcpy(hw_cache_event_ids, p6_hw_cache_event_ids,
257 sizeof(hw_cache_event_ids)); 276 sizeof(hw_cache_event_ids));
258 277
259
260 return 0; 278 return 0;
261} 279}
diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c
index 88db010845cb..384df5105fbc 100644
--- a/arch/x86/kernel/cpu/rdrand.c
+++ b/arch/x86/kernel/cpu/rdrand.c
@@ -31,20 +31,6 @@ static int __init x86_rdrand_setup(char *s)
31} 31}
32__setup("nordrand", x86_rdrand_setup); 32__setup("nordrand", x86_rdrand_setup);
33 33
34/* We can't use arch_get_random_long() here since alternatives haven't run */
35static inline int rdrand_long(unsigned long *v)
36{
37 int ok;
38 asm volatile("1: " RDRAND_LONG "\n\t"
39 "jc 2f\n\t"
40 "decl %0\n\t"
41 "jnz 1b\n\t"
42 "2:"
43 : "=r" (ok), "=a" (*v)
44 : "0" (RDRAND_RETRY_LOOPS));
45 return ok;
46}
47
48/* 34/*
49 * Force a reseed cycle; we are architecturally guaranteed a reseed 35 * Force a reseed cycle; we are architecturally guaranteed a reseed
50 * after no more than 512 128-bit chunks of random data. This also 36 * after no more than 512 128-bit chunks of random data. This also
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index aa0430d69b90..3fa0e5ad86b4 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -1,6 +1,5 @@
1#include <linux/kernel.h> 1#include <linux/kernel.h>
2#include <linux/mm.h> 2#include <linux/mm.h>
3#include <linux/init.h>
4#include <asm/processor.h> 3#include <asm/processor.h>
5#include <asm/msr.h> 4#include <asm/msr.h>
6#include "cpu.h" 5#include "cpu.h"
diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c
index 75c5ad5d35cc..ef9c2a0078bd 100644
--- a/arch/x86/kernel/cpu/umc.c
+++ b/arch/x86/kernel/cpu/umc.c
@@ -1,5 +1,4 @@
1#include <linux/kernel.h> 1#include <linux/kernel.h>
2#include <linux/init.h>
3#include <asm/processor.h> 2#include <asm/processor.h>
4#include "cpu.h" 3#include "cpu.h"
5 4
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 7d9481c743f8..3225ae6c5180 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -198,14 +198,15 @@ static int __init cpuid_init(void)
198 goto out_chrdev; 198 goto out_chrdev;
199 } 199 }
200 cpuid_class->devnode = cpuid_devnode; 200 cpuid_class->devnode = cpuid_devnode;
201 get_online_cpus(); 201
202 cpu_notifier_register_begin();
202 for_each_online_cpu(i) { 203 for_each_online_cpu(i) {
203 err = cpuid_device_create(i); 204 err = cpuid_device_create(i);
204 if (err != 0) 205 if (err != 0)
205 goto out_class; 206 goto out_class;
206 } 207 }
207 register_hotcpu_notifier(&cpuid_class_cpu_notifier); 208 __register_hotcpu_notifier(&cpuid_class_cpu_notifier);
208 put_online_cpus(); 209 cpu_notifier_register_done();
209 210
210 err = 0; 211 err = 0;
211 goto out; 212 goto out;
@@ -215,7 +216,7 @@ out_class:
215 for_each_online_cpu(i) { 216 for_each_online_cpu(i) {
216 cpuid_device_destroy(i); 217 cpuid_device_destroy(i);
217 } 218 }
218 put_online_cpus(); 219 cpu_notifier_register_done();
219 class_destroy(cpuid_class); 220 class_destroy(cpuid_class);
220out_chrdev: 221out_chrdev:
221 __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); 222 __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
@@ -227,13 +228,13 @@ static void __exit cpuid_exit(void)
227{ 228{
228 int cpu = 0; 229 int cpu = 0;
229 230
230 get_online_cpus(); 231 cpu_notifier_register_begin();
231 for_each_online_cpu(cpu) 232 for_each_online_cpu(cpu)
232 cpuid_device_destroy(cpu); 233 cpuid_device_destroy(cpu);
233 class_destroy(cpuid_class); 234 class_destroy(cpuid_class);
234 __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); 235 __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
235 unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); 236 __unregister_hotcpu_notifier(&cpuid_class_cpu_notifier);
236 put_online_cpus(); 237 cpu_notifier_register_done();
237} 238}
238 239
239module_init(cpuid_init); 240module_init(cpuid_init);
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 18677a90d6a3..507de8066594 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -7,7 +7,6 @@
7 * 7 *
8 */ 8 */
9 9
10#include <linux/init.h>
11#include <linux/types.h> 10#include <linux/types.h>
12#include <linux/kernel.h> 11#include <linux/kernel.h>
13#include <linux/smp.h> 12#include <linux/smp.h>
@@ -58,9 +57,7 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
58{ 57{
59#ifdef CONFIG_X86_32 58#ifdef CONFIG_X86_32
60 struct pt_regs fixed_regs; 59 struct pt_regs fixed_regs;
61#endif
62 60
63#ifdef CONFIG_X86_32
64 if (!user_mode_vm(regs)) { 61 if (!user_mode_vm(regs)) {
65 crash_fixup_ss_esp(&fixed_regs, regs); 62 crash_fixup_ss_esp(&fixed_regs, regs);
66 regs = &fixed_regs; 63 regs = &fixed_regs;
diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c
index 5d3fe8d36e4a..f6dfd9334b67 100644
--- a/arch/x86/kernel/doublefault.c
+++ b/arch/x86/kernel/doublefault.c
@@ -1,6 +1,5 @@
1#include <linux/mm.h> 1#include <linux/mm.h>
2#include <linux/sched.h> 2#include <linux/sched.h>
3#include <linux/init.h>
4#include <linux/init_task.h> 3#include <linux/init_task.h>
5#include <linux/fs.h> 4#include <linux/fs.h>
6 5
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index f2a1770ca176..5abd4cd4230c 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -16,12 +16,35 @@
16 16
17#include <asm/stacktrace.h> 17#include <asm/stacktrace.h>
18 18
19static void *is_irq_stack(void *p, void *irq)
20{
21 if (p < irq || p >= (irq + THREAD_SIZE))
22 return NULL;
23 return irq + THREAD_SIZE;
24}
25
26
27static void *is_hardirq_stack(unsigned long *stack, int cpu)
28{
29 void *irq = per_cpu(hardirq_stack, cpu);
30
31 return is_irq_stack(stack, irq);
32}
33
34static void *is_softirq_stack(unsigned long *stack, int cpu)
35{
36 void *irq = per_cpu(softirq_stack, cpu);
37
38 return is_irq_stack(stack, irq);
39}
19 40
20void dump_trace(struct task_struct *task, struct pt_regs *regs, 41void dump_trace(struct task_struct *task, struct pt_regs *regs,
21 unsigned long *stack, unsigned long bp, 42 unsigned long *stack, unsigned long bp,
22 const struct stacktrace_ops *ops, void *data) 43 const struct stacktrace_ops *ops, void *data)
23{ 44{
45 const unsigned cpu = get_cpu();
24 int graph = 0; 46 int graph = 0;
47 u32 *prev_esp;
25 48
26 if (!task) 49 if (!task)
27 task = current; 50 task = current;
@@ -30,7 +53,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
30 unsigned long dummy; 53 unsigned long dummy;
31 54
32 stack = &dummy; 55 stack = &dummy;
33 if (task && task != current) 56 if (task != current)
34 stack = (unsigned long *)task->thread.sp; 57 stack = (unsigned long *)task->thread.sp;
35 } 58 }
36 59
@@ -39,18 +62,31 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
39 62
40 for (;;) { 63 for (;;) {
41 struct thread_info *context; 64 struct thread_info *context;
65 void *end_stack;
66
67 end_stack = is_hardirq_stack(stack, cpu);
68 if (!end_stack)
69 end_stack = is_softirq_stack(stack, cpu);
42 70
43 context = (struct thread_info *) 71 context = task_thread_info(task);
44 ((unsigned long)stack & (~(THREAD_SIZE - 1))); 72 bp = ops->walk_stack(context, stack, bp, ops, data,
45 bp = ops->walk_stack(context, stack, bp, ops, data, NULL, &graph); 73 end_stack, &graph);
46 74
47 stack = (unsigned long *)context->previous_esp; 75 /* Stop if not on irq stack */
76 if (!end_stack)
77 break;
78
79 /* The previous esp is saved on the bottom of the stack */
80 prev_esp = (u32 *)(end_stack - THREAD_SIZE);
81 stack = (unsigned long *)*prev_esp;
48 if (!stack) 82 if (!stack)
49 break; 83 break;
84
50 if (ops->stack(data, "IRQ") < 0) 85 if (ops->stack(data, "IRQ") < 0)
51 break; 86 break;
52 touch_nmi_watchdog(); 87 touch_nmi_watchdog();
53 } 88 }
89 put_cpu();
54} 90}
55EXPORT_SYMBOL(dump_trace); 91EXPORT_SYMBOL(dump_trace);
56 92
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index addb207dab92..1abcb50b48ae 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -104,6 +104,44 @@ in_irq_stack(unsigned long *stack, unsigned long *irq_stack,
104 return (stack >= irq_stack && stack < irq_stack_end); 104 return (stack >= irq_stack && stack < irq_stack_end);
105} 105}
106 106
107static const unsigned long irq_stack_size =
108 (IRQ_STACK_SIZE - 64) / sizeof(unsigned long);
109
110enum stack_type {
111 STACK_IS_UNKNOWN,
112 STACK_IS_NORMAL,
113 STACK_IS_EXCEPTION,
114 STACK_IS_IRQ,
115};
116
117static enum stack_type
118analyze_stack(int cpu, struct task_struct *task, unsigned long *stack,
119 unsigned long **stack_end, unsigned long *irq_stack,
120 unsigned *used, char **id)
121{
122 unsigned long addr;
123
124 addr = ((unsigned long)stack & (~(THREAD_SIZE - 1)));
125 if ((unsigned long)task_stack_page(task) == addr)
126 return STACK_IS_NORMAL;
127
128 *stack_end = in_exception_stack(cpu, (unsigned long)stack,
129 used, id);
130 if (*stack_end)
131 return STACK_IS_EXCEPTION;
132
133 if (!irq_stack)
134 return STACK_IS_NORMAL;
135
136 *stack_end = irq_stack;
137 irq_stack = irq_stack - irq_stack_size;
138
139 if (in_irq_stack(stack, irq_stack, *stack_end))
140 return STACK_IS_IRQ;
141
142 return STACK_IS_UNKNOWN;
143}
144
107/* 145/*
108 * x86-64 can have up to three kernel stacks: 146 * x86-64 can have up to three kernel stacks:
109 * process stack 147 * process stack
@@ -116,12 +154,12 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
116 const struct stacktrace_ops *ops, void *data) 154 const struct stacktrace_ops *ops, void *data)
117{ 155{
118 const unsigned cpu = get_cpu(); 156 const unsigned cpu = get_cpu();
119 unsigned long *irq_stack_end =
120 (unsigned long *)per_cpu(irq_stack_ptr, cpu);
121 unsigned used = 0;
122 struct thread_info *tinfo; 157 struct thread_info *tinfo;
123 int graph = 0; 158 unsigned long *irq_stack = (unsigned long *)per_cpu(irq_stack_ptr, cpu);
124 unsigned long dummy; 159 unsigned long dummy;
160 unsigned used = 0;
161 int graph = 0;
162 int done = 0;
125 163
126 if (!task) 164 if (!task)
127 task = current; 165 task = current;
@@ -143,49 +181,61 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
143 * exceptions 181 * exceptions
144 */ 182 */
145 tinfo = task_thread_info(task); 183 tinfo = task_thread_info(task);
146 for (;;) { 184 while (!done) {
185 unsigned long *stack_end;
186 enum stack_type stype;
147 char *id; 187 char *id;
148 unsigned long *estack_end;
149 estack_end = in_exception_stack(cpu, (unsigned long)stack,
150 &used, &id);
151 188
152 if (estack_end) { 189 stype = analyze_stack(cpu, task, stack, &stack_end,
190 irq_stack, &used, &id);
191
192 /* Default finish unless specified to continue */
193 done = 1;
194
195 switch (stype) {
196
197 /* Break out early if we are on the thread stack */
198 case STACK_IS_NORMAL:
199 break;
200
201 case STACK_IS_EXCEPTION:
202
153 if (ops->stack(data, id) < 0) 203 if (ops->stack(data, id) < 0)
154 break; 204 break;
155 205
156 bp = ops->walk_stack(tinfo, stack, bp, ops, 206 bp = ops->walk_stack(tinfo, stack, bp, ops,
157 data, estack_end, &graph); 207 data, stack_end, &graph);
158 ops->stack(data, "<EOE>"); 208 ops->stack(data, "<EOE>");
159 /* 209 /*
160 * We link to the next stack via the 210 * We link to the next stack via the
161 * second-to-last pointer (index -2 to end) in the 211 * second-to-last pointer (index -2 to end) in the
162 * exception stack: 212 * exception stack:
163 */ 213 */
164 stack = (unsigned long *) estack_end[-2]; 214 stack = (unsigned long *) stack_end[-2];
165 continue; 215 done = 0;
166 } 216 break;
167 if (irq_stack_end) { 217
168 unsigned long *irq_stack; 218 case STACK_IS_IRQ:
169 irq_stack = irq_stack_end - 219
170 (IRQ_STACK_SIZE - 64) / sizeof(*irq_stack); 220 if (ops->stack(data, "IRQ") < 0)
171 221 break;
172 if (in_irq_stack(stack, irq_stack, irq_stack_end)) { 222 bp = ops->walk_stack(tinfo, stack, bp,
173 if (ops->stack(data, "IRQ") < 0) 223 ops, data, stack_end, &graph);
174 break; 224 /*
175 bp = ops->walk_stack(tinfo, stack, bp, 225 * We link to the next stack (which would be
176 ops, data, irq_stack_end, &graph); 226 * the process stack normally) the last
177 /* 227 * pointer (index -1 to end) in the IRQ stack:
178 * We link to the next stack (which would be 228 */
179 * the process stack normally) the last 229 stack = (unsigned long *) (stack_end[-1]);
180 * pointer (index -1 to end) in the IRQ stack: 230 irq_stack = NULL;
181 */ 231 ops->stack(data, "EOI");
182 stack = (unsigned long *) (irq_stack_end[-1]); 232 done = 0;
183 irq_stack_end = NULL; 233 break;
184 ops->stack(data, "EOI"); 234
185 continue; 235 case STACK_IS_UNKNOWN:
186 } 236 ops->stack(data, "UNK");
237 break;
187 } 238 }
188 break;
189 } 239 }
190 240
191 /* 241 /*
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 174da5fc5a7b..988c00a1f60d 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1120,7 +1120,7 @@ void __init memblock_find_dma_reserve(void)
1120 nr_pages += end_pfn - start_pfn; 1120 nr_pages += end_pfn - start_pfn;
1121 } 1121 }
1122 1122
1123 for_each_free_mem_range(u, MAX_NUMNODES, &start, &end, NULL) { 1123 for_each_free_mem_range(u, NUMA_NO_NODE, &start, &end, NULL) {
1124 start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN); 1124 start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN);
1125 end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN); 1125 end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN);
1126 if (start_pfn < end_pfn) 1126 if (start_pfn < end_pfn)
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index bc4a088f9023..b0cc3809723d 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -203,18 +203,15 @@ static void __init intel_remapping_check(int num, int slot, int func)
203 revision = read_pci_config_byte(num, slot, func, PCI_REVISION_ID); 203 revision = read_pci_config_byte(num, slot, func, PCI_REVISION_ID);
204 204
205 /* 205 /*
206 * Revision 13 of all triggering devices id in this quirk have 206 * Revision <= 13 of all triggering devices id in this quirk
207 * a problem draining interrupts when irq remapping is enabled, 207 * have a problem draining interrupts when irq remapping is
208 * and should be flagged as broken. Additionally revisions 0x12 208 * enabled, and should be flagged as broken. Additionally
209 * and 0x22 of device id 0x3405 has this problem. 209 * revision 0x22 of device id 0x3405 has this problem.
210 */ 210 */
211 if (revision == 0x13) 211 if (revision <= 0x13)
212 set_irq_remapping_broken(); 212 set_irq_remapping_broken();
213 else if ((device == 0x3405) && 213 else if (device == 0x3405 && revision == 0x22)
214 ((revision == 0x12) ||
215 (revision == 0x22)))
216 set_irq_remapping_broken(); 214 set_irq_remapping_broken();
217
218} 215}
219 216
220/* 217/*
@@ -228,7 +225,7 @@ static void __init intel_remapping_check(int num, int slot, int func)
228 * 225 *
229 * And yes, so far on current devices the base addr is always under 4G. 226 * And yes, so far on current devices the base addr is always under 4G.
230 */ 227 */
231static u32 __init intel_stolen_base(int num, int slot, int func) 228static u32 __init intel_stolen_base(int num, int slot, int func, size_t stolen_size)
232{ 229{
233 u32 base; 230 u32 base;
234 231
@@ -247,6 +244,114 @@ static u32 __init intel_stolen_base(int num, int slot, int func)
247#define MB(x) (KB (KB (x))) 244#define MB(x) (KB (KB (x)))
248#define GB(x) (MB (KB (x))) 245#define GB(x) (MB (KB (x)))
249 246
247static size_t __init i830_tseg_size(void)
248{
249 u8 tmp = read_pci_config_byte(0, 0, 0, I830_ESMRAMC);
250
251 if (!(tmp & TSEG_ENABLE))
252 return 0;
253
254 if (tmp & I830_TSEG_SIZE_1M)
255 return MB(1);
256 else
257 return KB(512);
258}
259
260static size_t __init i845_tseg_size(void)
261{
262 u8 tmp = read_pci_config_byte(0, 0, 0, I845_ESMRAMC);
263
264 if (!(tmp & TSEG_ENABLE))
265 return 0;
266
267 switch (tmp & I845_TSEG_SIZE_MASK) {
268 case I845_TSEG_SIZE_512K:
269 return KB(512);
270 case I845_TSEG_SIZE_1M:
271 return MB(1);
272 default:
273 WARN_ON(1);
274 return 0;
275 }
276}
277
278static size_t __init i85x_tseg_size(void)
279{
280 u8 tmp = read_pci_config_byte(0, 0, 0, I85X_ESMRAMC);
281
282 if (!(tmp & TSEG_ENABLE))
283 return 0;
284
285 return MB(1);
286}
287
288static size_t __init i830_mem_size(void)
289{
290 return read_pci_config_byte(0, 0, 0, I830_DRB3) * MB(32);
291}
292
293static size_t __init i85x_mem_size(void)
294{
295 return read_pci_config_byte(0, 0, 1, I85X_DRB3) * MB(32);
296}
297
298/*
299 * On 830/845/85x the stolen memory base isn't available in any
300 * register. We need to calculate it as TOM-TSEG_SIZE-stolen_size.
301 */
302static u32 __init i830_stolen_base(int num, int slot, int func, size_t stolen_size)
303{
304 return i830_mem_size() - i830_tseg_size() - stolen_size;
305}
306
307static u32 __init i845_stolen_base(int num, int slot, int func, size_t stolen_size)
308{
309 return i830_mem_size() - i845_tseg_size() - stolen_size;
310}
311
312static u32 __init i85x_stolen_base(int num, int slot, int func, size_t stolen_size)
313{
314 return i85x_mem_size() - i85x_tseg_size() - stolen_size;
315}
316
317static u32 __init i865_stolen_base(int num, int slot, int func, size_t stolen_size)
318{
319 /*
320 * FIXME is the graphics stolen memory region
321 * always at TOUD? Ie. is it always the last
322 * one to be allocated by the BIOS?
323 */
324 return read_pci_config_16(0, 0, 0, I865_TOUD) << 16;
325}
326
327static size_t __init i830_stolen_size(int num, int slot, int func)
328{
329 size_t stolen_size;
330 u16 gmch_ctrl;
331
332 gmch_ctrl = read_pci_config_16(0, 0, 0, I830_GMCH_CTRL);
333
334 switch (gmch_ctrl & I830_GMCH_GMS_MASK) {
335 case I830_GMCH_GMS_STOLEN_512:
336 stolen_size = KB(512);
337 break;
338 case I830_GMCH_GMS_STOLEN_1024:
339 stolen_size = MB(1);
340 break;
341 case I830_GMCH_GMS_STOLEN_8192:
342 stolen_size = MB(8);
343 break;
344 case I830_GMCH_GMS_LOCAL:
345 /* local memory isn't part of the normal address space */
346 stolen_size = 0;
347 break;
348 default:
349 return 0;
350 }
351
352 return stolen_size;
353}
354
250static size_t __init gen3_stolen_size(int num, int slot, int func) 355static size_t __init gen3_stolen_size(int num, int slot, int func)
251{ 356{
252 size_t stolen_size; 357 size_t stolen_size;
@@ -313,7 +418,7 @@ static size_t __init gen6_stolen_size(int num, int slot, int func)
313 return gmch_ctrl << 25; /* 32 MB units */ 418 return gmch_ctrl << 25; /* 32 MB units */
314} 419}
315 420
316static inline size_t gen8_stolen_size(int num, int slot, int func) 421static size_t gen8_stolen_size(int num, int slot, int func)
317{ 422{
318 u16 gmch_ctrl; 423 u16 gmch_ctrl;
319 424
@@ -323,31 +428,74 @@ static inline size_t gen8_stolen_size(int num, int slot, int func)
323 return gmch_ctrl << 25; /* 32 MB units */ 428 return gmch_ctrl << 25; /* 32 MB units */
324} 429}
325 430
326typedef size_t (*stolen_size_fn)(int num, int slot, int func); 431
432struct intel_stolen_funcs {
433 size_t (*size)(int num, int slot, int func);
434 u32 (*base)(int num, int slot, int func, size_t size);
435};
436
437static const struct intel_stolen_funcs i830_stolen_funcs = {
438 .base = i830_stolen_base,
439 .size = i830_stolen_size,
440};
441
442static const struct intel_stolen_funcs i845_stolen_funcs = {
443 .base = i845_stolen_base,
444 .size = i830_stolen_size,
445};
446
447static const struct intel_stolen_funcs i85x_stolen_funcs = {
448 .base = i85x_stolen_base,
449 .size = gen3_stolen_size,
450};
451
452static const struct intel_stolen_funcs i865_stolen_funcs = {
453 .base = i865_stolen_base,
454 .size = gen3_stolen_size,
455};
456
457static const struct intel_stolen_funcs gen3_stolen_funcs = {
458 .base = intel_stolen_base,
459 .size = gen3_stolen_size,
460};
461
462static const struct intel_stolen_funcs gen6_stolen_funcs = {
463 .base = intel_stolen_base,
464 .size = gen6_stolen_size,
465};
466
467static const struct intel_stolen_funcs gen8_stolen_funcs = {
468 .base = intel_stolen_base,
469 .size = gen8_stolen_size,
470};
327 471
328static struct pci_device_id intel_stolen_ids[] __initdata = { 472static struct pci_device_id intel_stolen_ids[] __initdata = {
329 INTEL_I915G_IDS(gen3_stolen_size), 473 INTEL_I830_IDS(&i830_stolen_funcs),
330 INTEL_I915GM_IDS(gen3_stolen_size), 474 INTEL_I845G_IDS(&i845_stolen_funcs),
331 INTEL_I945G_IDS(gen3_stolen_size), 475 INTEL_I85X_IDS(&i85x_stolen_funcs),
332 INTEL_I945GM_IDS(gen3_stolen_size), 476 INTEL_I865G_IDS(&i865_stolen_funcs),
333 INTEL_VLV_M_IDS(gen6_stolen_size), 477 INTEL_I915G_IDS(&gen3_stolen_funcs),
334 INTEL_VLV_D_IDS(gen6_stolen_size), 478 INTEL_I915GM_IDS(&gen3_stolen_funcs),
335 INTEL_PINEVIEW_IDS(gen3_stolen_size), 479 INTEL_I945G_IDS(&gen3_stolen_funcs),
336 INTEL_I965G_IDS(gen3_stolen_size), 480 INTEL_I945GM_IDS(&gen3_stolen_funcs),
337 INTEL_G33_IDS(gen3_stolen_size), 481 INTEL_VLV_M_IDS(&gen6_stolen_funcs),
338 INTEL_I965GM_IDS(gen3_stolen_size), 482 INTEL_VLV_D_IDS(&gen6_stolen_funcs),
339 INTEL_GM45_IDS(gen3_stolen_size), 483 INTEL_PINEVIEW_IDS(&gen3_stolen_funcs),
340 INTEL_G45_IDS(gen3_stolen_size), 484 INTEL_I965G_IDS(&gen3_stolen_funcs),
341 INTEL_IRONLAKE_D_IDS(gen3_stolen_size), 485 INTEL_G33_IDS(&gen3_stolen_funcs),
342 INTEL_IRONLAKE_M_IDS(gen3_stolen_size), 486 INTEL_I965GM_IDS(&gen3_stolen_funcs),
343 INTEL_SNB_D_IDS(gen6_stolen_size), 487 INTEL_GM45_IDS(&gen3_stolen_funcs),
344 INTEL_SNB_M_IDS(gen6_stolen_size), 488 INTEL_G45_IDS(&gen3_stolen_funcs),
345 INTEL_IVB_M_IDS(gen6_stolen_size), 489 INTEL_IRONLAKE_D_IDS(&gen3_stolen_funcs),
346 INTEL_IVB_D_IDS(gen6_stolen_size), 490 INTEL_IRONLAKE_M_IDS(&gen3_stolen_funcs),
347 INTEL_HSW_D_IDS(gen6_stolen_size), 491 INTEL_SNB_D_IDS(&gen6_stolen_funcs),
348 INTEL_HSW_M_IDS(gen6_stolen_size), 492 INTEL_SNB_M_IDS(&gen6_stolen_funcs),
349 INTEL_BDW_M_IDS(gen8_stolen_size), 493 INTEL_IVB_M_IDS(&gen6_stolen_funcs),
350 INTEL_BDW_D_IDS(gen8_stolen_size) 494 INTEL_IVB_D_IDS(&gen6_stolen_funcs),
495 INTEL_HSW_D_IDS(&gen6_stolen_funcs),
496 INTEL_HSW_M_IDS(&gen6_stolen_funcs),
497 INTEL_BDW_M_IDS(&gen8_stolen_funcs),
498 INTEL_BDW_D_IDS(&gen8_stolen_funcs)
351}; 499};
352 500
353static void __init intel_graphics_stolen(int num, int slot, int func) 501static void __init intel_graphics_stolen(int num, int slot, int func)
@@ -364,11 +512,13 @@ static void __init intel_graphics_stolen(int num, int slot, int func)
364 512
365 for (i = 0; i < ARRAY_SIZE(intel_stolen_ids); i++) { 513 for (i = 0; i < ARRAY_SIZE(intel_stolen_ids); i++) {
366 if (intel_stolen_ids[i].device == device) { 514 if (intel_stolen_ids[i].device == device) {
367 stolen_size_fn stolen_size = 515 const struct intel_stolen_funcs *stolen_funcs =
368 (stolen_size_fn)intel_stolen_ids[i].driver_data; 516 (const struct intel_stolen_funcs *)intel_stolen_ids[i].driver_data;
369 size = stolen_size(num, slot, func); 517 size = stolen_funcs->size(num, slot, func);
370 start = intel_stolen_base(num, slot, func); 518 start = stolen_funcs->base(num, slot, func, size);
371 if (size && start) { 519 if (size && start) {
520 printk(KERN_INFO "Reserving Intel graphics stolen memory at 0x%x-0x%x\n",
521 start, start + (u32)size - 1);
372 /* Mark this space as reserved */ 522 /* Mark this space as reserved */
373 e820_add_region(start, size, E820_RESERVED); 523 e820_add_region(start, size, E820_RESERVED);
374 sanitize_e820_map(e820.map, 524 sanitize_e820_map(e820.map,
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index d4bdd253fea7..52819e816f87 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -77,8 +77,7 @@ within(unsigned long addr, unsigned long start, unsigned long end)
77 return addr >= start && addr < end; 77 return addr >= start && addr < end;
78} 78}
79 79
80static int 80static unsigned long text_ip_addr(unsigned long ip)
81do_ftrace_mod_code(unsigned long ip, const void *new_code)
82{ 81{
83 /* 82 /*
84 * On x86_64, kernel text mappings are mapped read-only with 83 * On x86_64, kernel text mappings are mapped read-only with
@@ -91,7 +90,7 @@ do_ftrace_mod_code(unsigned long ip, const void *new_code)
91 if (within(ip, (unsigned long)_text, (unsigned long)_etext)) 90 if (within(ip, (unsigned long)_text, (unsigned long)_etext))
92 ip = (unsigned long)__va(__pa_symbol(ip)); 91 ip = (unsigned long)__va(__pa_symbol(ip));
93 92
94 return probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE); 93 return ip;
95} 94}
96 95
97static const unsigned char *ftrace_nop_replace(void) 96static const unsigned char *ftrace_nop_replace(void)
@@ -123,8 +122,10 @@ ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code,
123 if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0) 122 if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0)
124 return -EINVAL; 123 return -EINVAL;
125 124
125 ip = text_ip_addr(ip);
126
126 /* replace the text with the new text */ 127 /* replace the text with the new text */
127 if (do_ftrace_mod_code(ip, new_code)) 128 if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE))
128 return -EPERM; 129 return -EPERM;
129 130
130 sync_core(); 131 sync_core();
@@ -221,37 +222,51 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
221 return -EINVAL; 222 return -EINVAL;
222} 223}
223 224
224int ftrace_update_ftrace_func(ftrace_func_t func) 225static unsigned long ftrace_update_func;
226
227static int update_ftrace_func(unsigned long ip, void *new)
225{ 228{
226 unsigned long ip = (unsigned long)(&ftrace_call); 229 unsigned char old[MCOUNT_INSN_SIZE];
227 unsigned char old[MCOUNT_INSN_SIZE], *new;
228 int ret; 230 int ret;
229 231
230 memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE); 232 memcpy(old, (void *)ip, MCOUNT_INSN_SIZE);
231 new = ftrace_call_replace(ip, (unsigned long)func); 233
234 ftrace_update_func = ip;
235 /* Make sure the breakpoints see the ftrace_update_func update */
236 smp_wmb();
232 237
233 /* See comment above by declaration of modifying_ftrace_code */ 238 /* See comment above by declaration of modifying_ftrace_code */
234 atomic_inc(&modifying_ftrace_code); 239 atomic_inc(&modifying_ftrace_code);
235 240
236 ret = ftrace_modify_code(ip, old, new); 241 ret = ftrace_modify_code(ip, old, new);
237 242
243 atomic_dec(&modifying_ftrace_code);
244
245 return ret;
246}
247
248int ftrace_update_ftrace_func(ftrace_func_t func)
249{
250 unsigned long ip = (unsigned long)(&ftrace_call);
251 unsigned char *new;
252 int ret;
253
254 new = ftrace_call_replace(ip, (unsigned long)func);
255 ret = update_ftrace_func(ip, new);
256
238 /* Also update the regs callback function */ 257 /* Also update the regs callback function */
239 if (!ret) { 258 if (!ret) {
240 ip = (unsigned long)(&ftrace_regs_call); 259 ip = (unsigned long)(&ftrace_regs_call);
241 memcpy(old, &ftrace_regs_call, MCOUNT_INSN_SIZE);
242 new = ftrace_call_replace(ip, (unsigned long)func); 260 new = ftrace_call_replace(ip, (unsigned long)func);
243 ret = ftrace_modify_code(ip, old, new); 261 ret = update_ftrace_func(ip, new);
244 } 262 }
245 263
246 atomic_dec(&modifying_ftrace_code);
247
248 return ret; 264 return ret;
249} 265}
250 266
251static int is_ftrace_caller(unsigned long ip) 267static int is_ftrace_caller(unsigned long ip)
252{ 268{
253 if (ip == (unsigned long)(&ftrace_call) || 269 if (ip == ftrace_update_func)
254 ip == (unsigned long)(&ftrace_regs_call))
255 return 1; 270 return 1;
256 271
257 return 0; 272 return 0;
@@ -293,7 +308,10 @@ static int ftrace_write(unsigned long ip, const char *val, int size)
293 if (within(ip, (unsigned long)_text, (unsigned long)_etext)) 308 if (within(ip, (unsigned long)_text, (unsigned long)_etext))
294 ip = (unsigned long)__va(__pa_symbol(ip)); 309 ip = (unsigned long)__va(__pa_symbol(ip));
295 310
296 return probe_kernel_write((void *)ip, val, size); 311 if (probe_kernel_write((void *)ip, val, size))
312 return -EPERM;
313
314 return 0;
297} 315}
298 316
299static int add_break(unsigned long ip, const char *old) 317static int add_break(unsigned long ip, const char *old)
@@ -308,10 +326,7 @@ static int add_break(unsigned long ip, const char *old)
308 if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0) 326 if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0)
309 return -EINVAL; 327 return -EINVAL;
310 328
311 if (ftrace_write(ip, &brk, 1)) 329 return ftrace_write(ip, &brk, 1);
312 return -EPERM;
313
314 return 0;
315} 330}
316 331
317static int add_brk_on_call(struct dyn_ftrace *rec, unsigned long addr) 332static int add_brk_on_call(struct dyn_ftrace *rec, unsigned long addr)
@@ -410,7 +425,7 @@ static int remove_breakpoint(struct dyn_ftrace *rec)
410 425
411 /* If this does not have a breakpoint, we are done */ 426 /* If this does not have a breakpoint, we are done */
412 if (ins[0] != brk) 427 if (ins[0] != brk)
413 return -1; 428 return 0;
414 429
415 nop = ftrace_nop_replace(); 430 nop = ftrace_nop_replace();
416 431
@@ -440,7 +455,7 @@ static int remove_breakpoint(struct dyn_ftrace *rec)
440 } 455 }
441 456
442 update: 457 update:
443 return probe_kernel_write((void *)ip, &nop[0], 1); 458 return ftrace_write(ip, nop, 1);
444} 459}
445 460
446static int add_update_code(unsigned long ip, unsigned const char *new) 461static int add_update_code(unsigned long ip, unsigned const char *new)
@@ -448,9 +463,7 @@ static int add_update_code(unsigned long ip, unsigned const char *new)
448 /* skip breakpoint */ 463 /* skip breakpoint */
449 ip++; 464 ip++;
450 new++; 465 new++;
451 if (ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1)) 466 return ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1);
452 return -EPERM;
453 return 0;
454} 467}
455 468
456static int add_update_call(struct dyn_ftrace *rec, unsigned long addr) 469static int add_update_call(struct dyn_ftrace *rec, unsigned long addr)
@@ -505,10 +518,7 @@ static int finish_update_call(struct dyn_ftrace *rec, unsigned long addr)
505 518
506 new = ftrace_call_replace(ip, addr); 519 new = ftrace_call_replace(ip, addr);
507 520
508 if (ftrace_write(ip, new, 1)) 521 return ftrace_write(ip, new, 1);
509 return -EPERM;
510
511 return 0;
512} 522}
513 523
514static int finish_update_nop(struct dyn_ftrace *rec) 524static int finish_update_nop(struct dyn_ftrace *rec)
@@ -518,9 +528,7 @@ static int finish_update_nop(struct dyn_ftrace *rec)
518 528
519 new = ftrace_nop_replace(); 529 new = ftrace_nop_replace();
520 530
521 if (ftrace_write(ip, new, 1)) 531 return ftrace_write(ip, new, 1);
522 return -EPERM;
523 return 0;
524} 532}
525 533
526static int finish_update(struct dyn_ftrace *rec, int enable) 534static int finish_update(struct dyn_ftrace *rec, int enable)
@@ -617,8 +625,14 @@ void ftrace_replace_code(int enable)
617 printk(KERN_WARNING "Failed on %s (%d):\n", report, count); 625 printk(KERN_WARNING "Failed on %s (%d):\n", report, count);
618 for_ftrace_rec_iter(iter) { 626 for_ftrace_rec_iter(iter) {
619 rec = ftrace_rec_iter_record(iter); 627 rec = ftrace_rec_iter_record(iter);
620 remove_breakpoint(rec); 628 /*
629 * Breakpoints are handled only when this function is in
630 * progress. The system could not work with them.
631 */
632 if (remove_breakpoint(rec))
633 BUG();
621 } 634 }
635 run_sync();
622} 636}
623 637
624static int 638static int
@@ -640,16 +654,19 @@ ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
640 run_sync(); 654 run_sync();
641 655
642 ret = ftrace_write(ip, new_code, 1); 656 ret = ftrace_write(ip, new_code, 1);
643 if (ret) { 657 /*
644 ret = -EPERM; 658 * The breakpoint is handled only when this function is in progress.
645 goto out; 659 * The system could not work if we could not remove it.
646 } 660 */
647 run_sync(); 661 BUG_ON(ret);
648 out: 662 out:
663 run_sync();
649 return ret; 664 return ret;
650 665
651 fail_update: 666 fail_update:
652 probe_kernel_write((void *)ip, &old_code[0], 1); 667 /* Also here the system could not work with the breakpoint */
668 if (ftrace_write(ip, old_code, 1))
669 BUG();
653 goto out; 670 goto out;
654} 671}
655 672
@@ -663,11 +680,8 @@ void arch_ftrace_update_code(int command)
663 atomic_dec(&modifying_ftrace_code); 680 atomic_dec(&modifying_ftrace_code);
664} 681}
665 682
666int __init ftrace_dyn_arch_init(void *data) 683int __init ftrace_dyn_arch_init(void)
667{ 684{
668 /* The return code is retured via data */
669 *(unsigned long *)data = 0;
670
671 return 0; 685 return 0;
672} 686}
673#endif 687#endif
@@ -677,45 +691,41 @@ int __init ftrace_dyn_arch_init(void *data)
677#ifdef CONFIG_DYNAMIC_FTRACE 691#ifdef CONFIG_DYNAMIC_FTRACE
678extern void ftrace_graph_call(void); 692extern void ftrace_graph_call(void);
679 693
680static int ftrace_mod_jmp(unsigned long ip, 694static unsigned char *ftrace_jmp_replace(unsigned long ip, unsigned long addr)
681 int old_offset, int new_offset)
682{ 695{
683 unsigned char code[MCOUNT_INSN_SIZE]; 696 static union ftrace_code_union calc;
684 697
685 if (probe_kernel_read(code, (void *)ip, MCOUNT_INSN_SIZE)) 698 /* Jmp not a call (ignore the .e8) */
686 return -EFAULT; 699 calc.e8 = 0xe9;
700 calc.offset = ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr);
687 701
688 if (code[0] != 0xe9 || old_offset != *(int *)(&code[1])) 702 /*
689 return -EINVAL; 703 * ftrace external locks synchronize the access to the static variable.
704 */
705 return calc.code;
706}
690 707
691 *(int *)(&code[1]) = new_offset; 708static int ftrace_mod_jmp(unsigned long ip, void *func)
709{
710 unsigned char *new;
692 711
693 if (do_ftrace_mod_code(ip, &code)) 712 new = ftrace_jmp_replace(ip, (unsigned long)func);
694 return -EPERM;
695 713
696 return 0; 714 return update_ftrace_func(ip, new);
697} 715}
698 716
699int ftrace_enable_ftrace_graph_caller(void) 717int ftrace_enable_ftrace_graph_caller(void)
700{ 718{
701 unsigned long ip = (unsigned long)(&ftrace_graph_call); 719 unsigned long ip = (unsigned long)(&ftrace_graph_call);
702 int old_offset, new_offset;
703
704 old_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
705 new_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
706 720
707 return ftrace_mod_jmp(ip, old_offset, new_offset); 721 return ftrace_mod_jmp(ip, &ftrace_graph_caller);
708} 722}
709 723
710int ftrace_disable_ftrace_graph_caller(void) 724int ftrace_disable_ftrace_graph_caller(void)
711{ 725{
712 unsigned long ip = (unsigned long)(&ftrace_graph_call); 726 unsigned long ip = (unsigned long)(&ftrace_graph_call);
713 int old_offset, new_offset;
714
715 old_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
716 new_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
717 727
718 return ftrace_mod_jmp(ip, old_offset, new_offset); 728 return ftrace_mod_jmp(ip, &ftrace_stub);
719} 729}
720 730
721#endif /* !CONFIG_DYNAMIC_FTRACE */ 731#endif /* !CONFIG_DYNAMIC_FTRACE */
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 81ba27679f18..f36bd42d6f0c 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -544,6 +544,10 @@ ENDPROC(early_idt_handlers)
544 /* This is global to keep gas from relaxing the jumps */ 544 /* This is global to keep gas from relaxing the jumps */
545ENTRY(early_idt_handler) 545ENTRY(early_idt_handler)
546 cld 546 cld
547
548 cmpl $2,(%esp) # X86_TRAP_NMI
549 je is_nmi # Ignore NMI
550
547 cmpl $2,%ss:early_recursion_flag 551 cmpl $2,%ss:early_recursion_flag
548 je hlt_loop 552 je hlt_loop
549 incl %ss:early_recursion_flag 553 incl %ss:early_recursion_flag
@@ -594,8 +598,9 @@ ex_entry:
594 pop %edx 598 pop %edx
595 pop %ecx 599 pop %ecx
596 pop %eax 600 pop %eax
597 addl $8,%esp /* drop vector number and error code */
598 decl %ss:early_recursion_flag 601 decl %ss:early_recursion_flag
602is_nmi:
603 addl $8,%esp /* drop vector number and error code */
599 iret 604 iret
600ENDPROC(early_idt_handler) 605ENDPROC(early_idt_handler)
601 606
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index e1aabdb314c8..a468c0a65c42 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -343,6 +343,9 @@ early_idt_handlers:
343ENTRY(early_idt_handler) 343ENTRY(early_idt_handler)
344 cld 344 cld
345 345
346 cmpl $2,(%rsp) # X86_TRAP_NMI
347 je is_nmi # Ignore NMI
348
346 cmpl $2,early_recursion_flag(%rip) 349 cmpl $2,early_recursion_flag(%rip)
347 jz 1f 350 jz 1f
348 incl early_recursion_flag(%rip) 351 incl early_recursion_flag(%rip)
@@ -405,8 +408,9 @@ ENTRY(early_idt_handler)
405 popq %rdx 408 popq %rdx
406 popq %rcx 409 popq %rcx
407 popq %rax 410 popq %rax
408 addq $16,%rsp # drop vector number and error code
409 decl early_recursion_flag(%rip) 411 decl early_recursion_flag(%rip)
412is_nmi:
413 addq $16,%rsp # drop vector number and error code
410 INTERRUPT_RETURN 414 INTERRUPT_RETURN
411ENDPROC(early_idt_handler) 415ENDPROC(early_idt_handler)
412 416
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index da85a8e830a1..8d80ae011603 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -521,7 +521,7 @@ static int hpet_setup_irq(struct hpet_dev *dev)
521{ 521{
522 522
523 if (request_irq(dev->irq, hpet_interrupt_handler, 523 if (request_irq(dev->irq, hpet_interrupt_handler,
524 IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, 524 IRQF_TIMER | IRQF_NOBALANCING,
525 dev->name, dev)) 525 dev->name, dev))
526 return -1; 526 return -1;
527 527
@@ -699,7 +699,7 @@ static int hpet_cpuhp_notify(struct notifier_block *n,
699 /* FIXME: add schedule_work_on() */ 699 /* FIXME: add schedule_work_on() */
700 schedule_delayed_work_on(cpu, &work.work, 0); 700 schedule_delayed_work_on(cpu, &work.work, 0);
701 wait_for_completion(&work.complete); 701 wait_for_completion(&work.complete);
702 destroy_timer_on_stack(&work.work.timer); 702 destroy_delayed_work_on_stack(&work.work);
703 break; 703 break;
704 case CPU_DEAD: 704 case CPU_DEAD:
705 if (hdev) { 705 if (hdev) {
@@ -752,9 +752,7 @@ static struct clocksource clocksource_hpet = {
752 .mask = HPET_MASK, 752 .mask = HPET_MASK,
753 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 753 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
754 .resume = hpet_resume_counter, 754 .resume = hpet_resume_counter,
755#ifdef CONFIG_X86_64
756 .archdata = { .vclock_mode = VCLOCK_HPET }, 755 .archdata = { .vclock_mode = VCLOCK_HPET },
757#endif
758}; 756};
759 757
760static int hpet_clocksource_register(void) 758static int hpet_clocksource_register(void)
@@ -943,12 +941,14 @@ static __init int hpet_late_init(void)
943 if (boot_cpu_has(X86_FEATURE_ARAT)) 941 if (boot_cpu_has(X86_FEATURE_ARAT))
944 return 0; 942 return 0;
945 943
944 cpu_notifier_register_begin();
946 for_each_online_cpu(cpu) { 945 for_each_online_cpu(cpu) {
947 hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu); 946 hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu);
948 } 947 }
949 948
950 /* This notifier should be called after workqueue is ready */ 949 /* This notifier should be called after workqueue is ready */
951 hotcpu_notifier(hpet_cpuhp_notify, -20); 950 __hotcpu_notifier(hpet_cpuhp_notify, -20);
951 cpu_notifier_register_done();
952 952
953 return 0; 953 return 0;
954} 954}
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index f66ff162dce8..a67b47c31314 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -38,7 +38,6 @@
38#include <linux/kernel.h> 38#include <linux/kernel.h>
39#include <linux/module.h> 39#include <linux/module.h>
40#include <linux/sched.h> 40#include <linux/sched.h>
41#include <linux/init.h>
42#include <linux/smp.h> 41#include <linux/smp.h>
43 42
44#include <asm/hw_breakpoint.h> 43#include <asm/hw_breakpoint.h>
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index e8368c6dd2a2..d5dd80814419 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -86,10 +86,19 @@ EXPORT_SYMBOL(__kernel_fpu_begin);
86 86
87void __kernel_fpu_end(void) 87void __kernel_fpu_end(void)
88{ 88{
89 if (use_eager_fpu()) 89 if (use_eager_fpu()) {
90 math_state_restore(); 90 /*
91 else 91 * For eager fpu, most the time, tsk_used_math() is true.
92 * Restore the user math as we are done with the kernel usage.
93 * At few instances during thread exit, signal handling etc,
94 * tsk_used_math() is false. Those few places will take proper
95 * actions, so we don't need to restore the math here.
96 */
97 if (likely(tsk_used_math(current)))
98 math_state_restore();
99 } else {
92 stts(); 100 stts();
101 }
93} 102}
94EXPORT_SYMBOL(__kernel_fpu_end); 103EXPORT_SYMBOL(__kernel_fpu_end);
95 104
diff --git a/arch/x86/kernel/iosf_mbi.c b/arch/x86/kernel/iosf_mbi.c
new file mode 100644
index 000000000000..c3aae6672843
--- /dev/null
+++ b/arch/x86/kernel/iosf_mbi.c
@@ -0,0 +1,226 @@
1/*
2 * IOSF-SB MailBox Interface Driver
3 * Copyright (c) 2013, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 *
15 * The IOSF-SB is a fabric bus available on Atom based SOC's that uses a
16 * mailbox interface (MBI) to communicate with mutiple devices. This
17 * driver implements access to this interface for those platforms that can
18 * enumerate the device using PCI.
19 */
20
21#include <linux/module.h>
22#include <linux/init.h>
23#include <linux/spinlock.h>
24#include <linux/pci.h>
25
26#include <asm/iosf_mbi.h>
27
28static DEFINE_SPINLOCK(iosf_mbi_lock);
29
30static inline u32 iosf_mbi_form_mcr(u8 op, u8 port, u8 offset)
31{
32 return (op << 24) | (port << 16) | (offset << 8) | MBI_ENABLE;
33}
34
35static struct pci_dev *mbi_pdev; /* one mbi device */
36
37static int iosf_mbi_pci_read_mdr(u32 mcrx, u32 mcr, u32 *mdr)
38{
39 int result;
40
41 if (!mbi_pdev)
42 return -ENODEV;
43
44 if (mcrx) {
45 result = pci_write_config_dword(mbi_pdev, MBI_MCRX_OFFSET,
46 mcrx);
47 if (result < 0)
48 goto fail_read;
49 }
50
51 result = pci_write_config_dword(mbi_pdev, MBI_MCR_OFFSET, mcr);
52 if (result < 0)
53 goto fail_read;
54
55 result = pci_read_config_dword(mbi_pdev, MBI_MDR_OFFSET, mdr);
56 if (result < 0)
57 goto fail_read;
58
59 return 0;
60
61fail_read:
62 dev_err(&mbi_pdev->dev, "PCI config access failed with %d\n", result);
63 return result;
64}
65
66static int iosf_mbi_pci_write_mdr(u32 mcrx, u32 mcr, u32 mdr)
67{
68 int result;
69
70 if (!mbi_pdev)
71 return -ENODEV;
72
73 result = pci_write_config_dword(mbi_pdev, MBI_MDR_OFFSET, mdr);
74 if (result < 0)
75 goto fail_write;
76
77 if (mcrx) {
78 result = pci_write_config_dword(mbi_pdev, MBI_MCRX_OFFSET,
79 mcrx);
80 if (result < 0)
81 goto fail_write;
82 }
83
84 result = pci_write_config_dword(mbi_pdev, MBI_MCR_OFFSET, mcr);
85 if (result < 0)
86 goto fail_write;
87
88 return 0;
89
90fail_write:
91 dev_err(&mbi_pdev->dev, "PCI config access failed with %d\n", result);
92 return result;
93}
94
95int iosf_mbi_read(u8 port, u8 opcode, u32 offset, u32 *mdr)
96{
97 u32 mcr, mcrx;
98 unsigned long flags;
99 int ret;
100
101 /*Access to the GFX unit is handled by GPU code */
102 if (port == BT_MBI_UNIT_GFX) {
103 WARN_ON(1);
104 return -EPERM;
105 }
106
107 mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO);
108 mcrx = offset & MBI_MASK_HI;
109
110 spin_lock_irqsave(&iosf_mbi_lock, flags);
111 ret = iosf_mbi_pci_read_mdr(mcrx, mcr, mdr);
112 spin_unlock_irqrestore(&iosf_mbi_lock, flags);
113
114 return ret;
115}
116EXPORT_SYMBOL(iosf_mbi_read);
117
118int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr)
119{
120 u32 mcr, mcrx;
121 unsigned long flags;
122 int ret;
123
124 /*Access to the GFX unit is handled by GPU code */
125 if (port == BT_MBI_UNIT_GFX) {
126 WARN_ON(1);
127 return -EPERM;
128 }
129
130 mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO);
131 mcrx = offset & MBI_MASK_HI;
132
133 spin_lock_irqsave(&iosf_mbi_lock, flags);
134 ret = iosf_mbi_pci_write_mdr(mcrx, mcr, mdr);
135 spin_unlock_irqrestore(&iosf_mbi_lock, flags);
136
137 return ret;
138}
139EXPORT_SYMBOL(iosf_mbi_write);
140
141int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask)
142{
143 u32 mcr, mcrx;
144 u32 value;
145 unsigned long flags;
146 int ret;
147
148 /*Access to the GFX unit is handled by GPU code */
149 if (port == BT_MBI_UNIT_GFX) {
150 WARN_ON(1);
151 return -EPERM;
152 }
153
154 mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO);
155 mcrx = offset & MBI_MASK_HI;
156
157 spin_lock_irqsave(&iosf_mbi_lock, flags);
158
159 /* Read current mdr value */
160 ret = iosf_mbi_pci_read_mdr(mcrx, mcr & MBI_RD_MASK, &value);
161 if (ret < 0) {
162 spin_unlock_irqrestore(&iosf_mbi_lock, flags);
163 return ret;
164 }
165
166 /* Apply mask */
167 value &= ~mask;
168 mdr &= mask;
169 value |= mdr;
170
171 /* Write back */
172 ret = iosf_mbi_pci_write_mdr(mcrx, mcr | MBI_WR_MASK, value);
173
174 spin_unlock_irqrestore(&iosf_mbi_lock, flags);
175
176 return ret;
177}
178EXPORT_SYMBOL(iosf_mbi_modify);
179
180static int iosf_mbi_probe(struct pci_dev *pdev,
181 const struct pci_device_id *unused)
182{
183 int ret;
184
185 ret = pci_enable_device(pdev);
186 if (ret < 0) {
187 dev_err(&pdev->dev, "error: could not enable device\n");
188 return ret;
189 }
190
191 mbi_pdev = pci_dev_get(pdev);
192 return 0;
193}
194
195static DEFINE_PCI_DEVICE_TABLE(iosf_mbi_pci_ids) = {
196 { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x0F00) },
197 { 0, },
198};
199MODULE_DEVICE_TABLE(pci, iosf_mbi_pci_ids);
200
201static struct pci_driver iosf_mbi_pci_driver = {
202 .name = "iosf_mbi_pci",
203 .probe = iosf_mbi_probe,
204 .id_table = iosf_mbi_pci_ids,
205};
206
207static int __init iosf_mbi_init(void)
208{
209 return pci_register_driver(&iosf_mbi_pci_driver);
210}
211
212static void __exit iosf_mbi_exit(void)
213{
214 pci_unregister_driver(&iosf_mbi_pci_driver);
215 if (mbi_pdev) {
216 pci_dev_put(mbi_pdev);
217 mbi_pdev = NULL;
218 }
219}
220
221module_init(iosf_mbi_init);
222module_exit(iosf_mbi_exit);
223
224MODULE_AUTHOR("David E. Box <david.e.box@linux.intel.com>");
225MODULE_DESCRIPTION("IOSF Mailbox Interface accessor");
226MODULE_LICENSE("GPL v2");
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 22d0687e7fda..283a76a9cc40 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -125,6 +125,12 @@ int arch_show_interrupts(struct seq_file *p, int prec)
125 seq_printf(p, "%10u ", per_cpu(mce_poll_count, j)); 125 seq_printf(p, "%10u ", per_cpu(mce_poll_count, j));
126 seq_printf(p, " Machine check polls\n"); 126 seq_printf(p, " Machine check polls\n");
127#endif 127#endif
128#if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN)
129 seq_printf(p, "%*s: ", prec, "THR");
130 for_each_online_cpu(j)
131 seq_printf(p, "%10u ", irq_stats(j)->irq_hv_callback_count);
132 seq_printf(p, " Hypervisor callback interrupts\n");
133#endif
128 seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); 134 seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
129#if defined(CONFIG_X86_IO_APIC) 135#if defined(CONFIG_X86_IO_APIC)
130 seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count)); 136 seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
@@ -193,9 +199,13 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
193 if (!handle_irq(irq, regs)) { 199 if (!handle_irq(irq, regs)) {
194 ack_APIC_irq(); 200 ack_APIC_irq();
195 201
196 if (printk_ratelimit()) 202 if (irq != VECTOR_RETRIGGERED) {
197 pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n", 203 pr_emerg_ratelimited("%s: %d.%d No irq handler for vector (irq %d)\n",
198 __func__, smp_processor_id(), vector, irq); 204 __func__, smp_processor_id(),
205 vector, irq);
206 } else {
207 __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
208 }
199 } 209 }
200 210
201 irq_exit(); 211 irq_exit();
@@ -262,6 +272,83 @@ __visible void smp_trace_x86_platform_ipi(struct pt_regs *regs)
262EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); 272EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
263 273
264#ifdef CONFIG_HOTPLUG_CPU 274#ifdef CONFIG_HOTPLUG_CPU
275
276/* These two declarations are only used in check_irq_vectors_for_cpu_disable()
277 * below, which is protected by stop_machine(). Putting them on the stack
278 * results in a stack frame overflow. Dynamically allocating could result in a
279 * failure so declare these two cpumasks as global.
280 */
281static struct cpumask affinity_new, online_new;
282
283/*
284 * This cpu is going to be removed and its vectors migrated to the remaining
285 * online cpus. Check to see if there are enough vectors in the remaining cpus.
286 * This function is protected by stop_machine().
287 */
288int check_irq_vectors_for_cpu_disable(void)
289{
290 int irq, cpu;
291 unsigned int this_cpu, vector, this_count, count;
292 struct irq_desc *desc;
293 struct irq_data *data;
294
295 this_cpu = smp_processor_id();
296 cpumask_copy(&online_new, cpu_online_mask);
297 cpu_clear(this_cpu, online_new);
298
299 this_count = 0;
300 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
301 irq = __this_cpu_read(vector_irq[vector]);
302 if (irq >= 0) {
303 desc = irq_to_desc(irq);
304 data = irq_desc_get_irq_data(desc);
305 cpumask_copy(&affinity_new, data->affinity);
306 cpu_clear(this_cpu, affinity_new);
307
308 /* Do not count inactive or per-cpu irqs. */
309 if (!irq_has_action(irq) || irqd_is_per_cpu(data))
310 continue;
311
312 /*
313 * A single irq may be mapped to multiple
314 * cpu's vector_irq[] (for example IOAPIC cluster
315 * mode). In this case we have two
316 * possibilities:
317 *
318 * 1) the resulting affinity mask is empty; that is
319 * this the down'd cpu is the last cpu in the irq's
320 * affinity mask, or
321 *
322 * 2) the resulting affinity mask is no longer
323 * a subset of the online cpus but the affinity
324 * mask is not zero; that is the down'd cpu is the
325 * last online cpu in a user set affinity mask.
326 */
327 if (cpumask_empty(&affinity_new) ||
328 !cpumask_subset(&affinity_new, &online_new))
329 this_count++;
330 }
331 }
332
333 count = 0;
334 for_each_online_cpu(cpu) {
335 if (cpu == this_cpu)
336 continue;
337 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
338 vector++) {
339 if (per_cpu(vector_irq, cpu)[vector] < 0)
340 count++;
341 }
342 }
343
344 if (count < this_count) {
345 pr_warn("CPU %d disable failed: CPU has %u vectors assigned and there are only %u available.\n",
346 this_cpu, this_count, count);
347 return -ERANGE;
348 }
349 return 0;
350}
351
265/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ 352/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
266void fixup_irqs(void) 353void fixup_irqs(void)
267{ 354{
@@ -344,7 +431,7 @@ void fixup_irqs(void)
344 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { 431 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
345 unsigned int irr; 432 unsigned int irr;
346 433
347 if (__this_cpu_read(vector_irq[vector]) < 0) 434 if (__this_cpu_read(vector_irq[vector]) <= VECTOR_UNDEFINED)
348 continue; 435 continue;
349 436
350 irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); 437 irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
@@ -355,11 +442,14 @@ void fixup_irqs(void)
355 data = irq_desc_get_irq_data(desc); 442 data = irq_desc_get_irq_data(desc);
356 chip = irq_data_get_irq_chip(data); 443 chip = irq_data_get_irq_chip(data);
357 raw_spin_lock(&desc->lock); 444 raw_spin_lock(&desc->lock);
358 if (chip->irq_retrigger) 445 if (chip->irq_retrigger) {
359 chip->irq_retrigger(data); 446 chip->irq_retrigger(data);
447 __this_cpu_write(vector_irq[vector], VECTOR_RETRIGGERED);
448 }
360 raw_spin_unlock(&desc->lock); 449 raw_spin_unlock(&desc->lock);
361 } 450 }
362 __this_cpu_write(vector_irq[vector], -1); 451 if (__this_cpu_read(vector_irq[vector]) != VECTOR_RETRIGGERED)
452 __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
363 } 453 }
364} 454}
365#endif 455#endif
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index d7fcbedc9c43..63ce838e5a54 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -55,16 +55,8 @@ static inline int check_stack_overflow(void) { return 0; }
55static inline void print_stack_overflow(void) { } 55static inline void print_stack_overflow(void) { }
56#endif 56#endif
57 57
58/* 58DEFINE_PER_CPU(struct irq_stack *, hardirq_stack);
59 * per-CPU IRQ handling contexts (thread information and stack) 59DEFINE_PER_CPU(struct irq_stack *, softirq_stack);
60 */
61union irq_ctx {
62 struct thread_info tinfo;
63 u32 stack[THREAD_SIZE/sizeof(u32)];
64} __attribute__((aligned(THREAD_SIZE)));
65
66static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx);
67static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx);
68 60
69static void call_on_stack(void *func, void *stack) 61static void call_on_stack(void *func, void *stack)
70{ 62{
@@ -77,14 +69,26 @@ static void call_on_stack(void *func, void *stack)
77 : "memory", "cc", "edx", "ecx", "eax"); 69 : "memory", "cc", "edx", "ecx", "eax");
78} 70}
79 71
72/* how to get the current stack pointer from C */
73#define current_stack_pointer ({ \
74 unsigned long sp; \
75 asm("mov %%esp,%0" : "=g" (sp)); \
76 sp; \
77})
78
79static inline void *current_stack(void)
80{
81 return (void *)(current_stack_pointer & ~(THREAD_SIZE - 1));
82}
83
80static inline int 84static inline int
81execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) 85execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
82{ 86{
83 union irq_ctx *curctx, *irqctx; 87 struct irq_stack *curstk, *irqstk;
84 u32 *isp, arg1, arg2; 88 u32 *isp, *prev_esp, arg1, arg2;
85 89
86 curctx = (union irq_ctx *) current_thread_info(); 90 curstk = (struct irq_stack *) current_stack();
87 irqctx = __this_cpu_read(hardirq_ctx); 91 irqstk = __this_cpu_read(hardirq_stack);
88 92
89 /* 93 /*
90 * this is where we switch to the IRQ stack. However, if we are 94 * this is where we switch to the IRQ stack. However, if we are
@@ -92,13 +96,14 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
92 * handler) we can't do that and just have to keep using the 96 * handler) we can't do that and just have to keep using the
93 * current stack (which is the irq stack already after all) 97 * current stack (which is the irq stack already after all)
94 */ 98 */
95 if (unlikely(curctx == irqctx)) 99 if (unlikely(curstk == irqstk))
96 return 0; 100 return 0;
97 101
98 /* build the stack frame on the IRQ stack */ 102 isp = (u32 *) ((char *)irqstk + sizeof(*irqstk));
99 isp = (u32 *) ((char *)irqctx + sizeof(*irqctx)); 103
100 irqctx->tinfo.task = curctx->tinfo.task; 104 /* Save the next esp at the bottom of the stack */
101 irqctx->tinfo.previous_esp = current_stack_pointer; 105 prev_esp = (u32 *)irqstk;
106 *prev_esp = current_stack_pointer;
102 107
103 if (unlikely(overflow)) 108 if (unlikely(overflow))
104 call_on_stack(print_stack_overflow, isp); 109 call_on_stack(print_stack_overflow, isp);
@@ -118,46 +123,40 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
118 */ 123 */
119void irq_ctx_init(int cpu) 124void irq_ctx_init(int cpu)
120{ 125{
121 union irq_ctx *irqctx; 126 struct irq_stack *irqstk;
122 127
123 if (per_cpu(hardirq_ctx, cpu)) 128 if (per_cpu(hardirq_stack, cpu))
124 return; 129 return;
125 130
126 irqctx = page_address(alloc_pages_node(cpu_to_node(cpu), 131 irqstk = page_address(alloc_pages_node(cpu_to_node(cpu),
127 THREADINFO_GFP, 132 THREADINFO_GFP,
128 THREAD_SIZE_ORDER)); 133 THREAD_SIZE_ORDER));
129 memset(&irqctx->tinfo, 0, sizeof(struct thread_info)); 134 per_cpu(hardirq_stack, cpu) = irqstk;
130 irqctx->tinfo.cpu = cpu;
131 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
132
133 per_cpu(hardirq_ctx, cpu) = irqctx;
134 135
135 irqctx = page_address(alloc_pages_node(cpu_to_node(cpu), 136 irqstk = page_address(alloc_pages_node(cpu_to_node(cpu),
136 THREADINFO_GFP, 137 THREADINFO_GFP,
137 THREAD_SIZE_ORDER)); 138 THREAD_SIZE_ORDER));
138 memset(&irqctx->tinfo, 0, sizeof(struct thread_info)); 139 per_cpu(softirq_stack, cpu) = irqstk;
139 irqctx->tinfo.cpu = cpu;
140 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
141
142 per_cpu(softirq_ctx, cpu) = irqctx;
143 140
144 printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n", 141 printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
145 cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu)); 142 cpu, per_cpu(hardirq_stack, cpu), per_cpu(softirq_stack, cpu));
146} 143}
147 144
148void do_softirq_own_stack(void) 145void do_softirq_own_stack(void)
149{ 146{
150 struct thread_info *curctx; 147 struct thread_info *curstk;
151 union irq_ctx *irqctx; 148 struct irq_stack *irqstk;
152 u32 *isp; 149 u32 *isp, *prev_esp;
153 150
154 curctx = current_thread_info(); 151 curstk = current_stack();
155 irqctx = __this_cpu_read(softirq_ctx); 152 irqstk = __this_cpu_read(softirq_stack);
156 irqctx->tinfo.task = curctx->task;
157 irqctx->tinfo.previous_esp = current_stack_pointer;
158 153
159 /* build the stack frame on the softirq stack */ 154 /* build the stack frame on the softirq stack */
160 isp = (u32 *) ((char *)irqctx + sizeof(*irqctx)); 155 isp = (u32 *) ((char *)irqstk + sizeof(*irqstk));
156
157 /* Push the previous esp onto the stack */
158 prev_esp = (u32 *)irqstk;
159 *prev_esp = current_stack_pointer;
161 160
162 call_on_stack(__do_softirq, isp); 161 call_on_stack(__do_softirq, isp);
163} 162}
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index a2a1fbc594ff..7f50156542fb 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -52,7 +52,7 @@ static struct irqaction irq2 = {
52}; 52};
53 53
54DEFINE_PER_CPU(vector_irq_t, vector_irq) = { 54DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
55 [0 ... NR_VECTORS - 1] = -1, 55 [0 ... NR_VECTORS - 1] = VECTOR_UNDEFINED,
56}; 56};
57 57
58int vector_used_by_percpu_irq(unsigned int vector) 58int vector_used_by_percpu_irq(unsigned int vector)
@@ -60,7 +60,7 @@ int vector_used_by_percpu_irq(unsigned int vector)
60 int cpu; 60 int cpu;
61 61
62 for_each_online_cpu(cpu) { 62 for_each_online_cpu(cpu) {
63 if (per_cpu(vector_irq, cpu)[vector] != -1) 63 if (per_cpu(vector_irq, cpu)[vector] > VECTOR_UNDEFINED)
64 return 1; 64 return 1;
65 } 65 }
66 66
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 836f8322960e..7ec1d5f8d283 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -39,7 +39,6 @@
39#include <linux/sched.h> 39#include <linux/sched.h>
40#include <linux/delay.h> 40#include <linux/delay.h>
41#include <linux/kgdb.h> 41#include <linux/kgdb.h>
42#include <linux/init.h>
43#include <linux/smp.h> 42#include <linux/smp.h>
44#include <linux/nmi.h> 43#include <linux/nmi.h>
45#include <linux/hw_breakpoint.h> 44#include <linux/hw_breakpoint.h>
diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c
new file mode 100644
index 000000000000..c2bedaea11f7
--- /dev/null
+++ b/arch/x86/kernel/ksysfs.c
@@ -0,0 +1,340 @@
1/*
2 * Architecture specific sysfs attributes in /sys/kernel
3 *
4 * Copyright (C) 2007, Intel Corp.
5 * Huang Ying <ying.huang@intel.com>
6 * Copyright (C) 2013, 2013 Red Hat, Inc.
7 * Dave Young <dyoung@redhat.com>
8 *
9 * This file is released under the GPLv2
10 */
11
12#include <linux/kobject.h>
13#include <linux/string.h>
14#include <linux/sysfs.h>
15#include <linux/init.h>
16#include <linux/stat.h>
17#include <linux/slab.h>
18#include <linux/mm.h>
19
20#include <asm/io.h>
21#include <asm/setup.h>
22
23static ssize_t version_show(struct kobject *kobj,
24 struct kobj_attribute *attr, char *buf)
25{
26 return sprintf(buf, "0x%04x\n", boot_params.hdr.version);
27}
28
29static struct kobj_attribute boot_params_version_attr = __ATTR_RO(version);
30
31static ssize_t boot_params_data_read(struct file *fp, struct kobject *kobj,
32 struct bin_attribute *bin_attr,
33 char *buf, loff_t off, size_t count)
34{
35 memcpy(buf, (void *)&boot_params + off, count);
36 return count;
37}
38
39static struct bin_attribute boot_params_data_attr = {
40 .attr = {
41 .name = "data",
42 .mode = S_IRUGO,
43 },
44 .read = boot_params_data_read,
45 .size = sizeof(boot_params),
46};
47
48static struct attribute *boot_params_version_attrs[] = {
49 &boot_params_version_attr.attr,
50 NULL,
51};
52
53static struct bin_attribute *boot_params_data_attrs[] = {
54 &boot_params_data_attr,
55 NULL,
56};
57
58static struct attribute_group boot_params_attr_group = {
59 .attrs = boot_params_version_attrs,
60 .bin_attrs = boot_params_data_attrs,
61};
62
63static int kobj_to_setup_data_nr(struct kobject *kobj, int *nr)
64{
65 const char *name;
66
67 name = kobject_name(kobj);
68 return kstrtoint(name, 10, nr);
69}
70
71static int get_setup_data_paddr(int nr, u64 *paddr)
72{
73 int i = 0;
74 struct setup_data *data;
75 u64 pa_data = boot_params.hdr.setup_data;
76
77 while (pa_data) {
78 if (nr == i) {
79 *paddr = pa_data;
80 return 0;
81 }
82 data = ioremap_cache(pa_data, sizeof(*data));
83 if (!data)
84 return -ENOMEM;
85
86 pa_data = data->next;
87 iounmap(data);
88 i++;
89 }
90 return -EINVAL;
91}
92
93static int __init get_setup_data_size(int nr, size_t *size)
94{
95 int i = 0;
96 struct setup_data *data;
97 u64 pa_data = boot_params.hdr.setup_data;
98
99 while (pa_data) {
100 data = ioremap_cache(pa_data, sizeof(*data));
101 if (!data)
102 return -ENOMEM;
103 if (nr == i) {
104 *size = data->len;
105 iounmap(data);
106 return 0;
107 }
108
109 pa_data = data->next;
110 iounmap(data);
111 i++;
112 }
113 return -EINVAL;
114}
115
116static ssize_t type_show(struct kobject *kobj,
117 struct kobj_attribute *attr, char *buf)
118{
119 int nr, ret;
120 u64 paddr;
121 struct setup_data *data;
122
123 ret = kobj_to_setup_data_nr(kobj, &nr);
124 if (ret)
125 return ret;
126
127 ret = get_setup_data_paddr(nr, &paddr);
128 if (ret)
129 return ret;
130 data = ioremap_cache(paddr, sizeof(*data));
131 if (!data)
132 return -ENOMEM;
133
134 ret = sprintf(buf, "0x%x\n", data->type);
135 iounmap(data);
136 return ret;
137}
138
139static ssize_t setup_data_data_read(struct file *fp,
140 struct kobject *kobj,
141 struct bin_attribute *bin_attr,
142 char *buf,
143 loff_t off, size_t count)
144{
145 int nr, ret = 0;
146 u64 paddr;
147 struct setup_data *data;
148 void *p;
149
150 ret = kobj_to_setup_data_nr(kobj, &nr);
151 if (ret)
152 return ret;
153
154 ret = get_setup_data_paddr(nr, &paddr);
155 if (ret)
156 return ret;
157 data = ioremap_cache(paddr, sizeof(*data));
158 if (!data)
159 return -ENOMEM;
160
161 if (off > data->len) {
162 ret = -EINVAL;
163 goto out;
164 }
165
166 if (count > data->len - off)
167 count = data->len - off;
168
169 if (!count)
170 goto out;
171
172 ret = count;
173 p = ioremap_cache(paddr + sizeof(*data), data->len);
174 if (!p) {
175 ret = -ENOMEM;
176 goto out;
177 }
178 memcpy(buf, p + off, count);
179 iounmap(p);
180out:
181 iounmap(data);
182 return ret;
183}
184
185static struct kobj_attribute type_attr = __ATTR_RO(type);
186
187static struct bin_attribute data_attr = {
188 .attr = {
189 .name = "data",
190 .mode = S_IRUGO,
191 },
192 .read = setup_data_data_read,
193};
194
195static struct attribute *setup_data_type_attrs[] = {
196 &type_attr.attr,
197 NULL,
198};
199
200static struct bin_attribute *setup_data_data_attrs[] = {
201 &data_attr,
202 NULL,
203};
204
205static struct attribute_group setup_data_attr_group = {
206 .attrs = setup_data_type_attrs,
207 .bin_attrs = setup_data_data_attrs,
208};
209
210static int __init create_setup_data_node(struct kobject *parent,
211 struct kobject **kobjp, int nr)
212{
213 int ret = 0;
214 size_t size;
215 struct kobject *kobj;
216 char name[16]; /* should be enough for setup_data nodes numbers */
217 snprintf(name, 16, "%d", nr);
218
219 kobj = kobject_create_and_add(name, parent);
220 if (!kobj)
221 return -ENOMEM;
222
223 ret = get_setup_data_size(nr, &size);
224 if (ret)
225 goto out_kobj;
226
227 data_attr.size = size;
228 ret = sysfs_create_group(kobj, &setup_data_attr_group);
229 if (ret)
230 goto out_kobj;
231 *kobjp = kobj;
232
233 return 0;
234out_kobj:
235 kobject_put(kobj);
236 return ret;
237}
238
239static void __init cleanup_setup_data_node(struct kobject *kobj)
240{
241 sysfs_remove_group(kobj, &setup_data_attr_group);
242 kobject_put(kobj);
243}
244
245static int __init get_setup_data_total_num(u64 pa_data, int *nr)
246{
247 int ret = 0;
248 struct setup_data *data;
249
250 *nr = 0;
251 while (pa_data) {
252 *nr += 1;
253 data = ioremap_cache(pa_data, sizeof(*data));
254 if (!data) {
255 ret = -ENOMEM;
256 goto out;
257 }
258 pa_data = data->next;
259 iounmap(data);
260 }
261
262out:
263 return ret;
264}
265
266static int __init create_setup_data_nodes(struct kobject *parent)
267{
268 struct kobject *setup_data_kobj, **kobjp;
269 u64 pa_data;
270 int i, j, nr, ret = 0;
271
272 pa_data = boot_params.hdr.setup_data;
273 if (!pa_data)
274 return 0;
275
276 setup_data_kobj = kobject_create_and_add("setup_data", parent);
277 if (!setup_data_kobj) {
278 ret = -ENOMEM;
279 goto out;
280 }
281
282 ret = get_setup_data_total_num(pa_data, &nr);
283 if (ret)
284 goto out_setup_data_kobj;
285
286 kobjp = kmalloc(sizeof(*kobjp) * nr, GFP_KERNEL);
287 if (!kobjp) {
288 ret = -ENOMEM;
289 goto out_setup_data_kobj;
290 }
291
292 for (i = 0; i < nr; i++) {
293 ret = create_setup_data_node(setup_data_kobj, kobjp + i, i);
294 if (ret)
295 goto out_clean_nodes;
296 }
297
298 kfree(kobjp);
299 return 0;
300
301out_clean_nodes:
302 for (j = i - 1; j > 0; j--)
303 cleanup_setup_data_node(*(kobjp + j));
304 kfree(kobjp);
305out_setup_data_kobj:
306 kobject_put(setup_data_kobj);
307out:
308 return ret;
309}
310
311static int __init boot_params_ksysfs_init(void)
312{
313 int ret;
314 struct kobject *boot_params_kobj;
315
316 boot_params_kobj = kobject_create_and_add("boot_params",
317 kernel_kobj);
318 if (!boot_params_kobj) {
319 ret = -ENOMEM;
320 goto out;
321 }
322
323 ret = sysfs_create_group(boot_params_kobj, &boot_params_attr_group);
324 if (ret)
325 goto out_boot_params_kobj;
326
327 ret = create_setup_data_nodes(boot_params_kobj);
328 if (ret)
329 goto out_create_group;
330
331 return 0;
332out_create_group:
333 sysfs_remove_group(boot_params_kobj, &boot_params_attr_group);
334out_boot_params_kobj:
335 kobject_put(boot_params_kobj);
336out:
337 return ret;
338}
339
340arch_initcall(boot_params_ksysfs_init);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 6dd802c6d780..0331cb389d68 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -417,7 +417,6 @@ void kvm_disable_steal_time(void)
417#ifdef CONFIG_SMP 417#ifdef CONFIG_SMP
418static void __init kvm_smp_prepare_boot_cpu(void) 418static void __init kvm_smp_prepare_boot_cpu(void)
419{ 419{
420 WARN_ON(kvm_register_clock("primary cpu clock"));
421 kvm_guest_cpu_init(); 420 kvm_guest_cpu_init();
422 native_smp_prepare_boot_cpu(); 421 native_smp_prepare_boot_cpu();
423 kvm_spinlock_init(); 422 kvm_spinlock_init();
@@ -500,6 +499,38 @@ void __init kvm_guest_init(void)
500#endif 499#endif
501} 500}
502 501
502static noinline uint32_t __kvm_cpuid_base(void)
503{
504 if (boot_cpu_data.cpuid_level < 0)
505 return 0; /* So we don't blow up on old processors */
506
507 if (cpu_has_hypervisor)
508 return hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0);
509
510 return 0;
511}
512
513static inline uint32_t kvm_cpuid_base(void)
514{
515 static int kvm_cpuid_base = -1;
516
517 if (kvm_cpuid_base == -1)
518 kvm_cpuid_base = __kvm_cpuid_base();
519
520 return kvm_cpuid_base;
521}
522
523bool kvm_para_available(void)
524{
525 return kvm_cpuid_base() != 0;
526}
527EXPORT_SYMBOL_GPL(kvm_para_available);
528
529unsigned int kvm_arch_para_features(void)
530{
531 return cpuid_eax(kvm_cpuid_base() | KVM_CPUID_FEATURES);
532}
533
503static uint32_t __init kvm_detect(void) 534static uint32_t __init kvm_detect(void)
504{ 535{
505 return kvm_cpuid_base(); 536 return kvm_cpuid_base();
@@ -673,7 +704,7 @@ static cpumask_t waiting_cpus;
673/* Track spinlock on which a cpu is waiting */ 704/* Track spinlock on which a cpu is waiting */
674static DEFINE_PER_CPU(struct kvm_lock_waiting, klock_waiting); 705static DEFINE_PER_CPU(struct kvm_lock_waiting, klock_waiting);
675 706
676static void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want) 707__visible void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
677{ 708{
678 struct kvm_lock_waiting *w; 709 struct kvm_lock_waiting *w;
679 int cpu; 710 int cpu;
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index e6041094ff26..d9156ceecdff 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -242,7 +242,7 @@ void __init kvmclock_init(void)
242 hv_clock = __va(mem); 242 hv_clock = __va(mem);
243 memset(hv_clock, 0, size); 243 memset(hv_clock, 0, size);
244 244
245 if (kvm_register_clock("boot clock")) { 245 if (kvm_register_clock("primary cpu clock")) {
246 hv_clock = NULL; 246 hv_clock = NULL;
247 memblock_free(mem, size); 247 memblock_free(mem, size);
248 return; 248 return;
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index ebc987398923..af1d14a9ebda 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -229,6 +229,17 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
229 } 229 }
230 } 230 }
231 231
232 /*
233 * On x86-64 we do not support 16-bit segments due to
234 * IRET leaking the high bits of the kernel stack address.
235 */
236#ifdef CONFIG_X86_64
237 if (!ldt_info.seg_32bit) {
238 error = -EINVAL;
239 goto out_unlock;
240 }
241#endif
242
232 fill_ldt(&ldt, &ldt_info); 243 fill_ldt(&ldt, &ldt_info);
233 if (oldmode) 244 if (oldmode)
234 ldt.avl = 0; 245 ldt.avl = 0;
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 5b19e4d78b00..1667b1de8d5d 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -9,7 +9,6 @@
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/kexec.h> 10#include <linux/kexec.h>
11#include <linux/delay.h> 11#include <linux/delay.h>
12#include <linux/init.h>
13#include <linux/numa.h> 12#include <linux/numa.h>
14#include <linux/ftrace.h> 13#include <linux/ftrace.h>
15#include <linux/suspend.h> 14#include <linux/suspend.h>
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 4eabc160696f..679cef0791cd 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -279,5 +279,7 @@ void arch_crash_save_vmcoreinfo(void)
279 VMCOREINFO_SYMBOL(node_data); 279 VMCOREINFO_SYMBOL(node_data);
280 VMCOREINFO_LENGTH(node_data, MAX_NUMNODES); 280 VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
281#endif 281#endif
282 vmcoreinfo_append_str("KERNELOFFSET=%lx\n",
283 (unsigned long)&_text - __START_KERNEL);
282} 284}
283 285
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 18be189368bb..e69f9882bf95 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -28,6 +28,7 @@
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include <linux/gfp.h> 29#include <linux/gfp.h>
30#include <linux/jump_label.h> 30#include <linux/jump_label.h>
31#include <linux/random.h>
31 32
32#include <asm/page.h> 33#include <asm/page.h>
33#include <asm/pgtable.h> 34#include <asm/pgtable.h>
@@ -43,13 +44,52 @@ do { \
43} while (0) 44} while (0)
44#endif 45#endif
45 46
47#ifdef CONFIG_RANDOMIZE_BASE
48static unsigned long module_load_offset;
49static int randomize_modules = 1;
50
51/* Mutex protects the module_load_offset. */
52static DEFINE_MUTEX(module_kaslr_mutex);
53
54static int __init parse_nokaslr(char *p)
55{
56 randomize_modules = 0;
57 return 0;
58}
59early_param("nokaslr", parse_nokaslr);
60
61static unsigned long int get_module_load_offset(void)
62{
63 if (randomize_modules) {
64 mutex_lock(&module_kaslr_mutex);
65 /*
66 * Calculate the module_load_offset the first time this
67 * code is called. Once calculated it stays the same until
68 * reboot.
69 */
70 if (module_load_offset == 0)
71 module_load_offset =
72 (get_random_int() % 1024 + 1) * PAGE_SIZE;
73 mutex_unlock(&module_kaslr_mutex);
74 }
75 return module_load_offset;
76}
77#else
78static unsigned long int get_module_load_offset(void)
79{
80 return 0;
81}
82#endif
83
46void *module_alloc(unsigned long size) 84void *module_alloc(unsigned long size)
47{ 85{
48 if (PAGE_ALIGN(size) > MODULES_LEN) 86 if (PAGE_ALIGN(size) > MODULES_LEN)
49 return NULL; 87 return NULL;
50 return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, 88 return __vmalloc_node_range(size, 1,
51 GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, 89 MODULES_VADDR + get_module_load_offset(),
52 NUMA_NO_NODE, __builtin_return_address(0)); 90 MODULES_END, GFP_KERNEL | __GFP_HIGHMEM,
91 PAGE_KERNEL_EXEC, NUMA_NO_NODE,
92 __builtin_return_address(0));
53} 93}
54 94
55#ifdef CONFIG_X86_32 95#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 05266b5aae22..c9603ac80de5 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -259,14 +259,15 @@ static int __init msr_init(void)
259 goto out_chrdev; 259 goto out_chrdev;
260 } 260 }
261 msr_class->devnode = msr_devnode; 261 msr_class->devnode = msr_devnode;
262 get_online_cpus(); 262
263 cpu_notifier_register_begin();
263 for_each_online_cpu(i) { 264 for_each_online_cpu(i) {
264 err = msr_device_create(i); 265 err = msr_device_create(i);
265 if (err != 0) 266 if (err != 0)
266 goto out_class; 267 goto out_class;
267 } 268 }
268 register_hotcpu_notifier(&msr_class_cpu_notifier); 269 __register_hotcpu_notifier(&msr_class_cpu_notifier);
269 put_online_cpus(); 270 cpu_notifier_register_done();
270 271
271 err = 0; 272 err = 0;
272 goto out; 273 goto out;
@@ -275,7 +276,7 @@ out_class:
275 i = 0; 276 i = 0;
276 for_each_online_cpu(i) 277 for_each_online_cpu(i)
277 msr_device_destroy(i); 278 msr_device_destroy(i);
278 put_online_cpus(); 279 cpu_notifier_register_done();
279 class_destroy(msr_class); 280 class_destroy(msr_class);
280out_chrdev: 281out_chrdev:
281 __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); 282 __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
@@ -286,13 +287,14 @@ out:
286static void __exit msr_exit(void) 287static void __exit msr_exit(void)
287{ 288{
288 int cpu = 0; 289 int cpu = 0;
289 get_online_cpus(); 290
291 cpu_notifier_register_begin();
290 for_each_online_cpu(cpu) 292 for_each_online_cpu(cpu)
291 msr_device_destroy(cpu); 293 msr_device_destroy(cpu);
292 class_destroy(msr_class); 294 class_destroy(msr_class);
293 __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); 295 __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
294 unregister_hotcpu_notifier(&msr_class_cpu_notifier); 296 __unregister_hotcpu_notifier(&msr_class_cpu_notifier);
295 put_online_cpus(); 297 cpu_notifier_register_done();
296} 298}
297 299
298module_init(msr_init); 300module_init(msr_init);
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 6fcb49ce50a1..b4872b999a71 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -87,6 +87,7 @@ __setup("unknown_nmi_panic", setup_unknown_nmi_panic);
87#define nmi_to_desc(type) (&nmi_desc[type]) 87#define nmi_to_desc(type) (&nmi_desc[type])
88 88
89static u64 nmi_longest_ns = 1 * NSEC_PER_MSEC; 89static u64 nmi_longest_ns = 1 * NSEC_PER_MSEC;
90
90static int __init nmi_warning_debugfs(void) 91static int __init nmi_warning_debugfs(void)
91{ 92{
92 debugfs_create_u64("nmi_longest_ns", 0644, 93 debugfs_create_u64("nmi_longest_ns", 0644,
@@ -95,6 +96,20 @@ static int __init nmi_warning_debugfs(void)
95} 96}
96fs_initcall(nmi_warning_debugfs); 97fs_initcall(nmi_warning_debugfs);
97 98
99static void nmi_max_handler(struct irq_work *w)
100{
101 struct nmiaction *a = container_of(w, struct nmiaction, irq_work);
102 int remainder_ns, decimal_msecs;
103 u64 whole_msecs = ACCESS_ONCE(a->max_duration);
104
105 remainder_ns = do_div(whole_msecs, (1000 * 1000));
106 decimal_msecs = remainder_ns / 1000;
107
108 printk_ratelimited(KERN_INFO
109 "INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n",
110 a->handler, whole_msecs, decimal_msecs);
111}
112
98static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b) 113static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b)
99{ 114{
100 struct nmi_desc *desc = nmi_to_desc(type); 115 struct nmi_desc *desc = nmi_to_desc(type);
@@ -110,26 +125,20 @@ static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2
110 * to handle those situations. 125 * to handle those situations.
111 */ 126 */
112 list_for_each_entry_rcu(a, &desc->head, list) { 127 list_for_each_entry_rcu(a, &desc->head, list) {
113 u64 before, delta, whole_msecs; 128 int thishandled;
114 int remainder_ns, decimal_msecs, thishandled; 129 u64 delta;
115 130
116 before = sched_clock(); 131 delta = sched_clock();
117 thishandled = a->handler(type, regs); 132 thishandled = a->handler(type, regs);
118 handled += thishandled; 133 handled += thishandled;
119 delta = sched_clock() - before; 134 delta = sched_clock() - delta;
120 trace_nmi_handler(a->handler, (int)delta, thishandled); 135 trace_nmi_handler(a->handler, (int)delta, thishandled);
121 136
122 if (delta < nmi_longest_ns) 137 if (delta < nmi_longest_ns || delta < a->max_duration)
123 continue; 138 continue;
124 139
125 nmi_longest_ns = delta; 140 a->max_duration = delta;
126 whole_msecs = delta; 141 irq_work_queue(&a->irq_work);
127 remainder_ns = do_div(whole_msecs, (1000 * 1000));
128 decimal_msecs = remainder_ns / 1000;
129 printk_ratelimited(KERN_INFO
130 "INFO: NMI handler (%ps) took too long to run: "
131 "%lld.%03d msecs\n", a->handler, whole_msecs,
132 decimal_msecs);
133 } 142 }
134 143
135 rcu_read_unlock(); 144 rcu_read_unlock();
@@ -146,6 +155,8 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action)
146 if (!action->handler) 155 if (!action->handler)
147 return -EINVAL; 156 return -EINVAL;
148 157
158 init_irq_work(&action->irq_work, nmi_max_handler);
159
149 spin_lock_irqsave(&desc->lock, flags); 160 spin_lock_irqsave(&desc->lock, flags);
150 161
151 /* 162 /*
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 299d49302e7d..0497f719977d 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -1207,23 +1207,31 @@ error:
1207 return ret; 1207 return ret;
1208} 1208}
1209 1209
1210static inline int __init determine_tce_table_size(u64 ram) 1210static inline int __init determine_tce_table_size(void)
1211{ 1211{
1212 int ret; 1212 int ret;
1213 1213
1214 if (specified_table_size != TCE_TABLE_SIZE_UNSPECIFIED) 1214 if (specified_table_size != TCE_TABLE_SIZE_UNSPECIFIED)
1215 return specified_table_size; 1215 return specified_table_size;
1216 1216
1217 /* 1217 if (is_kdump_kernel() && saved_max_pfn) {
1218 * Table sizes are from 0 to 7 (TCE_TABLE_SIZE_64K to 1218 /*
1219 * TCE_TABLE_SIZE_8M). Table size 0 has 8K entries and each 1219 * Table sizes are from 0 to 7 (TCE_TABLE_SIZE_64K to
1220 * larger table size has twice as many entries, so shift the 1220 * TCE_TABLE_SIZE_8M). Table size 0 has 8K entries and each
1221 * max ram address by 13 to divide by 8K and then look at the 1221 * larger table size has twice as many entries, so shift the
1222 * order of the result to choose between 0-7. 1222 * max ram address by 13 to divide by 8K and then look at the
1223 */ 1223 * order of the result to choose between 0-7.
1224 ret = get_order(ram >> 13); 1224 */
1225 if (ret > TCE_TABLE_SIZE_8M) 1225 ret = get_order((saved_max_pfn * PAGE_SIZE) >> 13);
1226 if (ret > TCE_TABLE_SIZE_8M)
1227 ret = TCE_TABLE_SIZE_8M;
1228 } else {
1229 /*
1230 * Use 8M by default (suggested by Muli) if it's not
1231 * kdump kernel and saved_max_pfn isn't set.
1232 */
1226 ret = TCE_TABLE_SIZE_8M; 1233 ret = TCE_TABLE_SIZE_8M;
1234 }
1227 1235
1228 return ret; 1236 return ret;
1229} 1237}
@@ -1418,8 +1426,7 @@ int __init detect_calgary(void)
1418 return -ENOMEM; 1426 return -ENOMEM;
1419 } 1427 }
1420 1428
1421 specified_table_size = determine_tce_table_size((is_kdump_kernel() ? 1429 specified_table_size = determine_tce_table_size();
1422 saved_max_pfn : max_pfn) * PAGE_SIZE);
1423 1430
1424 for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { 1431 for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
1425 struct calgary_bus_info *info = &bus_info[bus]; 1432 struct calgary_bus_info *info = &bus_info[bus];
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 872079a67e4d..f7d0672481fd 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -100,8 +100,10 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size,
100 flag |= __GFP_ZERO; 100 flag |= __GFP_ZERO;
101again: 101again:
102 page = NULL; 102 page = NULL;
103 if (!(flag & GFP_ATOMIC)) 103 /* CMA can be used only in the context which permits sleeping */
104 if (flag & __GFP_WAIT)
104 page = dma_alloc_from_contiguous(dev, count, get_order(size)); 105 page = dma_alloc_from_contiguous(dev, count, get_order(size));
106 /* fallback */
105 if (!page) 107 if (!page)
106 page = alloc_pages_node(dev_to_node(dev), flag, get_order(size)); 108 page = alloc_pages_node(dev_to_node(dev), flag, get_order(size));
107 if (!page) 109 if (!page)
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index 871be4a84c7d..da15918d1c81 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -3,7 +3,6 @@
3#include <linux/dma-mapping.h> 3#include <linux/dma-mapping.h>
4#include <linux/scatterlist.h> 4#include <linux/scatterlist.h>
5#include <linux/string.h> 5#include <linux/string.h>
6#include <linux/init.h>
7#include <linux/gfp.h> 6#include <linux/gfp.h>
8#include <linux/pci.h> 7#include <linux/pci.h>
9#include <linux/mm.h> 8#include <linux/mm.h>
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 3fb8d95ab8b5..4505e2a950d8 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -298,10 +298,7 @@ void arch_cpu_idle_dead(void)
298 */ 298 */
299void arch_cpu_idle(void) 299void arch_cpu_idle(void)
300{ 300{
301 if (cpuidle_idle_call()) 301 x86_idle();
302 x86_idle();
303 else
304 local_irq_enable();
305} 302}
306 303
307/* 304/*
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 6f1236c29c4b..7bc86bbe7485 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -24,7 +24,6 @@
24#include <linux/interrupt.h> 24#include <linux/interrupt.h>
25#include <linux/delay.h> 25#include <linux/delay.h>
26#include <linux/reboot.h> 26#include <linux/reboot.h>
27#include <linux/init.h>
28#include <linux/mc146818rtc.h> 27#include <linux/mc146818rtc.h>
29#include <linux/module.h> 28#include <linux/module.h>
30#include <linux/kallsyms.h> 29#include <linux/kallsyms.h>
@@ -315,6 +314,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
315 */ 314 */
316 arch_end_context_switch(next_p); 315 arch_end_context_switch(next_p);
317 316
317 this_cpu_write(kernel_stack,
318 (unsigned long)task_stack_page(next_p) +
319 THREAD_SIZE - KERNEL_STACK_OFFSET);
320
318 /* 321 /*
319 * Restore %gs if needed (which is common) 322 * Restore %gs if needed (which is common)
320 */ 323 */
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 7461f50d5bb1..678c0ada3b3c 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -184,14 +184,14 @@ unsigned long kernel_stack_pointer(struct pt_regs *regs)
184{ 184{
185 unsigned long context = (unsigned long)regs & ~(THREAD_SIZE - 1); 185 unsigned long context = (unsigned long)regs & ~(THREAD_SIZE - 1);
186 unsigned long sp = (unsigned long)&regs->sp; 186 unsigned long sp = (unsigned long)&regs->sp;
187 struct thread_info *tinfo; 187 u32 *prev_esp;
188 188
189 if (context == (sp & ~(THREAD_SIZE - 1))) 189 if (context == (sp & ~(THREAD_SIZE - 1)))
190 return sp; 190 return sp;
191 191
192 tinfo = (struct thread_info *)context; 192 prev_esp = (u32 *)(context);
193 if (tinfo->previous_esp) 193 if (prev_esp)
194 return tinfo->previous_esp; 194 return (unsigned long)prev_esp;
195 195
196 return (unsigned long)regs; 196 return (unsigned long)regs;
197} 197}
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 04ee1e2e4c02..ff898bbf579d 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -529,7 +529,7 @@ static void quirk_amd_nb_node(struct pci_dev *dev)
529 return; 529 return;
530 530
531 pci_read_config_dword(nb_ht, 0x60, &val); 531 pci_read_config_dword(nb_ht, 0x60, &val);
532 node = val & 7; 532 node = pcibus_to_node(dev->bus) | (val & 7);
533 /* 533 /*
534 * Some hardware may return an invalid node ID, 534 * Some hardware may return an invalid node ID,
535 * so check it first: 535 * so check it first:
@@ -571,3 +571,40 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F5,
571 quirk_amd_nb_node); 571 quirk_amd_nb_node);
572 572
573#endif 573#endif
574
575#ifdef CONFIG_PCI
576/*
577 * Processor does not ensure DRAM scrub read/write sequence
578 * is atomic wrt accesses to CC6 save state area. Therefore
579 * if a concurrent scrub read/write access is to same address
580 * the entry may appear as if it is not written. This quirk
581 * applies to Fam16h models 00h-0Fh
582 *
583 * See "Revision Guide" for AMD F16h models 00h-0fh,
584 * document 51810 rev. 3.04, Nov 2013
585 */
586static void amd_disable_seq_and_redirect_scrub(struct pci_dev *dev)
587{
588 u32 val;
589
590 /*
591 * Suggested workaround:
592 * set D18F3x58[4:0] = 00h and set D18F3x5C[0] = 0b
593 */
594 pci_read_config_dword(dev, 0x58, &val);
595 if (val & 0x1F) {
596 val &= ~(0x1F);
597 pci_write_config_dword(dev, 0x58, val);
598 }
599
600 pci_read_config_dword(dev, 0x5C, &val);
601 if (val & BIT(0)) {
602 val &= ~BIT(0);
603 pci_write_config_dword(dev, 0x5c, val);
604 }
605}
606
607DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3,
608 amd_disable_seq_and_redirect_scrub);
609
610#endif
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index c752cb43e52f..654b46574b91 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -464,9 +464,12 @@ void __attribute__((weak)) mach_reboot_fixups(void)
464 * 2) If still alive, write to the keyboard controller 464 * 2) If still alive, write to the keyboard controller
465 * 3) If still alive, write to the ACPI reboot register again 465 * 3) If still alive, write to the ACPI reboot register again
466 * 4) If still alive, write to the keyboard controller again 466 * 4) If still alive, write to the keyboard controller again
467 * 5) If still alive, call the EFI runtime service to reboot
468 * 6) If still alive, write to the PCI IO port 0xCF9 to reboot
469 * 7) If still alive, inform BIOS to do a proper reboot
467 * 470 *
468 * If the machine is still alive at this stage, it gives up. We default to 471 * If the machine is still alive at this stage, it gives up. We default to
469 * following the same pattern, except that if we're still alive after (4) we'll 472 * following the same pattern, except that if we're still alive after (7) we'll
470 * try to force a triple fault and then cycle between hitting the keyboard 473 * try to force a triple fault and then cycle between hitting the keyboard
471 * controller and doing that 474 * controller and doing that
472 */ 475 */
@@ -502,7 +505,7 @@ static void native_machine_emergency_restart(void)
502 attempt = 1; 505 attempt = 1;
503 reboot_type = BOOT_ACPI; 506 reboot_type = BOOT_ACPI;
504 } else { 507 } else {
505 reboot_type = BOOT_TRIPLE; 508 reboot_type = BOOT_EFI;
506 } 509 }
507 break; 510 break;
508 511
@@ -510,13 +513,15 @@ static void native_machine_emergency_restart(void)
510 load_idt(&no_idt); 513 load_idt(&no_idt);
511 __asm__ __volatile__("int3"); 514 __asm__ __volatile__("int3");
512 515
516 /* We're probably dead after this, but... */
513 reboot_type = BOOT_KBD; 517 reboot_type = BOOT_KBD;
514 break; 518 break;
515 519
516 case BOOT_BIOS: 520 case BOOT_BIOS:
517 machine_real_restart(MRR_BIOS); 521 machine_real_restart(MRR_BIOS);
518 522
519 reboot_type = BOOT_KBD; 523 /* We're probably dead after this, but... */
524 reboot_type = BOOT_TRIPLE;
520 break; 525 break;
521 526
522 case BOOT_ACPI: 527 case BOOT_ACPI:
@@ -530,7 +535,7 @@ static void native_machine_emergency_restart(void)
530 EFI_RESET_WARM : 535 EFI_RESET_WARM :
531 EFI_RESET_COLD, 536 EFI_RESET_COLD,
532 EFI_SUCCESS, 0, NULL); 537 EFI_SUCCESS, 0, NULL);
533 reboot_type = BOOT_KBD; 538 reboot_type = BOOT_CF9_COND;
534 break; 539 break;
535 540
536 case BOOT_CF9: 541 case BOOT_CF9:
@@ -548,7 +553,7 @@ static void native_machine_emergency_restart(void)
548 outb(cf9|reboot_code, 0xcf9); 553 outb(cf9|reboot_code, 0xcf9);
549 udelay(50); 554 udelay(50);
550 } 555 }
551 reboot_type = BOOT_KBD; 556 reboot_type = BOOT_BIOS;
552 break; 557 break;
553 } 558 }
554 } 559 }
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index cb233bc9dee3..09c76d265550 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -295,6 +295,8 @@ static void __init reserve_brk(void)
295 _brk_start = 0; 295 _brk_start = 0;
296} 296}
297 297
298u64 relocated_ramdisk;
299
298#ifdef CONFIG_BLK_DEV_INITRD 300#ifdef CONFIG_BLK_DEV_INITRD
299 301
300static u64 __init get_ramdisk_image(void) 302static u64 __init get_ramdisk_image(void)
@@ -321,25 +323,24 @@ static void __init relocate_initrd(void)
321 u64 ramdisk_image = get_ramdisk_image(); 323 u64 ramdisk_image = get_ramdisk_image();
322 u64 ramdisk_size = get_ramdisk_size(); 324 u64 ramdisk_size = get_ramdisk_size();
323 u64 area_size = PAGE_ALIGN(ramdisk_size); 325 u64 area_size = PAGE_ALIGN(ramdisk_size);
324 u64 ramdisk_here;
325 unsigned long slop, clen, mapaddr; 326 unsigned long slop, clen, mapaddr;
326 char *p, *q; 327 char *p, *q;
327 328
328 /* We need to move the initrd down into directly mapped mem */ 329 /* We need to move the initrd down into directly mapped mem */
329 ramdisk_here = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), 330 relocated_ramdisk = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
330 area_size, PAGE_SIZE); 331 area_size, PAGE_SIZE);
331 332
332 if (!ramdisk_here) 333 if (!relocated_ramdisk)
333 panic("Cannot find place for new RAMDISK of size %lld\n", 334 panic("Cannot find place for new RAMDISK of size %lld\n",
334 ramdisk_size); 335 ramdisk_size);
335 336
336 /* Note: this includes all the mem currently occupied by 337 /* Note: this includes all the mem currently occupied by
337 the initrd, we rely on that fact to keep the data intact. */ 338 the initrd, we rely on that fact to keep the data intact. */
338 memblock_reserve(ramdisk_here, area_size); 339 memblock_reserve(relocated_ramdisk, area_size);
339 initrd_start = ramdisk_here + PAGE_OFFSET; 340 initrd_start = relocated_ramdisk + PAGE_OFFSET;
340 initrd_end = initrd_start + ramdisk_size; 341 initrd_end = initrd_start + ramdisk_size;
341 printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n", 342 printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n",
342 ramdisk_here, ramdisk_here + ramdisk_size - 1); 343 relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1);
343 344
344 q = (char *)initrd_start; 345 q = (char *)initrd_start;
345 346
@@ -363,7 +364,7 @@ static void __init relocate_initrd(void)
363 printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to" 364 printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
364 " [mem %#010llx-%#010llx]\n", 365 " [mem %#010llx-%#010llx]\n",
365 ramdisk_image, ramdisk_image + ramdisk_size - 1, 366 ramdisk_image, ramdisk_image + ramdisk_size - 1,
366 ramdisk_here, ramdisk_here + ramdisk_size - 1); 367 relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1);
367} 368}
368 369
369static void __init early_reserve_initrd(void) 370static void __init early_reserve_initrd(void)
@@ -447,6 +448,9 @@ static void __init parse_setup_data(void)
447 case SETUP_DTB: 448 case SETUP_DTB:
448 add_dtb(pa_data); 449 add_dtb(pa_data);
449 break; 450 break;
451 case SETUP_EFI:
452 parse_efi_setup(pa_data, data_len);
453 break;
450 default: 454 default:
451 break; 455 break;
452 } 456 }
@@ -824,6 +828,20 @@ static void __init trim_low_memory_range(void)
824} 828}
825 829
826/* 830/*
831 * Dump out kernel offset information on panic.
832 */
833static int
834dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
835{
836 pr_emerg("Kernel Offset: 0x%lx from 0x%lx "
837 "(relocation range: 0x%lx-0x%lx)\n",
838 (unsigned long)&_text - __START_KERNEL, __START_KERNEL,
839 __START_KERNEL_map, MODULES_VADDR-1);
840
841 return 0;
842}
843
844/*
827 * Determine if we were loaded by an EFI loader. If so, then we have also been 845 * Determine if we were loaded by an EFI loader. If so, then we have also been
828 * passed the efi memmap, systab, etc., so we should use these data structures 846 * passed the efi memmap, systab, etc., so we should use these data structures
829 * for initialization. Note, the efi init code path is determined by the 847 * for initialization. Note, the efi init code path is determined by the
@@ -851,7 +869,6 @@ void __init setup_arch(char **cmdline_p)
851 869
852#ifdef CONFIG_X86_32 870#ifdef CONFIG_X86_32
853 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); 871 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
854 visws_early_detect();
855 872
856 /* 873 /*
857 * copy kernel address range established so far and switch 874 * copy kernel address range established so far and switch
@@ -908,11 +925,11 @@ void __init setup_arch(char **cmdline_p)
908#ifdef CONFIG_EFI 925#ifdef CONFIG_EFI
909 if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, 926 if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
910 "EL32", 4)) { 927 "EL32", 4)) {
911 set_bit(EFI_BOOT, &x86_efi_facility); 928 set_bit(EFI_BOOT, &efi.flags);
912 } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, 929 } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
913 "EL64", 4)) { 930 "EL64", 4)) {
914 set_bit(EFI_BOOT, &x86_efi_facility); 931 set_bit(EFI_BOOT, &efi.flags);
915 set_bit(EFI_64BIT, &x86_efi_facility); 932 set_bit(EFI_64BIT, &efi.flags);
916 } 933 }
917 934
918 if (efi_enabled(EFI_BOOT)) 935 if (efi_enabled(EFI_BOOT))
@@ -924,8 +941,6 @@ void __init setup_arch(char **cmdline_p)
924 iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; 941 iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1;
925 setup_memory_map(); 942 setup_memory_map();
926 parse_setup_data(); 943 parse_setup_data();
927 /* update the e820_saved too */
928 e820_reserve_setup_data();
929 944
930 copy_edd(); 945 copy_edd();
931 946
@@ -987,6 +1002,8 @@ void __init setup_arch(char **cmdline_p)
987 early_dump_pci_devices(); 1002 early_dump_pci_devices();
988#endif 1003#endif
989 1004
1005 /* update the e820_saved too */
1006 e820_reserve_setup_data();
990 finish_e820_parsing(); 1007 finish_e820_parsing();
991 1008
992 if (efi_enabled(EFI_BOOT)) 1009 if (efi_enabled(EFI_BOOT))
@@ -1221,14 +1238,8 @@ void __init setup_arch(char **cmdline_p)
1221 register_refined_jiffies(CLOCK_TICK_RATE); 1238 register_refined_jiffies(CLOCK_TICK_RATE);
1222 1239
1223#ifdef CONFIG_EFI 1240#ifdef CONFIG_EFI
1224 /* Once setup is done above, unmap the EFI memory map on 1241 if (efi_enabled(EFI_BOOT))
1225 * mismatched firmware/kernel archtectures since there is no 1242 efi_apply_memmap_quirks();
1226 * support for runtime services.
1227 */
1228 if (efi_enabled(EFI_BOOT) && !efi_is_native()) {
1229 pr_info("efi: Setup done, disabling due to 32/64-bit mismatch\n");
1230 efi_unmap_memmap();
1231 }
1232#endif 1243#endif
1233} 1244}
1234 1245
@@ -1248,3 +1259,15 @@ void __init i386_reserve_resources(void)
1248} 1259}
1249 1260
1250#endif /* CONFIG_X86_32 */ 1261#endif /* CONFIG_X86_32 */
1262
1263static struct notifier_block kernel_offset_notifier = {
1264 .notifier_call = dump_kernel_offset
1265};
1266
1267static int __init register_kernel_offset_dumper(void)
1268{
1269 atomic_notifier_chain_register(&panic_notifier_list,
1270 &kernel_offset_notifier);
1271 return 0;
1272}
1273__initcall(register_kernel_offset_dumper);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 85dc05a3aa02..34826934d4a7 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -122,8 +122,9 @@ static void smp_callin(void)
122 * Since CPU0 is not wakened up by INIT, it doesn't wait for the IPI. 122 * Since CPU0 is not wakened up by INIT, it doesn't wait for the IPI.
123 */ 123 */
124 cpuid = smp_processor_id(); 124 cpuid = smp_processor_id();
125 if (apic->wait_for_init_deassert && cpuid != 0) 125 if (apic->wait_for_init_deassert && cpuid)
126 apic->wait_for_init_deassert(&init_deasserted); 126 while (!atomic_read(&init_deasserted))
127 cpu_relax();
127 128
128 /* 129 /*
129 * (This works even if the APIC is not enabled.) 130 * (This works even if the APIC is not enabled.)
@@ -701,11 +702,15 @@ wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid,
701 int id; 702 int id;
702 int boot_error; 703 int boot_error;
703 704
705 preempt_disable();
706
704 /* 707 /*
705 * Wake up AP by INIT, INIT, STARTUP sequence. 708 * Wake up AP by INIT, INIT, STARTUP sequence.
706 */ 709 */
707 if (cpu) 710 if (cpu) {
708 return wakeup_secondary_cpu_via_init(apicid, start_ip); 711 boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip);
712 goto out;
713 }
709 714
710 /* 715 /*
711 * Wake up BSP by nmi. 716 * Wake up BSP by nmi.
@@ -725,6 +730,9 @@ wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid,
725 boot_error = wakeup_secondary_cpu_via_nmi(id, start_ip); 730 boot_error = wakeup_secondary_cpu_via_nmi(id, start_ip);
726 } 731 }
727 732
733out:
734 preempt_enable();
735
728 return boot_error; 736 return boot_error;
729} 737}
730 738
@@ -758,10 +766,10 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
758#else 766#else
759 clear_tsk_thread_flag(idle, TIF_FORK); 767 clear_tsk_thread_flag(idle, TIF_FORK);
760 initial_gs = per_cpu_offset(cpu); 768 initial_gs = per_cpu_offset(cpu);
769#endif
761 per_cpu(kernel_stack, cpu) = 770 per_cpu(kernel_stack, cpu) =
762 (unsigned long)task_stack_page(idle) - 771 (unsigned long)task_stack_page(idle) -
763 KERNEL_STACK_OFFSET + THREAD_SIZE; 772 KERNEL_STACK_OFFSET + THREAD_SIZE;
764#endif
765 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); 773 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
766 initial_code = (unsigned long)start_secondary; 774 initial_code = (unsigned long)start_secondary;
767 stack_start = idle->thread.sp; 775 stack_start = idle->thread.sp;
@@ -1312,6 +1320,12 @@ void cpu_disable_common(void)
1312 1320
1313int native_cpu_disable(void) 1321int native_cpu_disable(void)
1314{ 1322{
1323 int ret;
1324
1325 ret = check_irq_vectors_for_cpu_disable();
1326 if (ret)
1327 return ret;
1328
1315 clear_local_APIC(); 1329 clear_local_APIC();
1316 1330
1317 cpu_disable_common(); 1331 cpu_disable_common();
@@ -1373,7 +1387,7 @@ static inline void mwait_play_dead(void)
1373 1387
1374 if (!this_cpu_has(X86_FEATURE_MWAIT)) 1388 if (!this_cpu_has(X86_FEATURE_MWAIT))
1375 return; 1389 return;
1376 if (!this_cpu_has(X86_FEATURE_CLFLSH)) 1390 if (!this_cpu_has(X86_FEATURE_CLFLUSH))
1377 return; 1391 return;
1378 if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF) 1392 if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
1379 return; 1393 return;
@@ -1417,7 +1431,9 @@ static inline void mwait_play_dead(void)
1417 * The WBINVD is insufficient due to the spurious-wakeup 1431 * The WBINVD is insufficient due to the spurious-wakeup
1418 * case where we return around the loop. 1432 * case where we return around the loop.
1419 */ 1433 */
1434 mb();
1420 clflush(mwait_ptr); 1435 clflush(mwait_ptr);
1436 mb();
1421 __monitor(mwait_ptr, 0, 0); 1437 __monitor(mwait_ptr, 0, 0);
1422 mb(); 1438 mb();
1423 __mwait(eax, 0); 1439 __mwait(eax, 0);
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index 24d3c91e9812..bf7ef5ce29df 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -23,7 +23,7 @@
23#include <asm/time.h> 23#include <asm/time.h>
24 24
25#ifdef CONFIG_X86_64 25#ifdef CONFIG_X86_64
26DEFINE_VVAR(volatile unsigned long, jiffies) = INITIAL_JIFFIES; 26__visible DEFINE_VVAR(volatile unsigned long, jiffies) = INITIAL_JIFFIES;
27#endif 27#endif
28 28
29unsigned long profile_pc(struct pt_regs *regs) 29unsigned long profile_pc(struct pt_regs *regs)
@@ -62,7 +62,7 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id)
62 62
63static struct irqaction irq0 = { 63static struct irqaction irq0 = {
64 .handler = timer_interrupt, 64 .handler = timer_interrupt,
65 .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, 65 .flags = IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
66 .name = "timer" 66 .name = "timer"
67}; 67};
68 68
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index b857ed890b4c..57409f6b8c62 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -211,21 +211,17 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
211 exception_exit(prev_state); \ 211 exception_exit(prev_state); \
212} 212}
213 213
214DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV, 214DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip )
215 regs->ip) 215DO_ERROR (X86_TRAP_OF, SIGSEGV, "overflow", overflow )
216DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow) 216DO_ERROR (X86_TRAP_BR, SIGSEGV, "bounds", bounds )
217DO_ERROR(X86_TRAP_BR, SIGSEGV, "bounds", bounds) 217DO_ERROR_INFO(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip )
218DO_ERROR_INFO(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, 218DO_ERROR (X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun )
219 regs->ip) 219DO_ERROR (X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS )
220DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun", 220DO_ERROR (X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present )
221 coprocessor_segment_overrun)
222DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS)
223DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present)
224#ifdef CONFIG_X86_32 221#ifdef CONFIG_X86_32
225DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment) 222DO_ERROR (X86_TRAP_SS, SIGBUS, "stack segment", stack_segment )
226#endif 223#endif
227DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check, 224DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0 )
228 BUS_ADRALN, 0)
229 225
230#ifdef CONFIG_X86_64 226#ifdef CONFIG_X86_64
231/* Runs on IST stack */ 227/* Runs on IST stack */
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 930e5d48f560..57e5ce126d5a 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -11,6 +11,7 @@
11#include <linux/clocksource.h> 11#include <linux/clocksource.h>
12#include <linux/percpu.h> 12#include <linux/percpu.h>
13#include <linux/timex.h> 13#include <linux/timex.h>
14#include <linux/static_key.h>
14 15
15#include <asm/hpet.h> 16#include <asm/hpet.h>
16#include <asm/timer.h> 17#include <asm/timer.h>
@@ -37,13 +38,244 @@ static int __read_mostly tsc_unstable;
37 erroneous rdtsc usage on !cpu_has_tsc processors */ 38 erroneous rdtsc usage on !cpu_has_tsc processors */
38static int __read_mostly tsc_disabled = -1; 39static int __read_mostly tsc_disabled = -1;
39 40
41static struct static_key __use_tsc = STATIC_KEY_INIT;
42
40int tsc_clocksource_reliable; 43int tsc_clocksource_reliable;
44
45/*
46 * Use a ring-buffer like data structure, where a writer advances the head by
47 * writing a new data entry and a reader advances the tail when it observes a
48 * new entry.
49 *
50 * Writers are made to wait on readers until there's space to write a new
51 * entry.
52 *
53 * This means that we can always use an {offset, mul} pair to compute a ns
54 * value that is 'roughly' in the right direction, even if we're writing a new
55 * {offset, mul} pair during the clock read.
56 *
57 * The down-side is that we can no longer guarantee strict monotonicity anymore
58 * (assuming the TSC was that to begin with), because while we compute the
59 * intersection point of the two clock slopes and make sure the time is
60 * continuous at the point of switching; we can no longer guarantee a reader is
61 * strictly before or after the switch point.
62 *
63 * It does mean a reader no longer needs to disable IRQs in order to avoid
64 * CPU-Freq updates messing with his times, and similarly an NMI reader will
65 * no longer run the risk of hitting half-written state.
66 */
67
68struct cyc2ns {
69 struct cyc2ns_data data[2]; /* 0 + 2*24 = 48 */
70 struct cyc2ns_data *head; /* 48 + 8 = 56 */
71 struct cyc2ns_data *tail; /* 56 + 8 = 64 */
72}; /* exactly fits one cacheline */
73
74static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
75
76struct cyc2ns_data *cyc2ns_read_begin(void)
77{
78 struct cyc2ns_data *head;
79
80 preempt_disable();
81
82 head = this_cpu_read(cyc2ns.head);
83 /*
84 * Ensure we observe the entry when we observe the pointer to it.
85 * matches the wmb from cyc2ns_write_end().
86 */
87 smp_read_barrier_depends();
88 head->__count++;
89 barrier();
90
91 return head;
92}
93
94void cyc2ns_read_end(struct cyc2ns_data *head)
95{
96 barrier();
97 /*
98 * If we're the outer most nested read; update the tail pointer
99 * when we're done. This notifies possible pending writers
100 * that we've observed the head pointer and that the other
101 * entry is now free.
102 */
103 if (!--head->__count) {
104 /*
105 * x86-TSO does not reorder writes with older reads;
106 * therefore once this write becomes visible to another
107 * cpu, we must be finished reading the cyc2ns_data.
108 *
109 * matches with cyc2ns_write_begin().
110 */
111 this_cpu_write(cyc2ns.tail, head);
112 }
113 preempt_enable();
114}
115
116/*
117 * Begin writing a new @data entry for @cpu.
118 *
119 * Assumes some sort of write side lock; currently 'provided' by the assumption
120 * that cpufreq will call its notifiers sequentially.
121 */
122static struct cyc2ns_data *cyc2ns_write_begin(int cpu)
123{
124 struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
125 struct cyc2ns_data *data = c2n->data;
126
127 if (data == c2n->head)
128 data++;
129
130 /* XXX send an IPI to @cpu in order to guarantee a read? */
131
132 /*
133 * When we observe the tail write from cyc2ns_read_end(),
134 * the cpu must be done with that entry and its safe
135 * to start writing to it.
136 */
137 while (c2n->tail == data)
138 cpu_relax();
139
140 return data;
141}
142
143static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
144{
145 struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
146
147 /*
148 * Ensure the @data writes are visible before we publish the
149 * entry. Matches the data-depencency in cyc2ns_read_begin().
150 */
151 smp_wmb();
152
153 ACCESS_ONCE(c2n->head) = data;
154}
155
156/*
157 * Accelerators for sched_clock()
158 * convert from cycles(64bits) => nanoseconds (64bits)
159 * basic equation:
160 * ns = cycles / (freq / ns_per_sec)
161 * ns = cycles * (ns_per_sec / freq)
162 * ns = cycles * (10^9 / (cpu_khz * 10^3))
163 * ns = cycles * (10^6 / cpu_khz)
164 *
165 * Then we use scaling math (suggested by george@mvista.com) to get:
166 * ns = cycles * (10^6 * SC / cpu_khz) / SC
167 * ns = cycles * cyc2ns_scale / SC
168 *
169 * And since SC is a constant power of two, we can convert the div
170 * into a shift.
171 *
172 * We can use khz divisor instead of mhz to keep a better precision, since
173 * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
174 * (mathieu.desnoyers@polymtl.ca)
175 *
176 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
177 */
178
179#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
180
181static void cyc2ns_data_init(struct cyc2ns_data *data)
182{
183 data->cyc2ns_mul = 0;
184 data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
185 data->cyc2ns_offset = 0;
186 data->__count = 0;
187}
188
189static void cyc2ns_init(int cpu)
190{
191 struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
192
193 cyc2ns_data_init(&c2n->data[0]);
194 cyc2ns_data_init(&c2n->data[1]);
195
196 c2n->head = c2n->data;
197 c2n->tail = c2n->data;
198}
199
200static inline unsigned long long cycles_2_ns(unsigned long long cyc)
201{
202 struct cyc2ns_data *data, *tail;
203 unsigned long long ns;
204
205 /*
206 * See cyc2ns_read_*() for details; replicated in order to avoid
207 * an extra few instructions that came with the abstraction.
208 * Notable, it allows us to only do the __count and tail update
209 * dance when its actually needed.
210 */
211
212 preempt_disable_notrace();
213 data = this_cpu_read(cyc2ns.head);
214 tail = this_cpu_read(cyc2ns.tail);
215
216 if (likely(data == tail)) {
217 ns = data->cyc2ns_offset;
218 ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
219 } else {
220 data->__count++;
221
222 barrier();
223
224 ns = data->cyc2ns_offset;
225 ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
226
227 barrier();
228
229 if (!--data->__count)
230 this_cpu_write(cyc2ns.tail, data);
231 }
232 preempt_enable_notrace();
233
234 return ns;
235}
236
237/* XXX surely we already have this someplace in the kernel?! */
238#define DIV_ROUND(n, d) (((n) + ((d) / 2)) / (d))
239
240static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
241{
242 unsigned long long tsc_now, ns_now;
243 struct cyc2ns_data *data;
244 unsigned long flags;
245
246 local_irq_save(flags);
247 sched_clock_idle_sleep_event();
248
249 if (!cpu_khz)
250 goto done;
251
252 data = cyc2ns_write_begin(cpu);
253
254 rdtscll(tsc_now);
255 ns_now = cycles_2_ns(tsc_now);
256
257 /*
258 * Compute a new multiplier as per the above comment and ensure our
259 * time function is continuous; see the comment near struct
260 * cyc2ns_data.
261 */
262 data->cyc2ns_mul = DIV_ROUND(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR, cpu_khz);
263 data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
264 data->cyc2ns_offset = ns_now -
265 mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
266
267 cyc2ns_write_end(cpu, data);
268
269done:
270 sched_clock_idle_wakeup_event(0);
271 local_irq_restore(flags);
272}
41/* 273/*
42 * Scheduler clock - returns current time in nanosec units. 274 * Scheduler clock - returns current time in nanosec units.
43 */ 275 */
44u64 native_sched_clock(void) 276u64 native_sched_clock(void)
45{ 277{
46 u64 this_offset; 278 u64 tsc_now;
47 279
48 /* 280 /*
49 * Fall back to jiffies if there's no TSC available: 281 * Fall back to jiffies if there's no TSC available:
@@ -53,16 +285,16 @@ u64 native_sched_clock(void)
53 * very important for it to be as fast as the platform 285 * very important for it to be as fast as the platform
54 * can achieve it. ) 286 * can achieve it. )
55 */ 287 */
56 if (unlikely(tsc_disabled)) { 288 if (!static_key_false(&__use_tsc)) {
57 /* No locking but a rare wrong value is not a big deal: */ 289 /* No locking but a rare wrong value is not a big deal: */
58 return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); 290 return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
59 } 291 }
60 292
61 /* read the Time Stamp Counter: */ 293 /* read the Time Stamp Counter: */
62 rdtscll(this_offset); 294 rdtscll(tsc_now);
63 295
64 /* return the value in ns */ 296 /* return the value in ns */
65 return __cycles_2_ns(this_offset); 297 return cycles_2_ns(tsc_now);
66} 298}
67 299
68/* We need to define a real function for sched_clock, to override the 300/* We need to define a real function for sched_clock, to override the
@@ -419,6 +651,13 @@ unsigned long native_calibrate_tsc(void)
419 unsigned long flags, latch, ms, fast_calibrate; 651 unsigned long flags, latch, ms, fast_calibrate;
420 int hpet = is_hpet_enabled(), i, loopmin; 652 int hpet = is_hpet_enabled(), i, loopmin;
421 653
654 /* Calibrate TSC using MSR for Intel Atom SoCs */
655 local_irq_save(flags);
656 fast_calibrate = try_msr_calibrate_tsc();
657 local_irq_restore(flags);
658 if (fast_calibrate)
659 return fast_calibrate;
660
422 local_irq_save(flags); 661 local_irq_save(flags);
423 fast_calibrate = quick_pit_calibrate(); 662 fast_calibrate = quick_pit_calibrate();
424 local_irq_restore(flags); 663 local_irq_restore(flags);
@@ -589,61 +828,11 @@ int recalibrate_cpu_khz(void)
589EXPORT_SYMBOL(recalibrate_cpu_khz); 828EXPORT_SYMBOL(recalibrate_cpu_khz);
590 829
591 830
592/* Accelerators for sched_clock()
593 * convert from cycles(64bits) => nanoseconds (64bits)
594 * basic equation:
595 * ns = cycles / (freq / ns_per_sec)
596 * ns = cycles * (ns_per_sec / freq)
597 * ns = cycles * (10^9 / (cpu_khz * 10^3))
598 * ns = cycles * (10^6 / cpu_khz)
599 *
600 * Then we use scaling math (suggested by george@mvista.com) to get:
601 * ns = cycles * (10^6 * SC / cpu_khz) / SC
602 * ns = cycles * cyc2ns_scale / SC
603 *
604 * And since SC is a constant power of two, we can convert the div
605 * into a shift.
606 *
607 * We can use khz divisor instead of mhz to keep a better precision, since
608 * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
609 * (mathieu.desnoyers@polymtl.ca)
610 *
611 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
612 */
613
614DEFINE_PER_CPU(unsigned long, cyc2ns);
615DEFINE_PER_CPU(unsigned long long, cyc2ns_offset);
616
617static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
618{
619 unsigned long long tsc_now, ns_now, *offset;
620 unsigned long flags, *scale;
621
622 local_irq_save(flags);
623 sched_clock_idle_sleep_event();
624
625 scale = &per_cpu(cyc2ns, cpu);
626 offset = &per_cpu(cyc2ns_offset, cpu);
627
628 rdtscll(tsc_now);
629 ns_now = __cycles_2_ns(tsc_now);
630
631 if (cpu_khz) {
632 *scale = ((NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR) +
633 cpu_khz / 2) / cpu_khz;
634 *offset = ns_now - mult_frac(tsc_now, *scale,
635 (1UL << CYC2NS_SCALE_FACTOR));
636 }
637
638 sched_clock_idle_wakeup_event(0);
639 local_irq_restore(flags);
640}
641
642static unsigned long long cyc2ns_suspend; 831static unsigned long long cyc2ns_suspend;
643 832
644void tsc_save_sched_clock_state(void) 833void tsc_save_sched_clock_state(void)
645{ 834{
646 if (!sched_clock_stable) 835 if (!sched_clock_stable())
647 return; 836 return;
648 837
649 cyc2ns_suspend = sched_clock(); 838 cyc2ns_suspend = sched_clock();
@@ -663,16 +852,26 @@ void tsc_restore_sched_clock_state(void)
663 unsigned long flags; 852 unsigned long flags;
664 int cpu; 853 int cpu;
665 854
666 if (!sched_clock_stable) 855 if (!sched_clock_stable())
667 return; 856 return;
668 857
669 local_irq_save(flags); 858 local_irq_save(flags);
670 859
671 __this_cpu_write(cyc2ns_offset, 0); 860 /*
861 * We're comming out of suspend, there's no concurrency yet; don't
862 * bother being nice about the RCU stuff, just write to both
863 * data fields.
864 */
865
866 this_cpu_write(cyc2ns.data[0].cyc2ns_offset, 0);
867 this_cpu_write(cyc2ns.data[1].cyc2ns_offset, 0);
868
672 offset = cyc2ns_suspend - sched_clock(); 869 offset = cyc2ns_suspend - sched_clock();
673 870
674 for_each_possible_cpu(cpu) 871 for_each_possible_cpu(cpu) {
675 per_cpu(cyc2ns_offset, cpu) = offset; 872 per_cpu(cyc2ns.data[0].cyc2ns_offset, cpu) = offset;
873 per_cpu(cyc2ns.data[1].cyc2ns_offset, cpu) = offset;
874 }
676 875
677 local_irq_restore(flags); 876 local_irq_restore(flags);
678} 877}
@@ -715,8 +914,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
715 tsc_khz_ref = tsc_khz; 914 tsc_khz_ref = tsc_khz;
716 } 915 }
717 if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || 916 if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
718 (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || 917 (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
719 (val == CPUFREQ_RESUMECHANGE)) {
720 *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); 918 *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
721 919
722 tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); 920 tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
@@ -786,16 +984,14 @@ static struct clocksource clocksource_tsc = {
786 .mask = CLOCKSOURCE_MASK(64), 984 .mask = CLOCKSOURCE_MASK(64),
787 .flags = CLOCK_SOURCE_IS_CONTINUOUS | 985 .flags = CLOCK_SOURCE_IS_CONTINUOUS |
788 CLOCK_SOURCE_MUST_VERIFY, 986 CLOCK_SOURCE_MUST_VERIFY,
789#ifdef CONFIG_X86_64
790 .archdata = { .vclock_mode = VCLOCK_TSC }, 987 .archdata = { .vclock_mode = VCLOCK_TSC },
791#endif
792}; 988};
793 989
794void mark_tsc_unstable(char *reason) 990void mark_tsc_unstable(char *reason)
795{ 991{
796 if (!tsc_unstable) { 992 if (!tsc_unstable) {
797 tsc_unstable = 1; 993 tsc_unstable = 1;
798 sched_clock_stable = 0; 994 clear_sched_clock_stable();
799 disable_sched_clock_irqtime(); 995 disable_sched_clock_irqtime();
800 pr_info("Marking TSC unstable due to %s\n", reason); 996 pr_info("Marking TSC unstable due to %s\n", reason);
801 /* Change only the rating, when not registered */ 997 /* Change only the rating, when not registered */
@@ -995,14 +1191,18 @@ void __init tsc_init(void)
995 * speed as the bootup CPU. (cpufreq notifiers will fix this 1191 * speed as the bootup CPU. (cpufreq notifiers will fix this
996 * up if their speed diverges) 1192 * up if their speed diverges)
997 */ 1193 */
998 for_each_possible_cpu(cpu) 1194 for_each_possible_cpu(cpu) {
1195 cyc2ns_init(cpu);
999 set_cyc2ns_scale(cpu_khz, cpu); 1196 set_cyc2ns_scale(cpu_khz, cpu);
1197 }
1000 1198
1001 if (tsc_disabled > 0) 1199 if (tsc_disabled > 0)
1002 return; 1200 return;
1003 1201
1004 /* now allow native_sched_clock() to use rdtsc */ 1202 /* now allow native_sched_clock() to use rdtsc */
1203
1005 tsc_disabled = 0; 1204 tsc_disabled = 0;
1205 static_key_slow_inc(&__use_tsc);
1006 1206
1007 if (!no_sched_irq_time) 1207 if (!no_sched_irq_time)
1008 enable_sched_clock_irqtime(); 1208 enable_sched_clock_irqtime();
diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c
new file mode 100644
index 000000000000..92ae6acac8a7
--- /dev/null
+++ b/arch/x86/kernel/tsc_msr.c
@@ -0,0 +1,127 @@
1/*
2 * tsc_msr.c - MSR based TSC calibration on Intel Atom SoC platforms.
3 *
4 * TSC in Intel Atom SoC runs at a constant rate which can be figured
5 * by this formula:
6 * <maximum core-clock to bus-clock ratio> * <maximum resolved frequency>
7 * See Intel 64 and IA-32 System Programming Guid section 16.12 and 30.11.5
8 * for details.
9 * Especially some Intel Atom SoCs don't have PIT(i8254) or HPET, so MSR
10 * based calibration is the only option.
11 *
12 *
13 * Copyright (C) 2013 Intel Corporation
14 * Author: Bin Gao <bin.gao@intel.com>
15 *
16 * This file is released under the GPLv2.
17 */
18
19#include <linux/kernel.h>
20#include <asm/processor.h>
21#include <asm/setup.h>
22#include <asm/apic.h>
23#include <asm/param.h>
24
25/* CPU reference clock frequency: in KHz */
26#define FREQ_83 83200
27#define FREQ_100 99840
28#define FREQ_133 133200
29#define FREQ_166 166400
30
31#define MAX_NUM_FREQS 8
32
33/*
34 * According to Intel 64 and IA-32 System Programming Guide,
35 * if MSR_PERF_STAT[31] is set, the maximum resolved bus ratio can be
36 * read in MSR_PLATFORM_ID[12:8], otherwise in MSR_PERF_STAT[44:40].
37 * Unfortunately some Intel Atom SoCs aren't quite compliant to this,
38 * so we need manually differentiate SoC families. This is what the
39 * field msr_plat does.
40 */
41struct freq_desc {
42 u8 x86_family; /* CPU family */
43 u8 x86_model; /* model */
44 u8 msr_plat; /* 1: use MSR_PLATFORM_INFO, 0: MSR_IA32_PERF_STATUS */
45 u32 freqs[MAX_NUM_FREQS];
46};
47
48static struct freq_desc freq_desc_tables[] = {
49 /* PNW */
50 { 6, 0x27, 0, { 0, 0, 0, 0, 0, FREQ_100, 0, FREQ_83 } },
51 /* CLV+ */
52 { 6, 0x35, 0, { 0, FREQ_133, 0, 0, 0, FREQ_100, 0, FREQ_83 } },
53 /* TNG */
54 { 6, 0x4a, 1, { 0, FREQ_100, FREQ_133, 0, 0, 0, 0, 0 } },
55 /* VLV2 */
56 { 6, 0x37, 1, { FREQ_83, FREQ_100, FREQ_133, FREQ_166, 0, 0, 0, 0 } },
57 /* ANN */
58 { 6, 0x5a, 1, { FREQ_83, FREQ_100, FREQ_133, FREQ_100, 0, 0, 0, 0 } },
59};
60
61static int match_cpu(u8 family, u8 model)
62{
63 int i;
64
65 for (i = 0; i < ARRAY_SIZE(freq_desc_tables); i++) {
66 if ((family == freq_desc_tables[i].x86_family) &&
67 (model == freq_desc_tables[i].x86_model))
68 return i;
69 }
70
71 return -1;
72}
73
74/* Map CPU reference clock freq ID(0-7) to CPU reference clock freq(KHz) */
75#define id_to_freq(cpu_index, freq_id) \
76 (freq_desc_tables[cpu_index].freqs[freq_id])
77
78/*
79 * Do MSR calibration only for known/supported CPUs.
80 *
81 * Returns the calibration value or 0 if MSR calibration failed.
82 */
83unsigned long try_msr_calibrate_tsc(void)
84{
85 u32 lo, hi, ratio, freq_id, freq;
86 unsigned long res;
87 int cpu_index;
88
89 cpu_index = match_cpu(boot_cpu_data.x86, boot_cpu_data.x86_model);
90 if (cpu_index < 0)
91 return 0;
92
93 if (freq_desc_tables[cpu_index].msr_plat) {
94 rdmsr(MSR_PLATFORM_INFO, lo, hi);
95 ratio = (lo >> 8) & 0x1f;
96 } else {
97 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
98 ratio = (hi >> 8) & 0x1f;
99 }
100 pr_info("Maximum core-clock to bus-clock ratio: 0x%x\n", ratio);
101
102 if (!ratio)
103 goto fail;
104
105 /* Get FSB FREQ ID */
106 rdmsr(MSR_FSB_FREQ, lo, hi);
107 freq_id = lo & 0x7;
108 freq = id_to_freq(cpu_index, freq_id);
109 pr_info("Resolved frequency ID: %u, frequency: %u KHz\n",
110 freq_id, freq);
111 if (!freq)
112 goto fail;
113
114 /* TSC frequency = maximum resolved freq * maximum resolved bus ratio */
115 res = freq * ratio;
116 pr_info("TSC runs at %lu KHz\n", res);
117
118#ifdef CONFIG_X86_LOCAL_APIC
119 lapic_timer_frequency = (freq * 1000) / HZ;
120 pr_info("lapic_timer_frequency = %d\n", lapic_timer_frequency);
121#endif
122 return res;
123
124fail:
125 pr_warn("Fast TSC calibration using MSR failed\n");
126 return 0;
127}
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index adfdf56a3714..26488487bc61 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -16,7 +16,6 @@
16 */ 16 */
17#include <linux/spinlock.h> 17#include <linux/spinlock.h>
18#include <linux/kernel.h> 18#include <linux/kernel.h>
19#include <linux/init.h>
20#include <linux/smp.h> 19#include <linux/smp.h>
21#include <linux/nmi.h> 20#include <linux/nmi.h>
22#include <asm/tsc.h> 21#include <asm/tsc.h>
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index da6b35a98260..49edf2dd3613 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -147,7 +147,6 @@ SECTIONS
147 _edata = .; 147 _edata = .;
148 } :data 148 } :data
149 149
150#ifdef CONFIG_X86_64
151 150
152 . = ALIGN(PAGE_SIZE); 151 . = ALIGN(PAGE_SIZE);
153 __vvar_page = .; 152 __vvar_page = .;
@@ -165,12 +164,15 @@ SECTIONS
165#undef __VVAR_KERNEL_LDS 164#undef __VVAR_KERNEL_LDS
166#undef EMIT_VVAR 165#undef EMIT_VVAR
167 166
167 /*
168 * Pad the rest of the page with zeros. Otherwise the loader
169 * can leave garbage here.
170 */
171 . = __vvar_beginning_hack + PAGE_SIZE;
168 } :data 172 } :data
169 173
170 . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE); 174 . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE);
171 175
172#endif /* CONFIG_X86_64 */
173
174 /* Init code and data - will be freed after init */ 176 /* Init code and data - will be freed after init */
175 . = ALIGN(PAGE_SIZE); 177 . = ALIGN(PAGE_SIZE);
176 .init.begin : AT(ADDR(.init.begin) - LOAD_OFFSET) { 178 .init.begin : AT(ADDR(.init.begin) - LOAD_OFFSET) {
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index 992f890283e9..f6584a90aba3 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -33,7 +33,7 @@
33 * and vice versa. 33 * and vice versa.
34 */ 34 */
35 35
36static unsigned long vsmp_save_fl(void) 36asmlinkage unsigned long vsmp_save_fl(void)
37{ 37{
38 unsigned long flags = native_save_fl(); 38 unsigned long flags = native_save_fl();
39 39
@@ -43,7 +43,7 @@ static unsigned long vsmp_save_fl(void)
43} 43}
44PV_CALLEE_SAVE_REGS_THUNK(vsmp_save_fl); 44PV_CALLEE_SAVE_REGS_THUNK(vsmp_save_fl);
45 45
46static void vsmp_restore_fl(unsigned long flags) 46__visible void vsmp_restore_fl(unsigned long flags)
47{ 47{
48 if (flags & X86_EFLAGS_IF) 48 if (flags & X86_EFLAGS_IF)
49 flags &= ~X86_EFLAGS_AC; 49 flags &= ~X86_EFLAGS_AC;
@@ -53,7 +53,7 @@ static void vsmp_restore_fl(unsigned long flags)
53} 53}
54PV_CALLEE_SAVE_REGS_THUNK(vsmp_restore_fl); 54PV_CALLEE_SAVE_REGS_THUNK(vsmp_restore_fl);
55 55
56static void vsmp_irq_disable(void) 56asmlinkage void vsmp_irq_disable(void)
57{ 57{
58 unsigned long flags = native_save_fl(); 58 unsigned long flags = native_save_fl();
59 59
@@ -61,7 +61,7 @@ static void vsmp_irq_disable(void)
61} 61}
62PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_disable); 62PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_disable);
63 63
64static void vsmp_irq_enable(void) 64asmlinkage void vsmp_irq_enable(void)
65{ 65{
66 unsigned long flags = native_save_fl(); 66 unsigned long flags = native_save_fl();
67 67
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 1f96f9347ed9..8b3b3eb3cead 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -47,14 +47,12 @@
47#include <asm/segment.h> 47#include <asm/segment.h>
48#include <asm/desc.h> 48#include <asm/desc.h>
49#include <asm/topology.h> 49#include <asm/topology.h>
50#include <asm/vgtod.h>
51#include <asm/traps.h> 50#include <asm/traps.h>
52 51
53#define CREATE_TRACE_POINTS 52#define CREATE_TRACE_POINTS
54#include "vsyscall_trace.h" 53#include "vsyscall_trace.h"
55 54
56DEFINE_VVAR(int, vgetcpu_mode); 55DEFINE_VVAR(int, vgetcpu_mode);
57DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data);
58 56
59static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; 57static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE;
60 58
@@ -77,48 +75,6 @@ static int __init vsyscall_setup(char *str)
77} 75}
78early_param("vsyscall", vsyscall_setup); 76early_param("vsyscall", vsyscall_setup);
79 77
80void update_vsyscall_tz(void)
81{
82 vsyscall_gtod_data.sys_tz = sys_tz;
83}
84
85void update_vsyscall(struct timekeeper *tk)
86{
87 struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data;
88
89 write_seqcount_begin(&vdata->seq);
90
91 /* copy vsyscall data */
92 vdata->clock.vclock_mode = tk->clock->archdata.vclock_mode;
93 vdata->clock.cycle_last = tk->clock->cycle_last;
94 vdata->clock.mask = tk->clock->mask;
95 vdata->clock.mult = tk->mult;
96 vdata->clock.shift = tk->shift;
97
98 vdata->wall_time_sec = tk->xtime_sec;
99 vdata->wall_time_snsec = tk->xtime_nsec;
100
101 vdata->monotonic_time_sec = tk->xtime_sec
102 + tk->wall_to_monotonic.tv_sec;
103 vdata->monotonic_time_snsec = tk->xtime_nsec
104 + (tk->wall_to_monotonic.tv_nsec
105 << tk->shift);
106 while (vdata->monotonic_time_snsec >=
107 (((u64)NSEC_PER_SEC) << tk->shift)) {
108 vdata->monotonic_time_snsec -=
109 ((u64)NSEC_PER_SEC) << tk->shift;
110 vdata->monotonic_time_sec++;
111 }
112
113 vdata->wall_time_coarse.tv_sec = tk->xtime_sec;
114 vdata->wall_time_coarse.tv_nsec = (long)(tk->xtime_nsec >> tk->shift);
115
116 vdata->monotonic_time_coarse = timespec_add(vdata->wall_time_coarse,
117 tk->wall_to_monotonic);
118
119 write_seqcount_end(&vdata->seq);
120}
121
122static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, 78static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
123 const char *message) 79 const char *message)
124{ 80{
@@ -374,7 +330,6 @@ void __init map_vsyscall(void)
374{ 330{
375 extern char __vsyscall_page; 331 extern char __vsyscall_page;
376 unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); 332 unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
377 extern char __vvar_page;
378 unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page); 333 unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page);
379 334
380 __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_vsyscall, 335 __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_vsyscall,
@@ -393,9 +348,13 @@ static int __init vsyscall_init(void)
393{ 348{
394 BUG_ON(VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)); 349 BUG_ON(VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE));
395 350
351 cpu_notifier_register_begin();
352
396 on_each_cpu(cpu_vsyscall_init, NULL, 1); 353 on_each_cpu(cpu_vsyscall_init, NULL, 1);
397 /* notifier priority > KVM */ 354 /* notifier priority > KVM */
398 hotcpu_notifier(cpu_vsyscall_notifier, 30); 355 __hotcpu_notifier(cpu_vsyscall_notifier, 30);
356
357 cpu_notifier_register_done();
399 358
400 return 0; 359 return 0;
401} 360}
diff --git a/arch/x86/kernel/vsyscall_gtod.c b/arch/x86/kernel/vsyscall_gtod.c
new file mode 100644
index 000000000000..f9c6e56e14b5
--- /dev/null
+++ b/arch/x86/kernel/vsyscall_gtod.c
@@ -0,0 +1,69 @@
1/*
2 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
3 * Copyright 2003 Andi Kleen, SuSE Labs.
4 *
5 * Modified for x86 32 bit architecture by
6 * Stefani Seibold <stefani@seibold.net>
7 * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany
8 *
9 * Thanks to hpa@transmeta.com for some useful hint.
10 * Special thanks to Ingo Molnar for his early experience with
11 * a different vsyscall implementation for Linux/IA32 and for the name.
12 *
13 */
14
15#include <linux/timekeeper_internal.h>
16#include <asm/vgtod.h>
17#include <asm/vvar.h>
18
19DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data);
20
21void update_vsyscall_tz(void)
22{
23 vsyscall_gtod_data.tz_minuteswest = sys_tz.tz_minuteswest;
24 vsyscall_gtod_data.tz_dsttime = sys_tz.tz_dsttime;
25}
26
27void update_vsyscall(struct timekeeper *tk)
28{
29 struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data;
30
31 gtod_write_begin(vdata);
32
33 /* copy vsyscall data */
34 vdata->vclock_mode = tk->clock->archdata.vclock_mode;
35 vdata->cycle_last = tk->clock->cycle_last;
36 vdata->mask = tk->clock->mask;
37 vdata->mult = tk->mult;
38 vdata->shift = tk->shift;
39
40 vdata->wall_time_sec = tk->xtime_sec;
41 vdata->wall_time_snsec = tk->xtime_nsec;
42
43 vdata->monotonic_time_sec = tk->xtime_sec
44 + tk->wall_to_monotonic.tv_sec;
45 vdata->monotonic_time_snsec = tk->xtime_nsec
46 + (tk->wall_to_monotonic.tv_nsec
47 << tk->shift);
48 while (vdata->monotonic_time_snsec >=
49 (((u64)NSEC_PER_SEC) << tk->shift)) {
50 vdata->monotonic_time_snsec -=
51 ((u64)NSEC_PER_SEC) << tk->shift;
52 vdata->monotonic_time_sec++;
53 }
54
55 vdata->wall_time_coarse_sec = tk->xtime_sec;
56 vdata->wall_time_coarse_nsec = (long)(tk->xtime_nsec >> tk->shift);
57
58 vdata->monotonic_time_coarse_sec =
59 vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec;
60 vdata->monotonic_time_coarse_nsec =
61 vdata->wall_time_coarse_nsec + tk->wall_to_monotonic.tv_nsec;
62
63 while (vdata->monotonic_time_coarse_nsec >= NSEC_PER_SEC) {
64 vdata->monotonic_time_coarse_nsec -= NSEC_PER_SEC;
65 vdata->monotonic_time_coarse_sec++;
66 }
67
68 gtod_write_end(vdata);
69}
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 021783b1f46a..e48b674639cc 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -136,9 +136,9 @@ void arch_teardown_msi_irq(unsigned int irq)
136 x86_msi.teardown_msi_irq(irq); 136 x86_msi.teardown_msi_irq(irq);
137} 137}
138 138
139void arch_restore_msi_irqs(struct pci_dev *dev, int irq) 139void arch_restore_msi_irqs(struct pci_dev *dev)
140{ 140{
141 x86_msi.restore_msi_irqs(dev, irq); 141 x86_msi.restore_msi_irqs(dev);
142} 142}
143u32 arch_msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag) 143u32 arch_msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
144{ 144{
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 422fd8223470..a4b451c6addf 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -562,6 +562,16 @@ static void __init xstate_enable_boot_cpu(void)
562 if (cpu_has_xsaveopt && eagerfpu != DISABLE) 562 if (cpu_has_xsaveopt && eagerfpu != DISABLE)
563 eagerfpu = ENABLE; 563 eagerfpu = ENABLE;
564 564
565 if (pcntxt_mask & XSTATE_EAGER) {
566 if (eagerfpu == DISABLE) {
567 pr_err("eagerfpu not present, disabling some xstate features: 0x%llx\n",
568 pcntxt_mask & XSTATE_EAGER);
569 pcntxt_mask &= ~XSTATE_EAGER;
570 } else {
571 eagerfpu = ENABLE;
572 }
573 }
574
565 pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n", 575 pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n",
566 pcntxt_mask, xstate_size); 576 pcntxt_mask, xstate_size);
567} 577}
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index b89c5db2b832..287e4c85fff9 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -80,7 +80,7 @@ config KVM_MMU_AUDIT
80 depends on KVM && TRACEPOINTS 80 depends on KVM && TRACEPOINTS
81 ---help--- 81 ---help---
82 This option adds a R/W kVM module parameter 'mmu_audit', which allows 82 This option adds a R/W kVM module parameter 'mmu_audit', which allows
83 audit KVM MMU at runtime. 83 auditing of KVM MMU events at runtime.
84 84
85config KVM_DEVICE_ASSIGNMENT 85config KVM_DEVICE_ASSIGNMENT
86 bool "KVM legacy PCI device assignment support" 86 bool "KVM legacy PCI device assignment support"
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index c6976257eff5..bea60671ef8a 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -28,7 +28,7 @@ static u32 xstate_required_size(u64 xstate_bv)
28 int feature_bit = 0; 28 int feature_bit = 0;
29 u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; 29 u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
30 30
31 xstate_bv &= ~XSTATE_FPSSE; 31 xstate_bv &= XSTATE_EXTEND_MASK;
32 while (xstate_bv) { 32 while (xstate_bv) {
33 if (xstate_bv & 0x1) { 33 if (xstate_bv & 0x1) {
34 u32 eax, ebx, ecx, edx; 34 u32 eax, ebx, ecx, edx;
@@ -43,6 +43,16 @@ static u32 xstate_required_size(u64 xstate_bv)
43 return ret; 43 return ret;
44} 44}
45 45
46u64 kvm_supported_xcr0(void)
47{
48 u64 xcr0 = KVM_SUPPORTED_XCR0 & host_xcr0;
49
50 if (!kvm_x86_ops->mpx_supported())
51 xcr0 &= ~(XSTATE_BNDREGS | XSTATE_BNDCSR);
52
53 return xcr0;
54}
55
46void kvm_update_cpuid(struct kvm_vcpu *vcpu) 56void kvm_update_cpuid(struct kvm_vcpu *vcpu)
47{ 57{
48 struct kvm_cpuid_entry2 *best; 58 struct kvm_cpuid_entry2 *best;
@@ -73,9 +83,9 @@ void kvm_update_cpuid(struct kvm_vcpu *vcpu)
73 } else { 83 } else {
74 vcpu->arch.guest_supported_xcr0 = 84 vcpu->arch.guest_supported_xcr0 =
75 (best->eax | ((u64)best->edx << 32)) & 85 (best->eax | ((u64)best->edx << 32)) &
76 host_xcr0 & KVM_SUPPORTED_XCR0; 86 kvm_supported_xcr0();
77 vcpu->arch.guest_xstate_size = 87 vcpu->arch.guest_xstate_size = best->ebx =
78 xstate_required_size(vcpu->arch.guest_supported_xcr0); 88 xstate_required_size(vcpu->arch.xcr0);
79 } 89 }
80 90
81 kvm_pmu_cpuid_update(vcpu); 91 kvm_pmu_cpuid_update(vcpu);
@@ -210,13 +220,6 @@ static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
210 entry->flags = 0; 220 entry->flags = 0;
211} 221}
212 222
213static bool supported_xcr0_bit(unsigned bit)
214{
215 u64 mask = ((u64)1 << bit);
216
217 return mask & KVM_SUPPORTED_XCR0 & host_xcr0;
218}
219
220#define F(x) bit(X86_FEATURE_##x) 223#define F(x) bit(X86_FEATURE_##x)
221 224
222static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry, 225static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry,
@@ -256,6 +259,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
256#endif 259#endif
257 unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; 260 unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
258 unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0; 261 unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0;
262 unsigned f_mpx = kvm_x86_ops->mpx_supported() ? F(MPX) : 0;
259 263
260 /* cpuid 1.edx */ 264 /* cpuid 1.edx */
261 const u32 kvm_supported_word0_x86_features = 265 const u32 kvm_supported_word0_x86_features =
@@ -263,7 +267,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
263 F(TSC) | F(MSR) | F(PAE) | F(MCE) | 267 F(TSC) | F(MSR) | F(PAE) | F(MCE) |
264 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) | 268 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
265 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | 269 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
266 F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) | 270 F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLUSH) |
267 0 /* Reserved, DS, ACPI */ | F(MMX) | 271 0 /* Reserved, DS, ACPI */ | F(MMX) |
268 F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) | 272 F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
269 0 /* HTT, TM, Reserved, PBE */; 273 0 /* HTT, TM, Reserved, PBE */;
@@ -303,7 +307,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
303 /* cpuid 7.0.ebx */ 307 /* cpuid 7.0.ebx */
304 const u32 kvm_supported_word9_x86_features = 308 const u32 kvm_supported_word9_x86_features =
305 F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | 309 F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
306 F(BMI2) | F(ERMS) | f_invpcid | F(RTM); 310 F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
311 F(ADX);
307 312
308 /* all calls to cpuid_count() should be made on the same cpu */ 313 /* all calls to cpuid_count() should be made on the same cpu */
309 get_cpu(); 314 get_cpu();
@@ -436,16 +441,18 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
436 } 441 }
437 case 0xd: { 442 case 0xd: {
438 int idx, i; 443 int idx, i;
444 u64 supported = kvm_supported_xcr0();
439 445
440 entry->eax &= host_xcr0 & KVM_SUPPORTED_XCR0; 446 entry->eax &= supported;
441 entry->edx &= (host_xcr0 & KVM_SUPPORTED_XCR0) >> 32; 447 entry->edx &= supported >> 32;
442 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 448 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
443 for (idx = 1, i = 1; idx < 64; ++idx) { 449 for (idx = 1, i = 1; idx < 64; ++idx) {
450 u64 mask = ((u64)1 << idx);
444 if (*nent >= maxnent) 451 if (*nent >= maxnent)
445 goto out; 452 goto out;
446 453
447 do_cpuid_1_ent(&entry[i], function, idx); 454 do_cpuid_1_ent(&entry[i], function, idx);
448 if (entry[i].eax == 0 || !supported_xcr0_bit(idx)) 455 if (entry[i].eax == 0 || !(supported & mask))
449 continue; 456 continue;
450 entry[i].flags |= 457 entry[i].flags |=
451 KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 458 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index f1e4895174b2..a2a1bb7ed8c1 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -72,4 +72,12 @@ static inline bool guest_cpuid_has_pcid(struct kvm_vcpu *vcpu)
72 return best && (best->ecx & bit(X86_FEATURE_PCID)); 72 return best && (best->ecx & bit(X86_FEATURE_PCID));
73} 73}
74 74
75static inline bool guest_cpuid_has_x2apic(struct kvm_vcpu *vcpu)
76{
77 struct kvm_cpuid_entry2 *best;
78
79 best = kvm_find_cpuid_entry(vcpu, 1, 0);
80 return best && (best->ecx & bit(X86_FEATURE_X2APIC));
81}
82
75#endif 83#endif
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 07ffca0a89e9..205b17eed93c 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3668,6 +3668,10 @@ static const struct gprefix pfx_vmovntpx = {
3668 I(0, em_mov), N, N, N, 3668 I(0, em_mov), N, N, N,
3669}; 3669};
3670 3670
3671static const struct gprefix pfx_0f_28_0f_29 = {
3672 I(Aligned, em_mov), I(Aligned, em_mov), N, N,
3673};
3674
3671static const struct escape escape_d9 = { { 3675static const struct escape escape_d9 = { {
3672 N, N, N, N, N, N, N, I(DstMem, em_fnstcw), 3676 N, N, N, N, N, N, N, I(DstMem, em_fnstcw),
3673}, { 3677}, {
@@ -3870,7 +3874,9 @@ static const struct opcode twobyte_table[256] = {
3870 IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write), 3874 IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write),
3871 IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write), 3875 IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write),
3872 N, N, N, N, 3876 N, N, N, N,
3873 N, N, N, GP(ModRM | DstMem | SrcReg | Sse | Mov | Aligned, &pfx_vmovntpx), 3877 GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_28_0f_29),
3878 GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_28_0f_29),
3879 N, GP(ModRM | DstMem | SrcReg | Sse | Mov | Aligned, &pfx_vmovntpx),
3874 N, N, N, N, 3880 N, N, N, N,
3875 /* 0x30 - 0x3F */ 3881 /* 0x30 - 0x3F */
3876 II(ImplicitOps | Priv, em_wrmsr, wrmsr), 3882 II(ImplicitOps | Priv, em_wrmsr, wrmsr),
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 412a5aa0ef94..518d86471b76 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -37,6 +37,7 @@
37 37
38#include "irq.h" 38#include "irq.h"
39#include "i8254.h" 39#include "i8254.h"
40#include "x86.h"
40 41
41#ifndef CONFIG_X86_64 42#ifndef CONFIG_X86_64
42#define mod_64(x, y) ((x) - (y) * div64_u64(x, y)) 43#define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
@@ -349,6 +350,23 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
349 atomic_set(&ps->pending, 0); 350 atomic_set(&ps->pending, 0);
350 ps->irq_ack = 1; 351 ps->irq_ack = 1;
351 352
353 /*
354 * Do not allow the guest to program periodic timers with small
355 * interval, since the hrtimers are not throttled by the host
356 * scheduler.
357 */
358 if (ps->is_periodic) {
359 s64 min_period = min_timer_period_us * 1000LL;
360
361 if (ps->period < min_period) {
362 pr_info_ratelimited(
363 "kvm: requested %lld ns "
364 "i8254 timer period limited to %lld ns\n",
365 ps->period, min_period);
366 ps->period = min_period;
367 }
368 }
369
352 hrtimer_start(&ps->timer, ktime_add_ns(ktime_get(), interval), 370 hrtimer_start(&ps->timer, ktime_add_ns(ktime_get(), interval),
353 HRTIMER_MODE_ABS); 371 HRTIMER_MODE_ABS);
354} 372}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 775702f649ca..9736529ade08 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -71,9 +71,6 @@
71#define VEC_POS(v) ((v) & (32 - 1)) 71#define VEC_POS(v) ((v) & (32 - 1))
72#define REG_POS(v) (((v) >> 5) << 4) 72#define REG_POS(v) (((v) >> 5) << 4)
73 73
74static unsigned int min_timer_period_us = 500;
75module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
76
77static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val) 74static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
78{ 75{
79 *((u32 *) (apic->regs + reg_off)) = val; 76 *((u32 *) (apic->regs + reg_off)) = val;
@@ -435,7 +432,7 @@ static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu)
435 u8 val; 432 u8 val;
436 if (pv_eoi_get_user(vcpu, &val) < 0) 433 if (pv_eoi_get_user(vcpu, &val) < 0)
437 apic_debug("Can't read EOI MSR value: 0x%llx\n", 434 apic_debug("Can't read EOI MSR value: 0x%llx\n",
438 (unsigned long long)vcpi->arch.pv_eoi.msr_val); 435 (unsigned long long)vcpu->arch.pv_eoi.msr_val);
439 return val & 0x1; 436 return val & 0x1;
440} 437}
441 438
@@ -443,7 +440,7 @@ static void pv_eoi_set_pending(struct kvm_vcpu *vcpu)
443{ 440{
444 if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0) { 441 if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0) {
445 apic_debug("Can't set EOI MSR value: 0x%llx\n", 442 apic_debug("Can't set EOI MSR value: 0x%llx\n",
446 (unsigned long long)vcpi->arch.pv_eoi.msr_val); 443 (unsigned long long)vcpu->arch.pv_eoi.msr_val);
447 return; 444 return;
448 } 445 }
449 __set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); 446 __set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
@@ -453,7 +450,7 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
453{ 450{
454 if (pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) { 451 if (pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) {
455 apic_debug("Can't clear EOI MSR value: 0x%llx\n", 452 apic_debug("Can't clear EOI MSR value: 0x%llx\n",
456 (unsigned long long)vcpi->arch.pv_eoi.msr_val); 453 (unsigned long long)vcpu->arch.pv_eoi.msr_val);
457 return; 454 return;
458 } 455 }
459 __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); 456 __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index c8b0d0d2da5c..6a11845fd8b9 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -65,7 +65,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
65 struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map); 65 struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map);
66 66
67u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); 67u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
68void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); 68int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
69void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, 69void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
70 struct kvm_lapic_state *s); 70 struct kvm_lapic_state *s);
71int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); 71int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 40772ef0f2b1..f5704d9e5ddc 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2659,6 +2659,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2659 int emulate = 0; 2659 int emulate = 0;
2660 gfn_t pseudo_gfn; 2660 gfn_t pseudo_gfn;
2661 2661
2662 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2663 return 0;
2664
2662 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { 2665 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
2663 if (iterator.level == level) { 2666 if (iterator.level == level) {
2664 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, 2667 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL,
@@ -2669,6 +2672,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2669 break; 2672 break;
2670 } 2673 }
2671 2674
2675 drop_large_spte(vcpu, iterator.sptep);
2672 if (!is_shadow_present_pte(*iterator.sptep)) { 2676 if (!is_shadow_present_pte(*iterator.sptep)) {
2673 u64 base_addr = iterator.addr; 2677 u64 base_addr = iterator.addr;
2674 2678
@@ -2829,6 +2833,9 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
2829 bool ret = false; 2833 bool ret = false;
2830 u64 spte = 0ull; 2834 u64 spte = 0ull;
2831 2835
2836 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2837 return false;
2838
2832 if (!page_fault_can_be_fast(error_code)) 2839 if (!page_fault_can_be_fast(error_code))
2833 return false; 2840 return false;
2834 2841
@@ -3224,6 +3231,9 @@ static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr)
3224 struct kvm_shadow_walk_iterator iterator; 3231 struct kvm_shadow_walk_iterator iterator;
3225 u64 spte = 0ull; 3232 u64 spte = 0ull;
3226 3233
3234 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
3235 return spte;
3236
3227 walk_shadow_page_lockless_begin(vcpu); 3237 walk_shadow_page_lockless_begin(vcpu);
3228 for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) 3238 for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
3229 if (!is_shadow_present_pte(spte)) 3239 if (!is_shadow_present_pte(spte))
@@ -3319,7 +3329,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
3319 arch.direct_map = vcpu->arch.mmu.direct_map; 3329 arch.direct_map = vcpu->arch.mmu.direct_map;
3320 arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu); 3330 arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
3321 3331
3322 return kvm_setup_async_pf(vcpu, gva, gfn, &arch); 3332 return kvm_setup_async_pf(vcpu, gva, gfn_to_hva(vcpu->kvm, gfn), &arch);
3323} 3333}
3324 3334
3325static bool can_do_async_pf(struct kvm_vcpu *vcpu) 3335static bool can_do_async_pf(struct kvm_vcpu *vcpu)
@@ -4510,6 +4520,9 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
4510 u64 spte; 4520 u64 spte;
4511 int nr_sptes = 0; 4521 int nr_sptes = 0;
4512 4522
4523 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
4524 return nr_sptes;
4525
4513 walk_shadow_page_lockless_begin(vcpu); 4526 walk_shadow_page_lockless_begin(vcpu);
4514 for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) { 4527 for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
4515 sptes[iterator.level-1] = spte; 4528 sptes[iterator.level-1] = spte;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index ad75d77999d0..b1e6c1bf68d3 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -569,6 +569,9 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
569 if (FNAME(gpte_changed)(vcpu, gw, top_level)) 569 if (FNAME(gpte_changed)(vcpu, gw, top_level))
570 goto out_gpte_changed; 570 goto out_gpte_changed;
571 571
572 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
573 goto out_gpte_changed;
574
572 for (shadow_walk_init(&it, vcpu, addr); 575 for (shadow_walk_init(&it, vcpu, addr);
573 shadow_walk_okay(&it) && it.level > gw->level; 576 shadow_walk_okay(&it) && it.level > gw->level;
574 shadow_walk_next(&it)) { 577 shadow_walk_next(&it)) {
@@ -820,6 +823,11 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
820 */ 823 */
821 mmu_topup_memory_caches(vcpu); 824 mmu_topup_memory_caches(vcpu);
822 825
826 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
827 WARN_ON(1);
828 return;
829 }
830
823 spin_lock(&vcpu->kvm->mmu_lock); 831 spin_lock(&vcpu->kvm->mmu_lock);
824 for_each_shadow_entry(vcpu, gva, iterator) { 832 for_each_shadow_entry(vcpu, gva, iterator) {
825 level = iterator.level; 833 level = iterator.level;
@@ -905,7 +913,8 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
905 * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't 913 * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't
906 * used by guest then tlbs are not flushed, so guest is allowed to access the 914 * used by guest then tlbs are not flushed, so guest is allowed to access the
907 * freed pages. 915 * freed pages.
908 * And we increase kvm->tlbs_dirty to delay tlbs flush in this case. 916 * We set tlbs_dirty to let the notifier know this change and delay the flush
917 * until such a case actually happens.
909 */ 918 */
910static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 919static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
911{ 920{
@@ -934,7 +943,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
934 return -EINVAL; 943 return -EINVAL;
935 944
936 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { 945 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
937 vcpu->kvm->tlbs_dirty++; 946 vcpu->kvm->tlbs_dirty = true;
938 continue; 947 continue;
939 } 948 }
940 949
@@ -949,7 +958,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
949 958
950 if (gfn != sp->gfns[i]) { 959 if (gfn != sp->gfns[i]) {
951 drop_spte(vcpu->kvm, &sp->spt[i]); 960 drop_spte(vcpu->kvm, &sp->spt[i]);
952 vcpu->kvm->tlbs_dirty++; 961 vcpu->kvm->tlbs_dirty = true;
953 continue; 962 continue;
954 } 963 }
955 964
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index c7168a5cff1b..7f4f9c2badae 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -34,6 +34,7 @@
34#include <asm/perf_event.h> 34#include <asm/perf_event.h>
35#include <asm/tlbflush.h> 35#include <asm/tlbflush.h>
36#include <asm/desc.h> 36#include <asm/desc.h>
37#include <asm/debugreg.h>
37#include <asm/kvm_para.h> 38#include <asm/kvm_para.h>
38 39
39#include <asm/virtext.h> 40#include <asm/virtext.h>
@@ -303,20 +304,35 @@ static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
303 return vmcb->control.intercept_cr & (1U << bit); 304 return vmcb->control.intercept_cr & (1U << bit);
304} 305}
305 306
306static inline void set_dr_intercept(struct vcpu_svm *svm, int bit) 307static inline void set_dr_intercepts(struct vcpu_svm *svm)
307{ 308{
308 struct vmcb *vmcb = get_host_vmcb(svm); 309 struct vmcb *vmcb = get_host_vmcb(svm);
309 310
310 vmcb->control.intercept_dr |= (1U << bit); 311 vmcb->control.intercept_dr = (1 << INTERCEPT_DR0_READ)
312 | (1 << INTERCEPT_DR1_READ)
313 | (1 << INTERCEPT_DR2_READ)
314 | (1 << INTERCEPT_DR3_READ)
315 | (1 << INTERCEPT_DR4_READ)
316 | (1 << INTERCEPT_DR5_READ)
317 | (1 << INTERCEPT_DR6_READ)
318 | (1 << INTERCEPT_DR7_READ)
319 | (1 << INTERCEPT_DR0_WRITE)
320 | (1 << INTERCEPT_DR1_WRITE)
321 | (1 << INTERCEPT_DR2_WRITE)
322 | (1 << INTERCEPT_DR3_WRITE)
323 | (1 << INTERCEPT_DR4_WRITE)
324 | (1 << INTERCEPT_DR5_WRITE)
325 | (1 << INTERCEPT_DR6_WRITE)
326 | (1 << INTERCEPT_DR7_WRITE);
311 327
312 recalc_intercepts(svm); 328 recalc_intercepts(svm);
313} 329}
314 330
315static inline void clr_dr_intercept(struct vcpu_svm *svm, int bit) 331static inline void clr_dr_intercepts(struct vcpu_svm *svm)
316{ 332{
317 struct vmcb *vmcb = get_host_vmcb(svm); 333 struct vmcb *vmcb = get_host_vmcb(svm);
318 334
319 vmcb->control.intercept_dr &= ~(1U << bit); 335 vmcb->control.intercept_dr = 0;
320 336
321 recalc_intercepts(svm); 337 recalc_intercepts(svm);
322} 338}
@@ -1080,23 +1096,7 @@ static void init_vmcb(struct vcpu_svm *svm)
1080 set_cr_intercept(svm, INTERCEPT_CR4_WRITE); 1096 set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
1081 set_cr_intercept(svm, INTERCEPT_CR8_WRITE); 1097 set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
1082 1098
1083 set_dr_intercept(svm, INTERCEPT_DR0_READ); 1099 set_dr_intercepts(svm);
1084 set_dr_intercept(svm, INTERCEPT_DR1_READ);
1085 set_dr_intercept(svm, INTERCEPT_DR2_READ);
1086 set_dr_intercept(svm, INTERCEPT_DR3_READ);
1087 set_dr_intercept(svm, INTERCEPT_DR4_READ);
1088 set_dr_intercept(svm, INTERCEPT_DR5_READ);
1089 set_dr_intercept(svm, INTERCEPT_DR6_READ);
1090 set_dr_intercept(svm, INTERCEPT_DR7_READ);
1091
1092 set_dr_intercept(svm, INTERCEPT_DR0_WRITE);
1093 set_dr_intercept(svm, INTERCEPT_DR1_WRITE);
1094 set_dr_intercept(svm, INTERCEPT_DR2_WRITE);
1095 set_dr_intercept(svm, INTERCEPT_DR3_WRITE);
1096 set_dr_intercept(svm, INTERCEPT_DR4_WRITE);
1097 set_dr_intercept(svm, INTERCEPT_DR5_WRITE);
1098 set_dr_intercept(svm, INTERCEPT_DR6_WRITE);
1099 set_dr_intercept(svm, INTERCEPT_DR7_WRITE);
1100 1100
1101 set_exception_intercept(svm, PF_VECTOR); 1101 set_exception_intercept(svm, PF_VECTOR);
1102 set_exception_intercept(svm, UD_VECTOR); 1102 set_exception_intercept(svm, UD_VECTOR);
@@ -1671,6 +1671,34 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1671 mark_dirty(svm->vmcb, VMCB_ASID); 1671 mark_dirty(svm->vmcb, VMCB_ASID);
1672} 1672}
1673 1673
1674static u64 svm_get_dr6(struct kvm_vcpu *vcpu)
1675{
1676 return to_svm(vcpu)->vmcb->save.dr6;
1677}
1678
1679static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value)
1680{
1681 struct vcpu_svm *svm = to_svm(vcpu);
1682
1683 svm->vmcb->save.dr6 = value;
1684 mark_dirty(svm->vmcb, VMCB_DR);
1685}
1686
1687static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
1688{
1689 struct vcpu_svm *svm = to_svm(vcpu);
1690
1691 get_debugreg(vcpu->arch.db[0], 0);
1692 get_debugreg(vcpu->arch.db[1], 1);
1693 get_debugreg(vcpu->arch.db[2], 2);
1694 get_debugreg(vcpu->arch.db[3], 3);
1695 vcpu->arch.dr6 = svm_get_dr6(vcpu);
1696 vcpu->arch.dr7 = svm->vmcb->save.dr7;
1697
1698 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
1699 set_dr_intercepts(svm);
1700}
1701
1674static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) 1702static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
1675{ 1703{
1676 struct vcpu_svm *svm = to_svm(vcpu); 1704 struct vcpu_svm *svm = to_svm(vcpu);
@@ -2829,6 +2857,7 @@ static int iret_interception(struct vcpu_svm *svm)
2829 clr_intercept(svm, INTERCEPT_IRET); 2857 clr_intercept(svm, INTERCEPT_IRET);
2830 svm->vcpu.arch.hflags |= HF_IRET_MASK; 2858 svm->vcpu.arch.hflags |= HF_IRET_MASK;
2831 svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu); 2859 svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
2860 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2832 return 1; 2861 return 1;
2833} 2862}
2834 2863
@@ -2961,6 +2990,17 @@ static int dr_interception(struct vcpu_svm *svm)
2961 unsigned long val; 2990 unsigned long val;
2962 int err; 2991 int err;
2963 2992
2993 if (svm->vcpu.guest_debug == 0) {
2994 /*
2995 * No more DR vmexits; force a reload of the debug registers
2996 * and reenter on this instruction. The next vmexit will
2997 * retrieve the full state of the debug registers.
2998 */
2999 clr_dr_intercepts(svm);
3000 svm->vcpu.arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
3001 return 1;
3002 }
3003
2964 if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS)) 3004 if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
2965 return emulate_on_interception(svm); 3005 return emulate_on_interception(svm);
2966 3006
@@ -2989,10 +3029,8 @@ static int cr8_write_interception(struct vcpu_svm *svm)
2989 u8 cr8_prev = kvm_get_cr8(&svm->vcpu); 3029 u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
2990 /* instruction emulation calls kvm_set_cr8() */ 3030 /* instruction emulation calls kvm_set_cr8() */
2991 r = cr_interception(svm); 3031 r = cr_interception(svm);
2992 if (irqchip_in_kernel(svm->vcpu.kvm)) { 3032 if (irqchip_in_kernel(svm->vcpu.kvm))
2993 clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
2994 return r; 3033 return r;
2995 }
2996 if (cr8_prev <= kvm_get_cr8(&svm->vcpu)) 3034 if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
2997 return r; 3035 return r;
2998 kvm_run->exit_reason = KVM_EXIT_SET_TPR; 3036 kvm_run->exit_reason = KVM_EXIT_SET_TPR;
@@ -3554,6 +3592,8 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3554 if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK)) 3592 if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
3555 return; 3593 return;
3556 3594
3595 clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
3596
3557 if (irr == -1) 3597 if (irr == -1)
3558 return; 3598 return;
3559 3599
@@ -3636,7 +3676,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
3636 return ret; 3676 return ret;
3637} 3677}
3638 3678
3639static int enable_irq_window(struct kvm_vcpu *vcpu) 3679static void enable_irq_window(struct kvm_vcpu *vcpu)
3640{ 3680{
3641 struct vcpu_svm *svm = to_svm(vcpu); 3681 struct vcpu_svm *svm = to_svm(vcpu);
3642 3682
@@ -3650,16 +3690,15 @@ static int enable_irq_window(struct kvm_vcpu *vcpu)
3650 svm_set_vintr(svm); 3690 svm_set_vintr(svm);
3651 svm_inject_irq(svm, 0x0); 3691 svm_inject_irq(svm, 0x0);
3652 } 3692 }
3653 return 0;
3654} 3693}
3655 3694
3656static int enable_nmi_window(struct kvm_vcpu *vcpu) 3695static void enable_nmi_window(struct kvm_vcpu *vcpu)
3657{ 3696{
3658 struct vcpu_svm *svm = to_svm(vcpu); 3697 struct vcpu_svm *svm = to_svm(vcpu);
3659 3698
3660 if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) 3699 if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
3661 == HF_NMI_MASK) 3700 == HF_NMI_MASK)
3662 return 0; /* IRET will cause a vm exit */ 3701 return; /* IRET will cause a vm exit */
3663 3702
3664 /* 3703 /*
3665 * Something prevents NMI from been injected. Single step over possible 3704 * Something prevents NMI from been injected. Single step over possible
@@ -3668,7 +3707,6 @@ static int enable_nmi_window(struct kvm_vcpu *vcpu)
3668 svm->nmi_singlestep = true; 3707 svm->nmi_singlestep = true;
3669 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); 3708 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
3670 update_db_bp_intercept(vcpu); 3709 update_db_bp_intercept(vcpu);
3671 return 0;
3672} 3710}
3673 3711
3674static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) 3712static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -4051,6 +4089,11 @@ static bool svm_invpcid_supported(void)
4051 return false; 4089 return false;
4052} 4090}
4053 4091
4092static bool svm_mpx_supported(void)
4093{
4094 return false;
4095}
4096
4054static bool svm_has_wbinvd_exit(void) 4097static bool svm_has_wbinvd_exit(void)
4055{ 4098{
4056 return true; 4099 return true;
@@ -4286,7 +4329,10 @@ static struct kvm_x86_ops svm_x86_ops = {
4286 .set_idt = svm_set_idt, 4329 .set_idt = svm_set_idt,
4287 .get_gdt = svm_get_gdt, 4330 .get_gdt = svm_get_gdt,
4288 .set_gdt = svm_set_gdt, 4331 .set_gdt = svm_set_gdt,
4332 .get_dr6 = svm_get_dr6,
4333 .set_dr6 = svm_set_dr6,
4289 .set_dr7 = svm_set_dr7, 4334 .set_dr7 = svm_set_dr7,
4335 .sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
4290 .cache_reg = svm_cache_reg, 4336 .cache_reg = svm_cache_reg,
4291 .get_rflags = svm_get_rflags, 4337 .get_rflags = svm_get_rflags,
4292 .set_rflags = svm_set_rflags, 4338 .set_rflags = svm_set_rflags,
@@ -4330,6 +4376,7 @@ static struct kvm_x86_ops svm_x86_ops = {
4330 4376
4331 .rdtscp_supported = svm_rdtscp_supported, 4377 .rdtscp_supported = svm_rdtscp_supported,
4332 .invpcid_supported = svm_invpcid_supported, 4378 .invpcid_supported = svm_invpcid_supported,
4379 .mpx_supported = svm_mpx_supported,
4333 4380
4334 .set_supported_cpuid = svm_set_supported_cpuid, 4381 .set_supported_cpuid = svm_set_supported_cpuid,
4335 4382
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index da7837e1349d..1320e0f8e611 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -31,6 +31,7 @@
31#include <linux/ftrace_event.h> 31#include <linux/ftrace_event.h>
32#include <linux/slab.h> 32#include <linux/slab.h>
33#include <linux/tboot.h> 33#include <linux/tboot.h>
34#include <linux/hrtimer.h>
34#include "kvm_cache_regs.h" 35#include "kvm_cache_regs.h"
35#include "x86.h" 36#include "x86.h"
36 37
@@ -42,6 +43,7 @@
42#include <asm/i387.h> 43#include <asm/i387.h>
43#include <asm/xcr.h> 44#include <asm/xcr.h>
44#include <asm/perf_event.h> 45#include <asm/perf_event.h>
46#include <asm/debugreg.h>
45#include <asm/kexec.h> 47#include <asm/kexec.h>
46 48
47#include "trace.h" 49#include "trace.h"
@@ -110,6 +112,8 @@ module_param(nested, bool, S_IRUGO);
110 112
111#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) 113#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
112 114
115#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
116
113/* 117/*
114 * These 2 parameters are used to config the controls for Pause-Loop Exiting: 118 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
115 * ple_gap: upper bound on the amount of time between two successive 119 * ple_gap: upper bound on the amount of time between two successive
@@ -202,6 +206,7 @@ struct __packed vmcs12 {
202 u64 guest_pdptr1; 206 u64 guest_pdptr1;
203 u64 guest_pdptr2; 207 u64 guest_pdptr2;
204 u64 guest_pdptr3; 208 u64 guest_pdptr3;
209 u64 guest_bndcfgs;
205 u64 host_ia32_pat; 210 u64 host_ia32_pat;
206 u64 host_ia32_efer; 211 u64 host_ia32_efer;
207 u64 host_ia32_perf_global_ctrl; 212 u64 host_ia32_perf_global_ctrl;
@@ -374,6 +379,9 @@ struct nested_vmx {
374 */ 379 */
375 struct page *apic_access_page; 380 struct page *apic_access_page;
376 u64 msr_ia32_feature_control; 381 u64 msr_ia32_feature_control;
382
383 struct hrtimer preemption_timer;
384 bool preemption_timer_expired;
377}; 385};
378 386
379#define POSTED_INTR_ON 0 387#define POSTED_INTR_ON 0
@@ -418,6 +426,8 @@ struct vcpu_vmx {
418 u64 msr_host_kernel_gs_base; 426 u64 msr_host_kernel_gs_base;
419 u64 msr_guest_kernel_gs_base; 427 u64 msr_guest_kernel_gs_base;
420#endif 428#endif
429 u32 vm_entry_controls_shadow;
430 u32 vm_exit_controls_shadow;
421 /* 431 /*
422 * loaded_vmcs points to the VMCS currently used in this vcpu. For a 432 * loaded_vmcs points to the VMCS currently used in this vcpu. For a
423 * non-nested (L1) guest, it always points to vmcs01. For a nested 433 * non-nested (L1) guest, it always points to vmcs01. For a nested
@@ -439,6 +449,7 @@ struct vcpu_vmx {
439#endif 449#endif
440 int gs_ldt_reload_needed; 450 int gs_ldt_reload_needed;
441 int fs_reload_needed; 451 int fs_reload_needed;
452 u64 msr_host_bndcfgs;
442 } host_state; 453 } host_state;
443 struct { 454 struct {
444 int vm86_active; 455 int vm86_active;
@@ -531,6 +542,7 @@ static const unsigned long shadow_read_write_fields[] = {
531 GUEST_CS_LIMIT, 542 GUEST_CS_LIMIT,
532 GUEST_CS_BASE, 543 GUEST_CS_BASE,
533 GUEST_ES_BASE, 544 GUEST_ES_BASE,
545 GUEST_BNDCFGS,
534 CR0_GUEST_HOST_MASK, 546 CR0_GUEST_HOST_MASK,
535 CR0_READ_SHADOW, 547 CR0_READ_SHADOW,
536 CR4_READ_SHADOW, 548 CR4_READ_SHADOW,
@@ -586,6 +598,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
586 FIELD64(GUEST_PDPTR1, guest_pdptr1), 598 FIELD64(GUEST_PDPTR1, guest_pdptr1),
587 FIELD64(GUEST_PDPTR2, guest_pdptr2), 599 FIELD64(GUEST_PDPTR2, guest_pdptr2),
588 FIELD64(GUEST_PDPTR3, guest_pdptr3), 600 FIELD64(GUEST_PDPTR3, guest_pdptr3),
601 FIELD64(GUEST_BNDCFGS, guest_bndcfgs),
589 FIELD64(HOST_IA32_PAT, host_ia32_pat), 602 FIELD64(HOST_IA32_PAT, host_ia32_pat),
590 FIELD64(HOST_IA32_EFER, host_ia32_efer), 603 FIELD64(HOST_IA32_EFER, host_ia32_efer),
591 FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl), 604 FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
@@ -716,6 +729,7 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
716static u64 construct_eptp(unsigned long root_hpa); 729static u64 construct_eptp(unsigned long root_hpa);
717static void kvm_cpu_vmxon(u64 addr); 730static void kvm_cpu_vmxon(u64 addr);
718static void kvm_cpu_vmxoff(void); 731static void kvm_cpu_vmxoff(void);
732static bool vmx_mpx_supported(void);
719static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); 733static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
720static void vmx_set_segment(struct kvm_vcpu *vcpu, 734static void vmx_set_segment(struct kvm_vcpu *vcpu,
721 struct kvm_segment *var, int seg); 735 struct kvm_segment *var, int seg);
@@ -726,6 +740,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var);
726static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu); 740static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu);
727static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx); 741static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
728static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); 742static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
743static bool vmx_mpx_supported(void);
729 744
730static DEFINE_PER_CPU(struct vmcs *, vmxarea); 745static DEFINE_PER_CPU(struct vmcs *, vmxarea);
731static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 746static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -1045,6 +1060,12 @@ static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
1045 return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; 1060 return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
1046} 1061}
1047 1062
1063static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12)
1064{
1065 return vmcs12->pin_based_vm_exec_control &
1066 PIN_BASED_VMX_PREEMPTION_TIMER;
1067}
1068
1048static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) 1069static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
1049{ 1070{
1050 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); 1071 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
@@ -1056,7 +1077,9 @@ static inline bool is_exception(u32 intr_info)
1056 == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK); 1077 == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK);
1057} 1078}
1058 1079
1059static void nested_vmx_vmexit(struct kvm_vcpu *vcpu); 1080static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
1081 u32 exit_intr_info,
1082 unsigned long exit_qualification);
1060static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu, 1083static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
1061 struct vmcs12 *vmcs12, 1084 struct vmcs12 *vmcs12,
1062 u32 reason, unsigned long qualification); 1085 u32 reason, unsigned long qualification);
@@ -1326,6 +1349,62 @@ static void vmcs_set_bits(unsigned long field, u32 mask)
1326 vmcs_writel(field, vmcs_readl(field) | mask); 1349 vmcs_writel(field, vmcs_readl(field) | mask);
1327} 1350}
1328 1351
1352static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val)
1353{
1354 vmcs_write32(VM_ENTRY_CONTROLS, val);
1355 vmx->vm_entry_controls_shadow = val;
1356}
1357
1358static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val)
1359{
1360 if (vmx->vm_entry_controls_shadow != val)
1361 vm_entry_controls_init(vmx, val);
1362}
1363
1364static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx)
1365{
1366 return vmx->vm_entry_controls_shadow;
1367}
1368
1369
1370static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val)
1371{
1372 vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val);
1373}
1374
1375static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
1376{
1377 vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val);
1378}
1379
1380static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val)
1381{
1382 vmcs_write32(VM_EXIT_CONTROLS, val);
1383 vmx->vm_exit_controls_shadow = val;
1384}
1385
1386static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val)
1387{
1388 if (vmx->vm_exit_controls_shadow != val)
1389 vm_exit_controls_init(vmx, val);
1390}
1391
1392static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx)
1393{
1394 return vmx->vm_exit_controls_shadow;
1395}
1396
1397
1398static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val)
1399{
1400 vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val);
1401}
1402
1403static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
1404{
1405 vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val);
1406}
1407
1329static void vmx_segment_cache_clear(struct vcpu_vmx *vmx) 1408static void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
1330{ 1409{
1331 vmx->segment_cache.bitmask = 0; 1410 vmx->segment_cache.bitmask = 0;
@@ -1410,11 +1489,11 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
1410 vmcs_write32(EXCEPTION_BITMAP, eb); 1489 vmcs_write32(EXCEPTION_BITMAP, eb);
1411} 1490}
1412 1491
1413static void clear_atomic_switch_msr_special(unsigned long entry, 1492static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
1414 unsigned long exit) 1493 unsigned long entry, unsigned long exit)
1415{ 1494{
1416 vmcs_clear_bits(VM_ENTRY_CONTROLS, entry); 1495 vm_entry_controls_clearbit(vmx, entry);
1417 vmcs_clear_bits(VM_EXIT_CONTROLS, exit); 1496 vm_exit_controls_clearbit(vmx, exit);
1418} 1497}
1419 1498
1420static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) 1499static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
@@ -1425,14 +1504,15 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
1425 switch (msr) { 1504 switch (msr) {
1426 case MSR_EFER: 1505 case MSR_EFER:
1427 if (cpu_has_load_ia32_efer) { 1506 if (cpu_has_load_ia32_efer) {
1428 clear_atomic_switch_msr_special(VM_ENTRY_LOAD_IA32_EFER, 1507 clear_atomic_switch_msr_special(vmx,
1508 VM_ENTRY_LOAD_IA32_EFER,
1429 VM_EXIT_LOAD_IA32_EFER); 1509 VM_EXIT_LOAD_IA32_EFER);
1430 return; 1510 return;
1431 } 1511 }
1432 break; 1512 break;
1433 case MSR_CORE_PERF_GLOBAL_CTRL: 1513 case MSR_CORE_PERF_GLOBAL_CTRL:
1434 if (cpu_has_load_perf_global_ctrl) { 1514 if (cpu_has_load_perf_global_ctrl) {
1435 clear_atomic_switch_msr_special( 1515 clear_atomic_switch_msr_special(vmx,
1436 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 1516 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
1437 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); 1517 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
1438 return; 1518 return;
@@ -1453,14 +1533,15 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
1453 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr); 1533 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
1454} 1534}
1455 1535
1456static void add_atomic_switch_msr_special(unsigned long entry, 1536static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
1457 unsigned long exit, unsigned long guest_val_vmcs, 1537 unsigned long entry, unsigned long exit,
1458 unsigned long host_val_vmcs, u64 guest_val, u64 host_val) 1538 unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
1539 u64 guest_val, u64 host_val)
1459{ 1540{
1460 vmcs_write64(guest_val_vmcs, guest_val); 1541 vmcs_write64(guest_val_vmcs, guest_val);
1461 vmcs_write64(host_val_vmcs, host_val); 1542 vmcs_write64(host_val_vmcs, host_val);
1462 vmcs_set_bits(VM_ENTRY_CONTROLS, entry); 1543 vm_entry_controls_setbit(vmx, entry);
1463 vmcs_set_bits(VM_EXIT_CONTROLS, exit); 1544 vm_exit_controls_setbit(vmx, exit);
1464} 1545}
1465 1546
1466static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, 1547static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
@@ -1472,7 +1553,8 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
1472 switch (msr) { 1553 switch (msr) {
1473 case MSR_EFER: 1554 case MSR_EFER:
1474 if (cpu_has_load_ia32_efer) { 1555 if (cpu_has_load_ia32_efer) {
1475 add_atomic_switch_msr_special(VM_ENTRY_LOAD_IA32_EFER, 1556 add_atomic_switch_msr_special(vmx,
1557 VM_ENTRY_LOAD_IA32_EFER,
1476 VM_EXIT_LOAD_IA32_EFER, 1558 VM_EXIT_LOAD_IA32_EFER,
1477 GUEST_IA32_EFER, 1559 GUEST_IA32_EFER,
1478 HOST_IA32_EFER, 1560 HOST_IA32_EFER,
@@ -1482,7 +1564,7 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
1482 break; 1564 break;
1483 case MSR_CORE_PERF_GLOBAL_CTRL: 1565 case MSR_CORE_PERF_GLOBAL_CTRL:
1484 if (cpu_has_load_perf_global_ctrl) { 1566 if (cpu_has_load_perf_global_ctrl) {
1485 add_atomic_switch_msr_special( 1567 add_atomic_switch_msr_special(vmx,
1486 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 1568 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
1487 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL, 1569 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
1488 GUEST_IA32_PERF_GLOBAL_CTRL, 1570 GUEST_IA32_PERF_GLOBAL_CTRL,
@@ -1647,6 +1729,8 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
1647 if (is_long_mode(&vmx->vcpu)) 1729 if (is_long_mode(&vmx->vcpu))
1648 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1730 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1649#endif 1731#endif
1732 if (boot_cpu_has(X86_FEATURE_MPX))
1733 rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
1650 for (i = 0; i < vmx->save_nmsrs; ++i) 1734 for (i = 0; i < vmx->save_nmsrs; ++i)
1651 kvm_set_shared_msr(vmx->guest_msrs[i].index, 1735 kvm_set_shared_msr(vmx->guest_msrs[i].index,
1652 vmx->guest_msrs[i].data, 1736 vmx->guest_msrs[i].data,
@@ -1684,6 +1768,8 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
1684#ifdef CONFIG_X86_64 1768#ifdef CONFIG_X86_64
1685 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); 1769 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
1686#endif 1770#endif
1771 if (vmx->host_state.msr_host_bndcfgs)
1772 wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
1687 /* 1773 /*
1688 * If the FPU is not active (through the host task or 1774 * If the FPU is not active (through the host task or
1689 * the guest vcpu), then restore the cr0.TS bit. 1775 * the guest vcpu), then restore the cr0.TS bit.
@@ -1906,7 +1992,9 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr)
1906 if (!(vmcs12->exception_bitmap & (1u << nr))) 1992 if (!(vmcs12->exception_bitmap & (1u << nr)))
1907 return 0; 1993 return 0;
1908 1994
1909 nested_vmx_vmexit(vcpu); 1995 nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason,
1996 vmcs_read32(VM_EXIT_INTR_INFO),
1997 vmcs_readl(EXIT_QUALIFICATION));
1910 return 1; 1998 return 1;
1911} 1999}
1912 2000
@@ -2183,9 +2271,9 @@ static __init void nested_vmx_setup_ctls_msrs(void)
2183 */ 2271 */
2184 nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 2272 nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
2185 nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK | 2273 nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK |
2186 PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS | 2274 PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS;
2275 nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
2187 PIN_BASED_VMX_PREEMPTION_TIMER; 2276 PIN_BASED_VMX_PREEMPTION_TIMER;
2188 nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
2189 2277
2190 /* 2278 /*
2191 * Exit controls 2279 * Exit controls
@@ -2200,15 +2288,12 @@ static __init void nested_vmx_setup_ctls_msrs(void)
2200#ifdef CONFIG_X86_64 2288#ifdef CONFIG_X86_64
2201 VM_EXIT_HOST_ADDR_SPACE_SIZE | 2289 VM_EXIT_HOST_ADDR_SPACE_SIZE |
2202#endif 2290#endif
2203 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | 2291 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
2292 nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
2293 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
2204 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; 2294 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
2205 if (!(nested_vmx_pinbased_ctls_high & PIN_BASED_VMX_PREEMPTION_TIMER) || 2295 if (vmx_mpx_supported())
2206 !(nested_vmx_exit_ctls_high & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) { 2296 nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
2207 nested_vmx_exit_ctls_high &= ~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
2208 nested_vmx_pinbased_ctls_high &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
2209 }
2210 nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
2211 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER);
2212 2297
2213 /* entry controls */ 2298 /* entry controls */
2214 rdmsr(MSR_IA32_VMX_ENTRY_CTLS, 2299 rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
@@ -2222,6 +2307,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
2222 VM_ENTRY_LOAD_IA32_PAT; 2307 VM_ENTRY_LOAD_IA32_PAT;
2223 nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | 2308 nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR |
2224 VM_ENTRY_LOAD_IA32_EFER); 2309 VM_ENTRY_LOAD_IA32_EFER);
2310 if (vmx_mpx_supported())
2311 nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
2225 2312
2226 /* cpu-based controls */ 2313 /* cpu-based controls */
2227 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, 2314 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
@@ -2277,8 +2364,9 @@ static __init void nested_vmx_setup_ctls_msrs(void)
2277 2364
2278 /* miscellaneous data */ 2365 /* miscellaneous data */
2279 rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high); 2366 rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high);
2280 nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK | 2367 nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA;
2281 VMX_MISC_SAVE_EFER_LMA; 2368 nested_vmx_misc_low |= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
2369 VMX_MISC_ACTIVITY_HLT;
2282 nested_vmx_misc_high = 0; 2370 nested_vmx_misc_high = 0;
2283} 2371}
2284 2372
@@ -2295,32 +2383,10 @@ static inline u64 vmx_control_msr(u32 low, u32 high)
2295 return low | ((u64)high << 32); 2383 return low | ((u64)high << 32);
2296} 2384}
2297 2385
2298/* 2386/* Returns 0 on success, non-0 otherwise. */
2299 * If we allow our guest to use VMX instructions (i.e., nested VMX), we should
2300 * also let it use VMX-specific MSRs.
2301 * vmx_get_vmx_msr() and vmx_set_vmx_msr() return 1 when we handled a
2302 * VMX-specific MSR, or 0 when we haven't (and the caller should handle it
2303 * like all other MSRs).
2304 */
2305static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) 2387static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2306{ 2388{
2307 if (!nested_vmx_allowed(vcpu) && msr_index >= MSR_IA32_VMX_BASIC &&
2308 msr_index <= MSR_IA32_VMX_TRUE_ENTRY_CTLS) {
2309 /*
2310 * According to the spec, processors which do not support VMX
2311 * should throw a #GP(0) when VMX capability MSRs are read.
2312 */
2313 kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
2314 return 1;
2315 }
2316
2317 switch (msr_index) { 2389 switch (msr_index) {
2318 case MSR_IA32_FEATURE_CONTROL:
2319 if (nested_vmx_allowed(vcpu)) {
2320 *pdata = to_vmx(vcpu)->nested.msr_ia32_feature_control;
2321 break;
2322 }
2323 return 0;
2324 case MSR_IA32_VMX_BASIC: 2390 case MSR_IA32_VMX_BASIC:
2325 /* 2391 /*
2326 * This MSR reports some information about VMX support. We 2392 * This MSR reports some information about VMX support. We
@@ -2387,34 +2453,9 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2387 *pdata = nested_vmx_ept_caps; 2453 *pdata = nested_vmx_ept_caps;
2388 break; 2454 break;
2389 default: 2455 default:
2390 return 0;
2391 }
2392
2393 return 1;
2394}
2395
2396static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2397{
2398 u32 msr_index = msr_info->index;
2399 u64 data = msr_info->data;
2400 bool host_initialized = msr_info->host_initiated;
2401
2402 if (!nested_vmx_allowed(vcpu))
2403 return 0;
2404
2405 if (msr_index == MSR_IA32_FEATURE_CONTROL) {
2406 if (!host_initialized &&
2407 to_vmx(vcpu)->nested.msr_ia32_feature_control
2408 & FEATURE_CONTROL_LOCKED)
2409 return 0;
2410 to_vmx(vcpu)->nested.msr_ia32_feature_control = data;
2411 return 1; 2456 return 1;
2412 } 2457 }
2413 2458
2414 /*
2415 * No need to treat VMX capability MSRs specially: If we don't handle
2416 * them, handle_wrmsr will #GP(0), which is correct (they are readonly)
2417 */
2418 return 0; 2459 return 0;
2419} 2460}
2420 2461
@@ -2460,13 +2501,25 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2460 case MSR_IA32_SYSENTER_ESP: 2501 case MSR_IA32_SYSENTER_ESP:
2461 data = vmcs_readl(GUEST_SYSENTER_ESP); 2502 data = vmcs_readl(GUEST_SYSENTER_ESP);
2462 break; 2503 break;
2504 case MSR_IA32_BNDCFGS:
2505 if (!vmx_mpx_supported())
2506 return 1;
2507 data = vmcs_read64(GUEST_BNDCFGS);
2508 break;
2509 case MSR_IA32_FEATURE_CONTROL:
2510 if (!nested_vmx_allowed(vcpu))
2511 return 1;
2512 data = to_vmx(vcpu)->nested.msr_ia32_feature_control;
2513 break;
2514 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
2515 if (!nested_vmx_allowed(vcpu))
2516 return 1;
2517 return vmx_get_vmx_msr(vcpu, msr_index, pdata);
2463 case MSR_TSC_AUX: 2518 case MSR_TSC_AUX:
2464 if (!to_vmx(vcpu)->rdtscp_enabled) 2519 if (!to_vmx(vcpu)->rdtscp_enabled)
2465 return 1; 2520 return 1;
2466 /* Otherwise falls through */ 2521 /* Otherwise falls through */
2467 default: 2522 default:
2468 if (vmx_get_vmx_msr(vcpu, msr_index, pdata))
2469 return 0;
2470 msr = find_msr_entry(to_vmx(vcpu), msr_index); 2523 msr = find_msr_entry(to_vmx(vcpu), msr_index);
2471 if (msr) { 2524 if (msr) {
2472 data = msr->data; 2525 data = msr->data;
@@ -2479,6 +2532,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2479 return 0; 2532 return 0;
2480} 2533}
2481 2534
2535static void vmx_leave_nested(struct kvm_vcpu *vcpu);
2536
2482/* 2537/*
2483 * Writes msr value into into the appropriate "register". 2538 * Writes msr value into into the appropriate "register".
2484 * Returns 0 on success, non-0 otherwise. 2539 * Returns 0 on success, non-0 otherwise.
@@ -2519,6 +2574,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2519 case MSR_IA32_SYSENTER_ESP: 2574 case MSR_IA32_SYSENTER_ESP:
2520 vmcs_writel(GUEST_SYSENTER_ESP, data); 2575 vmcs_writel(GUEST_SYSENTER_ESP, data);
2521 break; 2576 break;
2577 case MSR_IA32_BNDCFGS:
2578 if (!vmx_mpx_supported())
2579 return 1;
2580 vmcs_write64(GUEST_BNDCFGS, data);
2581 break;
2522 case MSR_IA32_TSC: 2582 case MSR_IA32_TSC:
2523 kvm_write_tsc(vcpu, msr_info); 2583 kvm_write_tsc(vcpu, msr_info);
2524 break; 2584 break;
@@ -2533,6 +2593,17 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2533 case MSR_IA32_TSC_ADJUST: 2593 case MSR_IA32_TSC_ADJUST:
2534 ret = kvm_set_msr_common(vcpu, msr_info); 2594 ret = kvm_set_msr_common(vcpu, msr_info);
2535 break; 2595 break;
2596 case MSR_IA32_FEATURE_CONTROL:
2597 if (!nested_vmx_allowed(vcpu) ||
2598 (to_vmx(vcpu)->nested.msr_ia32_feature_control &
2599 FEATURE_CONTROL_LOCKED && !msr_info->host_initiated))
2600 return 1;
2601 vmx->nested.msr_ia32_feature_control = data;
2602 if (msr_info->host_initiated && data == 0)
2603 vmx_leave_nested(vcpu);
2604 break;
2605 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
2606 return 1; /* they are read-only */
2536 case MSR_TSC_AUX: 2607 case MSR_TSC_AUX:
2537 if (!vmx->rdtscp_enabled) 2608 if (!vmx->rdtscp_enabled)
2538 return 1; 2609 return 1;
@@ -2541,8 +2612,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2541 return 1; 2612 return 1;
2542 /* Otherwise falls through */ 2613 /* Otherwise falls through */
2543 default: 2614 default:
2544 if (vmx_set_vmx_msr(vcpu, msr_info))
2545 break;
2546 msr = find_msr_entry(vmx, msr_index); 2615 msr = find_msr_entry(vmx, msr_index);
2547 if (msr) { 2616 if (msr) {
2548 msr->data = data; 2617 msr->data = data;
@@ -2795,12 +2864,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
2795 vmx_capability.ept, vmx_capability.vpid); 2864 vmx_capability.ept, vmx_capability.vpid);
2796 } 2865 }
2797 2866
2798 min = 0; 2867 min = VM_EXIT_SAVE_DEBUG_CONTROLS;
2799#ifdef CONFIG_X86_64 2868#ifdef CONFIG_X86_64
2800 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; 2869 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
2801#endif 2870#endif
2802 opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT | 2871 opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
2803 VM_EXIT_ACK_INTR_ON_EXIT; 2872 VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_CLEAR_BNDCFGS;
2804 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, 2873 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
2805 &_vmexit_control) < 0) 2874 &_vmexit_control) < 0)
2806 return -EIO; 2875 return -EIO;
@@ -2816,8 +2885,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
2816 !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT)) 2885 !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT))
2817 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; 2886 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
2818 2887
2819 min = 0; 2888 min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
2820 opt = VM_ENTRY_LOAD_IA32_PAT; 2889 opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS;
2821 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, 2890 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
2822 &_vmentry_control) < 0) 2891 &_vmentry_control) < 0)
2823 return -EIO; 2892 return -EIO;
@@ -3182,14 +3251,10 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
3182 vmx_load_host_state(to_vmx(vcpu)); 3251 vmx_load_host_state(to_vmx(vcpu));
3183 vcpu->arch.efer = efer; 3252 vcpu->arch.efer = efer;
3184 if (efer & EFER_LMA) { 3253 if (efer & EFER_LMA) {
3185 vmcs_write32(VM_ENTRY_CONTROLS, 3254 vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
3186 vmcs_read32(VM_ENTRY_CONTROLS) |
3187 VM_ENTRY_IA32E_MODE);
3188 msr->data = efer; 3255 msr->data = efer;
3189 } else { 3256 } else {
3190 vmcs_write32(VM_ENTRY_CONTROLS, 3257 vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
3191 vmcs_read32(VM_ENTRY_CONTROLS) &
3192 ~VM_ENTRY_IA32E_MODE);
3193 3258
3194 msr->data = efer & ~EFER_LME; 3259 msr->data = efer & ~EFER_LME;
3195 } 3260 }
@@ -3217,9 +3282,7 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
3217 3282
3218static void exit_lmode(struct kvm_vcpu *vcpu) 3283static void exit_lmode(struct kvm_vcpu *vcpu)
3219{ 3284{
3220 vmcs_write32(VM_ENTRY_CONTROLS, 3285 vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
3221 vmcs_read32(VM_ENTRY_CONTROLS)
3222 & ~VM_ENTRY_IA32E_MODE);
3223 vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA); 3286 vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
3224} 3287}
3225 3288
@@ -4192,6 +4255,10 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
4192static u32 vmx_exec_control(struct vcpu_vmx *vmx) 4255static u32 vmx_exec_control(struct vcpu_vmx *vmx)
4193{ 4256{
4194 u32 exec_control = vmcs_config.cpu_based_exec_ctrl; 4257 u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
4258
4259 if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
4260 exec_control &= ~CPU_BASED_MOV_DR_EXITING;
4261
4195 if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) { 4262 if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
4196 exec_control &= ~CPU_BASED_TPR_SHADOW; 4263 exec_control &= ~CPU_BASED_TPR_SHADOW;
4197#ifdef CONFIG_X86_64 4264#ifdef CONFIG_X86_64
@@ -4346,10 +4413,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
4346 ++vmx->nmsrs; 4413 ++vmx->nmsrs;
4347 } 4414 }
4348 4415
4349 vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); 4416
4417 vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
4350 4418
4351 /* 22.2.1, 20.8.1 */ 4419 /* 22.2.1, 20.8.1 */
4352 vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); 4420 vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl);
4353 4421
4354 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); 4422 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
4355 set_cr4_guest_host_mask(vmx); 4423 set_cr4_guest_host_mask(vmx);
@@ -4360,7 +4428,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
4360static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) 4428static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
4361{ 4429{
4362 struct vcpu_vmx *vmx = to_vmx(vcpu); 4430 struct vcpu_vmx *vmx = to_vmx(vcpu);
4363 u64 msr; 4431 struct msr_data apic_base_msr;
4364 4432
4365 vmx->rmode.vm86_active = 0; 4433 vmx->rmode.vm86_active = 0;
4366 4434
@@ -4368,10 +4436,11 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
4368 4436
4369 vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); 4437 vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
4370 kvm_set_cr8(&vmx->vcpu, 0); 4438 kvm_set_cr8(&vmx->vcpu, 0);
4371 msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; 4439 apic_base_msr.data = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
4372 if (kvm_vcpu_is_bsp(&vmx->vcpu)) 4440 if (kvm_vcpu_is_bsp(&vmx->vcpu))
4373 msr |= MSR_IA32_APICBASE_BSP; 4441 apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
4374 kvm_set_apic_base(&vmx->vcpu, msr); 4442 apic_base_msr.host_initiated = true;
4443 kvm_set_apic_base(&vmx->vcpu, &apic_base_msr);
4375 4444
4376 vmx_segment_cache_clear(vmx); 4445 vmx_segment_cache_clear(vmx);
4377 4446
@@ -4463,39 +4532,28 @@ static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
4463 PIN_BASED_NMI_EXITING; 4532 PIN_BASED_NMI_EXITING;
4464} 4533}
4465 4534
4466static int enable_irq_window(struct kvm_vcpu *vcpu) 4535static void enable_irq_window(struct kvm_vcpu *vcpu)
4467{ 4536{
4468 u32 cpu_based_vm_exec_control; 4537 u32 cpu_based_vm_exec_control;
4469 4538
4470 if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
4471 /*
4472 * We get here if vmx_interrupt_allowed() said we can't
4473 * inject to L1 now because L2 must run. The caller will have
4474 * to make L2 exit right after entry, so we can inject to L1
4475 * more promptly.
4476 */
4477 return -EBUSY;
4478
4479 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 4539 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
4480 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; 4540 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
4481 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 4541 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
4482 return 0;
4483} 4542}
4484 4543
4485static int enable_nmi_window(struct kvm_vcpu *vcpu) 4544static void enable_nmi_window(struct kvm_vcpu *vcpu)
4486{ 4545{
4487 u32 cpu_based_vm_exec_control; 4546 u32 cpu_based_vm_exec_control;
4488 4547
4489 if (!cpu_has_virtual_nmis()) 4548 if (!cpu_has_virtual_nmis() ||
4490 return enable_irq_window(vcpu); 4549 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
4491 4550 enable_irq_window(vcpu);
4492 if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) 4551 return;
4493 return enable_irq_window(vcpu); 4552 }
4494 4553
4495 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 4554 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
4496 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; 4555 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
4497 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 4556 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
4498 return 0;
4499} 4557}
4500 4558
4501static void vmx_inject_irq(struct kvm_vcpu *vcpu) 4559static void vmx_inject_irq(struct kvm_vcpu *vcpu)
@@ -4587,25 +4645,8 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
4587 4645
4588static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) 4646static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
4589{ 4647{
4590 if (is_guest_mode(vcpu)) { 4648 if (to_vmx(vcpu)->nested.nested_run_pending)
4591 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4649 return 0;
4592
4593 if (to_vmx(vcpu)->nested.nested_run_pending)
4594 return 0;
4595 if (nested_exit_on_nmi(vcpu)) {
4596 nested_vmx_vmexit(vcpu);
4597 vmcs12->vm_exit_reason = EXIT_REASON_EXCEPTION_NMI;
4598 vmcs12->vm_exit_intr_info = NMI_VECTOR |
4599 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK;
4600 /*
4601 * The NMI-triggered VM exit counts as injection:
4602 * clear this one and block further NMIs.
4603 */
4604 vcpu->arch.nmi_pending = 0;
4605 vmx_set_nmi_mask(vcpu, true);
4606 return 0;
4607 }
4608 }
4609 4650
4610 if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked) 4651 if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
4611 return 0; 4652 return 0;
@@ -4617,23 +4658,8 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
4617 4658
4618static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) 4659static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
4619{ 4660{
4620 if (is_guest_mode(vcpu)) { 4661 return (!to_vmx(vcpu)->nested.nested_run_pending &&
4621 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4662 vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
4622
4623 if (to_vmx(vcpu)->nested.nested_run_pending)
4624 return 0;
4625 if (nested_exit_on_intr(vcpu)) {
4626 nested_vmx_vmexit(vcpu);
4627 vmcs12->vm_exit_reason =
4628 EXIT_REASON_EXTERNAL_INTERRUPT;
4629 vmcs12->vm_exit_intr_info = 0;
4630 /*
4631 * fall through to normal code, but now in L1, not L2
4632 */
4633 }
4634 }
4635
4636 return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
4637 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 4663 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
4638 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); 4664 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
4639} 4665}
@@ -4812,7 +4838,8 @@ static int handle_exception(struct kvm_vcpu *vcpu)
4812 dr6 = vmcs_readl(EXIT_QUALIFICATION); 4838 dr6 = vmcs_readl(EXIT_QUALIFICATION);
4813 if (!(vcpu->guest_debug & 4839 if (!(vcpu->guest_debug &
4814 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { 4840 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
4815 vcpu->arch.dr6 = dr6 | DR6_FIXED_1; 4841 vcpu->arch.dr6 &= ~15;
4842 vcpu->arch.dr6 |= dr6;
4816 kvm_queue_exception(vcpu, DB_VECTOR); 4843 kvm_queue_exception(vcpu, DB_VECTOR);
4817 return 1; 4844 return 1;
4818 } 4845 }
@@ -5075,19 +5102,66 @@ static int handle_dr(struct kvm_vcpu *vcpu)
5075 } 5102 }
5076 } 5103 }
5077 5104
5105 if (vcpu->guest_debug == 0) {
5106 u32 cpu_based_vm_exec_control;
5107
5108 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
5109 cpu_based_vm_exec_control &= ~CPU_BASED_MOV_DR_EXITING;
5110 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
5111
5112 /*
5113 * No more DR vmexits; force a reload of the debug registers
5114 * and reenter on this instruction. The next vmexit will
5115 * retrieve the full state of the debug registers.
5116 */
5117 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
5118 return 1;
5119 }
5120
5078 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5121 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5079 dr = exit_qualification & DEBUG_REG_ACCESS_NUM; 5122 dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
5080 reg = DEBUG_REG_ACCESS_REG(exit_qualification); 5123 reg = DEBUG_REG_ACCESS_REG(exit_qualification);
5081 if (exit_qualification & TYPE_MOV_FROM_DR) { 5124 if (exit_qualification & TYPE_MOV_FROM_DR) {
5082 unsigned long val; 5125 unsigned long val;
5083 if (!kvm_get_dr(vcpu, dr, &val)) 5126
5084 kvm_register_write(vcpu, reg, val); 5127 if (kvm_get_dr(vcpu, dr, &val))
5128 return 1;
5129 kvm_register_write(vcpu, reg, val);
5085 } else 5130 } else
5086 kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg]); 5131 if (kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg]))
5132 return 1;
5133
5087 skip_emulated_instruction(vcpu); 5134 skip_emulated_instruction(vcpu);
5088 return 1; 5135 return 1;
5089} 5136}
5090 5137
5138static u64 vmx_get_dr6(struct kvm_vcpu *vcpu)
5139{
5140 return vcpu->arch.dr6;
5141}
5142
5143static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
5144{
5145}
5146
5147static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
5148{
5149 u32 cpu_based_vm_exec_control;
5150
5151 get_debugreg(vcpu->arch.db[0], 0);
5152 get_debugreg(vcpu->arch.db[1], 1);
5153 get_debugreg(vcpu->arch.db[2], 2);
5154 get_debugreg(vcpu->arch.db[3], 3);
5155 get_debugreg(vcpu->arch.dr6, 6);
5156 vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
5157
5158 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
5159
5160 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
5161 cpu_based_vm_exec_control |= CPU_BASED_MOV_DR_EXITING;
5162 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
5163}
5164
5091static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) 5165static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
5092{ 5166{
5093 vmcs_writel(GUEST_DR7, val); 5167 vmcs_writel(GUEST_DR7, val);
@@ -5687,6 +5761,18 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
5687 */ 5761 */
5688} 5762}
5689 5763
5764static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
5765{
5766 struct vcpu_vmx *vmx =
5767 container_of(timer, struct vcpu_vmx, nested.preemption_timer);
5768
5769 vmx->nested.preemption_timer_expired = true;
5770 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
5771 kvm_vcpu_kick(&vmx->vcpu);
5772
5773 return HRTIMER_NORESTART;
5774}
5775
5690/* 5776/*
5691 * Emulate the VMXON instruction. 5777 * Emulate the VMXON instruction.
5692 * Currently, we just remember that VMX is active, and do not save or even 5778 * Currently, we just remember that VMX is active, and do not save or even
@@ -5751,6 +5837,10 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
5751 INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); 5837 INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
5752 vmx->nested.vmcs02_num = 0; 5838 vmx->nested.vmcs02_num = 0;
5753 5839
5840 hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
5841 HRTIMER_MODE_REL);
5842 vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
5843
5754 vmx->nested.vmxon = true; 5844 vmx->nested.vmxon = true;
5755 5845
5756 skip_emulated_instruction(vcpu); 5846 skip_emulated_instruction(vcpu);
@@ -6460,11 +6550,8 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
6460 int size; 6550 int size;
6461 u8 b; 6551 u8 b;
6462 6552
6463 if (nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING))
6464 return 1;
6465
6466 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 6553 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
6467 return 0; 6554 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
6468 6555
6469 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 6556 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6470 6557
@@ -6628,6 +6715,13 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
6628 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6715 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6629 u32 exit_reason = vmx->exit_reason; 6716 u32 exit_reason = vmx->exit_reason;
6630 6717
6718 trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
6719 vmcs_readl(EXIT_QUALIFICATION),
6720 vmx->idt_vectoring_info,
6721 intr_info,
6722 vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
6723 KVM_ISA_VMX);
6724
6631 if (vmx->nested.nested_run_pending) 6725 if (vmx->nested.nested_run_pending)
6632 return 0; 6726 return 0;
6633 6727
@@ -6644,7 +6738,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
6644 else if (is_page_fault(intr_info)) 6738 else if (is_page_fault(intr_info))
6645 return enable_ept; 6739 return enable_ept;
6646 else if (is_no_device(intr_info) && 6740 else if (is_no_device(intr_info) &&
6647 !(nested_read_cr0(vmcs12) & X86_CR0_TS)) 6741 !(vmcs12->guest_cr0 & X86_CR0_TS))
6648 return 0; 6742 return 0;
6649 return vmcs12->exception_bitmap & 6743 return vmcs12->exception_bitmap &
6650 (1u << (intr_info & INTR_INFO_VECTOR_MASK)); 6744 (1u << (intr_info & INTR_INFO_VECTOR_MASK));
@@ -6723,9 +6817,6 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
6723 * table is L0's fault. 6817 * table is L0's fault.
6724 */ 6818 */
6725 return 0; 6819 return 0;
6726 case EXIT_REASON_PREEMPTION_TIMER:
6727 return vmcs12->pin_based_vm_exec_control &
6728 PIN_BASED_VMX_PREEMPTION_TIMER;
6729 case EXIT_REASON_WBINVD: 6820 case EXIT_REASON_WBINVD:
6730 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 6821 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
6731 case EXIT_REASON_XSETBV: 6822 case EXIT_REASON_XSETBV:
@@ -6741,27 +6832,6 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
6741 *info2 = vmcs_read32(VM_EXIT_INTR_INFO); 6832 *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
6742} 6833}
6743 6834
6744static void nested_adjust_preemption_timer(struct kvm_vcpu *vcpu)
6745{
6746 u64 delta_tsc_l1;
6747 u32 preempt_val_l1, preempt_val_l2, preempt_scale;
6748
6749 if (!(get_vmcs12(vcpu)->pin_based_vm_exec_control &
6750 PIN_BASED_VMX_PREEMPTION_TIMER))
6751 return;
6752 preempt_scale = native_read_msr(MSR_IA32_VMX_MISC) &
6753 MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE;
6754 preempt_val_l2 = vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
6755 delta_tsc_l1 = vmx_read_l1_tsc(vcpu, native_read_tsc())
6756 - vcpu->arch.last_guest_tsc;
6757 preempt_val_l1 = delta_tsc_l1 >> preempt_scale;
6758 if (preempt_val_l2 <= preempt_val_l1)
6759 preempt_val_l2 = 0;
6760 else
6761 preempt_val_l2 -= preempt_val_l1;
6762 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, preempt_val_l2);
6763}
6764
6765/* 6835/*
6766 * The guest has exited. See if we can fix it or if we need userspace 6836 * The guest has exited. See if we can fix it or if we need userspace
6767 * assistance. 6837 * assistance.
@@ -6777,7 +6847,9 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
6777 return handle_invalid_guest_state(vcpu); 6847 return handle_invalid_guest_state(vcpu);
6778 6848
6779 if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) { 6849 if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
6780 nested_vmx_vmexit(vcpu); 6850 nested_vmx_vmexit(vcpu, exit_reason,
6851 vmcs_read32(VM_EXIT_INTR_INFO),
6852 vmcs_readl(EXIT_QUALIFICATION));
6781 return 1; 6853 return 1;
6782 } 6854 }
6783 6855
@@ -7006,6 +7078,12 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
7006 local_irq_enable(); 7078 local_irq_enable();
7007} 7079}
7008 7080
7081static bool vmx_mpx_supported(void)
7082{
7083 return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) &&
7084 (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS);
7085}
7086
7009static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) 7087static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
7010{ 7088{
7011 u32 exit_intr_info; 7089 u32 exit_intr_info;
@@ -7172,8 +7250,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
7172 atomic_switch_perf_msrs(vmx); 7250 atomic_switch_perf_msrs(vmx);
7173 debugctlmsr = get_debugctlmsr(); 7251 debugctlmsr = get_debugctlmsr();
7174 7252
7175 if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending)
7176 nested_adjust_preemption_timer(vcpu);
7177 vmx->__launched = vmx->loaded_vmcs->launched; 7253 vmx->__launched = vmx->loaded_vmcs->launched;
7178 asm( 7254 asm(
7179 /* Store host registers */ 7255 /* Store host registers */
@@ -7332,8 +7408,8 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
7332 struct vcpu_vmx *vmx = to_vmx(vcpu); 7408 struct vcpu_vmx *vmx = to_vmx(vcpu);
7333 7409
7334 free_vpid(vmx); 7410 free_vpid(vmx);
7335 free_nested(vmx);
7336 free_loaded_vmcs(vmx->loaded_vmcs); 7411 free_loaded_vmcs(vmx->loaded_vmcs);
7412 free_nested(vmx);
7337 kfree(vmx->guest_msrs); 7413 kfree(vmx->guest_msrs);
7338 kvm_vcpu_uninit(vcpu); 7414 kvm_vcpu_uninit(vcpu);
7339 kmem_cache_free(kvm_vcpu_cache, vmx); 7415 kmem_cache_free(kvm_vcpu_cache, vmx);
@@ -7518,15 +7594,14 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
7518static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, 7594static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
7519 struct x86_exception *fault) 7595 struct x86_exception *fault)
7520{ 7596{
7521 struct vmcs12 *vmcs12; 7597 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
7522 nested_vmx_vmexit(vcpu); 7598 u32 exit_reason;
7523 vmcs12 = get_vmcs12(vcpu);
7524 7599
7525 if (fault->error_code & PFERR_RSVD_MASK) 7600 if (fault->error_code & PFERR_RSVD_MASK)
7526 vmcs12->vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; 7601 exit_reason = EXIT_REASON_EPT_MISCONFIG;
7527 else 7602 else
7528 vmcs12->vm_exit_reason = EXIT_REASON_EPT_VIOLATION; 7603 exit_reason = EXIT_REASON_EPT_VIOLATION;
7529 vmcs12->exit_qualification = vcpu->arch.exit_qualification; 7604 nested_vmx_vmexit(vcpu, exit_reason, 0, vcpu->arch.exit_qualification);
7530 vmcs12->guest_physical_address = fault->address; 7605 vmcs12->guest_physical_address = fault->address;
7531} 7606}
7532 7607
@@ -7564,11 +7639,35 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
7564 7639
7565 /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */ 7640 /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
7566 if (vmcs12->exception_bitmap & (1u << PF_VECTOR)) 7641 if (vmcs12->exception_bitmap & (1u << PF_VECTOR))
7567 nested_vmx_vmexit(vcpu); 7642 nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason,
7643 vmcs_read32(VM_EXIT_INTR_INFO),
7644 vmcs_readl(EXIT_QUALIFICATION));
7568 else 7645 else
7569 kvm_inject_page_fault(vcpu, fault); 7646 kvm_inject_page_fault(vcpu, fault);
7570} 7647}
7571 7648
7649static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
7650{
7651 u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value;
7652 struct vcpu_vmx *vmx = to_vmx(vcpu);
7653
7654 if (vcpu->arch.virtual_tsc_khz == 0)
7655 return;
7656
7657 /* Make sure short timeouts reliably trigger an immediate vmexit.
7658 * hrtimer_start does not guarantee this. */
7659 if (preemption_timeout <= 1) {
7660 vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
7661 return;
7662 }
7663
7664 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
7665 preemption_timeout *= 1000000;
7666 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
7667 hrtimer_start(&vmx->nested.preemption_timer,
7668 ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
7669}
7670
7572/* 7671/*
7573 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested 7672 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
7574 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it 7673 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
@@ -7582,7 +7681,6 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7582{ 7681{
7583 struct vcpu_vmx *vmx = to_vmx(vcpu); 7682 struct vcpu_vmx *vmx = to_vmx(vcpu);
7584 u32 exec_control; 7683 u32 exec_control;
7585 u32 exit_control;
7586 7684
7587 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); 7685 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
7588 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); 7686 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
@@ -7640,13 +7738,14 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7640 7738
7641 vmcs_write64(VMCS_LINK_POINTER, -1ull); 7739 vmcs_write64(VMCS_LINK_POINTER, -1ull);
7642 7740
7643 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, 7741 exec_control = vmcs12->pin_based_vm_exec_control;
7644 (vmcs_config.pin_based_exec_ctrl | 7742 exec_control |= vmcs_config.pin_based_exec_ctrl;
7645 vmcs12->pin_based_vm_exec_control)); 7743 exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
7744 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
7646 7745
7647 if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) 7746 vmx->nested.preemption_timer_expired = false;
7648 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 7747 if (nested_cpu_has_preemption_timer(vmcs12))
7649 vmcs12->vmx_preemption_timer_value); 7748 vmx_start_preemption_timer(vcpu);
7650 7749
7651 /* 7750 /*
7652 * Whether page-faults are trapped is determined by a combination of 7751 * Whether page-faults are trapped is determined by a combination of
@@ -7674,7 +7773,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7674 enable_ept ? vmcs12->page_fault_error_code_match : 0); 7773 enable_ept ? vmcs12->page_fault_error_code_match : 0);
7675 7774
7676 if (cpu_has_secondary_exec_ctrls()) { 7775 if (cpu_has_secondary_exec_ctrls()) {
7677 u32 exec_control = vmx_secondary_exec_control(vmx); 7776 exec_control = vmx_secondary_exec_control(vmx);
7678 if (!vmx->rdtscp_enabled) 7777 if (!vmx->rdtscp_enabled)
7679 exec_control &= ~SECONDARY_EXEC_RDTSCP; 7778 exec_control &= ~SECONDARY_EXEC_RDTSCP;
7680 /* Take the following fields only from vmcs12 */ 7779 /* Take the following fields only from vmcs12 */
@@ -7706,6 +7805,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7706 else 7805 else
7707 vmcs_write64(APIC_ACCESS_ADDR, 7806 vmcs_write64(APIC_ACCESS_ADDR,
7708 page_to_phys(vmx->nested.apic_access_page)); 7807 page_to_phys(vmx->nested.apic_access_page));
7808 } else if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) {
7809 exec_control |=
7810 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
7811 vmcs_write64(APIC_ACCESS_ADDR,
7812 page_to_phys(vcpu->kvm->arch.apic_access_page));
7709 } 7813 }
7710 7814
7711 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); 7815 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
@@ -7756,15 +7860,12 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7756 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER 7860 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
7757 * bits are further modified by vmx_set_efer() below. 7861 * bits are further modified by vmx_set_efer() below.
7758 */ 7862 */
7759 exit_control = vmcs_config.vmexit_ctrl; 7863 vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
7760 if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER)
7761 exit_control |= VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
7762 vmcs_write32(VM_EXIT_CONTROLS, exit_control);
7763 7864
7764 /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are 7865 /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
7765 * emulated by vmx_set_efer(), below. 7866 * emulated by vmx_set_efer(), below.
7766 */ 7867 */
7767 vmcs_write32(VM_ENTRY_CONTROLS, 7868 vm_entry_controls_init(vmx,
7768 (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER & 7869 (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER &
7769 ~VM_ENTRY_IA32E_MODE) | 7870 ~VM_ENTRY_IA32E_MODE) |
7770 (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE)); 7871 (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
@@ -7778,6 +7879,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7778 7879
7779 set_cr4_guest_host_mask(vmx); 7880 set_cr4_guest_host_mask(vmx);
7780 7881
7882 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)
7883 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
7884
7781 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) 7885 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
7782 vmcs_write64(TSC_OFFSET, 7886 vmcs_write64(TSC_OFFSET,
7783 vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset); 7887 vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset);
@@ -7882,7 +7986,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
7882 return 1; 7986 return 1;
7883 } 7987 }
7884 7988
7885 if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE) { 7989 if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
7990 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) {
7886 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 7991 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
7887 return 1; 7992 return 1;
7888 } 7993 }
@@ -7994,8 +8099,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
7994 8099
7995 enter_guest_mode(vcpu); 8100 enter_guest_mode(vcpu);
7996 8101
7997 vmx->nested.nested_run_pending = 1;
7998
7999 vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET); 8102 vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET);
8000 8103
8001 cpu = get_cpu(); 8104 cpu = get_cpu();
@@ -8011,6 +8114,11 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
8011 8114
8012 prepare_vmcs02(vcpu, vmcs12); 8115 prepare_vmcs02(vcpu, vmcs12);
8013 8116
8117 if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
8118 return kvm_emulate_halt(vcpu);
8119
8120 vmx->nested.nested_run_pending = 1;
8121
8014 /* 8122 /*
8015 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point 8123 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
8016 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet 8124 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
@@ -8099,6 +8207,58 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
8099 } 8207 }
8100} 8208}
8101 8209
8210static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
8211{
8212 struct vcpu_vmx *vmx = to_vmx(vcpu);
8213
8214 if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
8215 vmx->nested.preemption_timer_expired) {
8216 if (vmx->nested.nested_run_pending)
8217 return -EBUSY;
8218 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
8219 return 0;
8220 }
8221
8222 if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) {
8223 if (vmx->nested.nested_run_pending ||
8224 vcpu->arch.interrupt.pending)
8225 return -EBUSY;
8226 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
8227 NMI_VECTOR | INTR_TYPE_NMI_INTR |
8228 INTR_INFO_VALID_MASK, 0);
8229 /*
8230 * The NMI-triggered VM exit counts as injection:
8231 * clear this one and block further NMIs.
8232 */
8233 vcpu->arch.nmi_pending = 0;
8234 vmx_set_nmi_mask(vcpu, true);
8235 return 0;
8236 }
8237
8238 if ((kvm_cpu_has_interrupt(vcpu) || external_intr) &&
8239 nested_exit_on_intr(vcpu)) {
8240 if (vmx->nested.nested_run_pending)
8241 return -EBUSY;
8242 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
8243 }
8244
8245 return 0;
8246}
8247
8248static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
8249{
8250 ktime_t remaining =
8251 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer);
8252 u64 value;
8253
8254 if (ktime_to_ns(remaining) <= 0)
8255 return 0;
8256
8257 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz;
8258 do_div(value, 1000000);
8259 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
8260}
8261
8102/* 8262/*
8103 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits 8263 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
8104 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), 8264 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
@@ -8110,7 +8270,9 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
8110 * exit-information fields only. Other fields are modified by L1 with VMWRITE, 8270 * exit-information fields only. Other fields are modified by L1 with VMWRITE,
8111 * which already writes to vmcs12 directly. 8271 * which already writes to vmcs12 directly.
8112 */ 8272 */
8113static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 8273static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
8274 u32 exit_reason, u32 exit_intr_info,
8275 unsigned long exit_qualification)
8114{ 8276{
8115 /* update guest state fields: */ 8277 /* update guest state fields: */
8116 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); 8278 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
@@ -8162,11 +8324,18 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
8162 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 8324 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
8163 vmcs12->guest_pending_dbg_exceptions = 8325 vmcs12->guest_pending_dbg_exceptions =
8164 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 8326 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
8327 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
8328 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
8329 else
8330 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
8165 8331
8166 if ((vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) && 8332 if (nested_cpu_has_preemption_timer(vmcs12)) {
8167 (vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) 8333 if (vmcs12->vm_exit_controls &
8168 vmcs12->vmx_preemption_timer_value = 8334 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
8169 vmcs_read32(VMX_PREEMPTION_TIMER_VALUE); 8335 vmcs12->vmx_preemption_timer_value =
8336 vmx_get_preemption_timer_value(vcpu);
8337 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
8338 }
8170 8339
8171 /* 8340 /*
8172 * In some cases (usually, nested EPT), L2 is allowed to change its 8341 * In some cases (usually, nested EPT), L2 is allowed to change its
@@ -8186,7 +8355,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
8186 8355
8187 vmcs12->vm_entry_controls = 8356 vmcs12->vm_entry_controls =
8188 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 8357 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
8189 (vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE); 8358 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
8190 8359
8191 /* TODO: These cannot have changed unless we have MSR bitmaps and 8360 /* TODO: These cannot have changed unless we have MSR bitmaps and
8192 * the relevant bit asks not to trap the change */ 8361 * the relevant bit asks not to trap the change */
@@ -8198,13 +8367,15 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
8198 vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); 8367 vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
8199 vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); 8368 vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
8200 vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); 8369 vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
8370 if (vmx_mpx_supported())
8371 vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
8201 8372
8202 /* update exit information fields: */ 8373 /* update exit information fields: */
8203 8374
8204 vmcs12->vm_exit_reason = to_vmx(vcpu)->exit_reason; 8375 vmcs12->vm_exit_reason = exit_reason;
8205 vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 8376 vmcs12->exit_qualification = exit_qualification;
8206 8377
8207 vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 8378 vmcs12->vm_exit_intr_info = exit_intr_info;
8208 if ((vmcs12->vm_exit_intr_info & 8379 if ((vmcs12->vm_exit_intr_info &
8209 (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) == 8380 (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
8210 (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) 8381 (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK))
@@ -8307,6 +8478,10 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
8307 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); 8478 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
8308 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); 8479 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
8309 8480
8481 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */
8482 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
8483 vmcs_write64(GUEST_BNDCFGS, 0);
8484
8310 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { 8485 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
8311 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); 8486 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
8312 vcpu->arch.pat = vmcs12->host_ia32_pat; 8487 vcpu->arch.pat = vmcs12->host_ia32_pat;
@@ -8370,7 +8545,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
8370 * and modify vmcs12 to make it see what it would expect to see there if 8545 * and modify vmcs12 to make it see what it would expect to see there if
8371 * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) 8546 * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
8372 */ 8547 */
8373static void nested_vmx_vmexit(struct kvm_vcpu *vcpu) 8548static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
8549 u32 exit_intr_info,
8550 unsigned long exit_qualification)
8374{ 8551{
8375 struct vcpu_vmx *vmx = to_vmx(vcpu); 8552 struct vcpu_vmx *vmx = to_vmx(vcpu);
8376 int cpu; 8553 int cpu;
@@ -8380,7 +8557,15 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
8380 WARN_ON_ONCE(vmx->nested.nested_run_pending); 8557 WARN_ON_ONCE(vmx->nested.nested_run_pending);
8381 8558
8382 leave_guest_mode(vcpu); 8559 leave_guest_mode(vcpu);
8383 prepare_vmcs12(vcpu, vmcs12); 8560 prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
8561 exit_qualification);
8562
8563 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
8564 vmcs12->exit_qualification,
8565 vmcs12->idt_vectoring_info_field,
8566 vmcs12->vm_exit_intr_info,
8567 vmcs12->vm_exit_intr_error_code,
8568 KVM_ISA_VMX);
8384 8569
8385 cpu = get_cpu(); 8570 cpu = get_cpu();
8386 vmx->loaded_vmcs = &vmx->vmcs01; 8571 vmx->loaded_vmcs = &vmx->vmcs01;
@@ -8389,6 +8574,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
8389 vcpu->cpu = cpu; 8574 vcpu->cpu = cpu;
8390 put_cpu(); 8575 put_cpu();
8391 8576
8577 vm_entry_controls_init(vmx, vmcs_read32(VM_ENTRY_CONTROLS));
8578 vm_exit_controls_init(vmx, vmcs_read32(VM_EXIT_CONTROLS));
8392 vmx_segment_cache_clear(vmx); 8579 vmx_segment_cache_clear(vmx);
8393 8580
8394 /* if no vmcs02 cache requested, remove the one we used */ 8581 /* if no vmcs02 cache requested, remove the one we used */
@@ -8421,6 +8608,19 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
8421 nested_vmx_succeed(vcpu); 8608 nested_vmx_succeed(vcpu);
8422 if (enable_shadow_vmcs) 8609 if (enable_shadow_vmcs)
8423 vmx->nested.sync_shadow_vmcs = true; 8610 vmx->nested.sync_shadow_vmcs = true;
8611
8612 /* in case we halted in L2 */
8613 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
8614}
8615
8616/*
8617 * Forcibly leave nested mode in order to be able to reset the VCPU later on.
8618 */
8619static void vmx_leave_nested(struct kvm_vcpu *vcpu)
8620{
8621 if (is_guest_mode(vcpu))
8622 nested_vmx_vmexit(vcpu, -1, 0, 0);
8623 free_nested(to_vmx(vcpu));
8424} 8624}
8425 8625
8426/* 8626/*
@@ -8486,7 +8686,10 @@ static struct kvm_x86_ops vmx_x86_ops = {
8486 .set_idt = vmx_set_idt, 8686 .set_idt = vmx_set_idt,
8487 .get_gdt = vmx_get_gdt, 8687 .get_gdt = vmx_get_gdt,
8488 .set_gdt = vmx_set_gdt, 8688 .set_gdt = vmx_set_gdt,
8689 .get_dr6 = vmx_get_dr6,
8690 .set_dr6 = vmx_set_dr6,
8489 .set_dr7 = vmx_set_dr7, 8691 .set_dr7 = vmx_set_dr7,
8692 .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
8490 .cache_reg = vmx_cache_reg, 8693 .cache_reg = vmx_cache_reg,
8491 .get_rflags = vmx_get_rflags, 8694 .get_rflags = vmx_get_rflags,
8492 .set_rflags = vmx_set_rflags, 8695 .set_rflags = vmx_set_rflags,
@@ -8548,6 +8751,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
8548 8751
8549 .check_intercept = vmx_check_intercept, 8752 .check_intercept = vmx_check_intercept,
8550 .handle_external_intr = vmx_handle_external_intr, 8753 .handle_external_intr = vmx_handle_external_intr,
8754 .mpx_supported = vmx_mpx_supported,
8755
8756 .check_nested_events = vmx_check_nested_events,
8551}; 8757};
8552 8758
8553static int __init vmx_init(void) 8759static int __init vmx_init(void)
@@ -8635,6 +8841,8 @@ static int __init vmx_init(void)
8635 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); 8841 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
8636 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); 8842 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
8637 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); 8843 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
8844 vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true);
8845
8638 memcpy(vmx_msr_bitmap_legacy_x2apic, 8846 memcpy(vmx_msr_bitmap_legacy_x2apic,
8639 vmx_msr_bitmap_legacy, PAGE_SIZE); 8847 vmx_msr_bitmap_legacy, PAGE_SIZE);
8640 memcpy(vmx_msr_bitmap_longmode_x2apic, 8848 memcpy(vmx_msr_bitmap_longmode_x2apic,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5d004da1e35d..9d1b5cd4d34c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -94,6 +94,9 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops);
94static bool ignore_msrs = 0; 94static bool ignore_msrs = 0;
95module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR); 95module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
96 96
97unsigned int min_timer_period_us = 500;
98module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
99
97bool kvm_has_tsc_control; 100bool kvm_has_tsc_control;
98EXPORT_SYMBOL_GPL(kvm_has_tsc_control); 101EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
99u32 kvm_max_guest_tsc_khz; 102u32 kvm_max_guest_tsc_khz;
@@ -254,10 +257,26 @@ u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
254} 257}
255EXPORT_SYMBOL_GPL(kvm_get_apic_base); 258EXPORT_SYMBOL_GPL(kvm_get_apic_base);
256 259
257void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) 260int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
258{ 261{
259 /* TODO: reserve bits check */ 262 u64 old_state = vcpu->arch.apic_base &
260 kvm_lapic_set_base(vcpu, data); 263 (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE);
264 u64 new_state = msr_info->data &
265 (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE);
266 u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) |
267 0x2ff | (guest_cpuid_has_x2apic(vcpu) ? 0 : X2APIC_ENABLE);
268
269 if (!msr_info->host_initiated &&
270 ((msr_info->data & reserved_bits) != 0 ||
271 new_state == X2APIC_ENABLE ||
272 (new_state == MSR_IA32_APICBASE_ENABLE &&
273 old_state == (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) ||
274 (new_state == (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE) &&
275 old_state == 0)))
276 return 1;
277
278 kvm_lapic_set_base(vcpu, msr_info->data);
279 return 0;
261} 280}
262EXPORT_SYMBOL_GPL(kvm_set_apic_base); 281EXPORT_SYMBOL_GPL(kvm_set_apic_base);
263 282
@@ -576,13 +595,13 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
576 595
577int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) 596int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
578{ 597{
579 u64 xcr0; 598 u64 xcr0 = xcr;
599 u64 old_xcr0 = vcpu->arch.xcr0;
580 u64 valid_bits; 600 u64 valid_bits;
581 601
582 /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */ 602 /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */
583 if (index != XCR_XFEATURE_ENABLED_MASK) 603 if (index != XCR_XFEATURE_ENABLED_MASK)
584 return 1; 604 return 1;
585 xcr0 = xcr;
586 if (!(xcr0 & XSTATE_FP)) 605 if (!(xcr0 & XSTATE_FP))
587 return 1; 606 return 1;
588 if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE)) 607 if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
@@ -597,8 +616,14 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
597 if (xcr0 & ~valid_bits) 616 if (xcr0 & ~valid_bits)
598 return 1; 617 return 1;
599 618
619 if ((!(xcr0 & XSTATE_BNDREGS)) != (!(xcr0 & XSTATE_BNDCSR)))
620 return 1;
621
600 kvm_put_guest_xcr0(vcpu); 622 kvm_put_guest_xcr0(vcpu);
601 vcpu->arch.xcr0 = xcr0; 623 vcpu->arch.xcr0 = xcr0;
624
625 if ((xcr0 ^ old_xcr0) & XSTATE_EXTEND_MASK)
626 kvm_update_cpuid(vcpu);
602 return 0; 627 return 0;
603} 628}
604 629
@@ -719,6 +744,12 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
719} 744}
720EXPORT_SYMBOL_GPL(kvm_get_cr8); 745EXPORT_SYMBOL_GPL(kvm_get_cr8);
721 746
747static void kvm_update_dr6(struct kvm_vcpu *vcpu)
748{
749 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
750 kvm_x86_ops->set_dr6(vcpu, vcpu->arch.dr6);
751}
752
722static void kvm_update_dr7(struct kvm_vcpu *vcpu) 753static void kvm_update_dr7(struct kvm_vcpu *vcpu)
723{ 754{
724 unsigned long dr7; 755 unsigned long dr7;
@@ -728,7 +759,9 @@ static void kvm_update_dr7(struct kvm_vcpu *vcpu)
728 else 759 else
729 dr7 = vcpu->arch.dr7; 760 dr7 = vcpu->arch.dr7;
730 kvm_x86_ops->set_dr7(vcpu, dr7); 761 kvm_x86_ops->set_dr7(vcpu, dr7);
731 vcpu->arch.switch_db_regs = (dr7 & DR7_BP_EN_MASK); 762 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
763 if (dr7 & DR7_BP_EN_MASK)
764 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
732} 765}
733 766
734static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) 767static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
@@ -747,6 +780,7 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
747 if (val & 0xffffffff00000000ULL) 780 if (val & 0xffffffff00000000ULL)
748 return -1; /* #GP */ 781 return -1; /* #GP */
749 vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; 782 vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
783 kvm_update_dr6(vcpu);
750 break; 784 break;
751 case 5: 785 case 5:
752 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) 786 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
@@ -788,7 +822,10 @@ static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
788 return 1; 822 return 1;
789 /* fall through */ 823 /* fall through */
790 case 6: 824 case 6:
791 *val = vcpu->arch.dr6; 825 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
826 *val = vcpu->arch.dr6;
827 else
828 *val = kvm_x86_ops->get_dr6(vcpu);
792 break; 829 break;
793 case 5: 830 case 5:
794 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) 831 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
@@ -836,11 +873,12 @@ EXPORT_SYMBOL_GPL(kvm_rdpmc);
836 * kvm-specific. Those are put in the beginning of the list. 873 * kvm-specific. Those are put in the beginning of the list.
837 */ 874 */
838 875
839#define KVM_SAVE_MSRS_BEGIN 10 876#define KVM_SAVE_MSRS_BEGIN 12
840static u32 msrs_to_save[] = { 877static u32 msrs_to_save[] = {
841 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 878 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
842 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, 879 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
843 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, 880 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
881 HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
844 HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, 882 HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
845 MSR_KVM_PV_EOI_EN, 883 MSR_KVM_PV_EOI_EN,
846 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 884 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
@@ -849,7 +887,7 @@ static u32 msrs_to_save[] = {
849 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 887 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
850#endif 888#endif
851 MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, 889 MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
852 MSR_IA32_FEATURE_CONTROL 890 MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS
853}; 891};
854 892
855static unsigned num_msrs_to_save; 893static unsigned num_msrs_to_save;
@@ -1275,8 +1313,6 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
1275 kvm->arch.last_tsc_write = data; 1313 kvm->arch.last_tsc_write = data;
1276 kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz; 1314 kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
1277 1315
1278 /* Reset of TSC must disable overshoot protection below */
1279 vcpu->arch.hv_clock.tsc_timestamp = 0;
1280 vcpu->arch.last_guest_tsc = data; 1316 vcpu->arch.last_guest_tsc = data;
1281 1317
1282 /* Keep track of which generation this VCPU has synchronized to */ 1318 /* Keep track of which generation this VCPU has synchronized to */
@@ -1484,7 +1520,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1484 unsigned long flags, this_tsc_khz; 1520 unsigned long flags, this_tsc_khz;
1485 struct kvm_vcpu_arch *vcpu = &v->arch; 1521 struct kvm_vcpu_arch *vcpu = &v->arch;
1486 struct kvm_arch *ka = &v->kvm->arch; 1522 struct kvm_arch *ka = &v->kvm->arch;
1487 s64 kernel_ns, max_kernel_ns; 1523 s64 kernel_ns;
1488 u64 tsc_timestamp, host_tsc; 1524 u64 tsc_timestamp, host_tsc;
1489 struct pvclock_vcpu_time_info guest_hv_clock; 1525 struct pvclock_vcpu_time_info guest_hv_clock;
1490 u8 pvclock_flags; 1526 u8 pvclock_flags;
@@ -1543,37 +1579,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1543 if (!vcpu->pv_time_enabled) 1579 if (!vcpu->pv_time_enabled)
1544 return 0; 1580 return 0;
1545 1581
1546 /*
1547 * Time as measured by the TSC may go backwards when resetting the base
1548 * tsc_timestamp. The reason for this is that the TSC resolution is
1549 * higher than the resolution of the other clock scales. Thus, many
1550 * possible measurments of the TSC correspond to one measurement of any
1551 * other clock, and so a spread of values is possible. This is not a
1552 * problem for the computation of the nanosecond clock; with TSC rates
1553 * around 1GHZ, there can only be a few cycles which correspond to one
1554 * nanosecond value, and any path through this code will inevitably
1555 * take longer than that. However, with the kernel_ns value itself,
1556 * the precision may be much lower, down to HZ granularity. If the
1557 * first sampling of TSC against kernel_ns ends in the low part of the
1558 * range, and the second in the high end of the range, we can get:
1559 *
1560 * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new
1561 *
1562 * As the sampling errors potentially range in the thousands of cycles,
1563 * it is possible such a time value has already been observed by the
1564 * guest. To protect against this, we must compute the system time as
1565 * observed by the guest and ensure the new system time is greater.
1566 */
1567 max_kernel_ns = 0;
1568 if (vcpu->hv_clock.tsc_timestamp) {
1569 max_kernel_ns = vcpu->last_guest_tsc -
1570 vcpu->hv_clock.tsc_timestamp;
1571 max_kernel_ns = pvclock_scale_delta(max_kernel_ns,
1572 vcpu->hv_clock.tsc_to_system_mul,
1573 vcpu->hv_clock.tsc_shift);
1574 max_kernel_ns += vcpu->last_kernel_ns;
1575 }
1576
1577 if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) { 1582 if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
1578 kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz, 1583 kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz,
1579 &vcpu->hv_clock.tsc_shift, 1584 &vcpu->hv_clock.tsc_shift,
@@ -1581,18 +1586,9 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1581 vcpu->hw_tsc_khz = this_tsc_khz; 1586 vcpu->hw_tsc_khz = this_tsc_khz;
1582 } 1587 }
1583 1588
1584 /* with a master <monotonic time, tsc value> tuple,
1585 * pvclock clock reads always increase at the (scaled) rate
1586 * of guest TSC - no need to deal with sampling errors.
1587 */
1588 if (!use_master_clock) {
1589 if (max_kernel_ns > kernel_ns)
1590 kernel_ns = max_kernel_ns;
1591 }
1592 /* With all the info we got, fill in the values */ 1589 /* With all the info we got, fill in the values */
1593 vcpu->hv_clock.tsc_timestamp = tsc_timestamp; 1590 vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
1594 vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; 1591 vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
1595 vcpu->last_kernel_ns = kernel_ns;
1596 vcpu->last_guest_tsc = tsc_timestamp; 1592 vcpu->last_guest_tsc = tsc_timestamp;
1597 1593
1598 /* 1594 /*
@@ -1634,14 +1630,21 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1634 * the others. 1630 * the others.
1635 * 1631 *
1636 * So in those cases, request a kvmclock update for all vcpus. 1632 * So in those cases, request a kvmclock update for all vcpus.
1637 * The worst case for a remote vcpu to update its kvmclock 1633 * We need to rate-limit these requests though, as they can
1638 * is then bounded by maximum nohz sleep latency. 1634 * considerably slow guests that have a large number of vcpus.
1635 * The time for a remote vcpu to update its kvmclock is bound
1636 * by the delay we use to rate-limit the updates.
1639 */ 1637 */
1640 1638
1641static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) 1639#define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100)
1640
1641static void kvmclock_update_fn(struct work_struct *work)
1642{ 1642{
1643 int i; 1643 int i;
1644 struct kvm *kvm = v->kvm; 1644 struct delayed_work *dwork = to_delayed_work(work);
1645 struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
1646 kvmclock_update_work);
1647 struct kvm *kvm = container_of(ka, struct kvm, arch);
1645 struct kvm_vcpu *vcpu; 1648 struct kvm_vcpu *vcpu;
1646 1649
1647 kvm_for_each_vcpu(i, vcpu, kvm) { 1650 kvm_for_each_vcpu(i, vcpu, kvm) {
@@ -1650,6 +1653,29 @@ static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
1650 } 1653 }
1651} 1654}
1652 1655
1656static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
1657{
1658 struct kvm *kvm = v->kvm;
1659
1660 set_bit(KVM_REQ_CLOCK_UPDATE, &v->requests);
1661 schedule_delayed_work(&kvm->arch.kvmclock_update_work,
1662 KVMCLOCK_UPDATE_DELAY);
1663}
1664
1665#define KVMCLOCK_SYNC_PERIOD (300 * HZ)
1666
1667static void kvmclock_sync_fn(struct work_struct *work)
1668{
1669 struct delayed_work *dwork = to_delayed_work(work);
1670 struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
1671 kvmclock_sync_work);
1672 struct kvm *kvm = container_of(ka, struct kvm, arch);
1673
1674 schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
1675 schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
1676 KVMCLOCK_SYNC_PERIOD);
1677}
1678
1653static bool msr_mtrr_valid(unsigned msr) 1679static bool msr_mtrr_valid(unsigned msr)
1654{ 1680{
1655 switch (msr) { 1681 switch (msr) {
@@ -1826,6 +1852,8 @@ static bool kvm_hv_msr_partition_wide(u32 msr)
1826 switch (msr) { 1852 switch (msr) {
1827 case HV_X64_MSR_GUEST_OS_ID: 1853 case HV_X64_MSR_GUEST_OS_ID:
1828 case HV_X64_MSR_HYPERCALL: 1854 case HV_X64_MSR_HYPERCALL:
1855 case HV_X64_MSR_REFERENCE_TSC:
1856 case HV_X64_MSR_TIME_REF_COUNT:
1829 r = true; 1857 r = true;
1830 break; 1858 break;
1831 } 1859 }
@@ -1865,6 +1893,21 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1865 if (__copy_to_user((void __user *)addr, instructions, 4)) 1893 if (__copy_to_user((void __user *)addr, instructions, 4))
1866 return 1; 1894 return 1;
1867 kvm->arch.hv_hypercall = data; 1895 kvm->arch.hv_hypercall = data;
1896 mark_page_dirty(kvm, gfn);
1897 break;
1898 }
1899 case HV_X64_MSR_REFERENCE_TSC: {
1900 u64 gfn;
1901 HV_REFERENCE_TSC_PAGE tsc_ref;
1902 memset(&tsc_ref, 0, sizeof(tsc_ref));
1903 kvm->arch.hv_tsc_page = data;
1904 if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE))
1905 break;
1906 gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
1907 if (kvm_write_guest(kvm, data,
1908 &tsc_ref, sizeof(tsc_ref)))
1909 return 1;
1910 mark_page_dirty(kvm, gfn);
1868 break; 1911 break;
1869 } 1912 }
1870 default: 1913 default:
@@ -1879,19 +1922,21 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1879{ 1922{
1880 switch (msr) { 1923 switch (msr) {
1881 case HV_X64_MSR_APIC_ASSIST_PAGE: { 1924 case HV_X64_MSR_APIC_ASSIST_PAGE: {
1925 u64 gfn;
1882 unsigned long addr; 1926 unsigned long addr;
1883 1927
1884 if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) { 1928 if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
1885 vcpu->arch.hv_vapic = data; 1929 vcpu->arch.hv_vapic = data;
1886 break; 1930 break;
1887 } 1931 }
1888 addr = gfn_to_hva(vcpu->kvm, data >> 1932 gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT;
1889 HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT); 1933 addr = gfn_to_hva(vcpu->kvm, gfn);
1890 if (kvm_is_error_hva(addr)) 1934 if (kvm_is_error_hva(addr))
1891 return 1; 1935 return 1;
1892 if (__clear_user((void __user *)addr, PAGE_SIZE)) 1936 if (__clear_user((void __user *)addr, PAGE_SIZE))
1893 return 1; 1937 return 1;
1894 vcpu->arch.hv_vapic = data; 1938 vcpu->arch.hv_vapic = data;
1939 mark_page_dirty(vcpu->kvm, gfn);
1895 break; 1940 break;
1896 } 1941 }
1897 case HV_X64_MSR_EOI: 1942 case HV_X64_MSR_EOI:
@@ -2017,8 +2062,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2017 case 0x200 ... 0x2ff: 2062 case 0x200 ... 0x2ff:
2018 return set_msr_mtrr(vcpu, msr, data); 2063 return set_msr_mtrr(vcpu, msr, data);
2019 case MSR_IA32_APICBASE: 2064 case MSR_IA32_APICBASE:
2020 kvm_set_apic_base(vcpu, data); 2065 return kvm_set_apic_base(vcpu, msr_info);
2021 break;
2022 case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: 2066 case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
2023 return kvm_x2apic_msr_write(vcpu, msr, data); 2067 return kvm_x2apic_msr_write(vcpu, msr, data);
2024 case MSR_IA32_TSCDEADLINE: 2068 case MSR_IA32_TSCDEADLINE:
@@ -2291,6 +2335,14 @@ static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
2291 case HV_X64_MSR_HYPERCALL: 2335 case HV_X64_MSR_HYPERCALL:
2292 data = kvm->arch.hv_hypercall; 2336 data = kvm->arch.hv_hypercall;
2293 break; 2337 break;
2338 case HV_X64_MSR_TIME_REF_COUNT: {
2339 data =
2340 div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100);
2341 break;
2342 }
2343 case HV_X64_MSR_REFERENCE_TSC:
2344 data = kvm->arch.hv_tsc_page;
2345 break;
2294 default: 2346 default:
2295 vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); 2347 vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
2296 return 1; 2348 return 1;
@@ -2308,9 +2360,12 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
2308 case HV_X64_MSR_VP_INDEX: { 2360 case HV_X64_MSR_VP_INDEX: {
2309 int r; 2361 int r;
2310 struct kvm_vcpu *v; 2362 struct kvm_vcpu *v;
2311 kvm_for_each_vcpu(r, v, vcpu->kvm) 2363 kvm_for_each_vcpu(r, v, vcpu->kvm) {
2312 if (v == vcpu) 2364 if (v == vcpu) {
2313 data = r; 2365 data = r;
2366 break;
2367 }
2368 }
2314 break; 2369 break;
2315 } 2370 }
2316 case HV_X64_MSR_EOI: 2371 case HV_X64_MSR_EOI:
@@ -2601,6 +2656,8 @@ int kvm_dev_ioctl_check_extension(long ext)
2601 case KVM_CAP_GET_TSC_KHZ: 2656 case KVM_CAP_GET_TSC_KHZ:
2602 case KVM_CAP_KVMCLOCK_CTRL: 2657 case KVM_CAP_KVMCLOCK_CTRL:
2603 case KVM_CAP_READONLY_MEM: 2658 case KVM_CAP_READONLY_MEM:
2659 case KVM_CAP_HYPERV_TIME:
2660 case KVM_CAP_IOAPIC_POLARITY_IGNORED:
2604#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT 2661#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
2605 case KVM_CAP_ASSIGN_DEV_IRQ: 2662 case KVM_CAP_ASSIGN_DEV_IRQ:
2606 case KVM_CAP_PCI_2_3: 2663 case KVM_CAP_PCI_2_3:
@@ -2972,8 +3029,11 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2972static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, 3029static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
2973 struct kvm_debugregs *dbgregs) 3030 struct kvm_debugregs *dbgregs)
2974{ 3031{
3032 unsigned long val;
3033
2975 memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); 3034 memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
2976 dbgregs->dr6 = vcpu->arch.dr6; 3035 _kvm_get_dr(vcpu, 6, &val);
3036 dbgregs->dr6 = val;
2977 dbgregs->dr7 = vcpu->arch.dr7; 3037 dbgregs->dr7 = vcpu->arch.dr7;
2978 dbgregs->flags = 0; 3038 dbgregs->flags = 0;
2979 memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved)); 3039 memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
@@ -2987,7 +3047,9 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
2987 3047
2988 memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); 3048 memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
2989 vcpu->arch.dr6 = dbgregs->dr6; 3049 vcpu->arch.dr6 = dbgregs->dr6;
3050 kvm_update_dr6(vcpu);
2990 vcpu->arch.dr7 = dbgregs->dr7; 3051 vcpu->arch.dr7 = dbgregs->dr7;
3052 kvm_update_dr7(vcpu);
2991 3053
2992 return 0; 3054 return 0;
2993} 3055}
@@ -3022,9 +3084,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
3022 * CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility 3084 * CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility
3023 * with old userspace. 3085 * with old userspace.
3024 */ 3086 */
3025 if (xstate_bv & ~KVM_SUPPORTED_XCR0) 3087 if (xstate_bv & ~kvm_supported_xcr0())
3026 return -EINVAL;
3027 if (xstate_bv & ~host_xcr0)
3028 return -EINVAL; 3088 return -EINVAL;
3029 memcpy(&vcpu->arch.guest_fpu.state->xsave, 3089 memcpy(&vcpu->arch.guest_fpu.state->xsave,
3030 guest_xsave->region, vcpu->arch.guest_xstate_size); 3090 guest_xsave->region, vcpu->arch.guest_xstate_size);
@@ -3877,6 +3937,23 @@ static void kvm_init_msr_list(void)
3877 for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) { 3937 for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
3878 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) 3938 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
3879 continue; 3939 continue;
3940
3941 /*
3942 * Even MSRs that are valid in the host may not be exposed
3943 * to the guests in some cases. We could work around this
3944 * in VMX with the generic MSR save/load machinery, but it
3945 * is not really worthwhile since it will really only
3946 * happen with nested virtualization.
3947 */
3948 switch (msrs_to_save[i]) {
3949 case MSR_IA32_BNDCFGS:
3950 if (!kvm_x86_ops->mpx_supported())
3951 continue;
3952 break;
3953 default:
3954 break;
3955 }
3956
3880 if (j < i) 3957 if (j < i)
3881 msrs_to_save[j] = msrs_to_save[i]; 3958 msrs_to_save[j] = msrs_to_save[i];
3882 j++; 3959 j++;
@@ -4373,6 +4450,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
4373 if (!exchanged) 4450 if (!exchanged)
4374 return X86EMUL_CMPXCHG_FAILED; 4451 return X86EMUL_CMPXCHG_FAILED;
4375 4452
4453 mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
4376 kvm_mmu_pte_write(vcpu, gpa, new, bytes); 4454 kvm_mmu_pte_write(vcpu, gpa, new, bytes);
4377 4455
4378 return X86EMUL_CONTINUE; 4456 return X86EMUL_CONTINUE;
@@ -5344,7 +5422,8 @@ static void kvm_timer_init(void)
5344 int cpu; 5422 int cpu;
5345 5423
5346 max_tsc_khz = tsc_khz; 5424 max_tsc_khz = tsc_khz;
5347 register_hotcpu_notifier(&kvmclock_cpu_notifier_block); 5425
5426 cpu_notifier_register_begin();
5348 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { 5427 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
5349#ifdef CONFIG_CPU_FREQ 5428#ifdef CONFIG_CPU_FREQ
5350 struct cpufreq_policy policy; 5429 struct cpufreq_policy policy;
@@ -5361,6 +5440,10 @@ static void kvm_timer_init(void)
5361 pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz); 5440 pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz);
5362 for_each_online_cpu(cpu) 5441 for_each_online_cpu(cpu)
5363 smp_call_function_single(cpu, tsc_khz_changed, NULL, 1); 5442 smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
5443
5444 __register_hotcpu_notifier(&kvmclock_cpu_notifier_block);
5445 cpu_notifier_register_done();
5446
5364} 5447}
5365 5448
5366static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); 5449static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
@@ -5516,9 +5599,10 @@ int kvm_arch_init(void *opaque)
5516 goto out_free_percpu; 5599 goto out_free_percpu;
5517 5600
5518 kvm_set_mmio_spte_mask(); 5601 kvm_set_mmio_spte_mask();
5519 kvm_init_msr_list();
5520 5602
5521 kvm_x86_ops = ops; 5603 kvm_x86_ops = ops;
5604 kvm_init_msr_list();
5605
5522 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 5606 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
5523 PT_DIRTY_MASK, PT64_NX_MASK, 0); 5607 PT_DIRTY_MASK, PT64_NX_MASK, 0);
5524 5608
@@ -5761,8 +5845,10 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
5761 kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); 5845 kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
5762} 5846}
5763 5847
5764static void inject_pending_event(struct kvm_vcpu *vcpu) 5848static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
5765{ 5849{
5850 int r;
5851
5766 /* try to reinject previous events if any */ 5852 /* try to reinject previous events if any */
5767 if (vcpu->arch.exception.pending) { 5853 if (vcpu->arch.exception.pending) {
5768 trace_kvm_inj_exception(vcpu->arch.exception.nr, 5854 trace_kvm_inj_exception(vcpu->arch.exception.nr,
@@ -5772,17 +5858,23 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
5772 vcpu->arch.exception.has_error_code, 5858 vcpu->arch.exception.has_error_code,
5773 vcpu->arch.exception.error_code, 5859 vcpu->arch.exception.error_code,
5774 vcpu->arch.exception.reinject); 5860 vcpu->arch.exception.reinject);
5775 return; 5861 return 0;
5776 } 5862 }
5777 5863
5778 if (vcpu->arch.nmi_injected) { 5864 if (vcpu->arch.nmi_injected) {
5779 kvm_x86_ops->set_nmi(vcpu); 5865 kvm_x86_ops->set_nmi(vcpu);
5780 return; 5866 return 0;
5781 } 5867 }
5782 5868
5783 if (vcpu->arch.interrupt.pending) { 5869 if (vcpu->arch.interrupt.pending) {
5784 kvm_x86_ops->set_irq(vcpu); 5870 kvm_x86_ops->set_irq(vcpu);
5785 return; 5871 return 0;
5872 }
5873
5874 if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
5875 r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
5876 if (r != 0)
5877 return r;
5786 } 5878 }
5787 5879
5788 /* try to inject new event if pending */ 5880 /* try to inject new event if pending */
@@ -5799,6 +5891,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
5799 kvm_x86_ops->set_irq(vcpu); 5891 kvm_x86_ops->set_irq(vcpu);
5800 } 5892 }
5801 } 5893 }
5894 return 0;
5802} 5895}
5803 5896
5804static void process_nmi(struct kvm_vcpu *vcpu) 5897static void process_nmi(struct kvm_vcpu *vcpu)
@@ -5834,6 +5927,11 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
5834 kvm_apic_update_tmr(vcpu, tmr); 5927 kvm_apic_update_tmr(vcpu, tmr);
5835} 5928}
5836 5929
5930/*
5931 * Returns 1 to let __vcpu_run() continue the guest execution loop without
5932 * exiting to the userspace. Otherwise, the value will be returned to the
5933 * userspace.
5934 */
5837static int vcpu_enter_guest(struct kvm_vcpu *vcpu) 5935static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5838{ 5936{
5839 int r; 5937 int r;
@@ -5898,15 +5996,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5898 goto out; 5996 goto out;
5899 } 5997 }
5900 5998
5901 inject_pending_event(vcpu); 5999 if (inject_pending_event(vcpu, req_int_win) != 0)
5902 6000 req_immediate_exit = true;
5903 /* enable NMI/IRQ window open exits if needed */ 6001 /* enable NMI/IRQ window open exits if needed */
5904 if (vcpu->arch.nmi_pending) 6002 else if (vcpu->arch.nmi_pending)
5905 req_immediate_exit = 6003 kvm_x86_ops->enable_nmi_window(vcpu);
5906 kvm_x86_ops->enable_nmi_window(vcpu) != 0;
5907 else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) 6004 else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
5908 req_immediate_exit = 6005 kvm_x86_ops->enable_irq_window(vcpu);
5909 kvm_x86_ops->enable_irq_window(vcpu) != 0;
5910 6006
5911 if (kvm_lapic_enabled(vcpu)) { 6007 if (kvm_lapic_enabled(vcpu)) {
5912 /* 6008 /*
@@ -5966,12 +6062,28 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5966 set_debugreg(vcpu->arch.eff_db[1], 1); 6062 set_debugreg(vcpu->arch.eff_db[1], 1);
5967 set_debugreg(vcpu->arch.eff_db[2], 2); 6063 set_debugreg(vcpu->arch.eff_db[2], 2);
5968 set_debugreg(vcpu->arch.eff_db[3], 3); 6064 set_debugreg(vcpu->arch.eff_db[3], 3);
6065 set_debugreg(vcpu->arch.dr6, 6);
5969 } 6066 }
5970 6067
5971 trace_kvm_entry(vcpu->vcpu_id); 6068 trace_kvm_entry(vcpu->vcpu_id);
5972 kvm_x86_ops->run(vcpu); 6069 kvm_x86_ops->run(vcpu);
5973 6070
5974 /* 6071 /*
6072 * Do this here before restoring debug registers on the host. And
6073 * since we do this before handling the vmexit, a DR access vmexit
6074 * can (a) read the correct value of the debug registers, (b) set
6075 * KVM_DEBUGREG_WONT_EXIT again.
6076 */
6077 if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
6078 int i;
6079
6080 WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
6081 kvm_x86_ops->sync_dirty_debug_regs(vcpu);
6082 for (i = 0; i < KVM_NR_DB_REGS; i++)
6083 vcpu->arch.eff_db[i] = vcpu->arch.db[i];
6084 }
6085
6086 /*
5975 * If the guest has used debug registers, at least dr7 6087 * If the guest has used debug registers, at least dr7
5976 * will be disabled while returning to the host. 6088 * will be disabled while returning to the host.
5977 * If we don't have active breakpoints in the host, we don't 6089 * If we don't have active breakpoints in the host, we don't
@@ -6089,7 +6201,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
6089 } 6201 }
6090 if (need_resched()) { 6202 if (need_resched()) {
6091 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 6203 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
6092 kvm_resched(vcpu); 6204 cond_resched();
6093 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); 6205 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
6094 } 6206 }
6095 } 6207 }
@@ -6160,7 +6272,7 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
6160 frag->len -= len; 6272 frag->len -= len;
6161 } 6273 }
6162 6274
6163 if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) { 6275 if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) {
6164 vcpu->mmio_needed = 0; 6276 vcpu->mmio_needed = 0;
6165 6277
6166 /* FIXME: return into emulator if single-stepping. */ 6278 /* FIXME: return into emulator if single-stepping. */
@@ -6401,6 +6513,7 @@ EXPORT_SYMBOL_GPL(kvm_task_switch);
6401int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, 6513int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
6402 struct kvm_sregs *sregs) 6514 struct kvm_sregs *sregs)
6403{ 6515{
6516 struct msr_data apic_base_msr;
6404 int mmu_reset_needed = 0; 6517 int mmu_reset_needed = 0;
6405 int pending_vec, max_bits, idx; 6518 int pending_vec, max_bits, idx;
6406 struct desc_ptr dt; 6519 struct desc_ptr dt;
@@ -6424,7 +6537,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
6424 6537
6425 mmu_reset_needed |= vcpu->arch.efer != sregs->efer; 6538 mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
6426 kvm_x86_ops->set_efer(vcpu, sregs->efer); 6539 kvm_x86_ops->set_efer(vcpu, sregs->efer);
6427 kvm_set_apic_base(vcpu, sregs->apic_base); 6540 apic_base_msr.data = sregs->apic_base;
6541 apic_base_msr.host_initiated = true;
6542 kvm_set_apic_base(vcpu, &apic_base_msr);
6428 6543
6429 mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; 6544 mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
6430 kvm_x86_ops->set_cr0(vcpu, sregs->cr0); 6545 kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
@@ -6682,6 +6797,7 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
6682{ 6797{
6683 int r; 6798 int r;
6684 struct msr_data msr; 6799 struct msr_data msr;
6800 struct kvm *kvm = vcpu->kvm;
6685 6801
6686 r = vcpu_load(vcpu); 6802 r = vcpu_load(vcpu);
6687 if (r) 6803 if (r)
@@ -6692,6 +6808,9 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
6692 kvm_write_tsc(vcpu, &msr); 6808 kvm_write_tsc(vcpu, &msr);
6693 vcpu_put(vcpu); 6809 vcpu_put(vcpu);
6694 6810
6811 schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
6812 KVMCLOCK_SYNC_PERIOD);
6813
6695 return r; 6814 return r;
6696} 6815}
6697 6816
@@ -6717,6 +6836,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
6717 6836
6718 memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); 6837 memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
6719 vcpu->arch.dr6 = DR6_FIXED_1; 6838 vcpu->arch.dr6 = DR6_FIXED_1;
6839 kvm_update_dr6(vcpu);
6720 vcpu->arch.dr7 = DR7_FIXED_1; 6840 vcpu->arch.dr7 = DR7_FIXED_1;
6721 kvm_update_dr7(vcpu); 6841 kvm_update_dr7(vcpu);
6722 6842
@@ -6983,6 +7103,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
6983 7103
6984 pvclock_update_vm_gtod_copy(kvm); 7104 pvclock_update_vm_gtod_copy(kvm);
6985 7105
7106 INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
7107 INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
7108
6986 return 0; 7109 return 0;
6987} 7110}
6988 7111
@@ -7020,6 +7143,8 @@ static void kvm_free_vcpus(struct kvm *kvm)
7020 7143
7021void kvm_arch_sync_events(struct kvm *kvm) 7144void kvm_arch_sync_events(struct kvm *kvm)
7022{ 7145{
7146 cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
7147 cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
7023 kvm_free_all_assigned_devices(kvm); 7148 kvm_free_all_assigned_devices(kvm);
7024 kvm_free_pit(kvm); 7149 kvm_free_pit(kvm);
7025} 7150}
@@ -7218,6 +7343,9 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
7218 7343
7219int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 7344int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
7220{ 7345{
7346 if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
7347 kvm_x86_ops->check_nested_events(vcpu, false);
7348
7221 return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && 7349 return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
7222 !vcpu->arch.apf.halted) 7350 !vcpu->arch.apf.halted)
7223 || !list_empty_careful(&vcpu->async_pf.done) 7351 || !list_empty_careful(&vcpu->async_pf.done)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 587fb9ede436..8c97bac9a895 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -122,8 +122,13 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
122 gva_t addr, void *val, unsigned int bytes, 122 gva_t addr, void *val, unsigned int bytes,
123 struct x86_exception *exception); 123 struct x86_exception *exception);
124 124
125#define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) 125#define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \
126 | XSTATE_BNDREGS | XSTATE_BNDCSR)
126extern u64 host_xcr0; 127extern u64 host_xcr0;
127 128
129extern u64 kvm_supported_xcr0(void);
130
131extern unsigned int min_timer_period_us;
132
128extern struct static_key kvm_no_apic_vcpu; 133extern struct static_key kvm_no_apic_vcpu;
129#endif 134#endif
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index bdf8532494fe..ad1fb5f53925 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -233,13 +233,13 @@ static void lguest_end_context_switch(struct task_struct *next)
233 * flags word contains all kind of stuff, but in practice Linux only cares 233 * flags word contains all kind of stuff, but in practice Linux only cares
234 * about the interrupt flag. Our "save_flags()" just returns that. 234 * about the interrupt flag. Our "save_flags()" just returns that.
235 */ 235 */
236static unsigned long save_fl(void) 236asmlinkage unsigned long lguest_save_fl(void)
237{ 237{
238 return lguest_data.irq_enabled; 238 return lguest_data.irq_enabled;
239} 239}
240 240
241/* Interrupts go off... */ 241/* Interrupts go off... */
242static void irq_disable(void) 242asmlinkage void lguest_irq_disable(void)
243{ 243{
244 lguest_data.irq_enabled = 0; 244 lguest_data.irq_enabled = 0;
245} 245}
@@ -253,8 +253,8 @@ static void irq_disable(void)
253 * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the 253 * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the
254 * C function, then restores it. 254 * C function, then restores it.
255 */ 255 */
256PV_CALLEE_SAVE_REGS_THUNK(save_fl); 256PV_CALLEE_SAVE_REGS_THUNK(lguest_save_fl);
257PV_CALLEE_SAVE_REGS_THUNK(irq_disable); 257PV_CALLEE_SAVE_REGS_THUNK(lguest_irq_disable);
258/*:*/ 258/*:*/
259 259
260/* These are in i386_head.S */ 260/* These are in i386_head.S */
@@ -1291,9 +1291,9 @@ __init void lguest_init(void)
1291 */ 1291 */
1292 1292
1293 /* Interrupt-related operations */ 1293 /* Interrupt-related operations */
1294 pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); 1294 pv_irq_ops.save_fl = PV_CALLEE_SAVE(lguest_save_fl);
1295 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); 1295 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl);
1296 pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable); 1296 pv_irq_ops.irq_disable = PV_CALLEE_SAVE(lguest_irq_disable);
1297 pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable); 1297 pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable);
1298 pv_irq_ops.safe_halt = lguest_safe_halt; 1298 pv_irq_ops.safe_halt = lguest_safe_halt;
1299 1299
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 992d63bb154f..eabcb6e6a900 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -24,7 +24,7 @@ lib-$(CONFIG_SMP) += rwlock.o
24lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o 24lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
25lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o 25lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
26 26
27obj-y += msr.o msr-reg.o msr-reg-export.o 27obj-y += msr.o msr-reg.o msr-reg-export.o hash.o
28 28
29ifeq ($(CONFIG_X86_32),y) 29ifeq ($(CONFIG_X86_32),y)
30 obj-y += atomic64_32.o 30 obj-y += atomic64_32.o
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index a30ca15be21c..dee945d55594 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -186,7 +186,7 @@ ENTRY(copy_user_generic_unrolled)
18630: shll $6,%ecx 18630: shll $6,%ecx
187 addl %ecx,%edx 187 addl %ecx,%edx
188 jmp 60f 188 jmp 60f
18940: lea (%rdx,%rcx,8),%rdx 18940: leal (%rdx,%rcx,8),%edx
190 jmp 60f 190 jmp 60f
19150: movl %ecx,%edx 19150: movl %ecx,%edx
19260: jmp copy_user_handle_tail /* ecx is zerorest also */ 19260: jmp copy_user_handle_tail /* ecx is zerorest also */
@@ -236,8 +236,6 @@ ENDPROC(copy_user_generic_unrolled)
236ENTRY(copy_user_generic_string) 236ENTRY(copy_user_generic_string)
237 CFI_STARTPROC 237 CFI_STARTPROC
238 ASM_STAC 238 ASM_STAC
239 andl %edx,%edx
240 jz 4f
241 cmpl $8,%edx 239 cmpl $8,%edx
242 jb 2f /* less than 8 bytes, go to byte copy loop */ 240 jb 2f /* less than 8 bytes, go to byte copy loop */
243 ALIGN_DESTINATION 241 ALIGN_DESTINATION
@@ -249,12 +247,12 @@ ENTRY(copy_user_generic_string)
2492: movl %edx,%ecx 2472: movl %edx,%ecx
2503: rep 2483: rep
251 movsb 249 movsb
2524: xorl %eax,%eax 250 xorl %eax,%eax
253 ASM_CLAC 251 ASM_CLAC
254 ret 252 ret
255 253
256 .section .fixup,"ax" 254 .section .fixup,"ax"
25711: lea (%rdx,%rcx,8),%rcx 25511: leal (%rdx,%rcx,8),%ecx
25812: movl %ecx,%edx /* ecx is zerorest also */ 25612: movl %ecx,%edx /* ecx is zerorest also */
259 jmp copy_user_handle_tail 257 jmp copy_user_handle_tail
260 .previous 258 .previous
@@ -279,12 +277,10 @@ ENDPROC(copy_user_generic_string)
279ENTRY(copy_user_enhanced_fast_string) 277ENTRY(copy_user_enhanced_fast_string)
280 CFI_STARTPROC 278 CFI_STARTPROC
281 ASM_STAC 279 ASM_STAC
282 andl %edx,%edx
283 jz 2f
284 movl %edx,%ecx 280 movl %edx,%ecx
2851: rep 2811: rep
286 movsb 282 movsb
2872: xorl %eax,%eax 283 xorl %eax,%eax
288 ASM_CLAC 284 ASM_CLAC
289 ret 285 ret
290 286
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index 7c3bee636e2f..39d6a3db0b96 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -16,7 +16,6 @@
16#include <linux/timex.h> 16#include <linux/timex.h>
17#include <linux/preempt.h> 17#include <linux/preempt.h>
18#include <linux/delay.h> 18#include <linux/delay.h>
19#include <linux/init.h>
20 19
21#include <asm/processor.h> 20#include <asm/processor.h>
22#include <asm/delay.h> 21#include <asm/delay.h>
diff --git a/arch/x86/lib/hash.c b/arch/x86/lib/hash.c
new file mode 100644
index 000000000000..ff4fa51a5b1f
--- /dev/null
+++ b/arch/x86/lib/hash.c
@@ -0,0 +1,92 @@
1/*
2 * Some portions derived from code covered by the following notice:
3 *
4 * Copyright (c) 2010-2013 Intel Corporation. All rights reserved.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34#include <linux/hash.h>
35#include <linux/init.h>
36
37#include <asm/processor.h>
38#include <asm/cpufeature.h>
39#include <asm/hash.h>
40
41static inline u32 crc32_u32(u32 crc, u32 val)
42{
43#ifdef CONFIG_AS_CRC32
44 asm ("crc32l %1,%0\n" : "+r" (crc) : "rm" (val));
45#else
46 asm (".byte 0xf2, 0x0f, 0x38, 0xf1, 0xc1" : "+a" (crc) : "c" (val));
47#endif
48 return crc;
49}
50
51static u32 intel_crc4_2_hash(const void *data, u32 len, u32 seed)
52{
53 const u32 *p32 = (const u32 *) data;
54 u32 i, tmp = 0;
55
56 for (i = 0; i < len / 4; i++)
57 seed = crc32_u32(seed, *p32++);
58
59 switch (len & 3) {
60 case 3:
61 tmp |= *((const u8 *) p32 + 2) << 16;
62 /* fallthrough */
63 case 2:
64 tmp |= *((const u8 *) p32 + 1) << 8;
65 /* fallthrough */
66 case 1:
67 tmp |= *((const u8 *) p32);
68 seed = crc32_u32(seed, tmp);
69 break;
70 }
71
72 return seed;
73}
74
75static u32 intel_crc4_2_hash2(const u32 *data, u32 len, u32 seed)
76{
77 const u32 *p32 = (const u32 *) data;
78 u32 i;
79
80 for (i = 0; i < len; i++)
81 seed = crc32_u32(seed, *p32++);
82
83 return seed;
84}
85
86void __init setup_arch_fast_hash(struct fast_hash_ops *ops)
87{
88 if (cpu_has_xmm4_2) {
89 ops->hash = intel_crc4_2_hash;
90 ops->hash2 = intel_crc4_2_hash2;
91 }
92}
diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c
index e78761d6b7f8..a404b4b75533 100644
--- a/arch/x86/lib/memcpy_32.c
+++ b/arch/x86/lib/memcpy_32.c
@@ -4,7 +4,7 @@
4#undef memcpy 4#undef memcpy
5#undef memset 5#undef memset
6 6
7void *memcpy(void *to, const void *from, size_t n) 7__visible void *memcpy(void *to, const void *from, size_t n)
8{ 8{
9#ifdef CONFIG_X86_USE_3DNOW 9#ifdef CONFIG_X86_USE_3DNOW
10 return __memcpy3d(to, from, n); 10 return __memcpy3d(to, from, n);
@@ -14,13 +14,13 @@ void *memcpy(void *to, const void *from, size_t n)
14} 14}
15EXPORT_SYMBOL(memcpy); 15EXPORT_SYMBOL(memcpy);
16 16
17void *memset(void *s, int c, size_t count) 17__visible void *memset(void *s, int c, size_t count)
18{ 18{
19 return __memset(s, c, count); 19 return __memset(s, c, count);
20} 20}
21EXPORT_SYMBOL(memset); 21EXPORT_SYMBOL(memset);
22 22
23void *memmove(void *dest, const void *src, size_t n) 23__visible void *memmove(void *dest, const void *src, size_t n)
24{ 24{
25 int d0,d1,d2,d3,d4,d5; 25 int d0,d1,d2,d3,d4,d5;
26 char *ret = dest; 26 char *ret = dest;
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
index 8f8eebdca7d4..db9db446b71a 100644
--- a/arch/x86/lib/msr.c
+++ b/arch/x86/lib/msr.c
@@ -8,7 +8,7 @@ struct msr *msrs_alloc(void)
8 8
9 msrs = alloc_percpu(struct msr); 9 msrs = alloc_percpu(struct msr);
10 if (!msrs) { 10 if (!msrs) {
11 pr_warning("%s: error allocating msrs\n", __func__); 11 pr_warn("%s: error allocating msrs\n", __func__);
12 return NULL; 12 return NULL;
13 } 13 }
14 14
@@ -21,3 +21,90 @@ void msrs_free(struct msr *msrs)
21 free_percpu(msrs); 21 free_percpu(msrs);
22} 22}
23EXPORT_SYMBOL(msrs_free); 23EXPORT_SYMBOL(msrs_free);
24
25/**
26 * Read an MSR with error handling
27 *
28 * @msr: MSR to read
29 * @m: value to read into
30 *
31 * It returns read data only on success, otherwise it doesn't change the output
32 * argument @m.
33 *
34 */
35int msr_read(u32 msr, struct msr *m)
36{
37 int err;
38 u64 val;
39
40 err = rdmsrl_safe(msr, &val);
41 if (!err)
42 m->q = val;
43
44 return err;
45}
46
47/**
48 * Write an MSR with error handling
49 *
50 * @msr: MSR to write
51 * @m: value to write
52 */
53int msr_write(u32 msr, struct msr *m)
54{
55 return wrmsrl_safe(msr, m->q);
56}
57
58static inline int __flip_bit(u32 msr, u8 bit, bool set)
59{
60 struct msr m, m1;
61 int err = -EINVAL;
62
63 if (bit > 63)
64 return err;
65
66 err = msr_read(msr, &m);
67 if (err)
68 return err;
69
70 m1 = m;
71 if (set)
72 m1.q |= BIT_64(bit);
73 else
74 m1.q &= ~BIT_64(bit);
75
76 if (m1.q == m.q)
77 return 0;
78
79 err = msr_write(msr, &m);
80 if (err)
81 return err;
82
83 return 1;
84}
85
86/**
87 * Set @bit in a MSR @msr.
88 *
89 * Retval:
90 * < 0: An error was encountered.
91 * = 0: Bit was already set.
92 * > 0: Hardware accepted the MSR write.
93 */
94int msr_set_bit(u32 msr, u8 bit)
95{
96 return __flip_bit(msr, bit, true);
97}
98
99/**
100 * Clear @bit in a MSR @msr.
101 *
102 * Retval:
103 * < 0: An error was encountered.
104 * = 0: Bit was already cleared.
105 * > 0: Hardware accepted the MSR write.
106 */
107int msr_clear_bit(u32 msr, u8 bit)
108{
109 return __flip_bit(msr, bit, false);
110}
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index 533a85e3a07e..1a2be7c6895d 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -346,8 +346,8 @@ AVXcode: 1
34617: vmovhps Mq,Vq (v1) | vmovhpd Mq,Vq (66),(v1) 34617: vmovhps Mq,Vq (v1) | vmovhpd Mq,Vq (66),(v1)
34718: Grp16 (1A) 34718: Grp16 (1A)
34819: 34819:
3491a: 3491a: BNDCL Ev,Gv | BNDCU Ev,Gv | BNDMOV Gv,Ev | BNDLDX Gv,Ev,Gv
3501b: 3501b: BNDCN Ev,Gv | BNDMOV Ev,Gv | BNDMK Gv,Ev | BNDSTX Ev,GV,Gv
3511c: 3511c:
3521d: 3521d:
3531e: 3531e:
diff --git a/arch/x86/math-emu/errors.c b/arch/x86/math-emu/errors.c
index 59d353d2c599..a5449089cd9f 100644
--- a/arch/x86/math-emu/errors.c
+++ b/arch/x86/math-emu/errors.c
@@ -330,11 +330,6 @@ asmlinkage void FPU_exception(int n)
330 330
331 RE_ENTRANT_CHECK_OFF; 331 RE_ENTRANT_CHECK_OFF;
332 if ((~control_word & n & CW_Exceptions) || (n == EX_INTERNAL)) { 332 if ((~control_word & n & CW_Exceptions) || (n == EX_INTERNAL)) {
333#ifdef PRINT_MESSAGES
334 /* My message from the sponsor */
335 printk(FPU_VERSION " " __DATE__ " (C) W. Metzenthen.\n");
336#endif /* PRINT_MESSAGES */
337
338 /* Get a name string for error reporting */ 333 /* Get a name string for error reporting */
339 for (i = 0; exception_names[i].type; i++) 334 for (i = 0; exception_names[i].type; i++)
340 if ((exception_names[i].type & n) == 335 if ((exception_names[i].type & n) ==
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 0002a3a33081..20621d753d5f 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -30,6 +30,7 @@ struct pg_state {
30 unsigned long start_address; 30 unsigned long start_address;
31 unsigned long current_address; 31 unsigned long current_address;
32 const struct addr_marker *marker; 32 const struct addr_marker *marker;
33 bool to_dmesg;
33}; 34};
34 35
35struct addr_marker { 36struct addr_marker {
@@ -88,10 +89,28 @@ static struct addr_marker address_markers[] = {
88#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT) 89#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
89#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT) 90#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
90 91
92#define pt_dump_seq_printf(m, to_dmesg, fmt, args...) \
93({ \
94 if (to_dmesg) \
95 printk(KERN_INFO fmt, ##args); \
96 else \
97 if (m) \
98 seq_printf(m, fmt, ##args); \
99})
100
101#define pt_dump_cont_printf(m, to_dmesg, fmt, args...) \
102({ \
103 if (to_dmesg) \
104 printk(KERN_CONT fmt, ##args); \
105 else \
106 if (m) \
107 seq_printf(m, fmt, ##args); \
108})
109
91/* 110/*
92 * Print a readable form of a pgprot_t to the seq_file 111 * Print a readable form of a pgprot_t to the seq_file
93 */ 112 */
94static void printk_prot(struct seq_file *m, pgprot_t prot, int level) 113static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
95{ 114{
96 pgprotval_t pr = pgprot_val(prot); 115 pgprotval_t pr = pgprot_val(prot);
97 static const char * const level_name[] = 116 static const char * const level_name[] =
@@ -99,47 +118,47 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level)
99 118
100 if (!pgprot_val(prot)) { 119 if (!pgprot_val(prot)) {
101 /* Not present */ 120 /* Not present */
102 seq_printf(m, " "); 121 pt_dump_cont_printf(m, dmsg, " ");
103 } else { 122 } else {
104 if (pr & _PAGE_USER) 123 if (pr & _PAGE_USER)
105 seq_printf(m, "USR "); 124 pt_dump_cont_printf(m, dmsg, "USR ");
106 else 125 else
107 seq_printf(m, " "); 126 pt_dump_cont_printf(m, dmsg, " ");
108 if (pr & _PAGE_RW) 127 if (pr & _PAGE_RW)
109 seq_printf(m, "RW "); 128 pt_dump_cont_printf(m, dmsg, "RW ");
110 else 129 else
111 seq_printf(m, "ro "); 130 pt_dump_cont_printf(m, dmsg, "ro ");
112 if (pr & _PAGE_PWT) 131 if (pr & _PAGE_PWT)
113 seq_printf(m, "PWT "); 132 pt_dump_cont_printf(m, dmsg, "PWT ");
114 else 133 else
115 seq_printf(m, " "); 134 pt_dump_cont_printf(m, dmsg, " ");
116 if (pr & _PAGE_PCD) 135 if (pr & _PAGE_PCD)
117 seq_printf(m, "PCD "); 136 pt_dump_cont_printf(m, dmsg, "PCD ");
118 else 137 else
119 seq_printf(m, " "); 138 pt_dump_cont_printf(m, dmsg, " ");
120 139
121 /* Bit 9 has a different meaning on level 3 vs 4 */ 140 /* Bit 9 has a different meaning on level 3 vs 4 */
122 if (level <= 3) { 141 if (level <= 3) {
123 if (pr & _PAGE_PSE) 142 if (pr & _PAGE_PSE)
124 seq_printf(m, "PSE "); 143 pt_dump_cont_printf(m, dmsg, "PSE ");
125 else 144 else
126 seq_printf(m, " "); 145 pt_dump_cont_printf(m, dmsg, " ");
127 } else { 146 } else {
128 if (pr & _PAGE_PAT) 147 if (pr & _PAGE_PAT)
129 seq_printf(m, "pat "); 148 pt_dump_cont_printf(m, dmsg, "pat ");
130 else 149 else
131 seq_printf(m, " "); 150 pt_dump_cont_printf(m, dmsg, " ");
132 } 151 }
133 if (pr & _PAGE_GLOBAL) 152 if (pr & _PAGE_GLOBAL)
134 seq_printf(m, "GLB "); 153 pt_dump_cont_printf(m, dmsg, "GLB ");
135 else 154 else
136 seq_printf(m, " "); 155 pt_dump_cont_printf(m, dmsg, " ");
137 if (pr & _PAGE_NX) 156 if (pr & _PAGE_NX)
138 seq_printf(m, "NX "); 157 pt_dump_cont_printf(m, dmsg, "NX ");
139 else 158 else
140 seq_printf(m, "x "); 159 pt_dump_cont_printf(m, dmsg, "x ");
141 } 160 }
142 seq_printf(m, "%s\n", level_name[level]); 161 pt_dump_cont_printf(m, dmsg, "%s\n", level_name[level]);
143} 162}
144 163
145/* 164/*
@@ -178,7 +197,8 @@ static void note_page(struct seq_file *m, struct pg_state *st,
178 st->current_prot = new_prot; 197 st->current_prot = new_prot;
179 st->level = level; 198 st->level = level;
180 st->marker = address_markers; 199 st->marker = address_markers;
181 seq_printf(m, "---[ %s ]---\n", st->marker->name); 200 pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
201 st->marker->name);
182 } else if (prot != cur || level != st->level || 202 } else if (prot != cur || level != st->level ||
183 st->current_address >= st->marker[1].start_address) { 203 st->current_address >= st->marker[1].start_address) {
184 const char *unit = units; 204 const char *unit = units;
@@ -188,17 +208,17 @@ static void note_page(struct seq_file *m, struct pg_state *st,
188 /* 208 /*
189 * Now print the actual finished series 209 * Now print the actual finished series
190 */ 210 */
191 seq_printf(m, "0x%0*lx-0x%0*lx ", 211 pt_dump_seq_printf(m, st->to_dmesg, "0x%0*lx-0x%0*lx ",
192 width, st->start_address, 212 width, st->start_address,
193 width, st->current_address); 213 width, st->current_address);
194 214
195 delta = (st->current_address - st->start_address) >> 10; 215 delta = (st->current_address - st->start_address) >> 10;
196 while (!(delta & 1023) && unit[1]) { 216 while (!(delta & 1023) && unit[1]) {
197 delta >>= 10; 217 delta >>= 10;
198 unit++; 218 unit++;
199 } 219 }
200 seq_printf(m, "%9lu%c ", delta, *unit); 220 pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ", delta, *unit);
201 printk_prot(m, st->current_prot, st->level); 221 printk_prot(m, st->current_prot, st->level, st->to_dmesg);
202 222
203 /* 223 /*
204 * We print markers for special areas of address space, 224 * We print markers for special areas of address space,
@@ -207,7 +227,8 @@ static void note_page(struct seq_file *m, struct pg_state *st,
207 */ 227 */
208 if (st->current_address >= st->marker[1].start_address) { 228 if (st->current_address >= st->marker[1].start_address) {
209 st->marker++; 229 st->marker++;
210 seq_printf(m, "---[ %s ]---\n", st->marker->name); 230 pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
231 st->marker->name);
211 } 232 }
212 233
213 st->start_address = st->current_address; 234 st->start_address = st->current_address;
@@ -296,7 +317,7 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
296#define pgd_none(a) pud_none(__pud(pgd_val(a))) 317#define pgd_none(a) pud_none(__pud(pgd_val(a)))
297#endif 318#endif
298 319
299static void walk_pgd_level(struct seq_file *m) 320void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
300{ 321{
301#ifdef CONFIG_X86_64 322#ifdef CONFIG_X86_64
302 pgd_t *start = (pgd_t *) &init_level4_pgt; 323 pgd_t *start = (pgd_t *) &init_level4_pgt;
@@ -304,9 +325,12 @@ static void walk_pgd_level(struct seq_file *m)
304 pgd_t *start = swapper_pg_dir; 325 pgd_t *start = swapper_pg_dir;
305#endif 326#endif
306 int i; 327 int i;
307 struct pg_state st; 328 struct pg_state st = {};
308 329
309 memset(&st, 0, sizeof(st)); 330 if (pgd) {
331 start = pgd;
332 st.to_dmesg = true;
333 }
310 334
311 for (i = 0; i < PTRS_PER_PGD; i++) { 335 for (i = 0; i < PTRS_PER_PGD; i++) {
312 st.current_address = normalize_addr(i * PGD_LEVEL_MULT); 336 st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
@@ -331,7 +355,7 @@ static void walk_pgd_level(struct seq_file *m)
331 355
332static int ptdump_show(struct seq_file *m, void *v) 356static int ptdump_show(struct seq_file *m, void *v)
333{ 357{
334 walk_pgd_level(m); 358 ptdump_walk_pgd_level(m, NULL);
335 return 0; 359 return 0;
336} 360}
337 361
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 9d591c895803..8e5722992677 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -584,8 +584,13 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
584 584
585 if (error_code & PF_INSTR) { 585 if (error_code & PF_INSTR) {
586 unsigned int level; 586 unsigned int level;
587 pgd_t *pgd;
588 pte_t *pte;
587 589
588 pte_t *pte = lookup_address(address, &level); 590 pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
591 pgd += pgd_index(address);
592
593 pte = lookup_address_in_pgd(pgd, address, &level);
589 594
590 if (pte && pte_present(*pte) && !pte_exec(*pte)) 595 if (pte && pte_present(*pte) && !pte_exec(*pte))
591 printk(nx_warning, from_kuid(&init_user_ns, current_uid())); 596 printk(nx_warning, from_kuid(&init_user_ns, current_uid()));
@@ -1001,6 +1006,12 @@ static int fault_in_kernel_space(unsigned long address)
1001 1006
1002static inline bool smap_violation(int error_code, struct pt_regs *regs) 1007static inline bool smap_violation(int error_code, struct pt_regs *regs)
1003{ 1008{
1009 if (!IS_ENABLED(CONFIG_X86_SMAP))
1010 return false;
1011
1012 if (!static_cpu_has(X86_FEATURE_SMAP))
1013 return false;
1014
1004 if (error_code & PF_USER) 1015 if (error_code & PF_USER)
1005 return false; 1016 return false;
1006 1017
@@ -1014,13 +1025,17 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs)
1014 * This routine handles page faults. It determines the address, 1025 * This routine handles page faults. It determines the address,
1015 * and the problem, and then passes it off to one of the appropriate 1026 * and the problem, and then passes it off to one of the appropriate
1016 * routines. 1027 * routines.
1028 *
1029 * This function must have noinline because both callers
1030 * {,trace_}do_page_fault() have notrace on. Having this an actual function
1031 * guarantees there's a function trace entry.
1017 */ 1032 */
1018static void __kprobes 1033static void __kprobes noinline
1019__do_page_fault(struct pt_regs *regs, unsigned long error_code) 1034__do_page_fault(struct pt_regs *regs, unsigned long error_code,
1035 unsigned long address)
1020{ 1036{
1021 struct vm_area_struct *vma; 1037 struct vm_area_struct *vma;
1022 struct task_struct *tsk; 1038 struct task_struct *tsk;
1023 unsigned long address;
1024 struct mm_struct *mm; 1039 struct mm_struct *mm;
1025 int fault; 1040 int fault;
1026 unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; 1041 unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
@@ -1028,9 +1043,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
1028 tsk = current; 1043 tsk = current;
1029 mm = tsk->mm; 1044 mm = tsk->mm;
1030 1045
1031 /* Get the faulting address: */
1032 address = read_cr2();
1033
1034 /* 1046 /*
1035 * Detect and handle instructions that would cause a page fault for 1047 * Detect and handle instructions that would cause a page fault for
1036 * both a tracked kernel page and a userspace page. 1048 * both a tracked kernel page and a userspace page.
@@ -1087,11 +1099,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
1087 if (unlikely(error_code & PF_RSVD)) 1099 if (unlikely(error_code & PF_RSVD))
1088 pgtable_bad(regs, error_code, address); 1100 pgtable_bad(regs, error_code, address);
1089 1101
1090 if (static_cpu_has(X86_FEATURE_SMAP)) { 1102 if (unlikely(smap_violation(error_code, regs))) {
1091 if (unlikely(smap_violation(error_code, regs))) { 1103 bad_area_nosemaphore(regs, error_code, address);
1092 bad_area_nosemaphore(regs, error_code, address); 1104 return;
1093 return;
1094 }
1095 } 1105 }
1096 1106
1097 /* 1107 /*
@@ -1244,32 +1254,50 @@ good_area:
1244 up_read(&mm->mmap_sem); 1254 up_read(&mm->mmap_sem);
1245} 1255}
1246 1256
1247dotraplinkage void __kprobes 1257dotraplinkage void __kprobes notrace
1248do_page_fault(struct pt_regs *regs, unsigned long error_code) 1258do_page_fault(struct pt_regs *regs, unsigned long error_code)
1249{ 1259{
1260 unsigned long address = read_cr2(); /* Get the faulting address */
1250 enum ctx_state prev_state; 1261 enum ctx_state prev_state;
1251 1262
1263 /*
1264 * We must have this function tagged with __kprobes, notrace and call
1265 * read_cr2() before calling anything else. To avoid calling any kind
1266 * of tracing machinery before we've observed the CR2 value.
1267 *
1268 * exception_{enter,exit}() contain all sorts of tracepoints.
1269 */
1270
1252 prev_state = exception_enter(); 1271 prev_state = exception_enter();
1253 __do_page_fault(regs, error_code); 1272 __do_page_fault(regs, error_code, address);
1254 exception_exit(prev_state); 1273 exception_exit(prev_state);
1255} 1274}
1256 1275
1257static void trace_page_fault_entries(struct pt_regs *regs, 1276#ifdef CONFIG_TRACING
1277static void trace_page_fault_entries(unsigned long address, struct pt_regs *regs,
1258 unsigned long error_code) 1278 unsigned long error_code)
1259{ 1279{
1260 if (user_mode(regs)) 1280 if (user_mode(regs))
1261 trace_page_fault_user(read_cr2(), regs, error_code); 1281 trace_page_fault_user(address, regs, error_code);
1262 else 1282 else
1263 trace_page_fault_kernel(read_cr2(), regs, error_code); 1283 trace_page_fault_kernel(address, regs, error_code);
1264} 1284}
1265 1285
1266dotraplinkage void __kprobes 1286dotraplinkage void __kprobes notrace
1267trace_do_page_fault(struct pt_regs *regs, unsigned long error_code) 1287trace_do_page_fault(struct pt_regs *regs, unsigned long error_code)
1268{ 1288{
1289 /*
1290 * The exception_enter and tracepoint processing could
1291 * trigger another page faults (user space callchain
1292 * reading) and destroy the original cr2 value, so read
1293 * the faulting address now.
1294 */
1295 unsigned long address = read_cr2();
1269 enum ctx_state prev_state; 1296 enum ctx_state prev_state;
1270 1297
1271 prev_state = exception_enter(); 1298 prev_state = exception_enter();
1272 trace_page_fault_entries(regs, error_code); 1299 trace_page_fault_entries(address, regs, error_code);
1273 __do_page_fault(regs, error_code); 1300 __do_page_fault(regs, error_code, address);
1274 exception_exit(prev_state); 1301 exception_exit(prev_state);
1275} 1302}
1303#endif /* CONFIG_TRACING */
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 0596e8e0cc19..207d9aef662d 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -108,8 +108,8 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
108 108
109static inline void get_head_page_multiple(struct page *page, int nr) 109static inline void get_head_page_multiple(struct page *page, int nr)
110{ 110{
111 VM_BUG_ON(page != compound_head(page)); 111 VM_BUG_ON_PAGE(page != compound_head(page), page);
112 VM_BUG_ON(page_count(page) == 0); 112 VM_BUG_ON_PAGE(page_count(page) == 0, page);
113 atomic_add(nr, &page->_count); 113 atomic_add(nr, &page->_count);
114 SetPageReferenced(page); 114 SetPageReferenced(page);
115} 115}
@@ -135,7 +135,7 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
135 head = pte_page(pte); 135 head = pte_page(pte);
136 page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); 136 page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
137 do { 137 do {
138 VM_BUG_ON(compound_head(page) != head); 138 VM_BUG_ON_PAGE(compound_head(page) != head, page);
139 pages[*nr] = page; 139 pages[*nr] = page;
140 if (PageTail(page)) 140 if (PageTail(page))
141 get_huge_page_tail(page); 141 get_huge_page_tail(page);
@@ -212,7 +212,7 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
212 head = pte_page(pte); 212 head = pte_page(pte);
213 page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); 213 page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
214 do { 214 do {
215 VM_BUG_ON(compound_head(page) != head); 215 VM_BUG_ON_PAGE(compound_head(page) != head, page);
216 pages[*nr] = page; 216 pages[*nr] = page;
217 if (PageTail(page)) 217 if (PageTail(page))
218 get_huge_page_tail(page); 218 get_huge_page_tail(page);
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 9d980d88b747..8c9f647ff9e1 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -87,9 +87,7 @@ int pmd_huge_support(void)
87} 87}
88#endif 88#endif
89 89
90/* x86_64 also uses this file */ 90#ifdef CONFIG_HUGETLB_PAGE
91
92#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
93static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, 91static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
94 unsigned long addr, unsigned long len, 92 unsigned long addr, unsigned long len,
95 unsigned long pgoff, unsigned long flags) 93 unsigned long pgoff, unsigned long flags)
@@ -99,7 +97,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
99 97
100 info.flags = 0; 98 info.flags = 0;
101 info.length = len; 99 info.length = len;
102 info.low_limit = TASK_UNMAPPED_BASE; 100 info.low_limit = current->mm->mmap_legacy_base;
103 info.high_limit = TASK_SIZE; 101 info.high_limit = TASK_SIZE;
104 info.align_mask = PAGE_MASK & ~huge_page_mask(h); 102 info.align_mask = PAGE_MASK & ~huge_page_mask(h);
105 info.align_offset = 0; 103 info.align_offset = 0;
@@ -172,8 +170,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
172 return hugetlb_get_unmapped_area_topdown(file, addr, len, 170 return hugetlb_get_unmapped_area_topdown(file, addr, len,
173 pgoff, flags); 171 pgoff, flags);
174} 172}
175 173#endif /* CONFIG_HUGETLB_PAGE */
176#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/
177 174
178#ifdef CONFIG_X86_64 175#ifdef CONFIG_X86_64
179static __init int setup_hugepagesz(char *opt) 176static __init int setup_hugepagesz(char *opt)
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 4287f1ffba7e..e39504878aec 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -665,7 +665,7 @@ void __init initmem_init(void)
665 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; 665 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
666#endif 666#endif
667 667
668 memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); 668 memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0);
669 sparse_memory_present_with_active_regions(0); 669 sparse_memory_present_with_active_regions(0);
670 670
671#ifdef CONFIG_FLATMEM 671#ifdef CONFIG_FLATMEM
@@ -806,6 +806,9 @@ void __init mem_init(void)
806 BUILD_BUG_ON(VMALLOC_START >= VMALLOC_END); 806 BUILD_BUG_ON(VMALLOC_START >= VMALLOC_END);
807#undef high_memory 807#undef high_memory
808#undef __FIXADDR_TOP 808#undef __FIXADDR_TOP
809#ifdef CONFIG_RANDOMIZE_BASE
810 BUILD_BUG_ON(CONFIG_RANDOMIZE_BASE_MAX_OFFSET > KERNEL_IMAGE_SIZE);
811#endif
809 812
810#ifdef CONFIG_HIGHMEM 813#ifdef CONFIG_HIGHMEM
811 BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START); 814 BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 104d56a9245f..f35c66c5959a 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -643,7 +643,7 @@ kernel_physical_mapping_init(unsigned long start,
643#ifndef CONFIG_NUMA 643#ifndef CONFIG_NUMA
644void __init initmem_init(void) 644void __init initmem_init(void)
645{ 645{
646 memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); 646 memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0);
647} 647}
648#endif 648#endif
649 649
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 799580cabc78..597ac155c91c 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -328,17 +328,6 @@ void unxlate_dev_mem_ptr(unsigned long phys, void *addr)
328 return; 328 return;
329} 329}
330 330
331static int __initdata early_ioremap_debug;
332
333static int __init early_ioremap_debug_setup(char *str)
334{
335 early_ioremap_debug = 1;
336
337 return 0;
338}
339early_param("early_ioremap_debug", early_ioremap_debug_setup);
340
341static __initdata int after_paging_init;
342static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss; 331static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
343 332
344static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) 333static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
@@ -362,18 +351,11 @@ bool __init is_early_ioremap_ptep(pte_t *ptep)
362 return ptep >= &bm_pte[0] && ptep < &bm_pte[PAGE_SIZE/sizeof(pte_t)]; 351 return ptep >= &bm_pte[0] && ptep < &bm_pte[PAGE_SIZE/sizeof(pte_t)];
363} 352}
364 353
365static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
366
367void __init early_ioremap_init(void) 354void __init early_ioremap_init(void)
368{ 355{
369 pmd_t *pmd; 356 pmd_t *pmd;
370 int i;
371 357
372 if (early_ioremap_debug) 358 early_ioremap_setup();
373 printk(KERN_INFO "early_ioremap_init()\n");
374
375 for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
376 slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);
377 359
378 pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); 360 pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
379 memset(bm_pte, 0, sizeof(bm_pte)); 361 memset(bm_pte, 0, sizeof(bm_pte));
@@ -402,13 +384,8 @@ void __init early_ioremap_init(void)
402 } 384 }
403} 385}
404 386
405void __init early_ioremap_reset(void) 387void __init __early_set_fixmap(enum fixed_addresses idx,
406{ 388 phys_addr_t phys, pgprot_t flags)
407 after_paging_init = 1;
408}
409
410static void __init __early_set_fixmap(enum fixed_addresses idx,
411 phys_addr_t phys, pgprot_t flags)
412{ 389{
413 unsigned long addr = __fix_to_virt(idx); 390 unsigned long addr = __fix_to_virt(idx);
414 pte_t *pte; 391 pte_t *pte;
@@ -425,198 +402,3 @@ static void __init __early_set_fixmap(enum fixed_addresses idx,
425 pte_clear(&init_mm, addr, pte); 402 pte_clear(&init_mm, addr, pte);
426 __flush_tlb_one(addr); 403 __flush_tlb_one(addr);
427} 404}
428
429static inline void __init early_set_fixmap(enum fixed_addresses idx,
430 phys_addr_t phys, pgprot_t prot)
431{
432 if (after_paging_init)
433 __set_fixmap(idx, phys, prot);
434 else
435 __early_set_fixmap(idx, phys, prot);
436}
437
438static inline void __init early_clear_fixmap(enum fixed_addresses idx)
439{
440 if (after_paging_init)
441 clear_fixmap(idx);
442 else
443 __early_set_fixmap(idx, 0, __pgprot(0));
444}
445
446static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata;
447static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
448
449void __init fixup_early_ioremap(void)
450{
451 int i;
452
453 for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
454 if (prev_map[i]) {
455 WARN_ON(1);
456 break;
457 }
458 }
459
460 early_ioremap_init();
461}
462
463static int __init check_early_ioremap_leak(void)
464{
465 int count = 0;
466 int i;
467
468 for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
469 if (prev_map[i])
470 count++;
471
472 if (!count)
473 return 0;
474 WARN(1, KERN_WARNING
475 "Debug warning: early ioremap leak of %d areas detected.\n",
476 count);
477 printk(KERN_WARNING
478 "please boot with early_ioremap_debug and report the dmesg.\n");
479
480 return 1;
481}
482late_initcall(check_early_ioremap_leak);
483
484static void __init __iomem *
485__early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot)
486{
487 unsigned long offset;
488 resource_size_t last_addr;
489 unsigned int nrpages;
490 enum fixed_addresses idx;
491 int i, slot;
492
493 WARN_ON(system_state != SYSTEM_BOOTING);
494
495 slot = -1;
496 for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
497 if (!prev_map[i]) {
498 slot = i;
499 break;
500 }
501 }
502
503 if (slot < 0) {
504 printk(KERN_INFO "%s(%08llx, %08lx) not found slot\n",
505 __func__, (u64)phys_addr, size);
506 WARN_ON(1);
507 return NULL;
508 }
509
510 if (early_ioremap_debug) {
511 printk(KERN_INFO "%s(%08llx, %08lx) [%d] => ",
512 __func__, (u64)phys_addr, size, slot);
513 dump_stack();
514 }
515
516 /* Don't allow wraparound or zero size */
517 last_addr = phys_addr + size - 1;
518 if (!size || last_addr < phys_addr) {
519 WARN_ON(1);
520 return NULL;
521 }
522
523 prev_size[slot] = size;
524 /*
525 * Mappings have to be page-aligned
526 */
527 offset = phys_addr & ~PAGE_MASK;
528 phys_addr &= PAGE_MASK;
529 size = PAGE_ALIGN(last_addr + 1) - phys_addr;
530
531 /*
532 * Mappings have to fit in the FIX_BTMAP area.
533 */
534 nrpages = size >> PAGE_SHIFT;
535 if (nrpages > NR_FIX_BTMAPS) {
536 WARN_ON(1);
537 return NULL;
538 }
539
540 /*
541 * Ok, go for it..
542 */
543 idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
544 while (nrpages > 0) {
545 early_set_fixmap(idx, phys_addr, prot);
546 phys_addr += PAGE_SIZE;
547 --idx;
548 --nrpages;
549 }
550 if (early_ioremap_debug)
551 printk(KERN_CONT "%08lx + %08lx\n", offset, slot_virt[slot]);
552
553 prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]);
554 return prev_map[slot];
555}
556
557/* Remap an IO device */
558void __init __iomem *
559early_ioremap(resource_size_t phys_addr, unsigned long size)
560{
561 return __early_ioremap(phys_addr, size, PAGE_KERNEL_IO);
562}
563
564/* Remap memory */
565void __init __iomem *
566early_memremap(resource_size_t phys_addr, unsigned long size)
567{
568 return __early_ioremap(phys_addr, size, PAGE_KERNEL);
569}
570
571void __init early_iounmap(void __iomem *addr, unsigned long size)
572{
573 unsigned long virt_addr;
574 unsigned long offset;
575 unsigned int nrpages;
576 enum fixed_addresses idx;
577 int i, slot;
578
579 slot = -1;
580 for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
581 if (prev_map[i] == addr) {
582 slot = i;
583 break;
584 }
585 }
586
587 if (slot < 0) {
588 printk(KERN_INFO "early_iounmap(%p, %08lx) not found slot\n",
589 addr, size);
590 WARN_ON(1);
591 return;
592 }
593
594 if (prev_size[slot] != size) {
595 printk(KERN_INFO "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n",
596 addr, size, slot, prev_size[slot]);
597 WARN_ON(1);
598 return;
599 }
600
601 if (early_ioremap_debug) {
602 printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
603 size, slot);
604 dump_stack();
605 }
606
607 virt_addr = (unsigned long)addr;
608 if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) {
609 WARN_ON(1);
610 return;
611 }
612 offset = virt_addr & ~PAGE_MASK;
613 nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT;
614
615 idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
616 while (nrpages > 0) {
617 early_clear_fixmap(idx);
618 --idx;
619 --nrpages;
620 }
621 prev_map[slot] = NULL;
622}
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
index d87dd6d042d6..dd89a13f1051 100644
--- a/arch/x86/mm/kmemcheck/kmemcheck.c
+++ b/arch/x86/mm/kmemcheck/kmemcheck.c
@@ -78,10 +78,16 @@ early_initcall(kmemcheck_init);
78 */ 78 */
79static int __init param_kmemcheck(char *str) 79static int __init param_kmemcheck(char *str)
80{ 80{
81 int val;
82 int ret;
83
81 if (!str) 84 if (!str)
82 return -EINVAL; 85 return -EINVAL;
83 86
84 sscanf(str, "%d", &kmemcheck_enabled); 87 ret = kstrtoint(str, 0, &val);
88 if (ret)
89 return ret;
90 kmemcheck_enabled = val;
85 return 0; 91 return 0;
86} 92}
87 93
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index e5d5e2ce9f77..637ab34ed632 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -11,7 +11,6 @@
11#include <linux/rculist.h> 11#include <linux/rculist.h>
12#include <linux/spinlock.h> 12#include <linux/spinlock.h>
13#include <linux/hash.h> 13#include <linux/hash.h>
14#include <linux/init.h>
15#include <linux/module.h> 14#include <linux/module.h>
16#include <linux/kernel.h> 15#include <linux/kernel.h>
17#include <linux/uaccess.h> 16#include <linux/uaccess.h>
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index 8dabbed409ee..1e9da795767a 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -74,7 +74,7 @@ static void __init do_one_pass(u64 pattern, u64 start, u64 end)
74 u64 i; 74 u64 i;
75 phys_addr_t this_start, this_end; 75 phys_addr_t this_start, this_end;
76 76
77 for_each_free_mem_range(i, MAX_NUMNODES, &this_start, &this_end, NULL) { 77 for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) {
78 this_start = clamp_t(phys_addr_t, this_start, start, end); 78 this_start = clamp_t(phys_addr_t, this_start, start, end);
79 this_end = clamp_t(phys_addr_t, this_end, start, end); 79 this_end = clamp_t(phys_addr_t, this_end, start, end);
80 if (this_start < this_end) { 80 if (this_start < this_end) {
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 24aec58d6afd..1d045f9c390f 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -211,9 +211,13 @@ static void __init setup_node_data(int nid, u64 start, u64 end)
211 */ 211 */
212 nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid); 212 nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
213 if (!nd_pa) { 213 if (!nd_pa) {
214 pr_err("Cannot find %zu bytes in node %d\n", 214 nd_pa = __memblock_alloc_base(nd_size, SMP_CACHE_BYTES,
215 nd_size, nid); 215 MEMBLOCK_ALLOC_ACCESSIBLE);
216 return; 216 if (!nd_pa) {
217 pr_err("Cannot find %zu bytes in node %d\n",
218 nd_size, nid);
219 return;
220 }
217 } 221 }
218 nd = __va(nd_pa); 222 nd = __va(nd_pa);
219 223
@@ -487,7 +491,8 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
487 491
488 for (i = 0; i < mi->nr_blks; i++) { 492 for (i = 0; i < mi->nr_blks; i++) {
489 struct numa_memblk *mb = &mi->blk[i]; 493 struct numa_memblk *mb = &mi->blk[i];
490 memblock_set_node(mb->start, mb->end - mb->start, mb->nid); 494 memblock_set_node(mb->start, mb->end - mb->start,
495 &memblock.memory, mb->nid);
491 } 496 }
492 497
493 /* 498 /*
@@ -549,6 +554,41 @@ static void __init numa_init_array(void)
549 } 554 }
550} 555}
551 556
557static void __init numa_clear_kernel_node_hotplug(void)
558{
559 int i, nid;
560 nodemask_t numa_kernel_nodes = NODE_MASK_NONE;
561 unsigned long start, end;
562 struct memblock_type *type = &memblock.reserved;
563
564 /*
565 * At this time, all memory regions reserved by memblock are
566 * used by the kernel. Set the nid in memblock.reserved will
567 * mark out all the nodes the kernel resides in.
568 */
569 for (i = 0; i < numa_meminfo.nr_blks; i++) {
570 struct numa_memblk *mb = &numa_meminfo.blk[i];
571 memblock_set_node(mb->start, mb->end - mb->start,
572 &memblock.reserved, mb->nid);
573 }
574
575 /* Mark all kernel nodes. */
576 for (i = 0; i < type->cnt; i++)
577 node_set(type->regions[i].nid, numa_kernel_nodes);
578
579 /* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */
580 for (i = 0; i < numa_meminfo.nr_blks; i++) {
581 nid = numa_meminfo.blk[i].nid;
582 if (!node_isset(nid, numa_kernel_nodes))
583 continue;
584
585 start = numa_meminfo.blk[i].start;
586 end = numa_meminfo.blk[i].end;
587
588 memblock_clear_hotplug(start, end - start);
589 }
590}
591
552static int __init numa_init(int (*init_func)(void)) 592static int __init numa_init(int (*init_func)(void))
553{ 593{
554 int i; 594 int i;
@@ -561,7 +601,12 @@ static int __init numa_init(int (*init_func)(void))
561 nodes_clear(node_possible_map); 601 nodes_clear(node_possible_map);
562 nodes_clear(node_online_map); 602 nodes_clear(node_online_map);
563 memset(&numa_meminfo, 0, sizeof(numa_meminfo)); 603 memset(&numa_meminfo, 0, sizeof(numa_meminfo));
564 WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES)); 604 WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.memory,
605 MAX_NUMNODES));
606 WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.reserved,
607 MAX_NUMNODES));
608 /* In case that parsing SRAT failed. */
609 WARN_ON(memblock_clear_hotplug(0, ULLONG_MAX));
565 numa_reset_distance(); 610 numa_reset_distance();
566 611
567 ret = init_func(); 612 ret = init_func();
@@ -597,6 +642,16 @@ static int __init numa_init(int (*init_func)(void))
597 numa_clear_node(i); 642 numa_clear_node(i);
598 } 643 }
599 numa_init_array(); 644 numa_init_array();
645
646 /*
647 * At very early time, the kernel have to use some memory such as
648 * loading the kernel image. We cannot prevent this anyway. So any
649 * node the kernel resides in should be un-hotpluggable.
650 *
651 * And when we come here, numa_init() won't fail.
652 */
653 numa_clear_kernel_node_hotplug();
654
600 return 0; 655 return 0;
601} 656}
602 657
@@ -632,10 +687,6 @@ static int __init dummy_numa_init(void)
632void __init x86_numa_init(void) 687void __init x86_numa_init(void)
633{ 688{
634 if (!numa_off) { 689 if (!numa_off) {
635#ifdef CONFIG_X86_NUMAQ
636 if (!numa_init(numaq_numa_init))
637 return;
638#endif
639#ifdef CONFIG_ACPI_NUMA 690#ifdef CONFIG_ACPI_NUMA
640 if (!numa_init(x86_acpi_numa_init)) 691 if (!numa_init(x86_acpi_numa_init))
641 return; 692 return;
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 0342d27ca798..47b6436e41c2 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -52,6 +52,8 @@ void memory_present(int nid, unsigned long start, unsigned long end)
52 nid, start, end); 52 nid, start, end);
53 printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid); 53 printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid);
54 printk(KERN_DEBUG " "); 54 printk(KERN_DEBUG " ");
55 start = round_down(start, PAGES_PER_SECTION);
56 end = round_up(end, PAGES_PER_SECTION);
55 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { 57 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
56 physnode_map[pfn / PAGES_PER_SECTION] = nid; 58 physnode_map[pfn / PAGES_PER_SECTION] = nid;
57 printk(KERN_CONT "%lx ", pfn); 59 printk(KERN_CONT "%lx ", pfn);
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index d0b1773d9d2e..461bc8289024 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -8,7 +8,6 @@
8#include <linux/kthread.h> 8#include <linux/kthread.h>
9#include <linux/random.h> 9#include <linux/random.h>
10#include <linux/kernel.h> 10#include <linux/kernel.h>
11#include <linux/init.h>
12#include <linux/mm.h> 11#include <linux/mm.h>
13 12
14#include <asm/cacheflush.h> 13#include <asm/cacheflush.h>
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index bb32480c2d71..ae242a7c11c7 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -30,6 +30,7 @@
30 */ 30 */
31struct cpa_data { 31struct cpa_data {
32 unsigned long *vaddr; 32 unsigned long *vaddr;
33 pgd_t *pgd;
33 pgprot_t mask_set; 34 pgprot_t mask_set;
34 pgprot_t mask_clr; 35 pgprot_t mask_clr;
35 int numpages; 36 int numpages;
@@ -125,8 +126,8 @@ within(unsigned long addr, unsigned long start, unsigned long end)
125 * @vaddr: virtual start address 126 * @vaddr: virtual start address
126 * @size: number of bytes to flush 127 * @size: number of bytes to flush
127 * 128 *
128 * clflush is an unordered instruction which needs fencing with mfence 129 * clflushopt is an unordered instruction which needs fencing with mfence or
129 * to avoid ordering issues. 130 * sfence to avoid ordering issues.
130 */ 131 */
131void clflush_cache_range(void *vaddr, unsigned int size) 132void clflush_cache_range(void *vaddr, unsigned int size)
132{ 133{
@@ -135,11 +136,11 @@ void clflush_cache_range(void *vaddr, unsigned int size)
135 mb(); 136 mb();
136 137
137 for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size) 138 for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
138 clflush(vaddr); 139 clflushopt(vaddr);
139 /* 140 /*
140 * Flush any possible final partial cacheline: 141 * Flush any possible final partial cacheline:
141 */ 142 */
142 clflush(vend); 143 clflushopt(vend);
143 144
144 mb(); 145 mb();
145} 146}
@@ -323,16 +324,12 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
323} 324}
324 325
325/* 326/*
326 * Lookup the page table entry for a virtual address. Return a pointer 327 * Lookup the page table entry for a virtual address in a specific pgd.
327 * to the entry and the level of the mapping. 328 * Return a pointer to the entry and the level of the mapping.
328 *
329 * Note: We return pud and pmd either when the entry is marked large
330 * or when the present bit is not set. Otherwise we would return a
331 * pointer to a nonexisting mapping.
332 */ 329 */
333pte_t *lookup_address(unsigned long address, unsigned int *level) 330pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
331 unsigned int *level)
334{ 332{
335 pgd_t *pgd = pgd_offset_k(address);
336 pud_t *pud; 333 pud_t *pud;
337 pmd_t *pmd; 334 pmd_t *pmd;
338 335
@@ -361,8 +358,31 @@ pte_t *lookup_address(unsigned long address, unsigned int *level)
361 358
362 return pte_offset_kernel(pmd, address); 359 return pte_offset_kernel(pmd, address);
363} 360}
361
362/*
363 * Lookup the page table entry for a virtual address. Return a pointer
364 * to the entry and the level of the mapping.
365 *
366 * Note: We return pud and pmd either when the entry is marked large
367 * or when the present bit is not set. Otherwise we would return a
368 * pointer to a nonexisting mapping.
369 */
370pte_t *lookup_address(unsigned long address, unsigned int *level)
371{
372 return lookup_address_in_pgd(pgd_offset_k(address), address, level);
373}
364EXPORT_SYMBOL_GPL(lookup_address); 374EXPORT_SYMBOL_GPL(lookup_address);
365 375
376static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
377 unsigned int *level)
378{
379 if (cpa->pgd)
380 return lookup_address_in_pgd(cpa->pgd + pgd_index(address),
381 address, level);
382
383 return lookup_address(address, level);
384}
385
366/* 386/*
367 * This is necessary because __pa() does not work on some 387 * This is necessary because __pa() does not work on some
368 * kinds of memory, like vmalloc() or the alloc_remap() 388 * kinds of memory, like vmalloc() or the alloc_remap()
@@ -437,7 +457,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
437 * Check for races, another CPU might have split this page 457 * Check for races, another CPU might have split this page
438 * up already: 458 * up already:
439 */ 459 */
440 tmp = lookup_address(address, &level); 460 tmp = _lookup_address_cpa(cpa, address, &level);
441 if (tmp != kpte) 461 if (tmp != kpte)
442 goto out_unlock; 462 goto out_unlock;
443 463
@@ -543,7 +563,8 @@ out_unlock:
543} 563}
544 564
545static int 565static int
546__split_large_page(pte_t *kpte, unsigned long address, struct page *base) 566__split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
567 struct page *base)
547{ 568{
548 pte_t *pbase = (pte_t *)page_address(base); 569 pte_t *pbase = (pte_t *)page_address(base);
549 unsigned long pfn, pfninc = 1; 570 unsigned long pfn, pfninc = 1;
@@ -556,7 +577,7 @@ __split_large_page(pte_t *kpte, unsigned long address, struct page *base)
556 * Check for races, another CPU might have split this page 577 * Check for races, another CPU might have split this page
557 * up for us already: 578 * up for us already:
558 */ 579 */
559 tmp = lookup_address(address, &level); 580 tmp = _lookup_address_cpa(cpa, address, &level);
560 if (tmp != kpte) { 581 if (tmp != kpte) {
561 spin_unlock(&pgd_lock); 582 spin_unlock(&pgd_lock);
562 return 1; 583 return 1;
@@ -632,7 +653,8 @@ __split_large_page(pte_t *kpte, unsigned long address, struct page *base)
632 return 0; 653 return 0;
633} 654}
634 655
635static int split_large_page(pte_t *kpte, unsigned long address) 656static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
657 unsigned long address)
636{ 658{
637 struct page *base; 659 struct page *base;
638 660
@@ -644,15 +666,402 @@ static int split_large_page(pte_t *kpte, unsigned long address)
644 if (!base) 666 if (!base)
645 return -ENOMEM; 667 return -ENOMEM;
646 668
647 if (__split_large_page(kpte, address, base)) 669 if (__split_large_page(cpa, kpte, address, base))
648 __free_page(base); 670 __free_page(base);
649 671
650 return 0; 672 return 0;
651} 673}
652 674
675static bool try_to_free_pte_page(pte_t *pte)
676{
677 int i;
678
679 for (i = 0; i < PTRS_PER_PTE; i++)
680 if (!pte_none(pte[i]))
681 return false;
682
683 free_page((unsigned long)pte);
684 return true;
685}
686
687static bool try_to_free_pmd_page(pmd_t *pmd)
688{
689 int i;
690
691 for (i = 0; i < PTRS_PER_PMD; i++)
692 if (!pmd_none(pmd[i]))
693 return false;
694
695 free_page((unsigned long)pmd);
696 return true;
697}
698
699static bool try_to_free_pud_page(pud_t *pud)
700{
701 int i;
702
703 for (i = 0; i < PTRS_PER_PUD; i++)
704 if (!pud_none(pud[i]))
705 return false;
706
707 free_page((unsigned long)pud);
708 return true;
709}
710
711static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
712{
713 pte_t *pte = pte_offset_kernel(pmd, start);
714
715 while (start < end) {
716 set_pte(pte, __pte(0));
717
718 start += PAGE_SIZE;
719 pte++;
720 }
721
722 if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
723 pmd_clear(pmd);
724 return true;
725 }
726 return false;
727}
728
729static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
730 unsigned long start, unsigned long end)
731{
732 if (unmap_pte_range(pmd, start, end))
733 if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
734 pud_clear(pud);
735}
736
737static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
738{
739 pmd_t *pmd = pmd_offset(pud, start);
740
741 /*
742 * Not on a 2MB page boundary?
743 */
744 if (start & (PMD_SIZE - 1)) {
745 unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
746 unsigned long pre_end = min_t(unsigned long, end, next_page);
747
748 __unmap_pmd_range(pud, pmd, start, pre_end);
749
750 start = pre_end;
751 pmd++;
752 }
753
754 /*
755 * Try to unmap in 2M chunks.
756 */
757 while (end - start >= PMD_SIZE) {
758 if (pmd_large(*pmd))
759 pmd_clear(pmd);
760 else
761 __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
762
763 start += PMD_SIZE;
764 pmd++;
765 }
766
767 /*
768 * 4K leftovers?
769 */
770 if (start < end)
771 return __unmap_pmd_range(pud, pmd, start, end);
772
773 /*
774 * Try again to free the PMD page if haven't succeeded above.
775 */
776 if (!pud_none(*pud))
777 if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
778 pud_clear(pud);
779}
780
781static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
782{
783 pud_t *pud = pud_offset(pgd, start);
784
785 /*
786 * Not on a GB page boundary?
787 */
788 if (start & (PUD_SIZE - 1)) {
789 unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
790 unsigned long pre_end = min_t(unsigned long, end, next_page);
791
792 unmap_pmd_range(pud, start, pre_end);
793
794 start = pre_end;
795 pud++;
796 }
797
798 /*
799 * Try to unmap in 1G chunks?
800 */
801 while (end - start >= PUD_SIZE) {
802
803 if (pud_large(*pud))
804 pud_clear(pud);
805 else
806 unmap_pmd_range(pud, start, start + PUD_SIZE);
807
808 start += PUD_SIZE;
809 pud++;
810 }
811
812 /*
813 * 2M leftovers?
814 */
815 if (start < end)
816 unmap_pmd_range(pud, start, end);
817
818 /*
819 * No need to try to free the PUD page because we'll free it in
820 * populate_pgd's error path
821 */
822}
823
824static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end)
825{
826 pgd_t *pgd_entry = root + pgd_index(addr);
827
828 unmap_pud_range(pgd_entry, addr, end);
829
830 if (try_to_free_pud_page((pud_t *)pgd_page_vaddr(*pgd_entry)))
831 pgd_clear(pgd_entry);
832}
833
834static int alloc_pte_page(pmd_t *pmd)
835{
836 pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
837 if (!pte)
838 return -1;
839
840 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
841 return 0;
842}
843
844static int alloc_pmd_page(pud_t *pud)
845{
846 pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
847 if (!pmd)
848 return -1;
849
850 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
851 return 0;
852}
853
854static void populate_pte(struct cpa_data *cpa,
855 unsigned long start, unsigned long end,
856 unsigned num_pages, pmd_t *pmd, pgprot_t pgprot)
857{
858 pte_t *pte;
859
860 pte = pte_offset_kernel(pmd, start);
861
862 while (num_pages-- && start < end) {
863
864 /* deal with the NX bit */
865 if (!(pgprot_val(pgprot) & _PAGE_NX))
866 cpa->pfn &= ~_PAGE_NX;
867
868 set_pte(pte, pfn_pte(cpa->pfn >> PAGE_SHIFT, pgprot));
869
870 start += PAGE_SIZE;
871 cpa->pfn += PAGE_SIZE;
872 pte++;
873 }
874}
875
876static int populate_pmd(struct cpa_data *cpa,
877 unsigned long start, unsigned long end,
878 unsigned num_pages, pud_t *pud, pgprot_t pgprot)
879{
880 unsigned int cur_pages = 0;
881 pmd_t *pmd;
882
883 /*
884 * Not on a 2M boundary?
885 */
886 if (start & (PMD_SIZE - 1)) {
887 unsigned long pre_end = start + (num_pages << PAGE_SHIFT);
888 unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
889
890 pre_end = min_t(unsigned long, pre_end, next_page);
891 cur_pages = (pre_end - start) >> PAGE_SHIFT;
892 cur_pages = min_t(unsigned int, num_pages, cur_pages);
893
894 /*
895 * Need a PTE page?
896 */
897 pmd = pmd_offset(pud, start);
898 if (pmd_none(*pmd))
899 if (alloc_pte_page(pmd))
900 return -1;
901
902 populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot);
903
904 start = pre_end;
905 }
906
907 /*
908 * We mapped them all?
909 */
910 if (num_pages == cur_pages)
911 return cur_pages;
912
913 while (end - start >= PMD_SIZE) {
914
915 /*
916 * We cannot use a 1G page so allocate a PMD page if needed.
917 */
918 if (pud_none(*pud))
919 if (alloc_pmd_page(pud))
920 return -1;
921
922 pmd = pmd_offset(pud, start);
923
924 set_pmd(pmd, __pmd(cpa->pfn | _PAGE_PSE | massage_pgprot(pgprot)));
925
926 start += PMD_SIZE;
927 cpa->pfn += PMD_SIZE;
928 cur_pages += PMD_SIZE >> PAGE_SHIFT;
929 }
930
931 /*
932 * Map trailing 4K pages.
933 */
934 if (start < end) {
935 pmd = pmd_offset(pud, start);
936 if (pmd_none(*pmd))
937 if (alloc_pte_page(pmd))
938 return -1;
939
940 populate_pte(cpa, start, end, num_pages - cur_pages,
941 pmd, pgprot);
942 }
943 return num_pages;
944}
945
946static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
947 pgprot_t pgprot)
948{
949 pud_t *pud;
950 unsigned long end;
951 int cur_pages = 0;
952
953 end = start + (cpa->numpages << PAGE_SHIFT);
954
955 /*
956 * Not on a Gb page boundary? => map everything up to it with
957 * smaller pages.
958 */
959 if (start & (PUD_SIZE - 1)) {
960 unsigned long pre_end;
961 unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
962
963 pre_end = min_t(unsigned long, end, next_page);
964 cur_pages = (pre_end - start) >> PAGE_SHIFT;
965 cur_pages = min_t(int, (int)cpa->numpages, cur_pages);
966
967 pud = pud_offset(pgd, start);
968
969 /*
970 * Need a PMD page?
971 */
972 if (pud_none(*pud))
973 if (alloc_pmd_page(pud))
974 return -1;
975
976 cur_pages = populate_pmd(cpa, start, pre_end, cur_pages,
977 pud, pgprot);
978 if (cur_pages < 0)
979 return cur_pages;
980
981 start = pre_end;
982 }
983
984 /* We mapped them all? */
985 if (cpa->numpages == cur_pages)
986 return cur_pages;
987
988 pud = pud_offset(pgd, start);
989
990 /*
991 * Map everything starting from the Gb boundary, possibly with 1G pages
992 */
993 while (end - start >= PUD_SIZE) {
994 set_pud(pud, __pud(cpa->pfn | _PAGE_PSE | massage_pgprot(pgprot)));
995
996 start += PUD_SIZE;
997 cpa->pfn += PUD_SIZE;
998 cur_pages += PUD_SIZE >> PAGE_SHIFT;
999 pud++;
1000 }
1001
1002 /* Map trailing leftover */
1003 if (start < end) {
1004 int tmp;
1005
1006 pud = pud_offset(pgd, start);
1007 if (pud_none(*pud))
1008 if (alloc_pmd_page(pud))
1009 return -1;
1010
1011 tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages,
1012 pud, pgprot);
1013 if (tmp < 0)
1014 return cur_pages;
1015
1016 cur_pages += tmp;
1017 }
1018 return cur_pages;
1019}
1020
1021/*
1022 * Restrictions for kernel page table do not necessarily apply when mapping in
1023 * an alternate PGD.
1024 */
1025static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
1026{
1027 pgprot_t pgprot = __pgprot(_KERNPG_TABLE);
1028 pud_t *pud = NULL; /* shut up gcc */
1029 pgd_t *pgd_entry;
1030 int ret;
1031
1032 pgd_entry = cpa->pgd + pgd_index(addr);
1033
1034 /*
1035 * Allocate a PUD page and hand it down for mapping.
1036 */
1037 if (pgd_none(*pgd_entry)) {
1038 pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
1039 if (!pud)
1040 return -1;
1041
1042 set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE));
1043 }
1044
1045 pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr);
1046 pgprot_val(pgprot) |= pgprot_val(cpa->mask_set);
1047
1048 ret = populate_pud(cpa, addr, pgd_entry, pgprot);
1049 if (ret < 0) {
1050 unmap_pgd_range(cpa->pgd, addr,
1051 addr + (cpa->numpages << PAGE_SHIFT));
1052 return ret;
1053 }
1054
1055 cpa->numpages = ret;
1056 return 0;
1057}
1058
653static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr, 1059static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
654 int primary) 1060 int primary)
655{ 1061{
1062 if (cpa->pgd)
1063 return populate_pgd(cpa, vaddr);
1064
656 /* 1065 /*
657 * Ignore all non primary paths. 1066 * Ignore all non primary paths.
658 */ 1067 */
@@ -697,7 +1106,7 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
697 else 1106 else
698 address = *cpa->vaddr; 1107 address = *cpa->vaddr;
699repeat: 1108repeat:
700 kpte = lookup_address(address, &level); 1109 kpte = _lookup_address_cpa(cpa, address, &level);
701 if (!kpte) 1110 if (!kpte)
702 return __cpa_process_fault(cpa, address, primary); 1111 return __cpa_process_fault(cpa, address, primary);
703 1112
@@ -761,7 +1170,7 @@ repeat:
761 /* 1170 /*
762 * We have to split the large page: 1171 * We have to split the large page:
763 */ 1172 */
764 err = split_large_page(kpte, address); 1173 err = split_large_page(cpa, kpte, address);
765 if (!err) { 1174 if (!err) {
766 /* 1175 /*
767 * Do a global flush tlb after splitting the large page 1176 * Do a global flush tlb after splitting the large page
@@ -910,6 +1319,8 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
910 int ret, cache, checkalias; 1319 int ret, cache, checkalias;
911 unsigned long baddr = 0; 1320 unsigned long baddr = 0;
912 1321
1322 memset(&cpa, 0, sizeof(cpa));
1323
913 /* 1324 /*
914 * Check, if we are requested to change a not supported 1325 * Check, if we are requested to change a not supported
915 * feature: 1326 * feature:
@@ -982,10 +1393,10 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
982 cache = cache_attr(mask_set); 1393 cache = cache_attr(mask_set);
983 1394
984 /* 1395 /*
985 * On success we use clflush, when the CPU supports it to 1396 * On success we use CLFLUSH, when the CPU supports it to
986 * avoid the wbindv. If the CPU does not support it and in the 1397 * avoid the WBINVD. If the CPU does not support it and in the
987 * error case we fall back to cpa_flush_all (which uses 1398 * error case we fall back to cpa_flush_all (which uses
988 * wbindv): 1399 * WBINVD):
989 */ 1400 */
990 if (!ret && cpu_has_clflush) { 1401 if (!ret && cpu_has_clflush) {
991 if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { 1402 if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
@@ -1356,6 +1767,7 @@ static int __set_pages_p(struct page *page, int numpages)
1356{ 1767{
1357 unsigned long tempaddr = (unsigned long) page_address(page); 1768 unsigned long tempaddr = (unsigned long) page_address(page);
1358 struct cpa_data cpa = { .vaddr = &tempaddr, 1769 struct cpa_data cpa = { .vaddr = &tempaddr,
1770 .pgd = NULL,
1359 .numpages = numpages, 1771 .numpages = numpages,
1360 .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), 1772 .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
1361 .mask_clr = __pgprot(0), 1773 .mask_clr = __pgprot(0),
@@ -1374,6 +1786,7 @@ static int __set_pages_np(struct page *page, int numpages)
1374{ 1786{
1375 unsigned long tempaddr = (unsigned long) page_address(page); 1787 unsigned long tempaddr = (unsigned long) page_address(page);
1376 struct cpa_data cpa = { .vaddr = &tempaddr, 1788 struct cpa_data cpa = { .vaddr = &tempaddr,
1789 .pgd = NULL,
1377 .numpages = numpages, 1790 .numpages = numpages,
1378 .mask_set = __pgprot(0), 1791 .mask_set = __pgprot(0),
1379 .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), 1792 .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
@@ -1434,6 +1847,42 @@ bool kernel_page_present(struct page *page)
1434 1847
1435#endif /* CONFIG_DEBUG_PAGEALLOC */ 1848#endif /* CONFIG_DEBUG_PAGEALLOC */
1436 1849
1850int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
1851 unsigned numpages, unsigned long page_flags)
1852{
1853 int retval = -EINVAL;
1854
1855 struct cpa_data cpa = {
1856 .vaddr = &address,
1857 .pfn = pfn,
1858 .pgd = pgd,
1859 .numpages = numpages,
1860 .mask_set = __pgprot(0),
1861 .mask_clr = __pgprot(0),
1862 .flags = 0,
1863 };
1864
1865 if (!(__supported_pte_mask & _PAGE_NX))
1866 goto out;
1867
1868 if (!(page_flags & _PAGE_NX))
1869 cpa.mask_clr = __pgprot(_PAGE_NX);
1870
1871 cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);
1872
1873 retval = __change_page_attr_set_clr(&cpa, 0);
1874 __flush_tlb_all();
1875
1876out:
1877 return retval;
1878}
1879
1880void kernel_unmap_pages_in_pgd(pgd_t *root, unsigned long address,
1881 unsigned numpages)
1882{
1883 unmap_pgd_range(root, address, address + (numpages << PAGE_SHIFT));
1884}
1885
1437/* 1886/*
1438 * The testcases use internal knowledge of the implementation that shouldn't 1887 * The testcases use internal knowledge of the implementation that shouldn't
1439 * be exposed to the rest of the kernel. Include these directly here. 1888 * be exposed to the rest of the kernel. Include these directly here.
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index a69bcb8c7621..4dd8cf652579 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -127,7 +127,7 @@ static int __init parse_reservetop(char *arg)
127 127
128 address = memparse(arg, &arg); 128 address = memparse(arg, &arg);
129 reserve_top_address(address); 129 reserve_top_address(address);
130 fixup_early_ioremap(); 130 early_ioremap_init();
131 return 0; 131 return 0;
132} 132}
133early_param("reservetop", parse_reservetop); 133early_param("reservetop", parse_reservetop);
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 266ca912f62e..66338a60aa6e 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -42,15 +42,31 @@ static __init inline int srat_disabled(void)
42 return acpi_numa < 0; 42 return acpi_numa < 0;
43} 43}
44 44
45/* Callback for SLIT parsing */ 45/*
46 * Callback for SLIT parsing. pxm_to_node() returns NUMA_NO_NODE for
47 * I/O localities since SRAT does not list them. I/O localities are
48 * not supported at this point.
49 */
46void __init acpi_numa_slit_init(struct acpi_table_slit *slit) 50void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
47{ 51{
48 int i, j; 52 int i, j;
49 53
50 for (i = 0; i < slit->locality_count; i++) 54 for (i = 0; i < slit->locality_count; i++) {
51 for (j = 0; j < slit->locality_count; j++) 55 const int from_node = pxm_to_node(i);
52 numa_set_distance(pxm_to_node(i), pxm_to_node(j), 56
57 if (from_node == NUMA_NO_NODE)
58 continue;
59
60 for (j = 0; j < slit->locality_count; j++) {
61 const int to_node = pxm_to_node(j);
62
63 if (to_node == NUMA_NO_NODE)
64 continue;
65
66 numa_set_distance(from_node, to_node,
53 slit->entry[slit->locality_count * i + j]); 67 slit->entry[slit->locality_count * i + j]);
68 }
69 }
54} 70}
55 71
56/* Callback for Proximity Domain -> x2APIC mapping */ 72/* Callback for Proximity Domain -> x2APIC mapping */
@@ -181,6 +197,11 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
181 (unsigned long long) start, (unsigned long long) end - 1, 197 (unsigned long long) start, (unsigned long long) end - 1,
182 hotpluggable ? " hotplug" : ""); 198 hotpluggable ? " hotplug" : "");
183 199
200 /* Mark hotplug range in memblock. */
201 if (hotpluggable && memblock_mark_hotplug(start, ma->length))
202 pr_warn("SRAT: Failed to mark hotplug range [mem %#010Lx-%#010Lx] in memblock\n",
203 (unsigned long long)start, (unsigned long long)end - 1);
204
184 return 0; 205 return 0;
185out_err_bad_srat: 206out_err_bad_srat:
186 bad_srat(); 207 bad_srat();
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index ae699b3bbac8..dd8dda167a24 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -103,7 +103,7 @@ static void flush_tlb_func(void *info)
103 if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) 103 if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
104 return; 104 return;
105 105
106 count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED); 106 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
107 if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { 107 if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
108 if (f->flush_end == TLB_FLUSH_ALL) 108 if (f->flush_end == TLB_FLUSH_ALL)
109 local_flush_tlb(); 109 local_flush_tlb();
@@ -131,7 +131,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
131 info.flush_start = start; 131 info.flush_start = start;
132 info.flush_end = end; 132 info.flush_end = end;
133 133
134 count_vm_event(NR_TLB_REMOTE_FLUSH); 134 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
135 if (is_uv_system()) { 135 if (is_uv_system()) {
136 unsigned int cpu; 136 unsigned int cpu;
137 137
@@ -151,44 +151,19 @@ void flush_tlb_current_task(void)
151 151
152 preempt_disable(); 152 preempt_disable();
153 153
154 count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); 154 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
155 local_flush_tlb(); 155 local_flush_tlb();
156 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) 156 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
157 flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); 157 flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
158 preempt_enable(); 158 preempt_enable();
159} 159}
160 160
161/*
162 * It can find out the THP large page, or
163 * HUGETLB page in tlb_flush when THP disabled
164 */
165static inline unsigned long has_large_page(struct mm_struct *mm,
166 unsigned long start, unsigned long end)
167{
168 pgd_t *pgd;
169 pud_t *pud;
170 pmd_t *pmd;
171 unsigned long addr = ALIGN(start, HPAGE_SIZE);
172 for (; addr < end; addr += HPAGE_SIZE) {
173 pgd = pgd_offset(mm, addr);
174 if (likely(!pgd_none(*pgd))) {
175 pud = pud_offset(pgd, addr);
176 if (likely(!pud_none(*pud))) {
177 pmd = pmd_offset(pud, addr);
178 if (likely(!pmd_none(*pmd)))
179 if (pmd_large(*pmd))
180 return addr;
181 }
182 }
183 }
184 return 0;
185}
186
187void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, 161void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
188 unsigned long end, unsigned long vmflag) 162 unsigned long end, unsigned long vmflag)
189{ 163{
190 unsigned long addr; 164 unsigned long addr;
191 unsigned act_entries, tlb_entries = 0; 165 unsigned act_entries, tlb_entries = 0;
166 unsigned long nr_base_pages;
192 167
193 preempt_disable(); 168 preempt_disable();
194 if (current->active_mm != mm) 169 if (current->active_mm != mm)
@@ -210,21 +185,20 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
210 tlb_entries = tlb_lli_4k[ENTRIES]; 185 tlb_entries = tlb_lli_4k[ENTRIES];
211 else 186 else
212 tlb_entries = tlb_lld_4k[ENTRIES]; 187 tlb_entries = tlb_lld_4k[ENTRIES];
188
213 /* Assume all of TLB entries was occupied by this task */ 189 /* Assume all of TLB entries was occupied by this task */
214 act_entries = mm->total_vm > tlb_entries ? tlb_entries : mm->total_vm; 190 act_entries = tlb_entries >> tlb_flushall_shift;
191 act_entries = mm->total_vm > act_entries ? act_entries : mm->total_vm;
192 nr_base_pages = (end - start) >> PAGE_SHIFT;
215 193
216 /* tlb_flushall_shift is on balance point, details in commit log */ 194 /* tlb_flushall_shift is on balance point, details in commit log */
217 if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) { 195 if (nr_base_pages > act_entries) {
218 count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); 196 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
219 local_flush_tlb(); 197 local_flush_tlb();
220 } else { 198 } else {
221 if (has_large_page(mm, start, end)) {
222 local_flush_tlb();
223 goto flush_all;
224 }
225 /* flush range by one by one 'invlpg' */ 199 /* flush range by one by one 'invlpg' */
226 for (addr = start; addr < end; addr += PAGE_SIZE) { 200 for (addr = start; addr < end; addr += PAGE_SIZE) {
227 count_vm_event(NR_TLB_LOCAL_FLUSH_ONE); 201 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
228 __flush_tlb_single(addr); 202 __flush_tlb_single(addr);
229 } 203 }
230 204
@@ -262,7 +236,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
262 236
263static void do_flush_tlb_all(void *info) 237static void do_flush_tlb_all(void *info)
264{ 238{
265 count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED); 239 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
266 __flush_tlb_all(); 240 __flush_tlb_all();
267 if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) 241 if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
268 leave_mm(smp_processor_id()); 242 leave_mm(smp_processor_id());
@@ -270,7 +244,7 @@ static void do_flush_tlb_all(void *info)
270 244
271void flush_tlb_all(void) 245void flush_tlb_all(void)
272{ 246{
273 count_vm_event(NR_TLB_REMOTE_FLUSH); 247 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
274 on_each_cpu(do_flush_tlb_all, NULL, 1); 248 on_each_cpu(do_flush_tlb_all, NULL, 1);
275} 249}
276 250
diff --git a/arch/x86/net/bpf_jit.S b/arch/x86/net/bpf_jit.S
index 877b9a1b2152..01495755701b 100644
--- a/arch/x86/net/bpf_jit.S
+++ b/arch/x86/net/bpf_jit.S
@@ -140,7 +140,7 @@ bpf_slow_path_byte_msh:
140 push %r9; \ 140 push %r9; \
141 push SKBDATA; \ 141 push SKBDATA; \
142/* rsi already has offset */ \ 142/* rsi already has offset */ \
143 mov $SIZE,%ecx; /* size */ \ 143 mov $SIZE,%edx; /* size */ \
144 call bpf_internal_load_pointer_neg_helper; \ 144 call bpf_internal_load_pointer_neg_helper; \
145 test %rax,%rax; \ 145 test %rax,%rax; \
146 pop SKBDATA; \ 146 pop SKBDATA; \
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 4ed75dd81d05..dc017735bb91 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -553,13 +553,13 @@ void bpf_jit_compile(struct sk_filter *fp)
553 } 553 }
554 break; 554 break;
555 case BPF_S_ANC_RXHASH: 555 case BPF_S_ANC_RXHASH:
556 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, rxhash) != 4); 556 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
557 if (is_imm8(offsetof(struct sk_buff, rxhash))) { 557 if (is_imm8(offsetof(struct sk_buff, hash))) {
558 /* mov off8(%rdi),%eax */ 558 /* mov off8(%rdi),%eax */
559 EMIT3(0x8b, 0x47, offsetof(struct sk_buff, rxhash)); 559 EMIT3(0x8b, 0x47, offsetof(struct sk_buff, hash));
560 } else { 560 } else {
561 EMIT2(0x8b, 0x87); 561 EMIT2(0x8b, 0x87);
562 EMIT(offsetof(struct sk_buff, rxhash), 4); 562 EMIT(offsetof(struct sk_buff, hash), 4);
563 } 563 }
564 break; 564 break;
565 case BPF_S_ANC_QUEUE: 565 case BPF_S_ANC_QUEUE:
@@ -772,6 +772,7 @@ cond_branch: f_offset = addrs[i + filter[i].jf] - addrs[i];
772 bpf_flush_icache(header, image + proglen); 772 bpf_flush_icache(header, image + proglen);
773 set_memory_ro((unsigned long)header, header->pages); 773 set_memory_ro((unsigned long)header, header->pages);
774 fp->bpf_func = (void *)image; 774 fp->bpf_func = (void *)image;
775 fp->jited = 1;
775 } 776 }
776out: 777out:
777 kfree(addrs); 778 kfree(addrs);
@@ -791,7 +792,7 @@ static void bpf_jit_free_deferred(struct work_struct *work)
791 792
792void bpf_jit_free(struct sk_filter *fp) 793void bpf_jit_free(struct sk_filter *fp)
793{ 794{
794 if (fp->bpf_func != sk_run_filter) { 795 if (fp->jited) {
795 INIT_WORK(&fp->work, bpf_jit_free_deferred); 796 INIT_WORK(&fp->work, bpf_jit_free_deferred);
796 schedule_work(&fp->work); 797 schedule_work(&fp->work);
797 } else { 798 } else {
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 6890d8498e0b..379e8bd0deea 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -494,14 +494,19 @@ static int nmi_setup(void)
494 if (err) 494 if (err)
495 goto fail; 495 goto fail;
496 496
497 cpu_notifier_register_begin();
498
499 /* Use get/put_online_cpus() to protect 'nmi_enabled' */
497 get_online_cpus(); 500 get_online_cpus();
498 register_cpu_notifier(&oprofile_cpu_nb);
499 nmi_enabled = 1; 501 nmi_enabled = 1;
500 /* make nmi_enabled visible to the nmi handler: */ 502 /* make nmi_enabled visible to the nmi handler: */
501 smp_mb(); 503 smp_mb();
502 on_each_cpu(nmi_cpu_setup, NULL, 1); 504 on_each_cpu(nmi_cpu_setup, NULL, 1);
505 __register_cpu_notifier(&oprofile_cpu_nb);
503 put_online_cpus(); 506 put_online_cpus();
504 507
508 cpu_notifier_register_done();
509
505 return 0; 510 return 0;
506fail: 511fail:
507 free_msrs(); 512 free_msrs();
@@ -512,12 +517,18 @@ static void nmi_shutdown(void)
512{ 517{
513 struct op_msrs *msrs; 518 struct op_msrs *msrs;
514 519
520 cpu_notifier_register_begin();
521
522 /* Use get/put_online_cpus() to protect 'nmi_enabled' & 'ctr_running' */
515 get_online_cpus(); 523 get_online_cpus();
516 unregister_cpu_notifier(&oprofile_cpu_nb);
517 on_each_cpu(nmi_cpu_shutdown, NULL, 1); 524 on_each_cpu(nmi_cpu_shutdown, NULL, 1);
518 nmi_enabled = 0; 525 nmi_enabled = 0;
519 ctr_running = 0; 526 ctr_running = 0;
527 __unregister_cpu_notifier(&oprofile_cpu_nb);
520 put_online_cpus(); 528 put_online_cpus();
529
530 cpu_notifier_register_done();
531
521 /* make variables visible to the nmi handler: */ 532 /* make variables visible to the nmi handler: */
522 smp_mb(); 533 smp_mb();
523 unregister_nmi_handler(NMI_LOCAL, "oprofile"); 534 unregister_nmi_handler(NMI_LOCAL, "oprofile");
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index e063eed0f912..5c6fc3577a49 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -13,9 +13,6 @@ obj-y += legacy.o irq.o
13 13
14obj-$(CONFIG_STA2X11) += sta2x11-fixup.o 14obj-$(CONFIG_STA2X11) += sta2x11-fixup.o
15 15
16obj-$(CONFIG_X86_VISWS) += visws.o
17
18obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
19obj-$(CONFIG_X86_NUMACHIP) += numachip.o 16obj-$(CONFIG_X86_NUMACHIP) += numachip.o
20 17
21obj-$(CONFIG_X86_INTEL_MID) += intel_mid_pci.o 18obj-$(CONFIG_X86_INTEL_MID) += intel_mid_pci.o
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 4f25ec077552..01edac6c5e18 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -218,9 +218,8 @@ static void teardown_mcfg_map(struct pci_root_info *info)
218} 218}
219#endif 219#endif
220 220
221static acpi_status 221static acpi_status resource_to_addr(struct acpi_resource *resource,
222resource_to_addr(struct acpi_resource *resource, 222 struct acpi_resource_address64 *addr)
223 struct acpi_resource_address64 *addr)
224{ 223{
225 acpi_status status; 224 acpi_status status;
226 struct acpi_resource_memory24 *memory24; 225 struct acpi_resource_memory24 *memory24;
@@ -265,8 +264,7 @@ resource_to_addr(struct acpi_resource *resource,
265 return AE_ERROR; 264 return AE_ERROR;
266} 265}
267 266
268static acpi_status 267static acpi_status count_resource(struct acpi_resource *acpi_res, void *data)
269count_resource(struct acpi_resource *acpi_res, void *data)
270{ 268{
271 struct pci_root_info *info = data; 269 struct pci_root_info *info = data;
272 struct acpi_resource_address64 addr; 270 struct acpi_resource_address64 addr;
@@ -278,8 +276,7 @@ count_resource(struct acpi_resource *acpi_res, void *data)
278 return AE_OK; 276 return AE_OK;
279} 277}
280 278
281static acpi_status 279static acpi_status setup_resource(struct acpi_resource *acpi_res, void *data)
282setup_resource(struct acpi_resource *acpi_res, void *data)
283{ 280{
284 struct pci_root_info *info = data; 281 struct pci_root_info *info = data;
285 struct resource *res; 282 struct resource *res;
@@ -435,9 +432,9 @@ static void release_pci_root_info(struct pci_host_bridge *bridge)
435 __release_pci_root_info(info); 432 __release_pci_root_info(info);
436} 433}
437 434
438static void 435static void probe_pci_root_info(struct pci_root_info *info,
439probe_pci_root_info(struct pci_root_info *info, struct acpi_device *device, 436 struct acpi_device *device,
440 int busnum, int domain) 437 int busnum, int domain)
441{ 438{
442 size_t size; 439 size_t size;
443 440
@@ -473,16 +470,13 @@ probe_pci_root_info(struct pci_root_info *info, struct acpi_device *device,
473struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) 470struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
474{ 471{
475 struct acpi_device *device = root->device; 472 struct acpi_device *device = root->device;
476 struct pci_root_info *info = NULL; 473 struct pci_root_info *info;
477 int domain = root->segment; 474 int domain = root->segment;
478 int busnum = root->secondary.start; 475 int busnum = root->secondary.start;
479 LIST_HEAD(resources); 476 LIST_HEAD(resources);
480 struct pci_bus *bus = NULL; 477 struct pci_bus *bus;
481 struct pci_sysdata *sd; 478 struct pci_sysdata *sd;
482 int node; 479 int node;
483#ifdef CONFIG_ACPI_NUMA
484 int pxm;
485#endif
486 480
487 if (pci_ignore_seg) 481 if (pci_ignore_seg)
488 domain = 0; 482 domain = 0;
@@ -494,19 +488,12 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
494 return NULL; 488 return NULL;
495 } 489 }
496 490
497 node = -1; 491 node = acpi_get_node(device->handle);
498#ifdef CONFIG_ACPI_NUMA 492 if (node == NUMA_NO_NODE)
499 pxm = acpi_get_pxm(device->handle); 493 node = x86_pci_root_bus_node(busnum);
500 if (pxm >= 0)
501 node = pxm_to_node(pxm);
502 if (node != -1)
503 set_mp_bus_to_node(busnum, node);
504 else
505#endif
506 node = get_mp_bus_to_node(busnum);
507 494
508 if (node != -1 && !node_online(node)) 495 if (node != NUMA_NO_NODE && !node_online(node))
509 node = -1; 496 node = NUMA_NO_NODE;
510 497
511 info = kzalloc(sizeof(*info), GFP_KERNEL); 498 info = kzalloc(sizeof(*info), GFP_KERNEL);
512 if (!info) { 499 if (!info) {
@@ -519,15 +506,12 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
519 sd->domain = domain; 506 sd->domain = domain;
520 sd->node = node; 507 sd->node = node;
521 sd->companion = device; 508 sd->companion = device;
522 /* 509
523 * Maybe the desired pci bus has been already scanned. In such case
524 * it is unnecessary to scan the pci bus with the given domain,busnum.
525 */
526 bus = pci_find_bus(domain, busnum); 510 bus = pci_find_bus(domain, busnum);
527 if (bus) { 511 if (bus) {
528 /* 512 /*
529 * If the desired bus exits, the content of bus->sysdata will 513 * If the desired bus has been scanned already, replace
530 * be replaced by sd. 514 * its bus->sysdata.
531 */ 515 */
532 memcpy(bus->sysdata, sd, sizeof(*sd)); 516 memcpy(bus->sysdata, sd, sizeof(*sd));
533 kfree(info); 517 kfree(info);
@@ -572,15 +556,8 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
572 pcie_bus_configure_settings(child); 556 pcie_bus_configure_settings(child);
573 } 557 }
574 558
575 if (bus && node != -1) { 559 if (bus && node != NUMA_NO_NODE)
576#ifdef CONFIG_ACPI_NUMA
577 if (pxm >= 0)
578 dev_printk(KERN_DEBUG, &bus->dev,
579 "on NUMA node %d (pxm %d)\n", node, pxm);
580#else
581 dev_printk(KERN_DEBUG, &bus->dev, "on NUMA node %d\n", node); 560 dev_printk(KERN_DEBUG, &bus->dev, "on NUMA node %d\n", node);
582#endif
583 }
584 561
585 return bus; 562 return bus;
586} 563}
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index a48be98e9ded..e88f4c53d7f6 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -44,15 +44,6 @@ static struct pci_root_info __init *find_pci_root_info(int node, int link)
44 return NULL; 44 return NULL;
45} 45}
46 46
47static void __init set_mp_bus_range_to_node(int min_bus, int max_bus, int node)
48{
49#ifdef CONFIG_NUMA
50 int j;
51
52 for (j = min_bus; j <= max_bus; j++)
53 set_mp_bus_to_node(j, node);
54#endif
55}
56/** 47/**
57 * early_fill_mp_bus_to_node() 48 * early_fill_mp_bus_to_node()
58 * called before pcibios_scan_root and pci_scan_bus 49 * called before pcibios_scan_root and pci_scan_bus
@@ -117,7 +108,6 @@ static int __init early_fill_mp_bus_info(void)
117 min_bus = (reg >> 16) & 0xff; 108 min_bus = (reg >> 16) & 0xff;
118 max_bus = (reg >> 24) & 0xff; 109 max_bus = (reg >> 24) & 0xff;
119 node = (reg >> 4) & 0x07; 110 node = (reg >> 4) & 0x07;
120 set_mp_bus_range_to_node(min_bus, max_bus, node);
121 link = (reg >> 8) & 0x03; 111 link = (reg >> 8) & 0x03;
122 112
123 info = alloc_pci_root_info(min_bus, max_bus, node, link); 113 info = alloc_pci_root_info(min_bus, max_bus, node, link);
@@ -380,10 +370,13 @@ static int __init pci_io_ecs_init(void)
380 if (early_pci_allowed()) 370 if (early_pci_allowed())
381 pci_enable_pci_io_ecs(); 371 pci_enable_pci_io_ecs();
382 372
383 register_cpu_notifier(&amd_cpu_notifier); 373 cpu_notifier_register_begin();
384 for_each_online_cpu(cpu) 374 for_each_online_cpu(cpu)
385 amd_cpu_notify(&amd_cpu_notifier, (unsigned long)CPU_ONLINE, 375 amd_cpu_notify(&amd_cpu_notifier, (unsigned long)CPU_ONLINE,
386 (void *)(long)cpu); 376 (void *)(long)cpu);
377 __register_cpu_notifier(&amd_cpu_notifier);
378 cpu_notifier_register_done();
379
387 pci_probe |= PCI_HAS_IO_ECS; 380 pci_probe |= PCI_HAS_IO_ECS;
388 381
389 return 0; 382 return 0;
diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c
index c2735feb2508..f3a2cfc14125 100644
--- a/arch/x86/pci/bus_numa.c
+++ b/arch/x86/pci/bus_numa.c
@@ -10,9 +10,6 @@ static struct pci_root_info *x86_find_pci_root_info(int bus)
10{ 10{
11 struct pci_root_info *info; 11 struct pci_root_info *info;
12 12
13 if (list_empty(&pci_root_infos))
14 return NULL;
15
16 list_for_each_entry(info, &pci_root_infos, list) 13 list_for_each_entry(info, &pci_root_infos, list)
17 if (info->busn.start == bus) 14 if (info->busn.start == bus)
18 return info; 15 return info;
@@ -20,6 +17,16 @@ static struct pci_root_info *x86_find_pci_root_info(int bus)
20 return NULL; 17 return NULL;
21} 18}
22 19
20int x86_pci_root_bus_node(int bus)
21{
22 struct pci_root_info *info = x86_find_pci_root_info(bus);
23
24 if (!info)
25 return NUMA_NO_NODE;
26
27 return info->node;
28}
29
23void x86_pci_root_bus_resources(int bus, struct list_head *resources) 30void x86_pci_root_bus_resources(int bus, struct list_head *resources)
24{ 31{
25 struct pci_root_info *info = x86_find_pci_root_info(bus); 32 struct pci_root_info *info = x86_find_pci_root_info(bus);
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 981c2dbd72cc..059a76c29739 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -456,19 +456,25 @@ void __init dmi_check_pciprobe(void)
456 dmi_check_system(pciprobe_dmi_table); 456 dmi_check_system(pciprobe_dmi_table);
457} 457}
458 458
459struct pci_bus *pcibios_scan_root(int busnum) 459void pcibios_scan_root(int busnum)
460{ 460{
461 struct pci_bus *bus = NULL; 461 struct pci_bus *bus;
462 struct pci_sysdata *sd;
463 LIST_HEAD(resources);
462 464
463 while ((bus = pci_find_next_bus(bus)) != NULL) { 465 sd = kzalloc(sizeof(*sd), GFP_KERNEL);
464 if (bus->number == busnum) { 466 if (!sd) {
465 /* Already scanned */ 467 printk(KERN_ERR "PCI: OOM, skipping PCI bus %02x\n", busnum);
466 return bus; 468 return;
467 } 469 }
470 sd->node = x86_pci_root_bus_node(busnum);
471 x86_pci_root_bus_resources(busnum, &resources);
472 printk(KERN_DEBUG "PCI: Probing PCI hardware (bus %02x)\n", busnum);
473 bus = pci_scan_root_bus(NULL, busnum, &pci_root_ops, sd, &resources);
474 if (!bus) {
475 pci_free_resource_list(&resources);
476 kfree(sd);
468 } 477 }
469
470 return pci_scan_bus_on_node(busnum, &pci_root_ops,
471 get_mp_bus_to_node(busnum));
472} 478}
473 479
474void __init pcibios_set_cache_line_size(void) 480void __init pcibios_set_cache_line_size(void)
@@ -561,7 +567,6 @@ char * __init pcibios_setup(char *str)
561 pci_probe |= PCI_PROBE_NOEARLY; 567 pci_probe |= PCI_PROBE_NOEARLY;
562 return NULL; 568 return NULL;
563 } 569 }
564#ifndef CONFIG_X86_VISWS
565 else if (!strcmp(str, "usepirqmask")) { 570 else if (!strcmp(str, "usepirqmask")) {
566 pci_probe |= PCI_USE_PIRQ_MASK; 571 pci_probe |= PCI_USE_PIRQ_MASK;
567 return NULL; 572 return NULL;
@@ -571,9 +576,7 @@ char * __init pcibios_setup(char *str)
571 } else if (!strncmp(str, "lastbus=", 8)) { 576 } else if (!strncmp(str, "lastbus=", 8)) {
572 pcibios_last_bus = simple_strtol(str+8, NULL, 0); 577 pcibios_last_bus = simple_strtol(str+8, NULL, 0);
573 return NULL; 578 return NULL;
574 } 579 } else if (!strcmp(str, "rom")) {
575#endif
576 else if (!strcmp(str, "rom")) {
577 pci_probe |= PCI_ASSIGN_ROMS; 580 pci_probe |= PCI_ASSIGN_ROMS;
578 return NULL; 581 return NULL;
579 } else if (!strcmp(str, "norom")) { 582 } else if (!strcmp(str, "norom")) {
@@ -677,105 +680,3 @@ int pci_ext_cfg_avail(void)
677 else 680 else
678 return 0; 681 return 0;
679} 682}
680
681struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops, int node)
682{
683 LIST_HEAD(resources);
684 struct pci_bus *bus = NULL;
685 struct pci_sysdata *sd;
686
687 /*
688 * Allocate per-root-bus (not per bus) arch-specific data.
689 * TODO: leak; this memory is never freed.
690 * It's arguable whether it's worth the trouble to care.
691 */
692 sd = kzalloc(sizeof(*sd), GFP_KERNEL);
693 if (!sd) {
694 printk(KERN_ERR "PCI: OOM, skipping PCI bus %02x\n", busno);
695 return NULL;
696 }
697 sd->node = node;
698 x86_pci_root_bus_resources(busno, &resources);
699 printk(KERN_DEBUG "PCI: Probing PCI hardware (bus %02x)\n", busno);
700 bus = pci_scan_root_bus(NULL, busno, ops, sd, &resources);
701 if (!bus) {
702 pci_free_resource_list(&resources);
703 kfree(sd);
704 }
705
706 return bus;
707}
708
709struct pci_bus *pci_scan_bus_with_sysdata(int busno)
710{
711 return pci_scan_bus_on_node(busno, &pci_root_ops, -1);
712}
713
714/*
715 * NUMA info for PCI busses
716 *
717 * Early arch code is responsible for filling in reasonable values here.
718 * A node id of "-1" means "use current node". In other words, if a bus
719 * has a -1 node id, it's not tightly coupled to any particular chunk
720 * of memory (as is the case on some Nehalem systems).
721 */
722#ifdef CONFIG_NUMA
723
724#define BUS_NR 256
725
726#ifdef CONFIG_X86_64
727
728static int mp_bus_to_node[BUS_NR] = {
729 [0 ... BUS_NR - 1] = -1
730};
731
732void set_mp_bus_to_node(int busnum, int node)
733{
734 if (busnum >= 0 && busnum < BUS_NR)
735 mp_bus_to_node[busnum] = node;
736}
737
738int get_mp_bus_to_node(int busnum)
739{
740 int node = -1;
741
742 if (busnum < 0 || busnum > (BUS_NR - 1))
743 return node;
744
745 node = mp_bus_to_node[busnum];
746
747 /*
748 * let numa_node_id to decide it later in dma_alloc_pages
749 * if there is no ram on that node
750 */
751 if (node != -1 && !node_online(node))
752 node = -1;
753
754 return node;
755}
756
757#else /* CONFIG_X86_32 */
758
759static int mp_bus_to_node[BUS_NR] = {
760 [0 ... BUS_NR - 1] = -1
761};
762
763void set_mp_bus_to_node(int busnum, int node)
764{
765 if (busnum >= 0 && busnum < BUS_NR)
766 mp_bus_to_node[busnum] = (unsigned char) node;
767}
768
769int get_mp_bus_to_node(int busnum)
770{
771 int node;
772
773 if (busnum < 0 || busnum > (BUS_NR - 1))
774 return 0;
775 node = mp_bus_to_node[busnum];
776 return node;
777}
778
779#endif /* CONFIG_X86_32 */
780
781#endif /* CONFIG_NUMA */
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index b046e070e088..94ae9ae9574f 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -5,7 +5,6 @@
5#include <linux/delay.h> 5#include <linux/delay.h>
6#include <linux/dmi.h> 6#include <linux/dmi.h>
7#include <linux/pci.h> 7#include <linux/pci.h>
8#include <linux/init.h>
9#include <linux/vgaarb.h> 8#include <linux/vgaarb.h>
10#include <asm/pci_x86.h> 9#include <asm/pci_x86.h>
11 10
@@ -26,9 +25,9 @@ static void pci_fixup_i450nx(struct pci_dev *d)
26 dev_dbg(&d->dev, "i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno, 25 dev_dbg(&d->dev, "i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno,
27 suba, subb); 26 suba, subb);
28 if (busno) 27 if (busno)
29 pci_scan_bus_with_sysdata(busno); /* Bus A */ 28 pcibios_scan_root(busno); /* Bus A */
30 if (suba < subb) 29 if (suba < subb)
31 pci_scan_bus_with_sysdata(suba+1); /* Bus B */ 30 pcibios_scan_root(suba+1); /* Bus B */
32 } 31 }
33 pcibios_last_bus = -1; 32 pcibios_last_bus = -1;
34} 33}
@@ -43,7 +42,7 @@ static void pci_fixup_i450gx(struct pci_dev *d)
43 u8 busno; 42 u8 busno;
44 pci_read_config_byte(d, 0x4a, &busno); 43 pci_read_config_byte(d, 0x4a, &busno);
45 dev_info(&d->dev, "i440KX/GX host bridge; secondary bus %02x\n", busno); 44 dev_info(&d->dev, "i440KX/GX host bridge; secondary bus %02x\n", busno);
46 pci_scan_bus_with_sysdata(busno); 45 pcibios_scan_root(busno);
47 pcibios_last_bus = -1; 46 pcibios_last_bus = -1;
48} 47}
49DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82454GX, pci_fixup_i450gx); 48DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82454GX, pci_fixup_i450gx);
@@ -314,9 +313,10 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PC1, pcie_r
314 * IORESOURCE_ROM_SHADOW is used to associate the boot video 313 * IORESOURCE_ROM_SHADOW is used to associate the boot video
315 * card with this copy. On laptops this copy has to be used since 314 * card with this copy. On laptops this copy has to be used since
316 * the main ROM may be compressed or combined with another image. 315 * the main ROM may be compressed or combined with another image.
317 * See pci_map_rom() for use of this flag. IORESOURCE_ROM_SHADOW 316 * See pci_map_rom() for use of this flag. Before marking the device
318 * is marked here since the boot video device will be the only enabled 317 * with IORESOURCE_ROM_SHADOW check if a vga_default_device is already set
319 * video device at this point. 318 * by either arch cde or vga-arbitration, if so only apply the fixup to this
319 * already determined primary video card.
320 */ 320 */
321 321
322static void pci_fixup_video(struct pci_dev *pdev) 322static void pci_fixup_video(struct pci_dev *pdev)
@@ -347,12 +347,13 @@ static void pci_fixup_video(struct pci_dev *pdev)
347 } 347 }
348 bus = bus->parent; 348 bus = bus->parent;
349 } 349 }
350 pci_read_config_word(pdev, PCI_COMMAND, &config); 350 if (!vga_default_device() || pdev == vga_default_device()) {
351 if (config & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY)) { 351 pci_read_config_word(pdev, PCI_COMMAND, &config);
352 pdev->resource[PCI_ROM_RESOURCE].flags |= IORESOURCE_ROM_SHADOW; 352 if (config & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY)) {
353 dev_printk(KERN_DEBUG, &pdev->dev, "Boot video device\n"); 353 pdev->resource[PCI_ROM_RESOURCE].flags |= IORESOURCE_ROM_SHADOW;
354 if (!vga_default_device()) 354 dev_printk(KERN_DEBUG, &pdev->dev, "Boot video device\n");
355 vga_set_default_device(pdev); 355 vga_set_default_device(pdev);
356 }
356 } 357 }
357} 358}
358DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_ANY_ID, PCI_ANY_ID, 359DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_ANY_ID, PCI_ANY_ID,
diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c
index 51384ca727ad..84b9d672843d 100644
--- a/arch/x86/pci/intel_mid_pci.c
+++ b/arch/x86/pci/intel_mid_pci.c
@@ -31,6 +31,7 @@
31#include <asm/pci_x86.h> 31#include <asm/pci_x86.h>
32#include <asm/hw_irq.h> 32#include <asm/hw_irq.h>
33#include <asm/io_apic.h> 33#include <asm/io_apic.h>
34#include <asm/intel-mid.h>
34 35
35#define PCIE_CAP_OFFSET 0x100 36#define PCIE_CAP_OFFSET 0x100
36 37
@@ -219,7 +220,10 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev)
219 irq_attr.ioapic = mp_find_ioapic(dev->irq); 220 irq_attr.ioapic = mp_find_ioapic(dev->irq);
220 irq_attr.ioapic_pin = dev->irq; 221 irq_attr.ioapic_pin = dev->irq;
221 irq_attr.trigger = 1; /* level */ 222 irq_attr.trigger = 1; /* level */
222 irq_attr.polarity = 1; /* active low */ 223 if (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_TANGIER)
224 irq_attr.polarity = 0; /* active high */
225 else
226 irq_attr.polarity = 1; /* active low */
223 io_apic_set_pci_routing(&dev->dev, dev->irq, &irq_attr); 227 io_apic_set_pci_routing(&dev->dev, dev->irq, &irq_attr);
224 228
225 return 0; 229 return 0;
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 372e9b8989b3..84112f55dd7a 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -136,13 +136,9 @@ static void __init pirq_peer_trick(void)
136 busmap[e->bus] = 1; 136 busmap[e->bus] = 1;
137 } 137 }
138 for (i = 1; i < 256; i++) { 138 for (i = 1; i < 256; i++) {
139 int node;
140 if (!busmap[i] || pci_find_bus(0, i)) 139 if (!busmap[i] || pci_find_bus(0, i))
141 continue; 140 continue;
142 node = get_mp_bus_to_node(i); 141 pcibios_scan_root(i);
143 if (pci_scan_bus_on_node(i, &pci_root_ops, node))
144 printk(KERN_INFO "PCI: Discovered primary peer "
145 "bus %02x [IRQ]\n", i);
146 } 142 }
147 pcibios_last_bus = -1; 143 pcibios_last_bus = -1;
148} 144}
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index 4db96fb1c232..5b662c0faf8c 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -37,19 +37,17 @@ int __init pci_legacy_init(void)
37void pcibios_scan_specific_bus(int busn) 37void pcibios_scan_specific_bus(int busn)
38{ 38{
39 int devfn; 39 int devfn;
40 long node;
41 u32 l; 40 u32 l;
42 41
43 if (pci_find_bus(0, busn)) 42 if (pci_find_bus(0, busn))
44 return; 43 return;
45 44
46 node = get_mp_bus_to_node(busn);
47 for (devfn = 0; devfn < 256; devfn += 8) { 45 for (devfn = 0; devfn < 256; devfn += 8) {
48 if (!raw_pci_read(0, busn, devfn, PCI_VENDOR_ID, 2, &l) && 46 if (!raw_pci_read(0, busn, devfn, PCI_VENDOR_ID, 2, &l) &&
49 l != 0x0000 && l != 0xffff) { 47 l != 0x0000 && l != 0xffff) {
50 DBG("Found device at %02x:%02x [%04x]\n", busn, devfn, l); 48 DBG("Found device at %02x:%02x [%04x]\n", busn, devfn, l);
51 printk(KERN_INFO "PCI: Discovered peer bus %02x\n", busn); 49 printk(KERN_INFO "PCI: Discovered peer bus %02x\n", busn);
52 pci_scan_bus_on_node(busn, &pci_root_ops, node); 50 pcibios_scan_root(busn);
53 return; 51 return;
54 } 52 }
55 } 53 }
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 082e88129712..248642f4bab7 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -12,7 +12,6 @@
12 12
13#include <linux/pci.h> 13#include <linux/pci.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/acpi.h>
16#include <linux/sfi_acpi.h> 15#include <linux/sfi_acpi.h>
17#include <linux/bitmap.h> 16#include <linux/bitmap.h>
18#include <linux/dmi.h> 17#include <linux/dmi.h>
diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c
index 5c90975cdf0f..43984bc1665a 100644
--- a/arch/x86/pci/mmconfig_32.c
+++ b/arch/x86/pci/mmconfig_32.c
@@ -14,7 +14,6 @@
14#include <linux/rcupdate.h> 14#include <linux/rcupdate.h>
15#include <asm/e820.h> 15#include <asm/e820.h>
16#include <asm/pci_x86.h> 16#include <asm/pci_x86.h>
17#include <acpi/acpi.h>
18 17
19/* Assume systems with more busses have correct MCFG */ 18/* Assume systems with more busses have correct MCFG */
20#define mmcfg_virt_addr ((void __iomem *) fix_to_virt(FIX_PCIE_MCFG)) 19#define mmcfg_virt_addr ((void __iomem *) fix_to_virt(FIX_PCIE_MCFG))
diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c
deleted file mode 100644
index 72c229f9ebcf..000000000000
--- a/arch/x86/pci/numaq_32.c
+++ /dev/null
@@ -1,165 +0,0 @@
1/*
2 * numaq_32.c - Low-level PCI access for NUMA-Q machines
3 */
4
5#include <linux/pci.h>
6#include <linux/init.h>
7#include <linux/nodemask.h>
8#include <asm/apic.h>
9#include <asm/mpspec.h>
10#include <asm/pci_x86.h>
11#include <asm/numaq.h>
12
13#define BUS2QUAD(global) (mp_bus_id_to_node[global])
14
15#define BUS2LOCAL(global) (mp_bus_id_to_local[global])
16
17#define QUADLOCAL2BUS(quad,local) (quad_local_to_mp_bus_id[quad][local])
18
19#define PCI_CONF1_MQ_ADDRESS(bus, devfn, reg) \
20 (0x80000000 | (BUS2LOCAL(bus) << 16) | (devfn << 8) | (reg & ~3))
21
22static void write_cf8(unsigned bus, unsigned devfn, unsigned reg)
23{
24 unsigned val = PCI_CONF1_MQ_ADDRESS(bus, devfn, reg);
25 if (xquad_portio)
26 writel(val, XQUAD_PORT_ADDR(0xcf8, BUS2QUAD(bus)));
27 else
28 outl(val, 0xCF8);
29}
30
31static int pci_conf1_mq_read(unsigned int seg, unsigned int bus,
32 unsigned int devfn, int reg, int len, u32 *value)
33{
34 unsigned long flags;
35 void *adr __iomem = XQUAD_PORT_ADDR(0xcfc, BUS2QUAD(bus));
36
37 WARN_ON(seg);
38 if (!value || (bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255))
39 return -EINVAL;
40
41 raw_spin_lock_irqsave(&pci_config_lock, flags);
42
43 write_cf8(bus, devfn, reg);
44
45 switch (len) {
46 case 1:
47 if (xquad_portio)
48 *value = readb(adr + (reg & 3));
49 else
50 *value = inb(0xCFC + (reg & 3));
51 break;
52 case 2:
53 if (xquad_portio)
54 *value = readw(adr + (reg & 2));
55 else
56 *value = inw(0xCFC + (reg & 2));
57 break;
58 case 4:
59 if (xquad_portio)
60 *value = readl(adr);
61 else
62 *value = inl(0xCFC);
63 break;
64 }
65
66 raw_spin_unlock_irqrestore(&pci_config_lock, flags);
67
68 return 0;
69}
70
71static int pci_conf1_mq_write(unsigned int seg, unsigned int bus,
72 unsigned int devfn, int reg, int len, u32 value)
73{
74 unsigned long flags;
75 void *adr __iomem = XQUAD_PORT_ADDR(0xcfc, BUS2QUAD(bus));
76
77 WARN_ON(seg);
78 if ((bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255))
79 return -EINVAL;
80
81 raw_spin_lock_irqsave(&pci_config_lock, flags);
82
83 write_cf8(bus, devfn, reg);
84
85 switch (len) {
86 case 1:
87 if (xquad_portio)
88 writeb(value, adr + (reg & 3));
89 else
90 outb((u8)value, 0xCFC + (reg & 3));
91 break;
92 case 2:
93 if (xquad_portio)
94 writew(value, adr + (reg & 2));
95 else
96 outw((u16)value, 0xCFC + (reg & 2));
97 break;
98 case 4:
99 if (xquad_portio)
100 writel(value, adr + reg);
101 else
102 outl((u32)value, 0xCFC);
103 break;
104 }
105
106 raw_spin_unlock_irqrestore(&pci_config_lock, flags);
107
108 return 0;
109}
110
111#undef PCI_CONF1_MQ_ADDRESS
112
113static const struct pci_raw_ops pci_direct_conf1_mq = {
114 .read = pci_conf1_mq_read,
115 .write = pci_conf1_mq_write
116};
117
118
119static void pci_fixup_i450nx(struct pci_dev *d)
120{
121 /*
122 * i450NX -- Find and scan all secondary buses on all PXB's.
123 */
124 int pxb, reg;
125 u8 busno, suba, subb;
126 int quad = BUS2QUAD(d->bus->number);
127
128 dev_info(&d->dev, "searching for i450NX host bridges\n");
129 reg = 0xd0;
130 for(pxb=0; pxb<2; pxb++) {
131 pci_read_config_byte(d, reg++, &busno);
132 pci_read_config_byte(d, reg++, &suba);
133 pci_read_config_byte(d, reg++, &subb);
134 dev_dbg(&d->dev, "i450NX PXB %d: %02x/%02x/%02x\n",
135 pxb, busno, suba, subb);
136 if (busno) {
137 /* Bus A */
138 pci_scan_bus_with_sysdata(QUADLOCAL2BUS(quad, busno));
139 }
140 if (suba < subb) {
141 /* Bus B */
142 pci_scan_bus_with_sysdata(QUADLOCAL2BUS(quad, suba+1));
143 }
144 }
145 pcibios_last_bus = -1;
146}
147DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX, pci_fixup_i450nx);
148
149int __init pci_numaq_init(void)
150{
151 int quad;
152
153 raw_pci_ops = &pci_direct_conf1_mq;
154
155 pcibios_scan_root(0);
156 if (num_online_nodes() > 1)
157 for_each_online_node(quad) {
158 if (quad == 0)
159 continue;
160 printk("Scanning PCI bus %d for quad %d\n",
161 QUADLOCAL2BUS(quad,0), quad);
162 pci_scan_bus_with_sysdata(QUADLOCAL2BUS(quad, 0));
163 }
164 return 0;
165}
diff --git a/arch/x86/pci/visws.c b/arch/x86/pci/visws.c
deleted file mode 100644
index 3e6d2a6db866..000000000000
--- a/arch/x86/pci/visws.c
+++ /dev/null
@@ -1,87 +0,0 @@
1/*
2 * Low-Level PCI Support for SGI Visual Workstation
3 *
4 * (c) 1999--2000 Martin Mares <mj@ucw.cz>
5 */
6
7#include <linux/kernel.h>
8#include <linux/pci.h>
9#include <linux/init.h>
10
11#include <asm/setup.h>
12#include <asm/pci_x86.h>
13#include <asm/visws/cobalt.h>
14#include <asm/visws/lithium.h>
15
16static int pci_visws_enable_irq(struct pci_dev *dev) { return 0; }
17static void pci_visws_disable_irq(struct pci_dev *dev) { }
18
19/* int (*pcibios_enable_irq)(struct pci_dev *dev) = &pci_visws_enable_irq; */
20/* void (*pcibios_disable_irq)(struct pci_dev *dev) = &pci_visws_disable_irq; */
21
22/* void __init pcibios_penalize_isa_irq(int irq, int active) {} */
23
24
25unsigned int pci_bus0, pci_bus1;
26
27static int __init visws_map_irq(const struct pci_dev *dev, u8 slot, u8 pin)
28{
29 int irq, bus = dev->bus->number;
30
31 pin--;
32
33 /* Nothing useful at PIIX4 pin 1 */
34 if (bus == pci_bus0 && slot == 4 && pin == 0)
35 return -1;
36
37 /* PIIX4 USB is on Bus 0, Slot 4, Line 3 */
38 if (bus == pci_bus0 && slot == 4 && pin == 3) {
39 irq = CO_IRQ(CO_APIC_PIIX4_USB);
40 goto out;
41 }
42
43 /* First pin spread down 1 APIC entry per slot */
44 if (pin == 0) {
45 irq = CO_IRQ((bus == pci_bus0 ? CO_APIC_PCIB_BASE0 :
46 CO_APIC_PCIA_BASE0) + slot);
47 goto out;
48 }
49
50 /* lines 1,2,3 from any slot is shared in this twirly pattern */
51 if (bus == pci_bus1) {
52 /* lines 1-3 from devices 0 1 rotate over 2 apic entries */
53 irq = CO_IRQ(CO_APIC_PCIA_BASE123 + ((slot + (pin - 1)) % 2));
54 } else { /* bus == pci_bus0 */
55 /* lines 1-3 from devices 0-3 rotate over 3 apic entries */
56 if (slot == 0)
57 slot = 3; /* same pattern */
58 irq = CO_IRQ(CO_APIC_PCIA_BASE123 + ((3 - slot) + (pin - 1) % 3));
59 }
60out:
61 printk(KERN_DEBUG "PCI: Bus %d Slot %d Line %d -> IRQ %d\n", bus, slot, pin, irq);
62 return irq;
63}
64
65int __init pci_visws_init(void)
66{
67 pcibios_enable_irq = &pci_visws_enable_irq;
68 pcibios_disable_irq = &pci_visws_disable_irq;
69
70 /* The VISWS supports configuration access type 1 only */
71 pci_probe = (pci_probe | PCI_PROBE_CONF1) &
72 ~(PCI_PROBE_BIOS | PCI_PROBE_CONF2);
73
74 pci_bus0 = li_pcib_read16(LI_PCI_BUSNUM) & 0xff;
75 pci_bus1 = li_pcia_read16(LI_PCI_BUSNUM) & 0xff;
76
77 printk(KERN_INFO "PCI: Lithium bridge A bus: %u, "
78 "bridge B (PIIX4) bus: %u\n", pci_bus1, pci_bus0);
79
80 raw_pci_ops = &pci_direct_conf1;
81 pci_scan_bus_with_sysdata(pci_bus0);
82 pci_scan_bus_with_sysdata(pci_bus1);
83 pci_fixup_irqs(pci_common_swizzle, visws_map_irq);
84 pcibios_resource_survey();
85 /* Request bus scan */
86 return 1;
87}
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 5eee4959785d..905956f16465 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -178,6 +178,7 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
178 i = 0; 178 i = 0;
179 list_for_each_entry(msidesc, &dev->msi_list, list) { 179 list_for_each_entry(msidesc, &dev->msi_list, list) {
180 irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], 180 irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i],
181 (type == PCI_CAP_ID_MSI) ? nvec : 1,
181 (type == PCI_CAP_ID_MSIX) ? 182 (type == PCI_CAP_ID_MSIX) ?
182 "pcifront-msi-x" : 183 "pcifront-msi-x" :
183 "pcifront-msi", 184 "pcifront-msi",
@@ -245,6 +246,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
245 "xen: msi already bound to pirq=%d\n", pirq); 246 "xen: msi already bound to pirq=%d\n", pirq);
246 } 247 }
247 irq = xen_bind_pirq_msi_to_irq(dev, msidesc, pirq, 248 irq = xen_bind_pirq_msi_to_irq(dev, msidesc, pirq,
249 (type == PCI_CAP_ID_MSI) ? nvec : 1,
248 (type == PCI_CAP_ID_MSIX) ? 250 (type == PCI_CAP_ID_MSIX) ?
249 "msi-x" : "msi", 251 "msi-x" : "msi",
250 DOMID_SELF); 252 DOMID_SELF);
@@ -269,9 +271,6 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
269 int ret = 0; 271 int ret = 0;
270 struct msi_desc *msidesc; 272 struct msi_desc *msidesc;
271 273
272 if (type == PCI_CAP_ID_MSI && nvec > 1)
273 return 1;
274
275 list_for_each_entry(msidesc, &dev->msi_list, list) { 274 list_for_each_entry(msidesc, &dev->msi_list, list) {
276 struct physdev_map_pirq map_irq; 275 struct physdev_map_pirq map_irq;
277 domid_t domid; 276 domid_t domid;
@@ -291,7 +290,10 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
291 (pci_domain_nr(dev->bus) << 16); 290 (pci_domain_nr(dev->bus) << 16);
292 map_irq.devfn = dev->devfn; 291 map_irq.devfn = dev->devfn;
293 292
294 if (type == PCI_CAP_ID_MSIX) { 293 if (type == PCI_CAP_ID_MSI && nvec > 1) {
294 map_irq.type = MAP_PIRQ_TYPE_MULTI_MSI;
295 map_irq.entry_nr = nvec;
296 } else if (type == PCI_CAP_ID_MSIX) {
295 int pos; 297 int pos;
296 u32 table_offset, bir; 298 u32 table_offset, bir;
297 299
@@ -308,6 +310,16 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
308 if (pci_seg_supported) 310 if (pci_seg_supported)
309 ret = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, 311 ret = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq,
310 &map_irq); 312 &map_irq);
313 if (type == PCI_CAP_ID_MSI && nvec > 1 && ret) {
314 /*
315 * If MAP_PIRQ_TYPE_MULTI_MSI is not available
316 * there's nothing else we can do in this case.
317 * Just set ret > 0 so driver can retry with
318 * single MSI.
319 */
320 ret = 1;
321 goto out;
322 }
311 if (ret == -EINVAL && !pci_domain_nr(dev->bus)) { 323 if (ret == -EINVAL && !pci_domain_nr(dev->bus)) {
312 map_irq.type = MAP_PIRQ_TYPE_MSI; 324 map_irq.type = MAP_PIRQ_TYPE_MSI;
313 map_irq.index = -1; 325 map_irq.index = -1;
@@ -324,11 +336,10 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
324 goto out; 336 goto out;
325 } 337 }
326 338
327 ret = xen_bind_pirq_msi_to_irq(dev, msidesc, 339 ret = xen_bind_pirq_msi_to_irq(dev, msidesc, map_irq.pirq,
328 map_irq.pirq, 340 (type == PCI_CAP_ID_MSI) ? nvec : 1,
329 (type == PCI_CAP_ID_MSIX) ? 341 (type == PCI_CAP_ID_MSIX) ? "msi-x" : "msi",
330 "msi-x" : "msi", 342 domid);
331 domid);
332 if (ret < 0) 343 if (ret < 0)
333 goto out; 344 goto out;
334 } 345 }
@@ -337,7 +348,7 @@ out:
337 return ret; 348 return ret;
338} 349}
339 350
340static void xen_initdom_restore_msi_irqs(struct pci_dev *dev, int irq) 351static void xen_initdom_restore_msi_irqs(struct pci_dev *dev)
341{ 352{
342 int ret = 0; 353 int ret = 0;
343 354
diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile
index 20342d4c82ce..85afde1fa3e5 100644
--- a/arch/x86/platform/Makefile
+++ b/arch/x86/platform/Makefile
@@ -9,5 +9,4 @@ obj-y += olpc/
9obj-y += scx200/ 9obj-y += scx200/
10obj-y += sfi/ 10obj-y += sfi/
11obj-y += ts5500/ 11obj-y += ts5500/
12obj-y += visws/
13obj-y += uv/ 12obj-y += uv/
diff --git a/arch/x86/platform/efi/Makefile b/arch/x86/platform/efi/Makefile
index b7b0b35c1981..d51045afcaaf 100644
--- a/arch/x86/platform/efi/Makefile
+++ b/arch/x86/platform/efi/Makefile
@@ -1,3 +1,4 @@
1obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o 1obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o
2obj-$(CONFIG_ACPI_BGRT) += efi-bgrt.o 2obj-$(CONFIG_ACPI_BGRT) += efi-bgrt.o
3obj-$(CONFIG_EARLY_PRINTK_EFI) += early_printk.o 3obj-$(CONFIG_EARLY_PRINTK_EFI) += early_printk.o
4obj-$(CONFIG_EFI_MIXED) += efi_thunk_$(BITS).o
diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c
index 7145ec63c520..f15103dff4b4 100644
--- a/arch/x86/platform/efi/efi-bgrt.c
+++ b/arch/x86/platform/efi/efi-bgrt.c
@@ -42,14 +42,15 @@ void __init efi_bgrt_init(void)
42 42
43 if (bgrt_tab->header.length < sizeof(*bgrt_tab)) 43 if (bgrt_tab->header.length < sizeof(*bgrt_tab))
44 return; 44 return;
45 if (bgrt_tab->version != 1) 45 if (bgrt_tab->version != 1 || bgrt_tab->status != 1)
46 return; 46 return;
47 if (bgrt_tab->image_type != 0 || !bgrt_tab->image_address) 47 if (bgrt_tab->image_type != 0 || !bgrt_tab->image_address)
48 return; 48 return;
49 49
50 image = efi_lookup_mapped_addr(bgrt_tab->image_address); 50 image = efi_lookup_mapped_addr(bgrt_tab->image_address);
51 if (!image) { 51 if (!image) {
52 image = ioremap(bgrt_tab->image_address, sizeof(bmp_header)); 52 image = early_memremap(bgrt_tab->image_address,
53 sizeof(bmp_header));
53 ioremapped = true; 54 ioremapped = true;
54 if (!image) 55 if (!image)
55 return; 56 return;
@@ -57,7 +58,7 @@ void __init efi_bgrt_init(void)
57 58
58 memcpy_fromio(&bmp_header, image, sizeof(bmp_header)); 59 memcpy_fromio(&bmp_header, image, sizeof(bmp_header));
59 if (ioremapped) 60 if (ioremapped)
60 iounmap(image); 61 early_iounmap(image, sizeof(bmp_header));
61 bgrt_image_size = bmp_header.size; 62 bgrt_image_size = bmp_header.size;
62 63
63 bgrt_image = kmalloc(bgrt_image_size, GFP_KERNEL); 64 bgrt_image = kmalloc(bgrt_image_size, GFP_KERNEL);
@@ -65,7 +66,8 @@ void __init efi_bgrt_init(void)
65 return; 66 return;
66 67
67 if (ioremapped) { 68 if (ioremapped) {
68 image = ioremap(bgrt_tab->image_address, bmp_header.size); 69 image = early_memremap(bgrt_tab->image_address,
70 bmp_header.size);
69 if (!image) { 71 if (!image) {
70 kfree(bgrt_image); 72 kfree(bgrt_image);
71 bgrt_image = NULL; 73 bgrt_image = NULL;
@@ -75,5 +77,5 @@ void __init efi_bgrt_init(void)
75 77
76 memcpy_fromio(bgrt_image, image, bgrt_image_size); 78 memcpy_fromio(bgrt_image, image, bgrt_image_size);
77 if (ioremapped) 79 if (ioremapped)
78 iounmap(image); 80 early_iounmap(image, bmp_header.size);
79} 81}
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index cceb813044ef..3781dd39e8bd 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -12,6 +12,8 @@
12 * Bibo Mao <bibo.mao@intel.com> 12 * Bibo Mao <bibo.mao@intel.com>
13 * Chandramouli Narayanan <mouli@linux.intel.com> 13 * Chandramouli Narayanan <mouli@linux.intel.com>
14 * Huang Ying <ying.huang@intel.com> 14 * Huang Ying <ying.huang@intel.com>
15 * Copyright (C) 2013 SuSE Labs
16 * Borislav Petkov <bp@suse.de> - runtime services VA mapping
15 * 17 *
16 * Copied from efi_32.c to eliminate the duplicated code between EFI 18 * Copied from efi_32.c to eliminate the duplicated code between EFI
17 * 32/64 support code. --ying 2007-10-26 19 * 32/64 support code. --ying 2007-10-26
@@ -50,8 +52,9 @@
50#include <asm/tlbflush.h> 52#include <asm/tlbflush.h>
51#include <asm/x86_init.h> 53#include <asm/x86_init.h>
52#include <asm/rtc.h> 54#include <asm/rtc.h>
55#include <asm/uv/uv.h>
53 56
54#define EFI_DEBUG 1 57#define EFI_DEBUG
55 58
56#define EFI_MIN_RESERVE 5120 59#define EFI_MIN_RESERVE 5120
57 60
@@ -65,25 +68,16 @@ struct efi_memory_map memmap;
65static struct efi efi_phys __initdata; 68static struct efi efi_phys __initdata;
66static efi_system_table_t efi_systab __initdata; 69static efi_system_table_t efi_systab __initdata;
67 70
68unsigned long x86_efi_facility; 71static efi_config_table_type_t arch_tables[] __initdata = {
69
70static __initdata efi_config_table_type_t arch_tables[] = {
71#ifdef CONFIG_X86_UV 72#ifdef CONFIG_X86_UV
72 {UV_SYSTEM_TABLE_GUID, "UVsystab", &efi.uv_systab}, 73 {UV_SYSTEM_TABLE_GUID, "UVsystab", &efi.uv_systab},
73#endif 74#endif
74 {NULL_GUID, NULL, NULL}, 75 {NULL_GUID, NULL, NULL},
75}; 76};
76 77
77/* 78u64 efi_setup; /* efi setup_data physical address */
78 * Returns 1 if 'facility' is enabled, 0 otherwise.
79 */
80int efi_enabled(int facility)
81{
82 return test_bit(facility, &x86_efi_facility) != 0;
83}
84EXPORT_SYMBOL(efi_enabled);
85 79
86static bool __initdata disable_runtime = false; 80static bool disable_runtime __initdata = false;
87static int __init setup_noefi(char *arg) 81static int __init setup_noefi(char *arg)
88{ 82{
89 disable_runtime = true; 83 disable_runtime = true;
@@ -110,7 +104,6 @@ static int __init setup_storage_paranoia(char *arg)
110} 104}
111early_param("efi_no_storage_paranoia", setup_storage_paranoia); 105early_param("efi_no_storage_paranoia", setup_storage_paranoia);
112 106
113
114static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) 107static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
115{ 108{
116 unsigned long flags; 109 unsigned long flags;
@@ -253,27 +246,12 @@ static efi_status_t __init phys_efi_set_virtual_address_map(
253 return status; 246 return status;
254} 247}
255 248
256static efi_status_t __init phys_efi_get_time(efi_time_t *tm,
257 efi_time_cap_t *tc)
258{
259 unsigned long flags;
260 efi_status_t status;
261
262 spin_lock_irqsave(&rtc_lock, flags);
263 efi_call_phys_prelog();
264 status = efi_call_phys2(efi_phys.get_time, virt_to_phys(tm),
265 virt_to_phys(tc));
266 efi_call_phys_epilog();
267 spin_unlock_irqrestore(&rtc_lock, flags);
268 return status;
269}
270
271int efi_set_rtc_mmss(const struct timespec *now) 249int efi_set_rtc_mmss(const struct timespec *now)
272{ 250{
273 unsigned long nowtime = now->tv_sec; 251 unsigned long nowtime = now->tv_sec;
274 efi_status_t status; 252 efi_status_t status;
275 efi_time_t eft; 253 efi_time_t eft;
276 efi_time_cap_t cap; 254 efi_time_cap_t cap;
277 struct rtc_time tm; 255 struct rtc_time tm;
278 256
279 status = efi.get_time(&eft, &cap); 257 status = efi.get_time(&eft, &cap);
@@ -291,9 +269,8 @@ int efi_set_rtc_mmss(const struct timespec *now)
291 eft.second = tm.tm_sec; 269 eft.second = tm.tm_sec;
292 eft.nanosecond = 0; 270 eft.nanosecond = 0;
293 } else { 271 } else {
294 printk(KERN_ERR 272 pr_err("%s: Invalid EFI RTC value: write of %lx to EFI RTC failed\n",
295 "%s: Invalid EFI RTC value: write of %lx to EFI RTC failed\n", 273 __func__, nowtime);
296 __FUNCTION__, nowtime);
297 return -1; 274 return -1;
298 } 275 }
299 276
@@ -398,9 +375,9 @@ int __init efi_memblock_x86_reserve_range(void)
398 return 0; 375 return 0;
399} 376}
400 377
401#if EFI_DEBUG
402static void __init print_efi_memmap(void) 378static void __init print_efi_memmap(void)
403{ 379{
380#ifdef EFI_DEBUG
404 efi_memory_desc_t *md; 381 efi_memory_desc_t *md;
405 void *p; 382 void *p;
406 int i; 383 int i;
@@ -409,14 +386,13 @@ static void __init print_efi_memmap(void)
409 p < memmap.map_end; 386 p < memmap.map_end;
410 p += memmap.desc_size, i++) { 387 p += memmap.desc_size, i++) {
411 md = p; 388 md = p;
412 pr_info("mem%02u: type=%u, attr=0x%llx, " 389 pr_info("mem%02u: type=%u, attr=0x%llx, range=[0x%016llx-0x%016llx) (%lluMB)\n",
413 "range=[0x%016llx-0x%016llx) (%lluMB)\n",
414 i, md->type, md->attribute, md->phys_addr, 390 i, md->type, md->attribute, md->phys_addr,
415 md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT), 391 md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT),
416 (md->num_pages >> (20 - EFI_PAGE_SHIFT))); 392 (md->num_pages >> (20 - EFI_PAGE_SHIFT)));
417 } 393 }
418}
419#endif /* EFI_DEBUG */ 394#endif /* EFI_DEBUG */
395}
420 396
421void __init efi_reserve_boot_services(void) 397void __init efi_reserve_boot_services(void)
422{ 398{
@@ -436,15 +412,14 @@ void __init efi_reserve_boot_services(void)
436 * - Not within any part of the kernel 412 * - Not within any part of the kernel
437 * - Not the bios reserved area 413 * - Not the bios reserved area
438 */ 414 */
439 if ((start+size >= __pa_symbol(_text) 415 if ((start + size > __pa_symbol(_text)
440 && start <= __pa_symbol(_end)) || 416 && start <= __pa_symbol(_end)) ||
441 !e820_all_mapped(start, start+size, E820_RAM) || 417 !e820_all_mapped(start, start+size, E820_RAM) ||
442 memblock_is_region_reserved(start, size)) { 418 memblock_is_region_reserved(start, size)) {
443 /* Could not reserve, skip it */ 419 /* Could not reserve, skip it */
444 md->num_pages = 0; 420 md->num_pages = 0;
445 memblock_dbg("Could not reserve boot range " 421 memblock_dbg("Could not reserve boot range [0x%010llx-0x%010llx]\n",
446 "[0x%010llx-0x%010llx]\n", 422 start, start+size-1);
447 start, start+size-1);
448 } else 423 } else
449 memblock_reserve(start, size); 424 memblock_reserve(start, size);
450 } 425 }
@@ -452,7 +427,7 @@ void __init efi_reserve_boot_services(void)
452 427
453void __init efi_unmap_memmap(void) 428void __init efi_unmap_memmap(void)
454{ 429{
455 clear_bit(EFI_MEMMAP, &x86_efi_facility); 430 clear_bit(EFI_MEMMAP, &efi.flags);
456 if (memmap.map) { 431 if (memmap.map) {
457 early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size); 432 early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size);
458 memmap.map = NULL; 433 memmap.map = NULL;
@@ -463,9 +438,6 @@ void __init efi_free_boot_services(void)
463{ 438{
464 void *p; 439 void *p;
465 440
466 if (!efi_is_native())
467 return;
468
469 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { 441 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
470 efi_memory_desc_t *md = p; 442 efi_memory_desc_t *md = p;
471 unsigned long long start = md->phys_addr; 443 unsigned long long start = md->phys_addr;
@@ -489,18 +461,27 @@ static int __init efi_systab_init(void *phys)
489{ 461{
490 if (efi_enabled(EFI_64BIT)) { 462 if (efi_enabled(EFI_64BIT)) {
491 efi_system_table_64_t *systab64; 463 efi_system_table_64_t *systab64;
464 struct efi_setup_data *data = NULL;
492 u64 tmp = 0; 465 u64 tmp = 0;
493 466
467 if (efi_setup) {
468 data = early_memremap(efi_setup, sizeof(*data));
469 if (!data)
470 return -ENOMEM;
471 }
494 systab64 = early_ioremap((unsigned long)phys, 472 systab64 = early_ioremap((unsigned long)phys,
495 sizeof(*systab64)); 473 sizeof(*systab64));
496 if (systab64 == NULL) { 474 if (systab64 == NULL) {
497 pr_err("Couldn't map the system table!\n"); 475 pr_err("Couldn't map the system table!\n");
476 if (data)
477 early_iounmap(data, sizeof(*data));
498 return -ENOMEM; 478 return -ENOMEM;
499 } 479 }
500 480
501 efi_systab.hdr = systab64->hdr; 481 efi_systab.hdr = systab64->hdr;
502 efi_systab.fw_vendor = systab64->fw_vendor; 482 efi_systab.fw_vendor = data ? (unsigned long)data->fw_vendor :
503 tmp |= systab64->fw_vendor; 483 systab64->fw_vendor;
484 tmp |= data ? data->fw_vendor : systab64->fw_vendor;
504 efi_systab.fw_revision = systab64->fw_revision; 485 efi_systab.fw_revision = systab64->fw_revision;
505 efi_systab.con_in_handle = systab64->con_in_handle; 486 efi_systab.con_in_handle = systab64->con_in_handle;
506 tmp |= systab64->con_in_handle; 487 tmp |= systab64->con_in_handle;
@@ -514,15 +495,20 @@ static int __init efi_systab_init(void *phys)
514 tmp |= systab64->stderr_handle; 495 tmp |= systab64->stderr_handle;
515 efi_systab.stderr = systab64->stderr; 496 efi_systab.stderr = systab64->stderr;
516 tmp |= systab64->stderr; 497 tmp |= systab64->stderr;
517 efi_systab.runtime = (void *)(unsigned long)systab64->runtime; 498 efi_systab.runtime = data ?
518 tmp |= systab64->runtime; 499 (void *)(unsigned long)data->runtime :
500 (void *)(unsigned long)systab64->runtime;
501 tmp |= data ? data->runtime : systab64->runtime;
519 efi_systab.boottime = (void *)(unsigned long)systab64->boottime; 502 efi_systab.boottime = (void *)(unsigned long)systab64->boottime;
520 tmp |= systab64->boottime; 503 tmp |= systab64->boottime;
521 efi_systab.nr_tables = systab64->nr_tables; 504 efi_systab.nr_tables = systab64->nr_tables;
522 efi_systab.tables = systab64->tables; 505 efi_systab.tables = data ? (unsigned long)data->tables :
523 tmp |= systab64->tables; 506 systab64->tables;
507 tmp |= data ? data->tables : systab64->tables;
524 508
525 early_iounmap(systab64, sizeof(*systab64)); 509 early_iounmap(systab64, sizeof(*systab64));
510 if (data)
511 early_iounmap(data, sizeof(*data));
526#ifdef CONFIG_X86_32 512#ifdef CONFIG_X86_32
527 if (tmp >> 32) { 513 if (tmp >> 32) {
528 pr_err("EFI data located above 4GB, disabling EFI.\n"); 514 pr_err("EFI data located above 4GB, disabling EFI.\n");
@@ -566,45 +552,82 @@ static int __init efi_systab_init(void *phys)
566 return -EINVAL; 552 return -EINVAL;
567 } 553 }
568 if ((efi.systab->hdr.revision >> 16) == 0) 554 if ((efi.systab->hdr.revision >> 16) == 0)
569 pr_err("Warning: System table version " 555 pr_err("Warning: System table version %d.%02d, expected 1.00 or greater!\n",
570 "%d.%02d, expected 1.00 or greater!\n",
571 efi.systab->hdr.revision >> 16, 556 efi.systab->hdr.revision >> 16,
572 efi.systab->hdr.revision & 0xffff); 557 efi.systab->hdr.revision & 0xffff);
573 558
559 set_bit(EFI_SYSTEM_TABLES, &efi.flags);
560
574 return 0; 561 return 0;
575} 562}
576 563
577static int __init efi_runtime_init(void) 564static int __init efi_runtime_init32(void)
578{ 565{
579 efi_runtime_services_t *runtime; 566 efi_runtime_services_32_t *runtime;
567
568 runtime = early_ioremap((unsigned long)efi.systab->runtime,
569 sizeof(efi_runtime_services_32_t));
570 if (!runtime) {
571 pr_err("Could not map the runtime service table!\n");
572 return -ENOMEM;
573 }
580 574
581 /* 575 /*
582 * Check out the runtime services table. We need to map 576 * We will only need *early* access to the following two
583 * the runtime services table so that we can grab the physical 577 * EFI runtime services before set_virtual_address_map
584 * address of several of the EFI runtime functions, needed to 578 * is invoked.
585 * set the firmware into virtual mode.
586 */ 579 */
580 efi_phys.set_virtual_address_map =
581 (efi_set_virtual_address_map_t *)
582 (unsigned long)runtime->set_virtual_address_map;
583 early_iounmap(runtime, sizeof(efi_runtime_services_32_t));
584
585 return 0;
586}
587
588static int __init efi_runtime_init64(void)
589{
590 efi_runtime_services_64_t *runtime;
591
587 runtime = early_ioremap((unsigned long)efi.systab->runtime, 592 runtime = early_ioremap((unsigned long)efi.systab->runtime,
588 sizeof(efi_runtime_services_t)); 593 sizeof(efi_runtime_services_64_t));
589 if (!runtime) { 594 if (!runtime) {
590 pr_err("Could not map the runtime service table!\n"); 595 pr_err("Could not map the runtime service table!\n");
591 return -ENOMEM; 596 return -ENOMEM;
592 } 597 }
598
593 /* 599 /*
594 * We will only need *early* access to the following 600 * We will only need *early* access to the following two
595 * two EFI runtime services before set_virtual_address_map 601 * EFI runtime services before set_virtual_address_map
596 * is invoked. 602 * is invoked.
597 */ 603 */
598 efi_phys.get_time = (efi_get_time_t *)runtime->get_time;
599 efi_phys.set_virtual_address_map = 604 efi_phys.set_virtual_address_map =
600 (efi_set_virtual_address_map_t *) 605 (efi_set_virtual_address_map_t *)
601 runtime->set_virtual_address_map; 606 (unsigned long)runtime->set_virtual_address_map;
607 early_iounmap(runtime, sizeof(efi_runtime_services_64_t));
608
609 return 0;
610}
611
612static int __init efi_runtime_init(void)
613{
614 int rv;
615
602 /* 616 /*
603 * Make efi_get_time can be called before entering 617 * Check out the runtime services table. We need to map
604 * virtual mode. 618 * the runtime services table so that we can grab the physical
619 * address of several of the EFI runtime functions, needed to
620 * set the firmware into virtual mode.
605 */ 621 */
606 efi.get_time = phys_efi_get_time; 622 if (efi_enabled(EFI_64BIT))
607 early_iounmap(runtime, sizeof(efi_runtime_services_t)); 623 rv = efi_runtime_init64();
624 else
625 rv = efi_runtime_init32();
626
627 if (rv)
628 return rv;
629
630 set_bit(EFI_RUNTIME_SERVICES, &efi.flags);
608 631
609 return 0; 632 return 0;
610} 633}
@@ -623,9 +646,67 @@ static int __init efi_memmap_init(void)
623 if (add_efi_memmap) 646 if (add_efi_memmap)
624 do_add_efi_memmap(); 647 do_add_efi_memmap();
625 648
649 set_bit(EFI_MEMMAP, &efi.flags);
650
626 return 0; 651 return 0;
627} 652}
628 653
654/*
655 * A number of config table entries get remapped to virtual addresses
656 * after entering EFI virtual mode. However, the kexec kernel requires
657 * their physical addresses therefore we pass them via setup_data and
658 * correct those entries to their respective physical addresses here.
659 *
660 * Currently only handles smbios which is necessary for some firmware
661 * implementation.
662 */
663static int __init efi_reuse_config(u64 tables, int nr_tables)
664{
665 int i, sz, ret = 0;
666 void *p, *tablep;
667 struct efi_setup_data *data;
668
669 if (!efi_setup)
670 return 0;
671
672 if (!efi_enabled(EFI_64BIT))
673 return 0;
674
675 data = early_memremap(efi_setup, sizeof(*data));
676 if (!data) {
677 ret = -ENOMEM;
678 goto out;
679 }
680
681 if (!data->smbios)
682 goto out_memremap;
683
684 sz = sizeof(efi_config_table_64_t);
685
686 p = tablep = early_memremap(tables, nr_tables * sz);
687 if (!p) {
688 pr_err("Could not map Configuration table!\n");
689 ret = -ENOMEM;
690 goto out_memremap;
691 }
692
693 for (i = 0; i < efi.systab->nr_tables; i++) {
694 efi_guid_t guid;
695
696 guid = ((efi_config_table_64_t *)p)->guid;
697
698 if (!efi_guidcmp(guid, SMBIOS_TABLE_GUID))
699 ((efi_config_table_64_t *)p)->table = data->smbios;
700 p += sz;
701 }
702 early_iounmap(tablep, nr_tables * sz);
703
704out_memremap:
705 early_iounmap(data, sizeof(*data));
706out:
707 return ret;
708}
709
629void __init efi_init(void) 710void __init efi_init(void)
630{ 711{
631 efi_char16_t *c16; 712 efi_char16_t *c16;
@@ -649,7 +730,11 @@ void __init efi_init(void)
649 if (efi_systab_init(efi_phys.systab)) 730 if (efi_systab_init(efi_phys.systab))
650 return; 731 return;
651 732
652 set_bit(EFI_SYSTEM_TABLES, &x86_efi_facility); 733 set_bit(EFI_SYSTEM_TABLES, &efi.flags);
734
735 efi.config_table = (unsigned long)efi.systab->tables;
736 efi.fw_vendor = (unsigned long)efi.systab->fw_vendor;
737 efi.runtime = (unsigned long)efi.systab->runtime;
653 738
654 /* 739 /*
655 * Show what we know for posterity 740 * Show what we know for posterity
@@ -667,32 +752,29 @@ void __init efi_init(void)
667 efi.systab->hdr.revision >> 16, 752 efi.systab->hdr.revision >> 16,
668 efi.systab->hdr.revision & 0xffff, vendor); 753 efi.systab->hdr.revision & 0xffff, vendor);
669 754
670 if (efi_config_init(arch_tables)) 755 if (efi_reuse_config(efi.systab->tables, efi.systab->nr_tables))
671 return; 756 return;
672 757
673 set_bit(EFI_CONFIG_TABLES, &x86_efi_facility); 758 if (efi_config_init(arch_tables))
759 return;
674 760
675 /* 761 /*
676 * Note: We currently don't support runtime services on an EFI 762 * Note: We currently don't support runtime services on an EFI
677 * that doesn't match the kernel 32/64-bit mode. 763 * that doesn't match the kernel 32/64-bit mode.
678 */ 764 */
679 765
680 if (!efi_is_native()) 766 if (!efi_runtime_supported())
681 pr_info("No EFI runtime due to 32/64-bit mismatch with kernel\n"); 767 pr_info("No EFI runtime due to 32/64-bit mismatch with kernel\n");
682 else { 768 else {
683 if (disable_runtime || efi_runtime_init()) 769 if (disable_runtime || efi_runtime_init())
684 return; 770 return;
685 set_bit(EFI_RUNTIME_SERVICES, &x86_efi_facility);
686 } 771 }
687
688 if (efi_memmap_init()) 772 if (efi_memmap_init())
689 return; 773 return;
690 774
691 set_bit(EFI_MEMMAP, &x86_efi_facility); 775 set_bit(EFI_MEMMAP, &efi.flags);
692 776
693#if EFI_DEBUG
694 print_efi_memmap(); 777 print_efi_memmap();
695#endif
696} 778}
697 779
698void __init efi_late_init(void) 780void __init efi_late_init(void)
@@ -715,7 +797,7 @@ void __init efi_set_executable(efi_memory_desc_t *md, bool executable)
715 set_memory_nx(addr, npages); 797 set_memory_nx(addr, npages);
716} 798}
717 799
718static void __init runtime_code_page_mkexec(void) 800void __init runtime_code_page_mkexec(void)
719{ 801{
720 efi_memory_desc_t *md; 802 efi_memory_desc_t *md;
721 void *p; 803 void *p;
@@ -741,36 +823,54 @@ void efi_memory_uc(u64 addr, unsigned long size)
741 set_memory_uc(addr, npages); 823 set_memory_uc(addr, npages);
742} 824}
743 825
744/* 826void __init old_map_region(efi_memory_desc_t *md)
745 * This function will switch the EFI runtime services to virtual mode.
746 * Essentially, look through the EFI memmap and map every region that
747 * has the runtime attribute bit set in its memory descriptor and update
748 * that memory descriptor with the virtual address obtained from ioremap().
749 * This enables the runtime services to be called without having to
750 * thunk back into physical mode for every invocation.
751 */
752void __init efi_enter_virtual_mode(void)
753{ 827{
754 efi_memory_desc_t *md, *prev_md = NULL; 828 u64 start_pfn, end_pfn, end;
755 efi_status_t status;
756 unsigned long size; 829 unsigned long size;
757 u64 end, systab, start_pfn, end_pfn; 830 void *va;
758 void *p, *va, *new_memmap = NULL;
759 int count = 0;
760 831
761 efi.systab = NULL; 832 start_pfn = PFN_DOWN(md->phys_addr);
833 size = md->num_pages << PAGE_SHIFT;
834 end = md->phys_addr + size;
835 end_pfn = PFN_UP(end);
762 836
763 /* 837 if (pfn_range_is_mapped(start_pfn, end_pfn)) {
764 * We don't do virtual mode, since we don't do runtime services, on 838 va = __va(md->phys_addr);
765 * non-native EFI
766 */
767 839
768 if (!efi_is_native()) { 840 if (!(md->attribute & EFI_MEMORY_WB))
769 efi_unmap_memmap(); 841 efi_memory_uc((u64)(unsigned long)va, size);
770 return; 842 } else
771 } 843 va = efi_ioremap(md->phys_addr, size,
844 md->type, md->attribute);
845
846 md->virt_addr = (u64) (unsigned long) va;
847 if (!va)
848 pr_err("ioremap of 0x%llX failed!\n",
849 (unsigned long long)md->phys_addr);
850}
851
852static void native_runtime_setup(void)
853{
854 efi.get_time = virt_efi_get_time;
855 efi.set_time = virt_efi_set_time;
856 efi.get_wakeup_time = virt_efi_get_wakeup_time;
857 efi.set_wakeup_time = virt_efi_set_wakeup_time;
858 efi.get_variable = virt_efi_get_variable;
859 efi.get_next_variable = virt_efi_get_next_variable;
860 efi.set_variable = virt_efi_set_variable;
861 efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
862 efi.reset_system = virt_efi_reset_system;
863 efi.query_variable_info = virt_efi_query_variable_info;
864 efi.update_capsule = virt_efi_update_capsule;
865 efi.query_capsule_caps = virt_efi_query_capsule_caps;
866}
867
868/* Merge contiguous regions of the same type and attribute */
869static void __init efi_merge_regions(void)
870{
871 void *p;
872 efi_memory_desc_t *md, *prev_md = NULL;
772 873
773 /* Merge contiguous regions of the same type and attribute */
774 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { 874 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
775 u64 prev_size; 875 u64 prev_size;
776 md = p; 876 md = p;
@@ -796,6 +896,84 @@ void __init efi_enter_virtual_mode(void)
796 } 896 }
797 prev_md = md; 897 prev_md = md;
798 } 898 }
899}
900
901static void __init get_systab_virt_addr(efi_memory_desc_t *md)
902{
903 unsigned long size;
904 u64 end, systab;
905
906 size = md->num_pages << EFI_PAGE_SHIFT;
907 end = md->phys_addr + size;
908 systab = (u64)(unsigned long)efi_phys.systab;
909 if (md->phys_addr <= systab && systab < end) {
910 systab += md->virt_addr - md->phys_addr;
911 efi.systab = (efi_system_table_t *)(unsigned long)systab;
912 }
913}
914
915static void __init save_runtime_map(void)
916{
917#ifdef CONFIG_KEXEC
918 efi_memory_desc_t *md;
919 void *tmp, *p, *q = NULL;
920 int count = 0;
921
922 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
923 md = p;
924
925 if (!(md->attribute & EFI_MEMORY_RUNTIME) ||
926 (md->type == EFI_BOOT_SERVICES_CODE) ||
927 (md->type == EFI_BOOT_SERVICES_DATA))
928 continue;
929 tmp = krealloc(q, (count + 1) * memmap.desc_size, GFP_KERNEL);
930 if (!tmp)
931 goto out;
932 q = tmp;
933
934 memcpy(q + count * memmap.desc_size, md, memmap.desc_size);
935 count++;
936 }
937
938 efi_runtime_map_setup(q, count, memmap.desc_size);
939 return;
940
941out:
942 kfree(q);
943 pr_err("Error saving runtime map, efi runtime on kexec non-functional!!\n");
944#endif
945}
946
947static void *realloc_pages(void *old_memmap, int old_shift)
948{
949 void *ret;
950
951 ret = (void *)__get_free_pages(GFP_KERNEL, old_shift + 1);
952 if (!ret)
953 goto out;
954
955 /*
956 * A first-time allocation doesn't have anything to copy.
957 */
958 if (!old_memmap)
959 return ret;
960
961 memcpy(ret, old_memmap, PAGE_SIZE << old_shift);
962
963out:
964 free_pages((unsigned long)old_memmap, old_shift);
965 return ret;
966}
967
968/*
969 * Map the efi memory ranges of the runtime services and update new_mmap with
970 * virtual addresses.
971 */
972static void * __init efi_map_regions(int *count, int *pg_shift)
973{
974 void *p, *new_memmap = NULL;
975 unsigned long left = 0;
976 efi_memory_desc_t *md;
799 977
800 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { 978 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
801 md = p; 979 md = p;
@@ -807,52 +985,150 @@ void __init efi_enter_virtual_mode(void)
807 continue; 985 continue;
808 } 986 }
809 987
810 size = md->num_pages << EFI_PAGE_SHIFT; 988 efi_map_region(md);
811 end = md->phys_addr + size; 989 get_systab_virt_addr(md);
812 990
813 start_pfn = PFN_DOWN(md->phys_addr); 991 if (left < memmap.desc_size) {
814 end_pfn = PFN_UP(end); 992 new_memmap = realloc_pages(new_memmap, *pg_shift);
815 if (pfn_range_is_mapped(start_pfn, end_pfn)) { 993 if (!new_memmap)
816 va = __va(md->phys_addr); 994 return NULL;
817 995
818 if (!(md->attribute & EFI_MEMORY_WB)) 996 left += PAGE_SIZE << *pg_shift;
819 efi_memory_uc((u64)(unsigned long)va, size); 997 (*pg_shift)++;
820 } else
821 va = efi_ioremap(md->phys_addr, size,
822 md->type, md->attribute);
823
824 md->virt_addr = (u64) (unsigned long) va;
825
826 if (!va) {
827 pr_err("ioremap of 0x%llX failed!\n",
828 (unsigned long long)md->phys_addr);
829 continue;
830 } 998 }
831 999
832 systab = (u64) (unsigned long) efi_phys.systab; 1000 memcpy(new_memmap + (*count * memmap.desc_size), md,
833 if (md->phys_addr <= systab && systab < end) {
834 systab += md->virt_addr - md->phys_addr;
835 efi.systab = (efi_system_table_t *) (unsigned long) systab;
836 }
837 new_memmap = krealloc(new_memmap,
838 (count + 1) * memmap.desc_size,
839 GFP_KERNEL);
840 memcpy(new_memmap + (count * memmap.desc_size), md,
841 memmap.desc_size); 1001 memmap.desc_size);
842 count++; 1002
1003 left -= memmap.desc_size;
1004 (*count)++;
1005 }
1006
1007 return new_memmap;
1008}
1009
1010static void __init kexec_enter_virtual_mode(void)
1011{
1012#ifdef CONFIG_KEXEC
1013 efi_memory_desc_t *md;
1014 void *p;
1015
1016 efi.systab = NULL;
1017
1018 /*
1019 * We don't do virtual mode, since we don't do runtime services, on
1020 * non-native EFI
1021 */
1022 if (!efi_is_native()) {
1023 efi_unmap_memmap();
1024 return;
1025 }
1026
1027 /*
1028 * Map efi regions which were passed via setup_data. The virt_addr is a
1029 * fixed addr which was used in first kernel of a kexec boot.
1030 */
1031 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
1032 md = p;
1033 efi_map_region_fixed(md); /* FIXME: add error handling */
1034 get_systab_virt_addr(md);
1035 }
1036
1037 save_runtime_map();
1038
1039 BUG_ON(!efi.systab);
1040
1041 efi_sync_low_kernel_mappings();
1042
1043 /*
1044 * Now that EFI is in virtual mode, update the function
1045 * pointers in the runtime service table to the new virtual addresses.
1046 *
1047 * Call EFI services through wrapper functions.
1048 */
1049 efi.runtime_version = efi_systab.hdr.revision;
1050
1051 native_runtime_setup();
1052
1053 efi.set_virtual_address_map = NULL;
1054
1055 if (efi_enabled(EFI_OLD_MEMMAP) && (__supported_pte_mask & _PAGE_NX))
1056 runtime_code_page_mkexec();
1057
1058 /* clean DUMMY object */
1059 efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID,
1060 EFI_VARIABLE_NON_VOLATILE |
1061 EFI_VARIABLE_BOOTSERVICE_ACCESS |
1062 EFI_VARIABLE_RUNTIME_ACCESS,
1063 0, NULL);
1064#endif
1065}
1066
1067/*
1068 * This function will switch the EFI runtime services to virtual mode.
1069 * Essentially, we look through the EFI memmap and map every region that
1070 * has the runtime attribute bit set in its memory descriptor into the
1071 * ->trampoline_pgd page table using a top-down VA allocation scheme.
1072 *
1073 * The old method which used to update that memory descriptor with the
1074 * virtual address obtained from ioremap() is still supported when the
1075 * kernel is booted with efi=old_map on its command line. Same old
1076 * method enabled the runtime services to be called without having to
1077 * thunk back into physical mode for every invocation.
1078 *
1079 * The new method does a pagetable switch in a preemption-safe manner
1080 * so that we're in a different address space when calling a runtime
1081 * function. For function arguments passing we do copy the PGDs of the
1082 * kernel page table into ->trampoline_pgd prior to each call.
1083 *
1084 * Specially for kexec boot, efi runtime maps in previous kernel should
1085 * be passed in via setup_data. In that case runtime ranges will be mapped
1086 * to the same virtual addresses as the first kernel, see
1087 * kexec_enter_virtual_mode().
1088 */
1089static void __init __efi_enter_virtual_mode(void)
1090{
1091 int count = 0, pg_shift = 0;
1092 void *new_memmap = NULL;
1093 efi_status_t status;
1094
1095 efi.systab = NULL;
1096
1097 efi_merge_regions();
1098 new_memmap = efi_map_regions(&count, &pg_shift);
1099 if (!new_memmap) {
1100 pr_err("Error reallocating memory, EFI runtime non-functional!\n");
1101 return;
843 } 1102 }
844 1103
1104 save_runtime_map();
1105
845 BUG_ON(!efi.systab); 1106 BUG_ON(!efi.systab);
846 1107
847 status = phys_efi_set_virtual_address_map( 1108 if (efi_setup_page_tables(__pa(new_memmap), 1 << pg_shift))
848 memmap.desc_size * count, 1109 return;
849 memmap.desc_size, 1110
850 memmap.desc_version, 1111 efi_sync_low_kernel_mappings();
851 (efi_memory_desc_t *)__pa(new_memmap)); 1112 efi_dump_pagetable();
1113
1114 if (efi_is_native()) {
1115 status = phys_efi_set_virtual_address_map(
1116 memmap.desc_size * count,
1117 memmap.desc_size,
1118 memmap.desc_version,
1119 (efi_memory_desc_t *)__pa(new_memmap));
1120 } else {
1121 status = efi_thunk_set_virtual_address_map(
1122 efi_phys.set_virtual_address_map,
1123 memmap.desc_size * count,
1124 memmap.desc_size,
1125 memmap.desc_version,
1126 (efi_memory_desc_t *)__pa(new_memmap));
1127 }
852 1128
853 if (status != EFI_SUCCESS) { 1129 if (status != EFI_SUCCESS) {
854 pr_alert("Unable to switch EFI into virtual mode " 1130 pr_alert("Unable to switch EFI into virtual mode (status=%lx)!\n",
855 "(status=%lx)!\n", status); 1131 status);
856 panic("EFI call to SetVirtualAddressMap() failed!"); 1132 panic("EFI call to SetVirtualAddressMap() failed!");
857 } 1133 }
858 1134
@@ -863,23 +1139,43 @@ void __init efi_enter_virtual_mode(void)
863 * Call EFI services through wrapper functions. 1139 * Call EFI services through wrapper functions.
864 */ 1140 */
865 efi.runtime_version = efi_systab.hdr.revision; 1141 efi.runtime_version = efi_systab.hdr.revision;
866 efi.get_time = virt_efi_get_time; 1142
867 efi.set_time = virt_efi_set_time; 1143 if (efi_is_native())
868 efi.get_wakeup_time = virt_efi_get_wakeup_time; 1144 native_runtime_setup();
869 efi.set_wakeup_time = virt_efi_set_wakeup_time; 1145 else
870 efi.get_variable = virt_efi_get_variable; 1146 efi_thunk_runtime_setup();
871 efi.get_next_variable = virt_efi_get_next_variable; 1147
872 efi.set_variable = virt_efi_set_variable;
873 efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
874 efi.reset_system = virt_efi_reset_system;
875 efi.set_virtual_address_map = NULL; 1148 efi.set_virtual_address_map = NULL;
876 efi.query_variable_info = virt_efi_query_variable_info;
877 efi.update_capsule = virt_efi_update_capsule;
878 efi.query_capsule_caps = virt_efi_query_capsule_caps;
879 if (__supported_pte_mask & _PAGE_NX)
880 runtime_code_page_mkexec();
881 1149
882 kfree(new_memmap); 1150 efi_runtime_mkexec();
1151
1152 /*
1153 * We mapped the descriptor array into the EFI pagetable above but we're
1154 * not unmapping it here. Here's why:
1155 *
1156 * We're copying select PGDs from the kernel page table to the EFI page
1157 * table and when we do so and make changes to those PGDs like unmapping
1158 * stuff from them, those changes appear in the kernel page table and we
1159 * go boom.
1160 *
1161 * From setup_real_mode():
1162 *
1163 * ...
1164 * trampoline_pgd[0] = init_level4_pgt[pgd_index(__PAGE_OFFSET)].pgd;
1165 *
1166 * In this particular case, our allocation is in PGD 0 of the EFI page
1167 * table but we've copied that PGD from PGD[272] of the EFI page table:
1168 *
1169 * pgd_index(__PAGE_OFFSET = 0xffff880000000000) = 272
1170 *
1171 * where the direct memory mapping in kernel space is.
1172 *
1173 * new_memmap's VA comes from that direct mapping and thus clearing it,
1174 * it would get cleared in the kernel page table too.
1175 *
1176 * efi_cleanup_page_tables(__pa(new_memmap), 1 << pg_shift);
1177 */
1178 free_pages((unsigned long)new_memmap, pg_shift);
883 1179
884 /* clean DUMMY object */ 1180 /* clean DUMMY object */
885 efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID, 1181 efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID,
@@ -889,6 +1185,14 @@ void __init efi_enter_virtual_mode(void)
889 0, NULL); 1185 0, NULL);
890} 1186}
891 1187
1188void __init efi_enter_virtual_mode(void)
1189{
1190 if (efi_setup)
1191 kexec_enter_virtual_mode();
1192 else
1193 __efi_enter_virtual_mode();
1194}
1195
892/* 1196/*
893 * Convenience functions to obtain memory types and attributes 1197 * Convenience functions to obtain memory types and attributes
894 */ 1198 */
@@ -926,9 +1230,8 @@ u64 efi_mem_attributes(unsigned long phys_addr)
926} 1230}
927 1231
928/* 1232/*
929 * Some firmware has serious problems when using more than 50% of the EFI 1233 * Some firmware implementations refuse to boot if there's insufficient space
930 * variable store, i.e. it triggers bugs that can brick machines. Ensure that 1234 * in the variable store. Ensure that we never use more than a safe limit.
931 * we never use more than this safe limit.
932 * 1235 *
933 * Return EFI_SUCCESS if it is safe to write 'size' bytes to the variable 1236 * Return EFI_SUCCESS if it is safe to write 'size' bytes to the variable
934 * store. 1237 * store.
@@ -947,10 +1250,9 @@ efi_status_t efi_query_variable_store(u32 attributes, unsigned long size)
947 return status; 1250 return status;
948 1251
949 /* 1252 /*
950 * Some firmware implementations refuse to boot if there's insufficient 1253 * We account for that by refusing the write if permitting it would
951 * space in the variable store. We account for that by refusing the 1254 * reduce the available space to under 5KB. This figure was provided by
952 * write if permitting it would reduce the available space to under 1255 * Samsung, so should be safe.
953 * 5KB. This figure was provided by Samsung, so should be safe.
954 */ 1256 */
955 if ((remaining_size - size < EFI_MIN_RESERVE) && 1257 if ((remaining_size - size < EFI_MIN_RESERVE) &&
956 !efi_no_storage_paranoia) { 1258 !efi_no_storage_paranoia) {
@@ -1006,3 +1308,34 @@ efi_status_t efi_query_variable_store(u32 attributes, unsigned long size)
1006 return EFI_SUCCESS; 1308 return EFI_SUCCESS;
1007} 1309}
1008EXPORT_SYMBOL_GPL(efi_query_variable_store); 1310EXPORT_SYMBOL_GPL(efi_query_variable_store);
1311
1312static int __init parse_efi_cmdline(char *str)
1313{
1314 if (*str == '=')
1315 str++;
1316
1317 if (!strncmp(str, "old_map", 7))
1318 set_bit(EFI_OLD_MEMMAP, &efi.flags);
1319
1320 return 0;
1321}
1322early_param("efi", parse_efi_cmdline);
1323
1324void __init efi_apply_memmap_quirks(void)
1325{
1326 /*
1327 * Once setup is done earlier, unmap the EFI memory map on mismatched
1328 * firmware/kernel architectures since there is no support for runtime
1329 * services.
1330 */
1331 if (!efi_runtime_supported()) {
1332 pr_info("efi: Setup done, disabling due to 32/64-bit mismatch\n");
1333 efi_unmap_memmap();
1334 }
1335
1336 /*
1337 * UV doesn't support the new EFI pagetable mapping yet.
1338 */
1339 if (is_uv_system())
1340 set_bit(EFI_OLD_MEMMAP, &efi.flags);
1341}
diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
index 40e446941dd7..9ee3491e31fb 100644
--- a/arch/x86/platform/efi/efi_32.c
+++ b/arch/x86/platform/efi/efi_32.c
@@ -37,9 +37,24 @@
37 * claim EFI runtime service handler exclusively and to duplicate a memory in 37 * claim EFI runtime service handler exclusively and to duplicate a memory in
38 * low memory space say 0 - 3G. 38 * low memory space say 0 - 3G.
39 */ 39 */
40
41static unsigned long efi_rt_eflags; 40static unsigned long efi_rt_eflags;
42 41
42void efi_sync_low_kernel_mappings(void) {}
43void __init efi_dump_pagetable(void) {}
44int efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
45{
46 return 0;
47}
48void efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages) {}
49
50void __init efi_map_region(efi_memory_desc_t *md)
51{
52 old_map_region(md);
53}
54
55void __init efi_map_region_fixed(efi_memory_desc_t *md) {}
56void __init parse_efi_setup(u64 phys_addr, u32 data_len) {}
57
43void efi_call_phys_prelog(void) 58void efi_call_phys_prelog(void)
44{ 59{
45 struct desc_ptr gdt_descr; 60 struct desc_ptr gdt_descr;
@@ -67,3 +82,9 @@ void efi_call_phys_epilog(void)
67 82
68 local_irq_restore(efi_rt_eflags); 83 local_irq_restore(efi_rt_eflags);
69} 84}
85
86void __init efi_runtime_mkexec(void)
87{
88 if (__supported_pte_mask & _PAGE_NX)
89 runtime_code_page_mkexec();
90}
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 39a0e7f1f0a3..290d397e1dd9 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -38,10 +38,30 @@
38#include <asm/efi.h> 38#include <asm/efi.h>
39#include <asm/cacheflush.h> 39#include <asm/cacheflush.h>
40#include <asm/fixmap.h> 40#include <asm/fixmap.h>
41#include <asm/realmode.h>
42#include <asm/time.h>
41 43
42static pgd_t *save_pgd __initdata; 44static pgd_t *save_pgd __initdata;
43static unsigned long efi_flags __initdata; 45static unsigned long efi_flags __initdata;
44 46
47/*
48 * We allocate runtime services regions bottom-up, starting from -4G, i.e.
49 * 0xffff_ffff_0000_0000 and limit EFI VA mapping space to 64G.
50 */
51static u64 efi_va = -4 * (1UL << 30);
52#define EFI_VA_END (-68 * (1UL << 30))
53
54/*
55 * Scratch space used for switching the pagetable in the EFI stub
56 */
57struct efi_scratch {
58 u64 r15;
59 u64 prev_cr3;
60 pgd_t *efi_pgt;
61 bool use_pgd;
62 u64 phys_stack;
63} __packed;
64
45static void __init early_code_mapping_set_exec(int executable) 65static void __init early_code_mapping_set_exec(int executable)
46{ 66{
47 efi_memory_desc_t *md; 67 efi_memory_desc_t *md;
@@ -65,6 +85,9 @@ void __init efi_call_phys_prelog(void)
65 int pgd; 85 int pgd;
66 int n_pgds; 86 int n_pgds;
67 87
88 if (!efi_enabled(EFI_OLD_MEMMAP))
89 return;
90
68 early_code_mapping_set_exec(1); 91 early_code_mapping_set_exec(1);
69 local_irq_save(efi_flags); 92 local_irq_save(efi_flags);
70 93
@@ -86,6 +109,10 @@ void __init efi_call_phys_epilog(void)
86 */ 109 */
87 int pgd; 110 int pgd;
88 int n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE); 111 int n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE);
112
113 if (!efi_enabled(EFI_OLD_MEMMAP))
114 return;
115
89 for (pgd = 0; pgd < n_pgds; pgd++) 116 for (pgd = 0; pgd < n_pgds; pgd++)
90 set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), save_pgd[pgd]); 117 set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), save_pgd[pgd]);
91 kfree(save_pgd); 118 kfree(save_pgd);
@@ -94,6 +121,158 @@ void __init efi_call_phys_epilog(void)
94 early_code_mapping_set_exec(0); 121 early_code_mapping_set_exec(0);
95} 122}
96 123
124/*
125 * Add low kernel mappings for passing arguments to EFI functions.
126 */
127void efi_sync_low_kernel_mappings(void)
128{
129 unsigned num_pgds;
130 pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
131
132 if (efi_enabled(EFI_OLD_MEMMAP))
133 return;
134
135 num_pgds = pgd_index(MODULES_END - 1) - pgd_index(PAGE_OFFSET);
136
137 memcpy(pgd + pgd_index(PAGE_OFFSET),
138 init_mm.pgd + pgd_index(PAGE_OFFSET),
139 sizeof(pgd_t) * num_pgds);
140}
141
142int efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
143{
144 unsigned long text;
145 struct page *page;
146 unsigned npages;
147 pgd_t *pgd;
148
149 if (efi_enabled(EFI_OLD_MEMMAP))
150 return 0;
151
152 efi_scratch.efi_pgt = (pgd_t *)(unsigned long)real_mode_header->trampoline_pgd;
153 pgd = __va(efi_scratch.efi_pgt);
154
155 /*
156 * It can happen that the physical address of new_memmap lands in memory
157 * which is not mapped in the EFI page table. Therefore we need to go
158 * and ident-map those pages containing the map before calling
159 * phys_efi_set_virtual_address_map().
160 */
161 if (kernel_map_pages_in_pgd(pgd, pa_memmap, pa_memmap, num_pages, _PAGE_NX)) {
162 pr_err("Error ident-mapping new memmap (0x%lx)!\n", pa_memmap);
163 return 1;
164 }
165
166 efi_scratch.use_pgd = true;
167
168 /*
169 * When making calls to the firmware everything needs to be 1:1
170 * mapped and addressable with 32-bit pointers. Map the kernel
171 * text and allocate a new stack because we can't rely on the
172 * stack pointer being < 4GB.
173 */
174 if (!IS_ENABLED(CONFIG_EFI_MIXED))
175 return 0;
176
177 page = alloc_page(GFP_KERNEL|__GFP_DMA32);
178 if (!page)
179 panic("Unable to allocate EFI runtime stack < 4GB\n");
180
181 efi_scratch.phys_stack = virt_to_phys(page_address(page));
182 efi_scratch.phys_stack += PAGE_SIZE; /* stack grows down */
183
184 npages = (_end - _text) >> PAGE_SHIFT;
185 text = __pa(_text);
186
187 if (kernel_map_pages_in_pgd(pgd, text >> PAGE_SHIFT, text, npages, 0)) {
188 pr_err("Failed to map kernel text 1:1\n");
189 return 1;
190 }
191
192 return 0;
193}
194
195void efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages)
196{
197 pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
198
199 kernel_unmap_pages_in_pgd(pgd, pa_memmap, num_pages);
200}
201
202static void __init __map_region(efi_memory_desc_t *md, u64 va)
203{
204 pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
205 unsigned long pf = 0;
206
207 if (!(md->attribute & EFI_MEMORY_WB))
208 pf |= _PAGE_PCD;
209
210 if (kernel_map_pages_in_pgd(pgd, md->phys_addr, va, md->num_pages, pf))
211 pr_warn("Error mapping PA 0x%llx -> VA 0x%llx!\n",
212 md->phys_addr, va);
213}
214
215void __init efi_map_region(efi_memory_desc_t *md)
216{
217 unsigned long size = md->num_pages << PAGE_SHIFT;
218 u64 pa = md->phys_addr;
219
220 if (efi_enabled(EFI_OLD_MEMMAP))
221 return old_map_region(md);
222
223 /*
224 * Make sure the 1:1 mappings are present as a catch-all for b0rked
225 * firmware which doesn't update all internal pointers after switching
226 * to virtual mode and would otherwise crap on us.
227 */
228 __map_region(md, md->phys_addr);
229
230 /*
231 * Enforce the 1:1 mapping as the default virtual address when
232 * booting in EFI mixed mode, because even though we may be
233 * running a 64-bit kernel, the firmware may only be 32-bit.
234 */
235 if (!efi_is_native () && IS_ENABLED(CONFIG_EFI_MIXED)) {
236 md->virt_addr = md->phys_addr;
237 return;
238 }
239
240 efi_va -= size;
241
242 /* Is PA 2M-aligned? */
243 if (!(pa & (PMD_SIZE - 1))) {
244 efi_va &= PMD_MASK;
245 } else {
246 u64 pa_offset = pa & (PMD_SIZE - 1);
247 u64 prev_va = efi_va;
248
249 /* get us the same offset within this 2M page */
250 efi_va = (efi_va & PMD_MASK) + pa_offset;
251
252 if (efi_va > prev_va)
253 efi_va -= PMD_SIZE;
254 }
255
256 if (efi_va < EFI_VA_END) {
257 pr_warn(FW_WARN "VA address range overflow!\n");
258 return;
259 }
260
261 /* Do the VA map */
262 __map_region(md, efi_va);
263 md->virt_addr = efi_va;
264}
265
266/*
267 * kexec kernel will use efi_map_region_fixed to map efi runtime memory ranges.
268 * md->virt_addr is the original virtual address which had been mapped in kexec
269 * 1st kernel.
270 */
271void __init efi_map_region_fixed(efi_memory_desc_t *md)
272{
273 __map_region(md, md->virt_addr);
274}
275
97void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size, 276void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
98 u32 type, u64 attribute) 277 u32 type, u64 attribute)
99{ 278{
@@ -113,3 +292,313 @@ void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
113 292
114 return (void __iomem *)__va(phys_addr); 293 return (void __iomem *)__va(phys_addr);
115} 294}
295
296void __init parse_efi_setup(u64 phys_addr, u32 data_len)
297{
298 efi_setup = phys_addr + sizeof(struct setup_data);
299}
300
301void __init efi_runtime_mkexec(void)
302{
303 if (!efi_enabled(EFI_OLD_MEMMAP))
304 return;
305
306 if (__supported_pte_mask & _PAGE_NX)
307 runtime_code_page_mkexec();
308}
309
310void __init efi_dump_pagetable(void)
311{
312#ifdef CONFIG_EFI_PGT_DUMP
313 pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
314
315 ptdump_walk_pgd_level(NULL, pgd);
316#endif
317}
318
319#ifdef CONFIG_EFI_MIXED
320extern efi_status_t efi64_thunk(u32, ...);
321
322#define runtime_service32(func) \
323({ \
324 u32 table = (u32)(unsigned long)efi.systab; \
325 u32 *rt, *___f; \
326 \
327 rt = (u32 *)(table + offsetof(efi_system_table_32_t, runtime)); \
328 ___f = (u32 *)(*rt + offsetof(efi_runtime_services_32_t, func)); \
329 *___f; \
330})
331
332/*
333 * Switch to the EFI page tables early so that we can access the 1:1
334 * runtime services mappings which are not mapped in any other page
335 * tables. This function must be called before runtime_service32().
336 *
337 * Also, disable interrupts because the IDT points to 64-bit handlers,
338 * which aren't going to function correctly when we switch to 32-bit.
339 */
340#define efi_thunk(f, ...) \
341({ \
342 efi_status_t __s; \
343 unsigned long flags; \
344 u32 func; \
345 \
346 efi_sync_low_kernel_mappings(); \
347 local_irq_save(flags); \
348 \
349 efi_scratch.prev_cr3 = read_cr3(); \
350 write_cr3((unsigned long)efi_scratch.efi_pgt); \
351 __flush_tlb_all(); \
352 \
353 func = runtime_service32(f); \
354 __s = efi64_thunk(func, __VA_ARGS__); \
355 \
356 write_cr3(efi_scratch.prev_cr3); \
357 __flush_tlb_all(); \
358 local_irq_restore(flags); \
359 \
360 __s; \
361})
362
363efi_status_t efi_thunk_set_virtual_address_map(
364 void *phys_set_virtual_address_map,
365 unsigned long memory_map_size,
366 unsigned long descriptor_size,
367 u32 descriptor_version,
368 efi_memory_desc_t *virtual_map)
369{
370 efi_status_t status;
371 unsigned long flags;
372 u32 func;
373
374 efi_sync_low_kernel_mappings();
375 local_irq_save(flags);
376
377 efi_scratch.prev_cr3 = read_cr3();
378 write_cr3((unsigned long)efi_scratch.efi_pgt);
379 __flush_tlb_all();
380
381 func = (u32)(unsigned long)phys_set_virtual_address_map;
382 status = efi64_thunk(func, memory_map_size, descriptor_size,
383 descriptor_version, virtual_map);
384
385 write_cr3(efi_scratch.prev_cr3);
386 __flush_tlb_all();
387 local_irq_restore(flags);
388
389 return status;
390}
391
392static efi_status_t efi_thunk_get_time(efi_time_t *tm, efi_time_cap_t *tc)
393{
394 efi_status_t status;
395 u32 phys_tm, phys_tc;
396
397 spin_lock(&rtc_lock);
398
399 phys_tm = virt_to_phys(tm);
400 phys_tc = virt_to_phys(tc);
401
402 status = efi_thunk(get_time, phys_tm, phys_tc);
403
404 spin_unlock(&rtc_lock);
405
406 return status;
407}
408
409static efi_status_t efi_thunk_set_time(efi_time_t *tm)
410{
411 efi_status_t status;
412 u32 phys_tm;
413
414 spin_lock(&rtc_lock);
415
416 phys_tm = virt_to_phys(tm);
417
418 status = efi_thunk(set_time, phys_tm);
419
420 spin_unlock(&rtc_lock);
421
422 return status;
423}
424
425static efi_status_t
426efi_thunk_get_wakeup_time(efi_bool_t *enabled, efi_bool_t *pending,
427 efi_time_t *tm)
428{
429 efi_status_t status;
430 u32 phys_enabled, phys_pending, phys_tm;
431
432 spin_lock(&rtc_lock);
433
434 phys_enabled = virt_to_phys(enabled);
435 phys_pending = virt_to_phys(pending);
436 phys_tm = virt_to_phys(tm);
437
438 status = efi_thunk(get_wakeup_time, phys_enabled,
439 phys_pending, phys_tm);
440
441 spin_unlock(&rtc_lock);
442
443 return status;
444}
445
446static efi_status_t
447efi_thunk_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm)
448{
449 efi_status_t status;
450 u32 phys_tm;
451
452 spin_lock(&rtc_lock);
453
454 phys_tm = virt_to_phys(tm);
455
456 status = efi_thunk(set_wakeup_time, enabled, phys_tm);
457
458 spin_unlock(&rtc_lock);
459
460 return status;
461}
462
463
464static efi_status_t
465efi_thunk_get_variable(efi_char16_t *name, efi_guid_t *vendor,
466 u32 *attr, unsigned long *data_size, void *data)
467{
468 efi_status_t status;
469 u32 phys_name, phys_vendor, phys_attr;
470 u32 phys_data_size, phys_data;
471
472 phys_data_size = virt_to_phys(data_size);
473 phys_vendor = virt_to_phys(vendor);
474 phys_name = virt_to_phys(name);
475 phys_attr = virt_to_phys(attr);
476 phys_data = virt_to_phys(data);
477
478 status = efi_thunk(get_variable, phys_name, phys_vendor,
479 phys_attr, phys_data_size, phys_data);
480
481 return status;
482}
483
484static efi_status_t
485efi_thunk_set_variable(efi_char16_t *name, efi_guid_t *vendor,
486 u32 attr, unsigned long data_size, void *data)
487{
488 u32 phys_name, phys_vendor, phys_data;
489 efi_status_t status;
490
491 phys_name = virt_to_phys(name);
492 phys_vendor = virt_to_phys(vendor);
493 phys_data = virt_to_phys(data);
494
495 /* If data_size is > sizeof(u32) we've got problems */
496 status = efi_thunk(set_variable, phys_name, phys_vendor,
497 attr, data_size, phys_data);
498
499 return status;
500}
501
502static efi_status_t
503efi_thunk_get_next_variable(unsigned long *name_size,
504 efi_char16_t *name,
505 efi_guid_t *vendor)
506{
507 efi_status_t status;
508 u32 phys_name_size, phys_name, phys_vendor;
509
510 phys_name_size = virt_to_phys(name_size);
511 phys_vendor = virt_to_phys(vendor);
512 phys_name = virt_to_phys(name);
513
514 status = efi_thunk(get_next_variable, phys_name_size,
515 phys_name, phys_vendor);
516
517 return status;
518}
519
520static efi_status_t
521efi_thunk_get_next_high_mono_count(u32 *count)
522{
523 efi_status_t status;
524 u32 phys_count;
525
526 phys_count = virt_to_phys(count);
527 status = efi_thunk(get_next_high_mono_count, phys_count);
528
529 return status;
530}
531
532static void
533efi_thunk_reset_system(int reset_type, efi_status_t status,
534 unsigned long data_size, efi_char16_t *data)
535{
536 u32 phys_data;
537
538 phys_data = virt_to_phys(data);
539
540 efi_thunk(reset_system, reset_type, status, data_size, phys_data);
541}
542
543static efi_status_t
544efi_thunk_update_capsule(efi_capsule_header_t **capsules,
545 unsigned long count, unsigned long sg_list)
546{
547 /*
548 * To properly support this function we would need to repackage
549 * 'capsules' because the firmware doesn't understand 64-bit
550 * pointers.
551 */
552 return EFI_UNSUPPORTED;
553}
554
555static efi_status_t
556efi_thunk_query_variable_info(u32 attr, u64 *storage_space,
557 u64 *remaining_space,
558 u64 *max_variable_size)
559{
560 efi_status_t status;
561 u32 phys_storage, phys_remaining, phys_max;
562
563 if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
564 return EFI_UNSUPPORTED;
565
566 phys_storage = virt_to_phys(storage_space);
567 phys_remaining = virt_to_phys(remaining_space);
568 phys_max = virt_to_phys(max_variable_size);
569
570 status = efi_thunk(query_variable_info, attr, phys_storage,
571 phys_remaining, phys_max);
572
573 return status;
574}
575
576static efi_status_t
577efi_thunk_query_capsule_caps(efi_capsule_header_t **capsules,
578 unsigned long count, u64 *max_size,
579 int *reset_type)
580{
581 /*
582 * To properly support this function we would need to repackage
583 * 'capsules' because the firmware doesn't understand 64-bit
584 * pointers.
585 */
586 return EFI_UNSUPPORTED;
587}
588
589void efi_thunk_runtime_setup(void)
590{
591 efi.get_time = efi_thunk_get_time;
592 efi.set_time = efi_thunk_set_time;
593 efi.get_wakeup_time = efi_thunk_get_wakeup_time;
594 efi.set_wakeup_time = efi_thunk_set_wakeup_time;
595 efi.get_variable = efi_thunk_get_variable;
596 efi.get_next_variable = efi_thunk_get_next_variable;
597 efi.set_variable = efi_thunk_set_variable;
598 efi.get_next_high_mono_count = efi_thunk_get_next_high_mono_count;
599 efi.reset_system = efi_thunk_reset_system;
600 efi.query_variable_info = efi_thunk_query_variable_info;
601 efi.update_capsule = efi_thunk_update_capsule;
602 efi.query_capsule_caps = efi_thunk_query_capsule_caps;
603}
604#endif /* CONFIG_EFI_MIXED */
diff --git a/arch/x86/platform/efi/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S
index 4c07ccab8146..e0984ef0374b 100644
--- a/arch/x86/platform/efi/efi_stub_64.S
+++ b/arch/x86/platform/efi/efi_stub_64.S
@@ -7,6 +7,10 @@
7 */ 7 */
8 8
9#include <linux/linkage.h> 9#include <linux/linkage.h>
10#include <asm/segment.h>
11#include <asm/msr.h>
12#include <asm/processor-flags.h>
13#include <asm/page_types.h>
10 14
11#define SAVE_XMM \ 15#define SAVE_XMM \
12 mov %rsp, %rax; \ 16 mov %rsp, %rax; \
@@ -34,10 +38,47 @@
34 mov %rsi, %cr0; \ 38 mov %rsi, %cr0; \
35 mov (%rsp), %rsp 39 mov (%rsp), %rsp
36 40
41 /* stolen from gcc */
42 .macro FLUSH_TLB_ALL
43 movq %r15, efi_scratch(%rip)
44 movq %r14, efi_scratch+8(%rip)
45 movq %cr4, %r15
46 movq %r15, %r14
47 andb $0x7f, %r14b
48 movq %r14, %cr4
49 movq %r15, %cr4
50 movq efi_scratch+8(%rip), %r14
51 movq efi_scratch(%rip), %r15
52 .endm
53
54 .macro SWITCH_PGT
55 cmpb $0, efi_scratch+24(%rip)
56 je 1f
57 movq %r15, efi_scratch(%rip) # r15
58 # save previous CR3
59 movq %cr3, %r15
60 movq %r15, efi_scratch+8(%rip) # prev_cr3
61 movq efi_scratch+16(%rip), %r15 # EFI pgt
62 movq %r15, %cr3
63 1:
64 .endm
65
66 .macro RESTORE_PGT
67 cmpb $0, efi_scratch+24(%rip)
68 je 2f
69 movq efi_scratch+8(%rip), %r15
70 movq %r15, %cr3
71 movq efi_scratch(%rip), %r15
72 FLUSH_TLB_ALL
73 2:
74 .endm
75
37ENTRY(efi_call0) 76ENTRY(efi_call0)
38 SAVE_XMM 77 SAVE_XMM
39 subq $32, %rsp 78 subq $32, %rsp
79 SWITCH_PGT
40 call *%rdi 80 call *%rdi
81 RESTORE_PGT
41 addq $32, %rsp 82 addq $32, %rsp
42 RESTORE_XMM 83 RESTORE_XMM
43 ret 84 ret
@@ -47,7 +88,9 @@ ENTRY(efi_call1)
47 SAVE_XMM 88 SAVE_XMM
48 subq $32, %rsp 89 subq $32, %rsp
49 mov %rsi, %rcx 90 mov %rsi, %rcx
91 SWITCH_PGT
50 call *%rdi 92 call *%rdi
93 RESTORE_PGT
51 addq $32, %rsp 94 addq $32, %rsp
52 RESTORE_XMM 95 RESTORE_XMM
53 ret 96 ret
@@ -57,7 +100,9 @@ ENTRY(efi_call2)
57 SAVE_XMM 100 SAVE_XMM
58 subq $32, %rsp 101 subq $32, %rsp
59 mov %rsi, %rcx 102 mov %rsi, %rcx
103 SWITCH_PGT
60 call *%rdi 104 call *%rdi
105 RESTORE_PGT
61 addq $32, %rsp 106 addq $32, %rsp
62 RESTORE_XMM 107 RESTORE_XMM
63 ret 108 ret
@@ -68,7 +113,9 @@ ENTRY(efi_call3)
68 subq $32, %rsp 113 subq $32, %rsp
69 mov %rcx, %r8 114 mov %rcx, %r8
70 mov %rsi, %rcx 115 mov %rsi, %rcx
116 SWITCH_PGT
71 call *%rdi 117 call *%rdi
118 RESTORE_PGT
72 addq $32, %rsp 119 addq $32, %rsp
73 RESTORE_XMM 120 RESTORE_XMM
74 ret 121 ret
@@ -80,7 +127,9 @@ ENTRY(efi_call4)
80 mov %r8, %r9 127 mov %r8, %r9
81 mov %rcx, %r8 128 mov %rcx, %r8
82 mov %rsi, %rcx 129 mov %rsi, %rcx
130 SWITCH_PGT
83 call *%rdi 131 call *%rdi
132 RESTORE_PGT
84 addq $32, %rsp 133 addq $32, %rsp
85 RESTORE_XMM 134 RESTORE_XMM
86 ret 135 ret
@@ -93,7 +142,9 @@ ENTRY(efi_call5)
93 mov %r8, %r9 142 mov %r8, %r9
94 mov %rcx, %r8 143 mov %rcx, %r8
95 mov %rsi, %rcx 144 mov %rsi, %rcx
145 SWITCH_PGT
96 call *%rdi 146 call *%rdi
147 RESTORE_PGT
97 addq $48, %rsp 148 addq $48, %rsp
98 RESTORE_XMM 149 RESTORE_XMM
99 ret 150 ret
@@ -109,8 +160,177 @@ ENTRY(efi_call6)
109 mov %r8, %r9 160 mov %r8, %r9
110 mov %rcx, %r8 161 mov %rcx, %r8
111 mov %rsi, %rcx 162 mov %rsi, %rcx
163 SWITCH_PGT
112 call *%rdi 164 call *%rdi
165 RESTORE_PGT
113 addq $48, %rsp 166 addq $48, %rsp
114 RESTORE_XMM 167 RESTORE_XMM
115 ret 168 ret
116ENDPROC(efi_call6) 169ENDPROC(efi_call6)
170
171#ifdef CONFIG_EFI_MIXED
172
173/*
174 * We run this function from the 1:1 mapping.
175 *
176 * This function must be invoked with a 1:1 mapped stack.
177 */
178ENTRY(__efi64_thunk)
179 movl %ds, %eax
180 push %rax
181 movl %es, %eax
182 push %rax
183 movl %ss, %eax
184 push %rax
185
186 subq $32, %rsp
187 movl %esi, 0x0(%rsp)
188 movl %edx, 0x4(%rsp)
189 movl %ecx, 0x8(%rsp)
190 movq %r8, %rsi
191 movl %esi, 0xc(%rsp)
192 movq %r9, %rsi
193 movl %esi, 0x10(%rsp)
194
195 sgdt save_gdt(%rip)
196
197 leaq 1f(%rip), %rbx
198 movq %rbx, func_rt_ptr(%rip)
199
200 /* Switch to gdt with 32-bit segments */
201 movl 64(%rsp), %eax
202 lgdt (%rax)
203
204 leaq efi_enter32(%rip), %rax
205 pushq $__KERNEL_CS
206 pushq %rax
207 lretq
208
2091: addq $32, %rsp
210
211 lgdt save_gdt(%rip)
212
213 pop %rbx
214 movl %ebx, %ss
215 pop %rbx
216 movl %ebx, %es
217 pop %rbx
218 movl %ebx, %ds
219
220 /*
221 * Convert 32-bit status code into 64-bit.
222 */
223 test %rax, %rax
224 jz 1f
225 movl %eax, %ecx
226 andl $0x0fffffff, %ecx
227 andl $0xf0000000, %eax
228 shl $32, %rax
229 or %rcx, %rax
2301:
231 ret
232ENDPROC(__efi64_thunk)
233
234ENTRY(efi_exit32)
235 movq func_rt_ptr(%rip), %rax
236 push %rax
237 mov %rdi, %rax
238 ret
239ENDPROC(efi_exit32)
240
241 .code32
242/*
243 * EFI service pointer must be in %edi.
244 *
245 * The stack should represent the 32-bit calling convention.
246 */
247ENTRY(efi_enter32)
248 movl $__KERNEL_DS, %eax
249 movl %eax, %ds
250 movl %eax, %es
251 movl %eax, %ss
252
253 /* Reload pgtables */
254 movl %cr3, %eax
255 movl %eax, %cr3
256
257 /* Disable paging */
258 movl %cr0, %eax
259 btrl $X86_CR0_PG_BIT, %eax
260 movl %eax, %cr0
261
262 /* Disable long mode via EFER */
263 movl $MSR_EFER, %ecx
264 rdmsr
265 btrl $_EFER_LME, %eax
266 wrmsr
267
268 call *%edi
269
270 /* We must preserve return value */
271 movl %eax, %edi
272
273 /*
274 * Some firmware will return with interrupts enabled. Be sure to
275 * disable them before we switch GDTs.
276 */
277 cli
278
279 movl 68(%esp), %eax
280 movl %eax, 2(%eax)
281 lgdtl (%eax)
282
283 movl %cr4, %eax
284 btsl $(X86_CR4_PAE_BIT), %eax
285 movl %eax, %cr4
286
287 movl %cr3, %eax
288 movl %eax, %cr3
289
290 movl $MSR_EFER, %ecx
291 rdmsr
292 btsl $_EFER_LME, %eax
293 wrmsr
294
295 xorl %eax, %eax
296 lldt %ax
297
298 movl 72(%esp), %eax
299 pushl $__KERNEL_CS
300 pushl %eax
301
302 /* Enable paging */
303 movl %cr0, %eax
304 btsl $X86_CR0_PG_BIT, %eax
305 movl %eax, %cr0
306 lret
307ENDPROC(efi_enter32)
308
309 .data
310 .balign 8
311 .global efi32_boot_gdt
312efi32_boot_gdt: .word 0
313 .quad 0
314
315save_gdt: .word 0
316 .quad 0
317func_rt_ptr: .quad 0
318
319 .global efi_gdt64
320efi_gdt64:
321 .word efi_gdt64_end - efi_gdt64
322 .long 0 /* Filled out by user */
323 .word 0
324 .quad 0x0000000000000000 /* NULL descriptor */
325 .quad 0x00af9a000000ffff /* __KERNEL_CS */
326 .quad 0x00cf92000000ffff /* __KERNEL_DS */
327 .quad 0x0080890000000000 /* TS descriptor */
328 .quad 0x0000000000000000 /* TS continued */
329efi_gdt64_end:
330#endif /* CONFIG_EFI_MIXED */
331
332 .data
333ENTRY(efi_scratch)
334 .fill 3,8,0
335 .byte 0
336 .quad 0
diff --git a/arch/x86/platform/efi/efi_thunk_64.S b/arch/x86/platform/efi/efi_thunk_64.S
new file mode 100644
index 000000000000..8806fa73e6e6
--- /dev/null
+++ b/arch/x86/platform/efi/efi_thunk_64.S
@@ -0,0 +1,65 @@
1/*
2 * Copyright (C) 2014 Intel Corporation; author Matt Fleming
3 */
4
5#include <linux/linkage.h>
6#include <asm/page_types.h>
7
8 .text
9 .code64
10ENTRY(efi64_thunk)
11 push %rbp
12 push %rbx
13
14 /*
15 * Switch to 1:1 mapped 32-bit stack pointer.
16 */
17 movq %rsp, efi_saved_sp(%rip)
18 movq efi_scratch+25(%rip), %rsp
19
20 /*
21 * Calculate the physical address of the kernel text.
22 */
23 movq $__START_KERNEL_map, %rax
24 subq phys_base(%rip), %rax
25
26 /*
27 * Push some physical addresses onto the stack. This is easier
28 * to do now in a code64 section while the assembler can address
29 * 64-bit values. Note that all the addresses on the stack are
30 * 32-bit.
31 */
32 subq $16, %rsp
33 leaq efi_exit32(%rip), %rbx
34 subq %rax, %rbx
35 movl %ebx, 8(%rsp)
36 leaq efi_gdt64(%rip), %rbx
37 subq %rax, %rbx
38 movl %ebx, 2(%ebx)
39 movl %ebx, 4(%rsp)
40 leaq efi_gdt32(%rip), %rbx
41 subq %rax, %rbx
42 movl %ebx, 2(%ebx)
43 movl %ebx, (%rsp)
44
45 leaq __efi64_thunk(%rip), %rbx
46 subq %rax, %rbx
47 call *%rbx
48
49 movq efi_saved_sp(%rip), %rsp
50 pop %rbx
51 pop %rbp
52 retq
53ENDPROC(efi64_thunk)
54
55 .data
56efi_gdt32:
57 .word efi_gdt32_end - efi_gdt32
58 .long 0 /* Filled out above */
59 .word 0
60 .quad 0x0000000000000000 /* NULL descriptor */
61 .quad 0x00cf9a000000ffff /* __KERNEL_CS */
62 .quad 0x00cf93000000ffff /* __KERNEL_DS */
63efi_gdt32_end:
64
65efi_saved_sp: .quad 0
diff --git a/arch/x86/platform/intel-mid/Makefile b/arch/x86/platform/intel-mid/Makefile
index 01cc29ea5ff7..0a8ee703b9fa 100644
--- a/arch/x86/platform/intel-mid/Makefile
+++ b/arch/x86/platform/intel-mid/Makefile
@@ -1,6 +1,6 @@
1obj-$(CONFIG_X86_INTEL_MID) += intel-mid.o 1obj-$(CONFIG_X86_INTEL_MID) += intel-mid.o intel_mid_vrtc.o mfld.o mrfl.o
2obj-$(CONFIG_X86_INTEL_MID) += intel_mid_vrtc.o
3obj-$(CONFIG_EARLY_PRINTK_INTEL_MID) += early_printk_intel_mid.o 2obj-$(CONFIG_EARLY_PRINTK_INTEL_MID) += early_printk_intel_mid.o
3
4# SFI specific code 4# SFI specific code
5ifdef CONFIG_X86_INTEL_MID 5ifdef CONFIG_X86_INTEL_MID
6obj-$(CONFIG_SFI) += sfi.o device_libs/ 6obj-$(CONFIG_SFI) += sfi.o device_libs/
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c b/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c
index 0d942c1d26d5..69a783689d21 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c
@@ -22,7 +22,9 @@ static void __init *emc1403_platform_data(void *info)
22 int intr = get_gpio_by_name("thermal_int"); 22 int intr = get_gpio_by_name("thermal_int");
23 int intr2nd = get_gpio_by_name("thermal_alert"); 23 int intr2nd = get_gpio_by_name("thermal_alert");
24 24
25 if (intr == -1 || intr2nd == -1) 25 if (intr < 0)
26 return NULL;
27 if (intr2nd < 0)
26 return NULL; 28 return NULL;
27 29
28 i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET; 30 i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET;
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c b/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c
index a013a4834bbe..dccae6b0413f 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c
@@ -66,7 +66,7 @@ static int __init pb_keys_init(void)
66 gb[i].gpio = get_gpio_by_name(gb[i].desc); 66 gb[i].gpio = get_gpio_by_name(gb[i].desc);
67 pr_debug("info[%2d]: name = %s, gpio = %d\n", i, gb[i].desc, 67 pr_debug("info[%2d]: name = %s, gpio = %d\n", i, gb[i].desc,
68 gb[i].gpio); 68 gb[i].gpio);
69 if (gb[i].gpio == -1) 69 if (gb[i].gpio < 0)
70 continue; 70 continue;
71 71
72 if (i != good) 72 if (i != good)
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_ipc.h b/arch/x86/platform/intel-mid/device_libs/platform_ipc.h
index 8f568dd79605..79bb09d4f718 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_ipc.h
+++ b/arch/x86/platform/intel-mid/device_libs/platform_ipc.h
@@ -12,6 +12,7 @@
12#ifndef _PLATFORM_IPC_H_ 12#ifndef _PLATFORM_IPC_H_
13#define _PLATFORM_IPC_H_ 13#define _PLATFORM_IPC_H_
14 14
15extern void __init ipc_device_handler(struct sfi_device_table_entry *pentry, 15void __init
16 struct devs_id *dev) __attribute__((weak)); 16ipc_device_handler(struct sfi_device_table_entry *pentry, struct devs_id *dev);
17
17#endif 18#endif
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_lis331.c b/arch/x86/platform/intel-mid/device_libs/platform_lis331.c
index 15278c11f714..54226de7541a 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_lis331.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_lis331.c
@@ -21,7 +21,9 @@ static void __init *lis331dl_platform_data(void *info)
21 int intr = get_gpio_by_name("accel_int"); 21 int intr = get_gpio_by_name("accel_int");
22 int intr2nd = get_gpio_by_name("accel_2"); 22 int intr2nd = get_gpio_by_name("accel_2");
23 23
24 if (intr == -1 || intr2nd == -1) 24 if (intr < 0)
25 return NULL;
26 if (intr2nd < 0)
25 return NULL; 27 return NULL;
26 28
27 i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET; 29 i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET;
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_max7315.c b/arch/x86/platform/intel-mid/device_libs/platform_max7315.c
index 94ade10024ae..2c8acbc1e9ad 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_max7315.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_max7315.c
@@ -48,7 +48,7 @@ static void __init *max7315_platform_data(void *info)
48 gpio_base = get_gpio_by_name(base_pin_name); 48 gpio_base = get_gpio_by_name(base_pin_name);
49 intr = get_gpio_by_name(intr_pin_name); 49 intr = get_gpio_by_name(intr_pin_name);
50 50
51 if (gpio_base == -1) 51 if (gpio_base < 0)
52 return NULL; 52 return NULL;
53 max7315->gpio_base = gpio_base; 53 max7315->gpio_base = gpio_base;
54 if (intr != -1) { 54 if (intr != -1) {
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c b/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c
index dd28d63c84fb..cfe9a47a1e87 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c
@@ -19,7 +19,7 @@ static void *mpu3050_platform_data(void *info)
19 struct i2c_board_info *i2c_info = info; 19 struct i2c_board_info *i2c_info = info;
20 int intr = get_gpio_by_name("mpu3050_int"); 20 int intr = get_gpio_by_name("mpu3050_int");
21 21
22 if (intr == -1) 22 if (intr < 0)
23 return NULL; 23 return NULL;
24 24
25 i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET; 25 i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET;
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic.h b/arch/x86/platform/intel-mid/device_libs/platform_msic.h
index 917eb56d77da..b7be1d041da2 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_msic.h
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic.h
@@ -14,6 +14,6 @@
14 14
15extern struct intel_msic_platform_data msic_pdata; 15extern struct intel_msic_platform_data msic_pdata;
16 16
17extern void *msic_generic_platform_data(void *info, 17void *msic_generic_platform_data(void *info, enum intel_msic_block block);
18 enum intel_msic_block block) __attribute__((weak)); 18
19#endif 19#endif
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c b/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c
index d87182a09263..65c2a9a19db4 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c
@@ -26,7 +26,7 @@ static void __init *pmic_gpio_platform_data(void *info)
26 static struct intel_pmic_gpio_platform_data pmic_gpio_pdata; 26 static struct intel_pmic_gpio_platform_data pmic_gpio_pdata;
27 int gpio_base = get_gpio_by_name("pmic_gpio_base"); 27 int gpio_base = get_gpio_by_name("pmic_gpio_base");
28 28
29 if (gpio_base == -1) 29 if (gpio_base < 0)
30 gpio_base = 64; 30 gpio_base = 64;
31 pmic_gpio_pdata.gpio_base = gpio_base; 31 pmic_gpio_pdata.gpio_base = gpio_base;
32 pmic_gpio_pdata.irq_base = gpio_base + INTEL_MID_IRQ_OFFSET; 32 pmic_gpio_pdata.irq_base = gpio_base + INTEL_MID_IRQ_OFFSET;
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c b/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c
index 22881c9a6737..33be0b3be6e1 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c
@@ -34,10 +34,10 @@ static void *tca6416_platform_data(void *info)
34 gpio_base = get_gpio_by_name(base_pin_name); 34 gpio_base = get_gpio_by_name(base_pin_name);
35 intr = get_gpio_by_name(intr_pin_name); 35 intr = get_gpio_by_name(intr_pin_name);
36 36
37 if (gpio_base == -1) 37 if (gpio_base < 0)
38 return NULL; 38 return NULL;
39 tca6416.gpio_base = gpio_base; 39 tca6416.gpio_base = gpio_base;
40 if (intr != -1) { 40 if (intr >= 0) {
41 i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET; 41 i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET;
42 tca6416.irq_base = gpio_base + INTEL_MID_IRQ_OFFSET; 42 tca6416.irq_base = gpio_base + INTEL_MID_IRQ_OFFSET;
43 } else { 43 } else {
diff --git a/arch/x86/platform/intel-mid/early_printk_intel_mid.c b/arch/x86/platform/intel-mid/early_printk_intel_mid.c
index 4f702f554f6e..e0bd082a80e0 100644
--- a/arch/x86/platform/intel-mid/early_printk_intel_mid.c
+++ b/arch/x86/platform/intel-mid/early_printk_intel_mid.c
@@ -22,7 +22,6 @@
22#include <linux/console.h> 22#include <linux/console.h>
23#include <linux/kernel.h> 23#include <linux/kernel.h>
24#include <linux/delay.h> 24#include <linux/delay.h>
25#include <linux/init.h>
26#include <linux/io.h> 25#include <linux/io.h>
27 26
28#include <asm/fixmap.h> 27#include <asm/fixmap.h>
diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c
index f90e290f689f..1bbedc4b0f88 100644
--- a/arch/x86/platform/intel-mid/intel-mid.c
+++ b/arch/x86/platform/intel-mid/intel-mid.c
@@ -35,6 +35,8 @@
35#include <asm/apb_timer.h> 35#include <asm/apb_timer.h>
36#include <asm/reboot.h> 36#include <asm/reboot.h>
37 37
38#include "intel_mid_weak_decls.h"
39
38/* 40/*
39 * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock, 41 * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock,
40 * cmdline option x86_intel_mid_timer can be used to override the configuration 42 * cmdline option x86_intel_mid_timer can be used to override the configuration
@@ -58,12 +60,16 @@
58 60
59enum intel_mid_timer_options intel_mid_timer_options; 61enum intel_mid_timer_options intel_mid_timer_options;
60 62
63/* intel_mid_ops to store sub arch ops */
64struct intel_mid_ops *intel_mid_ops;
65/* getter function for sub arch ops*/
66static void *(*get_intel_mid_ops[])(void) = INTEL_MID_OPS_INIT;
61enum intel_mid_cpu_type __intel_mid_cpu_chip; 67enum intel_mid_cpu_type __intel_mid_cpu_chip;
62EXPORT_SYMBOL_GPL(__intel_mid_cpu_chip); 68EXPORT_SYMBOL_GPL(__intel_mid_cpu_chip);
63 69
64static void intel_mid_power_off(void) 70static void intel_mid_power_off(void)
65{ 71{
66} 72};
67 73
68static void intel_mid_reboot(void) 74static void intel_mid_reboot(void)
69{ 75{
@@ -72,32 +78,6 @@ static void intel_mid_reboot(void)
72 78
73static unsigned long __init intel_mid_calibrate_tsc(void) 79static unsigned long __init intel_mid_calibrate_tsc(void)
74{ 80{
75 unsigned long fast_calibrate;
76 u32 lo, hi, ratio, fsb;
77
78 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
79 pr_debug("IA32 perf status is 0x%x, 0x%0x\n", lo, hi);
80 ratio = (hi >> 8) & 0x1f;
81 pr_debug("ratio is %d\n", ratio);
82 if (!ratio) {
83 pr_err("read a zero ratio, should be incorrect!\n");
84 pr_err("force tsc ratio to 16 ...\n");
85 ratio = 16;
86 }
87 rdmsr(MSR_FSB_FREQ, lo, hi);
88 if ((lo & 0x7) == 0x7)
89 fsb = PENWELL_FSB_FREQ_83SKU;
90 else
91 fsb = PENWELL_FSB_FREQ_100SKU;
92 fast_calibrate = ratio * fsb;
93 pr_debug("read penwell tsc %lu khz\n", fast_calibrate);
94 lapic_timer_frequency = fsb * 1000 / HZ;
95 /* mark tsc clocksource as reliable */
96 set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE);
97
98 if (fast_calibrate)
99 return fast_calibrate;
100
101 return 0; 81 return 0;
102} 82}
103 83
@@ -125,13 +105,37 @@ static void __init intel_mid_time_init(void)
125 105
126static void intel_mid_arch_setup(void) 106static void intel_mid_arch_setup(void)
127{ 107{
128 if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27) 108 if (boot_cpu_data.x86 != 6) {
129 __intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_PENWELL;
130 else {
131 pr_err("Unknown Intel MID CPU (%d:%d), default to Penwell\n", 109 pr_err("Unknown Intel MID CPU (%d:%d), default to Penwell\n",
132 boot_cpu_data.x86, boot_cpu_data.x86_model); 110 boot_cpu_data.x86, boot_cpu_data.x86_model);
133 __intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_PENWELL; 111 __intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_PENWELL;
112 goto out;
134 } 113 }
114
115 switch (boot_cpu_data.x86_model) {
116 case 0x35:
117 __intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_CLOVERVIEW;
118 break;
119 case 0x3C:
120 case 0x4A:
121 __intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_TANGIER;
122 break;
123 case 0x27:
124 default:
125 __intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_PENWELL;
126 break;
127 }
128
129 if (__intel_mid_cpu_chip < MAX_CPU_OPS(get_intel_mid_ops))
130 intel_mid_ops = get_intel_mid_ops[__intel_mid_cpu_chip]();
131 else {
132 intel_mid_ops = get_intel_mid_ops[INTEL_MID_CPU_CHIP_PENWELL]();
133 pr_info("ARCH: Uknown SoC, assuming PENWELL!\n");
134 }
135
136out:
137 if (intel_mid_ops->arch_setup)
138 intel_mid_ops->arch_setup();
135} 139}
136 140
137/* MID systems don't have i8042 controller */ 141/* MID systems don't have i8042 controller */
diff --git a/arch/x86/platform/intel-mid/intel_mid_weak_decls.h b/arch/x86/platform/intel-mid/intel_mid_weak_decls.h
new file mode 100644
index 000000000000..46aa25c8ce06
--- /dev/null
+++ b/arch/x86/platform/intel-mid/intel_mid_weak_decls.h
@@ -0,0 +1,19 @@
1/*
2 * intel_mid_weak_decls.h: Weak declarations of intel-mid.c
3 *
4 * (C) Copyright 2013 Intel Corporation
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; version 2
9 * of the License.
10 */
11
12
13/* __attribute__((weak)) makes these declarations overridable */
14/* For every CPU addition a new get_<cpuname>_ops interface needs
15 * to be added.
16 */
17extern void *get_penwell_ops(void) __attribute__((weak));
18extern void *get_cloverview_ops(void) __attribute__((weak));
19extern void *get_tangier_ops(void) __attribute__((weak));
diff --git a/arch/x86/platform/intel-mid/mfld.c b/arch/x86/platform/intel-mid/mfld.c
new file mode 100644
index 000000000000..23381d2174ae
--- /dev/null
+++ b/arch/x86/platform/intel-mid/mfld.c
@@ -0,0 +1,75 @@
1/*
2 * mfld.c: Intel Medfield platform setup code
3 *
4 * (C) Copyright 2013 Intel Corporation
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; version 2
9 * of the License.
10 */
11
12#include <linux/init.h>
13
14#include <asm/apic.h>
15#include <asm/intel-mid.h>
16#include <asm/intel_mid_vrtc.h>
17
18#include "intel_mid_weak_decls.h"
19
20static void penwell_arch_setup(void);
21/* penwell arch ops */
22static struct intel_mid_ops penwell_ops = {
23 .arch_setup = penwell_arch_setup,
24};
25
26static void mfld_power_off(void)
27{
28}
29
30static unsigned long __init mfld_calibrate_tsc(void)
31{
32 unsigned long fast_calibrate;
33 u32 lo, hi, ratio, fsb;
34
35 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
36 pr_debug("IA32 perf status is 0x%x, 0x%0x\n", lo, hi);
37 ratio = (hi >> 8) & 0x1f;
38 pr_debug("ratio is %d\n", ratio);
39 if (!ratio) {
40 pr_err("read a zero ratio, should be incorrect!\n");
41 pr_err("force tsc ratio to 16 ...\n");
42 ratio = 16;
43 }
44 rdmsr(MSR_FSB_FREQ, lo, hi);
45 if ((lo & 0x7) == 0x7)
46 fsb = FSB_FREQ_83SKU;
47 else
48 fsb = FSB_FREQ_100SKU;
49 fast_calibrate = ratio * fsb;
50 pr_debug("read penwell tsc %lu khz\n", fast_calibrate);
51 lapic_timer_frequency = fsb * 1000 / HZ;
52 /* mark tsc clocksource as reliable */
53 set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE);
54
55 if (fast_calibrate)
56 return fast_calibrate;
57
58 return 0;
59}
60
61static void __init penwell_arch_setup(void)
62{
63 x86_platform.calibrate_tsc = mfld_calibrate_tsc;
64 pm_power_off = mfld_power_off;
65}
66
67void *get_penwell_ops(void)
68{
69 return &penwell_ops;
70}
71
72void *get_cloverview_ops(void)
73{
74 return &penwell_ops;
75}
diff --git a/arch/x86/platform/intel-mid/mrfl.c b/arch/x86/platform/intel-mid/mrfl.c
new file mode 100644
index 000000000000..aaca91753d32
--- /dev/null
+++ b/arch/x86/platform/intel-mid/mrfl.c
@@ -0,0 +1,103 @@
1/*
2 * mrfl.c: Intel Merrifield platform specific setup code
3 *
4 * (C) Copyright 2013 Intel Corporation
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; version 2
9 * of the License.
10 */
11
12#include <linux/init.h>
13
14#include <asm/apic.h>
15#include <asm/intel-mid.h>
16
17#include "intel_mid_weak_decls.h"
18
19static unsigned long __init tangier_calibrate_tsc(void)
20{
21 unsigned long fast_calibrate;
22 u32 lo, hi, ratio, fsb, bus_freq;
23
24 /* *********************** */
25 /* Compute TSC:Ratio * FSB */
26 /* *********************** */
27
28 /* Compute Ratio */
29 rdmsr(MSR_PLATFORM_INFO, lo, hi);
30 pr_debug("IA32 PLATFORM_INFO is 0x%x : %x\n", hi, lo);
31
32 ratio = (lo >> 8) & 0xFF;
33 pr_debug("ratio is %d\n", ratio);
34 if (!ratio) {
35 pr_err("Read a zero ratio, force tsc ratio to 4 ...\n");
36 ratio = 4;
37 }
38
39 /* Compute FSB */
40 rdmsr(MSR_FSB_FREQ, lo, hi);
41 pr_debug("Actual FSB frequency detected by SOC 0x%x : %x\n",
42 hi, lo);
43
44 bus_freq = lo & 0x7;
45 pr_debug("bus_freq = 0x%x\n", bus_freq);
46
47 if (bus_freq == 0)
48 fsb = FSB_FREQ_100SKU;
49 else if (bus_freq == 1)
50 fsb = FSB_FREQ_100SKU;
51 else if (bus_freq == 2)
52 fsb = FSB_FREQ_133SKU;
53 else if (bus_freq == 3)
54 fsb = FSB_FREQ_167SKU;
55 else if (bus_freq == 4)
56 fsb = FSB_FREQ_83SKU;
57 else if (bus_freq == 5)
58 fsb = FSB_FREQ_400SKU;
59 else if (bus_freq == 6)
60 fsb = FSB_FREQ_267SKU;
61 else if (bus_freq == 7)
62 fsb = FSB_FREQ_333SKU;
63 else {
64 BUG();
65 pr_err("Invalid bus_freq! Setting to minimal value!\n");
66 fsb = FSB_FREQ_100SKU;
67 }
68
69 /* TSC = FSB Freq * Resolved HFM Ratio */
70 fast_calibrate = ratio * fsb;
71 pr_debug("calculate tangier tsc %lu KHz\n", fast_calibrate);
72
73 /* ************************************ */
74 /* Calculate Local APIC Timer Frequency */
75 /* ************************************ */
76 lapic_timer_frequency = (fsb * 1000) / HZ;
77
78 pr_debug("Setting lapic_timer_frequency = %d\n",
79 lapic_timer_frequency);
80
81 /* mark tsc clocksource as reliable */
82 set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE);
83
84 if (fast_calibrate)
85 return fast_calibrate;
86
87 return 0;
88}
89
90static void __init tangier_arch_setup(void)
91{
92 x86_platform.calibrate_tsc = tangier_calibrate_tsc;
93}
94
95/* tangier arch ops */
96static struct intel_mid_ops tangier_ops = {
97 .arch_setup = tangier_arch_setup,
98};
99
100void *get_tangier_ops(void)
101{
102 return &tangier_ops;
103}
diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c
index c84c1ca396bf..994c40bd7cb7 100644
--- a/arch/x86/platform/intel-mid/sfi.c
+++ b/arch/x86/platform/intel-mid/sfi.c
@@ -224,7 +224,7 @@ int get_gpio_by_name(const char *name)
224 if (!strncmp(name, pentry->pin_name, SFI_NAME_LEN)) 224 if (!strncmp(name, pentry->pin_name, SFI_NAME_LEN))
225 return pentry->pin_no; 225 return pentry->pin_no;
226 } 226 }
227 return -1; 227 return -EINVAL;
228} 228}
229 229
230void __init intel_scu_device_register(struct platform_device *pdev) 230void __init intel_scu_device_register(struct platform_device *pdev)
@@ -250,7 +250,7 @@ static void __init intel_scu_spi_device_register(struct spi_board_info *sdev)
250 sdev->modalias); 250 sdev->modalias);
251 return; 251 return;
252 } 252 }
253 memcpy(new_dev, sdev, sizeof(*sdev)); 253 *new_dev = *sdev;
254 254
255 spi_devs[spi_next_dev++] = new_dev; 255 spi_devs[spi_next_dev++] = new_dev;
256} 256}
@@ -271,7 +271,7 @@ static void __init intel_scu_i2c_device_register(int bus,
271 idev->type); 271 idev->type);
272 return; 272 return;
273 } 273 }
274 memcpy(new_dev, idev, sizeof(*idev)); 274 *new_dev = *idev;
275 275
276 i2c_bus[i2c_next_dev] = bus; 276 i2c_bus[i2c_next_dev] = bus;
277 i2c_devs[i2c_next_dev++] = new_dev; 277 i2c_devs[i2c_next_dev++] = new_dev;
@@ -337,6 +337,8 @@ static void __init sfi_handle_ipc_dev(struct sfi_device_table_entry *pentry,
337 pr_debug("IPC bus, name = %16.16s, irq = 0x%2x\n", 337 pr_debug("IPC bus, name = %16.16s, irq = 0x%2x\n",
338 pentry->name, pentry->irq); 338 pentry->name, pentry->irq);
339 pdata = intel_mid_sfi_get_pdata(dev, pentry); 339 pdata = intel_mid_sfi_get_pdata(dev, pentry);
340 if (IS_ERR(pdata))
341 return;
340 342
341 pdev = platform_device_alloc(pentry->name, 0); 343 pdev = platform_device_alloc(pentry->name, 0);
342 if (pdev == NULL) { 344 if (pdev == NULL) {
@@ -370,6 +372,8 @@ static void __init sfi_handle_spi_dev(struct sfi_device_table_entry *pentry,
370 spi_info.chip_select); 372 spi_info.chip_select);
371 373
372 pdata = intel_mid_sfi_get_pdata(dev, &spi_info); 374 pdata = intel_mid_sfi_get_pdata(dev, &spi_info);
375 if (IS_ERR(pdata))
376 return;
373 377
374 spi_info.platform_data = pdata; 378 spi_info.platform_data = pdata;
375 if (dev->delay) 379 if (dev->delay)
@@ -395,6 +399,8 @@ static void __init sfi_handle_i2c_dev(struct sfi_device_table_entry *pentry,
395 i2c_info.addr); 399 i2c_info.addr);
396 pdata = intel_mid_sfi_get_pdata(dev, &i2c_info); 400 pdata = intel_mid_sfi_get_pdata(dev, &i2c_info);
397 i2c_info.platform_data = pdata; 401 i2c_info.platform_data = pdata;
402 if (IS_ERR(pdata))
403 return;
398 404
399 if (dev->delay) 405 if (dev->delay)
400 intel_scu_i2c_device_register(pentry->host_num, &i2c_info); 406 intel_scu_i2c_device_register(pentry->host_num, &i2c_info);
@@ -443,13 +449,35 @@ static int __init sfi_parse_devs(struct sfi_table_header *table)
443 * so we have to enable them one by one here 449 * so we have to enable them one by one here
444 */ 450 */
445 ioapic = mp_find_ioapic(irq); 451 ioapic = mp_find_ioapic(irq);
446 irq_attr.ioapic = ioapic; 452 if (ioapic >= 0) {
447 irq_attr.ioapic_pin = irq; 453 irq_attr.ioapic = ioapic;
448 irq_attr.trigger = 1; 454 irq_attr.ioapic_pin = irq;
449 irq_attr.polarity = 1; 455 irq_attr.trigger = 1;
450 io_apic_set_pci_routing(NULL, irq, &irq_attr); 456 if (intel_mid_identify_cpu() ==
451 } else 457 INTEL_MID_CPU_CHIP_TANGIER) {
458 if (!strncmp(pentry->name,
459 "r69001-ts-i2c", 13))
460 /* active low */
461 irq_attr.polarity = 1;
462 else if (!strncmp(pentry->name,
463 "synaptics_3202", 14))
464 /* active low */
465 irq_attr.polarity = 1;
466 else if (irq == 41)
467 /* fast_int_1 */
468 irq_attr.polarity = 1;
469 else
470 /* active high */
471 irq_attr.polarity = 0;
472 } else {
473 /* PNW and CLV go with active low */
474 irq_attr.polarity = 1;
475 }
476 io_apic_set_pci_routing(NULL, irq, &irq_attr);
477 }
478 } else {
452 irq = 0; /* No irq */ 479 irq = 0; /* No irq */
480 }
453 481
454 dev = get_device_id(pentry->type, pentry->name); 482 dev = get_device_id(pentry->type, pentry->name);
455 483
diff --git a/arch/x86/platform/iris/iris.c b/arch/x86/platform/iris/iris.c
index e6cb80f620af..4d171e8640ef 100644
--- a/arch/x86/platform/iris/iris.c
+++ b/arch/x86/platform/iris/iris.c
@@ -27,7 +27,6 @@
27#include <linux/kernel.h> 27#include <linux/kernel.h>
28#include <linux/errno.h> 28#include <linux/errno.h>
29#include <linux/delay.h> 29#include <linux/delay.h>
30#include <linux/init.h>
31#include <linux/pm.h> 30#include <linux/pm.h>
32#include <asm/io.h> 31#include <asm/io.h>
33 32
diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c
index 649a12befba9..08e350e757dc 100644
--- a/arch/x86/platform/olpc/olpc-xo15-sci.c
+++ b/arch/x86/platform/olpc/olpc-xo15-sci.c
@@ -15,8 +15,7 @@
15#include <linux/power_supply.h> 15#include <linux/power_supply.h>
16#include <linux/olpc-ec.h> 16#include <linux/olpc-ec.h>
17 17
18#include <acpi/acpi_bus.h> 18#include <linux/acpi.h>
19#include <acpi/acpi_drivers.h>
20#include <asm/olpc.h> 19#include <asm/olpc.h>
21 20
22#define DRV_NAME "olpc-xo15-sci" 21#define DRV_NAME "olpc-xo15-sci"
diff --git a/arch/x86/platform/ts5500/ts5500.c b/arch/x86/platform/ts5500/ts5500.c
index 39febb214e8c..9471b9456f25 100644
--- a/arch/x86/platform/ts5500/ts5500.c
+++ b/arch/x86/platform/ts5500/ts5500.c
@@ -88,7 +88,7 @@ struct ts5500_sbc {
88static const struct { 88static const struct {
89 const char * const string; 89 const char * const string;
90 const ssize_t offset; 90 const ssize_t offset;
91} ts5500_signatures[] __initdata = { 91} ts5500_signatures[] __initconst = {
92 { "TS-5x00 AMD Elan", 0xb14 }, 92 { "TS-5x00 AMD Elan", 0xb14 },
93}; 93};
94 94
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index efe4d7220397..dfe605ac1bcd 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -433,15 +433,49 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp)
433 return; 433 return;
434} 434}
435 435
436static inline unsigned long cycles_2_us(unsigned long long cyc) 436/*
437 * Not to be confused with cycles_2_ns() from tsc.c; this gives a relative
438 * number, not an absolute. It converts a duration in cycles to a duration in
439 * ns.
440 */
441static inline unsigned long long cycles_2_ns(unsigned long long cyc)
437{ 442{
443 struct cyc2ns_data *data = cyc2ns_read_begin();
438 unsigned long long ns; 444 unsigned long long ns;
439 unsigned long us;
440 int cpu = smp_processor_id();
441 445
442 ns = (cyc * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR; 446 ns = mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
443 us = ns / 1000; 447
444 return us; 448 cyc2ns_read_end(data);
449 return ns;
450}
451
452/*
453 * The reverse of the above; converts a duration in ns to a duration in cycles.
454 */
455static inline unsigned long long ns_2_cycles(unsigned long long ns)
456{
457 struct cyc2ns_data *data = cyc2ns_read_begin();
458 unsigned long long cyc;
459
460 cyc = (ns << data->cyc2ns_shift) / data->cyc2ns_mul;
461
462 cyc2ns_read_end(data);
463 return cyc;
464}
465
466static inline unsigned long cycles_2_us(unsigned long long cyc)
467{
468 return cycles_2_ns(cyc) / NSEC_PER_USEC;
469}
470
471static inline cycles_t sec_2_cycles(unsigned long sec)
472{
473 return ns_2_cycles(sec * NSEC_PER_SEC);
474}
475
476static inline unsigned long long usec_2_cycles(unsigned long usec)
477{
478 return ns_2_cycles(usec * NSEC_PER_USEC);
445} 479}
446 480
447/* 481/*
@@ -668,16 +702,6 @@ static int wait_completion(struct bau_desc *bau_desc,
668 bcp, try); 702 bcp, try);
669} 703}
670 704
671static inline cycles_t sec_2_cycles(unsigned long sec)
672{
673 unsigned long ns;
674 cycles_t cyc;
675
676 ns = sec * 1000000000;
677 cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
678 return cyc;
679}
680
681/* 705/*
682 * Our retries are blocked by all destination sw ack resources being 706 * Our retries are blocked by all destination sw ack resources being
683 * in use, and a timeout is pending. In that case hardware immediately 707 * in use, and a timeout is pending. In that case hardware immediately
@@ -1327,16 +1351,6 @@ static void ptc_seq_stop(struct seq_file *file, void *data)
1327{ 1351{
1328} 1352}
1329 1353
1330static inline unsigned long long usec_2_cycles(unsigned long microsec)
1331{
1332 unsigned long ns;
1333 unsigned long long cyc;
1334
1335 ns = microsec * 1000;
1336 cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
1337 return cyc;
1338}
1339
1340/* 1354/*
1341 * Display the statistics thru /proc/sgi_uv/ptc_statistics 1355 * Display the statistics thru /proc/sgi_uv/ptc_statistics
1342 * 'data' points to the cpu number 1356 * 'data' points to the cpu number
diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c
index 8eeccba73130..be27da60dc8f 100644
--- a/arch/x86/platform/uv/uv_nmi.c
+++ b/arch/x86/platform/uv/uv_nmi.c
@@ -74,7 +74,6 @@ static atomic_t uv_in_nmi;
74static atomic_t uv_nmi_cpu = ATOMIC_INIT(-1); 74static atomic_t uv_nmi_cpu = ATOMIC_INIT(-1);
75static atomic_t uv_nmi_cpus_in_nmi = ATOMIC_INIT(-1); 75static atomic_t uv_nmi_cpus_in_nmi = ATOMIC_INIT(-1);
76static atomic_t uv_nmi_slave_continue; 76static atomic_t uv_nmi_slave_continue;
77static atomic_t uv_nmi_kexec_failed;
78static cpumask_var_t uv_nmi_cpu_mask; 77static cpumask_var_t uv_nmi_cpu_mask;
79 78
80/* Values for uv_nmi_slave_continue */ 79/* Values for uv_nmi_slave_continue */
@@ -149,7 +148,8 @@ module_param_named(retry_count, uv_nmi_retry_count, int, 0644);
149 * "dump" - dump process stack for each cpu 148 * "dump" - dump process stack for each cpu
150 * "ips" - dump IP info for each cpu 149 * "ips" - dump IP info for each cpu
151 * "kdump" - do crash dump 150 * "kdump" - do crash dump
152 * "kdb" - enter KDB/KGDB (default) 151 * "kdb" - enter KDB (default)
152 * "kgdb" - enter KGDB
153 */ 153 */
154static char uv_nmi_action[8] = "kdb"; 154static char uv_nmi_action[8] = "kdb";
155module_param_string(action, uv_nmi_action, sizeof(uv_nmi_action), 0644); 155module_param_string(action, uv_nmi_action, sizeof(uv_nmi_action), 0644);
@@ -504,6 +504,7 @@ static void uv_nmi_touch_watchdogs(void)
504} 504}
505 505
506#if defined(CONFIG_KEXEC) 506#if defined(CONFIG_KEXEC)
507static atomic_t uv_nmi_kexec_failed;
507static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs) 508static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
508{ 509{
509 /* Call crash to dump system state */ 510 /* Call crash to dump system state */
@@ -537,18 +538,45 @@ static inline void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
537} 538}
538#endif /* !CONFIG_KEXEC */ 539#endif /* !CONFIG_KEXEC */
539 540
541#ifdef CONFIG_KGDB
540#ifdef CONFIG_KGDB_KDB 542#ifdef CONFIG_KGDB_KDB
541/* Call KDB from NMI handler */ 543static inline int uv_nmi_kdb_reason(void)
542static void uv_call_kdb(int cpu, struct pt_regs *regs, int master)
543{ 544{
544 int ret; 545 return KDB_REASON_SYSTEM_NMI;
546}
547#else /* !CONFIG_KGDB_KDB */
548static inline int uv_nmi_kdb_reason(void)
549{
550 /* Insure user is expecting to attach gdb remote */
551 if (uv_nmi_action_is("kgdb"))
552 return 0;
553
554 pr_err("UV: NMI error: KDB is not enabled in this kernel\n");
555 return -1;
556}
557#endif /* CONFIG_KGDB_KDB */
545 558
559/*
560 * Call KGDB/KDB from NMI handler
561 *
562 * Note that if both KGDB and KDB are configured, then the action of 'kgdb' or
563 * 'kdb' has no affect on which is used. See the KGDB documention for further
564 * information.
565 */
566static void uv_call_kgdb_kdb(int cpu, struct pt_regs *regs, int master)
567{
546 if (master) { 568 if (master) {
569 int reason = uv_nmi_kdb_reason();
570 int ret;
571
572 if (reason < 0)
573 return;
574
547 /* call KGDB NMI handler as MASTER */ 575 /* call KGDB NMI handler as MASTER */
548 ret = kgdb_nmicallin(cpu, X86_TRAP_NMI, regs, 576 ret = kgdb_nmicallin(cpu, X86_TRAP_NMI, regs, reason,
549 &uv_nmi_slave_continue); 577 &uv_nmi_slave_continue);
550 if (ret) { 578 if (ret) {
551 pr_alert("KDB returned error, is kgdboc set?\n"); 579 pr_alert("KGDB returned error, is kgdboc set?\n");
552 atomic_set(&uv_nmi_slave_continue, SLAVE_EXIT); 580 atomic_set(&uv_nmi_slave_continue, SLAVE_EXIT);
553 } 581 }
554 } else { 582 } else {
@@ -567,12 +595,12 @@ static void uv_call_kdb(int cpu, struct pt_regs *regs, int master)
567 uv_nmi_sync_exit(master); 595 uv_nmi_sync_exit(master);
568} 596}
569 597
570#else /* !CONFIG_KGDB_KDB */ 598#else /* !CONFIG_KGDB */
571static inline void uv_call_kdb(int cpu, struct pt_regs *regs, int master) 599static inline void uv_call_kgdb_kdb(int cpu, struct pt_regs *regs, int master)
572{ 600{
573 pr_err("UV: NMI error: KGDB/KDB is not enabled in this kernel\n"); 601 pr_err("UV: NMI error: KGDB is not enabled in this kernel\n");
574} 602}
575#endif /* !CONFIG_KGDB_KDB */ 603#endif /* !CONFIG_KGDB */
576 604
577/* 605/*
578 * UV NMI handler 606 * UV NMI handler
@@ -606,9 +634,9 @@ int uv_handle_nmi(unsigned int reason, struct pt_regs *regs)
606 if (uv_nmi_action_is("ips") || uv_nmi_action_is("dump")) 634 if (uv_nmi_action_is("ips") || uv_nmi_action_is("dump"))
607 uv_nmi_dump_state(cpu, regs, master); 635 uv_nmi_dump_state(cpu, regs, master);
608 636
609 /* Call KDB if enabled */ 637 /* Call KGDB/KDB if enabled */
610 else if (uv_nmi_action_is("kdb")) 638 else if (uv_nmi_action_is("kdb") || uv_nmi_action_is("kgdb"))
611 uv_call_kdb(cpu, regs, master); 639 uv_call_kgdb_kdb(cpu, regs, master);
612 640
613 /* Clear per_cpu "in nmi" flag */ 641 /* Clear per_cpu "in nmi" flag */
614 atomic_set(&uv_cpu_nmi.state, UV_NMI_STATE_OUT); 642 atomic_set(&uv_cpu_nmi.state, UV_NMI_STATE_OUT);
@@ -634,7 +662,7 @@ int uv_handle_nmi(unsigned int reason, struct pt_regs *regs)
634/* 662/*
635 * NMI handler for pulling in CPUs when perf events are grabbing our NMI 663 * NMI handler for pulling in CPUs when perf events are grabbing our NMI
636 */ 664 */
637int uv_handle_nmi_ping(unsigned int reason, struct pt_regs *regs) 665static int uv_handle_nmi_ping(unsigned int reason, struct pt_regs *regs)
638{ 666{
639 int ret; 667 int ret;
640 668
@@ -651,7 +679,7 @@ int uv_handle_nmi_ping(unsigned int reason, struct pt_regs *regs)
651 return ret; 679 return ret;
652} 680}
653 681
654void uv_register_nmi_notifier(void) 682static void uv_register_nmi_notifier(void)
655{ 683{
656 if (register_nmi_handler(NMI_UNKNOWN, uv_handle_nmi, 0, "uv")) 684 if (register_nmi_handler(NMI_UNKNOWN, uv_handle_nmi, 0, "uv"))
657 pr_warn("UV: NMI handler failed to register\n"); 685 pr_warn("UV: NMI handler failed to register\n");
@@ -695,6 +723,5 @@ void uv_nmi_setup(void)
695 uv_hub_nmi_per(cpu) = uv_hub_nmi_list[nid]; 723 uv_hub_nmi_per(cpu) = uv_hub_nmi_list[nid];
696 } 724 }
697 BUG_ON(!alloc_cpumask_var(&uv_nmi_cpu_mask, GFP_KERNEL)); 725 BUG_ON(!alloc_cpumask_var(&uv_nmi_cpu_mask, GFP_KERNEL));
726 uv_register_nmi_notifier();
698} 727}
699
700
diff --git a/arch/x86/platform/visws/Makefile b/arch/x86/platform/visws/Makefile
deleted file mode 100644
index 91bc17ab2fd5..000000000000
--- a/arch/x86/platform/visws/Makefile
+++ /dev/null
@@ -1 +0,0 @@
1obj-$(CONFIG_X86_VISWS) += visws_quirks.o
diff --git a/arch/x86/platform/visws/visws_quirks.c b/arch/x86/platform/visws/visws_quirks.c
deleted file mode 100644
index 94d8a39332ec..000000000000
--- a/arch/x86/platform/visws/visws_quirks.c
+++ /dev/null
@@ -1,608 +0,0 @@
1/*
2 * SGI Visual Workstation support and quirks, unmaintained.
3 *
4 * Split out from setup.c by davej@suse.de
5 *
6 * Copyright (C) 1999 Bent Hagemark, Ingo Molnar
7 *
8 * SGI Visual Workstation interrupt controller
9 *
10 * The Cobalt system ASIC in the Visual Workstation contains a "Cobalt" APIC
11 * which serves as the main interrupt controller in the system. Non-legacy
12 * hardware in the system uses this controller directly. Legacy devices
13 * are connected to the PIIX4 which in turn has its 8259(s) connected to
14 * a of the Cobalt APIC entry.
15 *
16 * 09/02/2000 - Updated for 2.4 by jbarnes@sgi.com
17 *
18 * 25/11/2002 - Updated for 2.5 by Andrey Panin <pazke@orbita1.ru>
19 */
20#include <linux/interrupt.h>
21#include <linux/module.h>
22#include <linux/init.h>
23#include <linux/smp.h>
24
25#include <asm/visws/cobalt.h>
26#include <asm/visws/piix4.h>
27#include <asm/io_apic.h>
28#include <asm/fixmap.h>
29#include <asm/reboot.h>
30#include <asm/setup.h>
31#include <asm/apic.h>
32#include <asm/e820.h>
33#include <asm/time.h>
34#include <asm/io.h>
35
36#include <linux/kernel_stat.h>
37
38#include <asm/i8259.h>
39#include <asm/irq_vectors.h>
40#include <asm/visws/lithium.h>
41
42#include <linux/sched.h>
43#include <linux/kernel.h>
44#include <linux/pci.h>
45#include <linux/pci_ids.h>
46
47extern int no_broadcast;
48
49char visws_board_type = -1;
50char visws_board_rev = -1;
51
52static void __init visws_time_init(void)
53{
54 printk(KERN_INFO "Starting Cobalt Timer system clock\n");
55
56 /* Set the countdown value */
57 co_cpu_write(CO_CPU_TIMEVAL, CO_TIME_HZ/HZ);
58
59 /* Start the timer */
60 co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) | CO_CTRL_TIMERUN);
61
62 /* Enable (unmask) the timer interrupt */
63 co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK);
64
65 setup_default_timer_irq();
66}
67
68/* Replaces the default init_ISA_irqs in the generic setup */
69static void __init visws_pre_intr_init(void);
70
71/* Quirk for machine specific memory setup. */
72
73#define MB (1024 * 1024)
74
75unsigned long sgivwfb_mem_phys;
76unsigned long sgivwfb_mem_size;
77EXPORT_SYMBOL(sgivwfb_mem_phys);
78EXPORT_SYMBOL(sgivwfb_mem_size);
79
80long long mem_size __initdata = 0;
81
82static char * __init visws_memory_setup(void)
83{
84 long long gfx_mem_size = 8 * MB;
85
86 mem_size = boot_params.alt_mem_k;
87
88 if (!mem_size) {
89 printk(KERN_WARNING "Bootloader didn't set memory size, upgrade it !\n");
90 mem_size = 128 * MB;
91 }
92
93 /*
94 * this hardcodes the graphics memory to 8 MB
95 * it really should be sized dynamically (or at least
96 * set as a boot param)
97 */
98 if (!sgivwfb_mem_size) {
99 printk(KERN_WARNING "Defaulting to 8 MB framebuffer size\n");
100 sgivwfb_mem_size = 8 * MB;
101 }
102
103 /*
104 * Trim to nearest MB
105 */
106 sgivwfb_mem_size &= ~((1 << 20) - 1);
107 sgivwfb_mem_phys = mem_size - gfx_mem_size;
108
109 e820_add_region(0, LOWMEMSIZE(), E820_RAM);
110 e820_add_region(HIGH_MEMORY, mem_size - sgivwfb_mem_size - HIGH_MEMORY, E820_RAM);
111 e820_add_region(sgivwfb_mem_phys, sgivwfb_mem_size, E820_RESERVED);
112
113 return "PROM";
114}
115
116static void visws_machine_emergency_restart(void)
117{
118 /*
119 * Visual Workstations restart after this
120 * register is poked on the PIIX4
121 */
122 outb(PIIX4_RESET_VAL, PIIX4_RESET_PORT);
123}
124
125static void visws_machine_power_off(void)
126{
127 unsigned short pm_status;
128/* extern unsigned int pci_bus0; */
129
130 while ((pm_status = inw(PMSTS_PORT)) & 0x100)
131 outw(pm_status, PMSTS_PORT);
132
133 outw(PM_SUSPEND_ENABLE, PMCNTRL_PORT);
134
135 mdelay(10);
136
137#define PCI_CONF1_ADDRESS(bus, devfn, reg) \
138 (0x80000000 | (bus << 16) | (devfn << 8) | (reg & ~3))
139
140/* outl(PCI_CONF1_ADDRESS(pci_bus0, SPECIAL_DEV, SPECIAL_REG), 0xCF8); */
141 outl(PIIX_SPECIAL_STOP, 0xCFC);
142}
143
144static void __init visws_get_smp_config(unsigned int early)
145{
146}
147
148/*
149 * The Visual Workstation is Intel MP compliant in the hardware
150 * sense, but it doesn't have a BIOS(-configuration table).
151 * No problem for Linux.
152 */
153
154static void __init MP_processor_info(struct mpc_cpu *m)
155{
156 int ver, logical_apicid;
157 physid_mask_t apic_cpus;
158
159 if (!(m->cpuflag & CPU_ENABLED))
160 return;
161
162 logical_apicid = m->apicid;
163 printk(KERN_INFO "%sCPU #%d %u:%u APIC version %d\n",
164 m->cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "",
165 m->apicid, (m->cpufeature & CPU_FAMILY_MASK) >> 8,
166 (m->cpufeature & CPU_MODEL_MASK) >> 4, m->apicver);
167
168 if (m->cpuflag & CPU_BOOTPROCESSOR)
169 boot_cpu_physical_apicid = m->apicid;
170
171 ver = m->apicver;
172 if ((ver >= 0x14 && m->apicid >= 0xff) || m->apicid >= 0xf) {
173 printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
174 m->apicid, MAX_LOCAL_APIC);
175 return;
176 }
177
178 apic->apicid_to_cpu_present(m->apicid, &apic_cpus);
179 physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus);
180 /*
181 * Validate version
182 */
183 if (ver == 0x0) {
184 printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! "
185 "fixing up to 0x10. (tell your hw vendor)\n",
186 m->apicid);
187 ver = 0x10;
188 }
189 apic_version[m->apicid] = ver;
190}
191
192static void __init visws_find_smp_config(void)
193{
194 struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS);
195 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
196
197 if (ncpus > CO_CPU_MAX) {
198 printk(KERN_WARNING "find_visws_smp: got cpu count of %d at %p\n",
199 ncpus, mp);
200
201 ncpus = CO_CPU_MAX;
202 }
203
204 if (ncpus > setup_max_cpus)
205 ncpus = setup_max_cpus;
206
207#ifdef CONFIG_X86_LOCAL_APIC
208 smp_found_config = 1;
209#endif
210 while (ncpus--)
211 MP_processor_info(mp++);
212
213 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
214}
215
216static void visws_trap_init(void);
217
218void __init visws_early_detect(void)
219{
220 int raw;
221
222 visws_board_type = (char)(inb_p(PIIX_GPI_BD_REG) & PIIX_GPI_BD_REG)
223 >> PIIX_GPI_BD_SHIFT;
224
225 if (visws_board_type < 0)
226 return;
227
228 /*
229 * Override the default platform setup functions
230 */
231 x86_init.resources.memory_setup = visws_memory_setup;
232 x86_init.mpparse.get_smp_config = visws_get_smp_config;
233 x86_init.mpparse.find_smp_config = visws_find_smp_config;
234 x86_init.irqs.pre_vector_init = visws_pre_intr_init;
235 x86_init.irqs.trap_init = visws_trap_init;
236 x86_init.timers.timer_init = visws_time_init;
237 x86_init.pci.init = pci_visws_init;
238 x86_init.pci.init_irq = x86_init_noop;
239
240 /*
241 * Install reboot quirks:
242 */
243 pm_power_off = visws_machine_power_off;
244 machine_ops.emergency_restart = visws_machine_emergency_restart;
245
246 /*
247 * Do not use broadcast IPIs:
248 */
249 no_broadcast = 0;
250
251#ifdef CONFIG_X86_IO_APIC
252 /*
253 * Turn off IO-APIC detection and initialization:
254 */
255 skip_ioapic_setup = 1;
256#endif
257
258 /*
259 * Get Board rev.
260 * First, we have to initialize the 307 part to allow us access
261 * to the GPIO registers. Let's map them at 0x0fc0 which is right
262 * after the PIIX4 PM section.
263 */
264 outb_p(SIO_DEV_SEL, SIO_INDEX);
265 outb_p(SIO_GP_DEV, SIO_DATA); /* Talk to GPIO regs. */
266
267 outb_p(SIO_DEV_MSB, SIO_INDEX);
268 outb_p(SIO_GP_MSB, SIO_DATA); /* MSB of GPIO base address */
269
270 outb_p(SIO_DEV_LSB, SIO_INDEX);
271 outb_p(SIO_GP_LSB, SIO_DATA); /* LSB of GPIO base address */
272
273 outb_p(SIO_DEV_ENB, SIO_INDEX);
274 outb_p(1, SIO_DATA); /* Enable GPIO registers. */
275
276 /*
277 * Now, we have to map the power management section to write
278 * a bit which enables access to the GPIO registers.
279 * What lunatic came up with this shit?
280 */
281 outb_p(SIO_DEV_SEL, SIO_INDEX);
282 outb_p(SIO_PM_DEV, SIO_DATA); /* Talk to GPIO regs. */
283
284 outb_p(SIO_DEV_MSB, SIO_INDEX);
285 outb_p(SIO_PM_MSB, SIO_DATA); /* MSB of PM base address */
286
287 outb_p(SIO_DEV_LSB, SIO_INDEX);
288 outb_p(SIO_PM_LSB, SIO_DATA); /* LSB of PM base address */
289
290 outb_p(SIO_DEV_ENB, SIO_INDEX);
291 outb_p(1, SIO_DATA); /* Enable PM registers. */
292
293 /*
294 * Now, write the PM register which enables the GPIO registers.
295 */
296 outb_p(SIO_PM_FER2, SIO_PM_INDEX);
297 outb_p(SIO_PM_GP_EN, SIO_PM_DATA);
298
299 /*
300 * Now, initialize the GPIO registers.
301 * We want them all to be inputs which is the
302 * power on default, so let's leave them alone.
303 * So, let's just read the board rev!
304 */
305 raw = inb_p(SIO_GP_DATA1);
306 raw &= 0x7f; /* 7 bits of valid board revision ID. */
307
308 if (visws_board_type == VISWS_320) {
309 if (raw < 0x6) {
310 visws_board_rev = 4;
311 } else if (raw < 0xc) {
312 visws_board_rev = 5;
313 } else {
314 visws_board_rev = 6;
315 }
316 } else if (visws_board_type == VISWS_540) {
317 visws_board_rev = 2;
318 } else {
319 visws_board_rev = raw;
320 }
321
322 printk(KERN_INFO "Silicon Graphics Visual Workstation %s (rev %d) detected\n",
323 (visws_board_type == VISWS_320 ? "320" :
324 (visws_board_type == VISWS_540 ? "540" :
325 "unknown")), visws_board_rev);
326}
327
328#define A01234 (LI_INTA_0 | LI_INTA_1 | LI_INTA_2 | LI_INTA_3 | LI_INTA_4)
329#define BCD (LI_INTB | LI_INTC | LI_INTD)
330#define ALLDEVS (A01234 | BCD)
331
332static __init void lithium_init(void)
333{
334 set_fixmap(FIX_LI_PCIA, LI_PCI_A_PHYS);
335 set_fixmap(FIX_LI_PCIB, LI_PCI_B_PHYS);
336
337 if ((li_pcia_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
338 (li_pcia_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
339 printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'A');
340/* panic("This machine is not SGI Visual Workstation 320/540"); */
341 }
342
343 if ((li_pcib_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
344 (li_pcib_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
345 printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'B');
346/* panic("This machine is not SGI Visual Workstation 320/540"); */
347 }
348
349 li_pcia_write16(LI_PCI_INTEN, ALLDEVS);
350 li_pcib_write16(LI_PCI_INTEN, ALLDEVS);
351}
352
353static __init void cobalt_init(void)
354{
355 /*
356 * On normal SMP PC this is used only with SMP, but we have to
357 * use it and set it up here to start the Cobalt clock
358 */
359 set_fixmap(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE);
360 setup_local_APIC();
361 printk(KERN_INFO "Local APIC Version %#x, ID %#x\n",
362 (unsigned int)apic_read(APIC_LVR),
363 (unsigned int)apic_read(APIC_ID));
364
365 set_fixmap(FIX_CO_CPU, CO_CPU_PHYS);
366 set_fixmap(FIX_CO_APIC, CO_APIC_PHYS);
367 printk(KERN_INFO "Cobalt Revision %#lx, APIC ID %#lx\n",
368 co_cpu_read(CO_CPU_REV), co_apic_read(CO_APIC_ID));
369
370 /* Enable Cobalt APIC being careful to NOT change the ID! */
371 co_apic_write(CO_APIC_ID, co_apic_read(CO_APIC_ID) | CO_APIC_ENABLE);
372
373 printk(KERN_INFO "Cobalt APIC enabled: ID reg %#lx\n",
374 co_apic_read(CO_APIC_ID));
375}
376
377static void __init visws_trap_init(void)
378{
379 lithium_init();
380 cobalt_init();
381}
382
383/*
384 * IRQ controller / APIC support:
385 */
386
387static DEFINE_SPINLOCK(cobalt_lock);
388
389/*
390 * Set the given Cobalt APIC Redirection Table entry to point
391 * to the given IDT vector/index.
392 */
393static inline void co_apic_set(int entry, int irq)
394{
395 co_apic_write(CO_APIC_LO(entry), CO_APIC_LEVEL | (irq + FIRST_EXTERNAL_VECTOR));
396 co_apic_write(CO_APIC_HI(entry), 0);
397}
398
399/*
400 * Cobalt (IO)-APIC functions to handle PCI devices.
401 */
402static inline int co_apic_ide0_hack(void)
403{
404 extern char visws_board_type;
405 extern char visws_board_rev;
406
407 if (visws_board_type == VISWS_320 && visws_board_rev == 5)
408 return 5;
409 return CO_APIC_IDE0;
410}
411
412static int is_co_apic(unsigned int irq)
413{
414 if (IS_CO_APIC(irq))
415 return CO_APIC(irq);
416
417 switch (irq) {
418 case 0: return CO_APIC_CPU;
419 case CO_IRQ_IDE0: return co_apic_ide0_hack();
420 case CO_IRQ_IDE1: return CO_APIC_IDE1;
421 default: return -1;
422 }
423}
424
425
426/*
427 * This is the SGI Cobalt (IO-)APIC:
428 */
429static void enable_cobalt_irq(struct irq_data *data)
430{
431 co_apic_set(is_co_apic(data->irq), data->irq);
432}
433
434static void disable_cobalt_irq(struct irq_data *data)
435{
436 int entry = is_co_apic(data->irq);
437
438 co_apic_write(CO_APIC_LO(entry), CO_APIC_MASK);
439 co_apic_read(CO_APIC_LO(entry));
440}
441
442static void ack_cobalt_irq(struct irq_data *data)
443{
444 unsigned long flags;
445
446 spin_lock_irqsave(&cobalt_lock, flags);
447 disable_cobalt_irq(data);
448 apic_write(APIC_EOI, APIC_EOI_ACK);
449 spin_unlock_irqrestore(&cobalt_lock, flags);
450}
451
452static struct irq_chip cobalt_irq_type = {
453 .name = "Cobalt-APIC",
454 .irq_enable = enable_cobalt_irq,
455 .irq_disable = disable_cobalt_irq,
456 .irq_ack = ack_cobalt_irq,
457};
458
459
460/*
461 * This is the PIIX4-based 8259 that is wired up indirectly to Cobalt
462 * -- not the manner expected by the code in i8259.c.
463 *
464 * there is a 'master' physical interrupt source that gets sent to
465 * the CPU. But in the chipset there are various 'virtual' interrupts
466 * waiting to be handled. We represent this to Linux through a 'master'
467 * interrupt controller type, and through a special virtual interrupt-
468 * controller. Device drivers only see the virtual interrupt sources.
469 */
470static unsigned int startup_piix4_master_irq(struct irq_data *data)
471{
472 legacy_pic->init(0);
473 enable_cobalt_irq(data);
474 return 0;
475}
476
477static struct irq_chip piix4_master_irq_type = {
478 .name = "PIIX4-master",
479 .irq_startup = startup_piix4_master_irq,
480 .irq_ack = ack_cobalt_irq,
481};
482
483static void pii4_mask(struct irq_data *data) { }
484
485static struct irq_chip piix4_virtual_irq_type = {
486 .name = "PIIX4-virtual",
487 .irq_mask = pii4_mask,
488};
489
490/*
491 * PIIX4-8259 master/virtual functions to handle interrupt requests
492 * from legacy devices: floppy, parallel, serial, rtc.
493 *
494 * None of these get Cobalt APIC entries, neither do they have IDT
495 * entries. These interrupts are purely virtual and distributed from
496 * the 'master' interrupt source: CO_IRQ_8259.
497 *
498 * When the 8259 interrupts its handler figures out which of these
499 * devices is interrupting and dispatches to its handler.
500 *
501 * CAREFUL: devices see the 'virtual' interrupt only. Thus disable/
502 * enable_irq gets the right irq. This 'master' irq is never directly
503 * manipulated by any driver.
504 */
505static irqreturn_t piix4_master_intr(int irq, void *dev_id)
506{
507 unsigned long flags;
508 int realirq;
509
510 raw_spin_lock_irqsave(&i8259A_lock, flags);
511
512 /* Find out what's interrupting in the PIIX4 master 8259 */
513 outb(0x0c, 0x20); /* OCW3 Poll command */
514 realirq = inb(0x20);
515
516 /*
517 * Bit 7 == 0 means invalid/spurious
518 */
519 if (unlikely(!(realirq & 0x80)))
520 goto out_unlock;
521
522 realirq &= 7;
523
524 if (unlikely(realirq == 2)) {
525 outb(0x0c, 0xa0);
526 realirq = inb(0xa0);
527
528 if (unlikely(!(realirq & 0x80)))
529 goto out_unlock;
530
531 realirq = (realirq & 7) + 8;
532 }
533
534 /* mask and ack interrupt */
535 cached_irq_mask |= 1 << realirq;
536 if (unlikely(realirq > 7)) {
537 inb(0xa1);
538 outb(cached_slave_mask, 0xa1);
539 outb(0x60 + (realirq & 7), 0xa0);
540 outb(0x60 + 2, 0x20);
541 } else {
542 inb(0x21);
543 outb(cached_master_mask, 0x21);
544 outb(0x60 + realirq, 0x20);
545 }
546
547 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
548
549 /*
550 * handle this 'virtual interrupt' as a Cobalt one now.
551 */
552 generic_handle_irq(realirq);
553
554 return IRQ_HANDLED;
555
556out_unlock:
557 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
558 return IRQ_NONE;
559}
560
561static struct irqaction master_action = {
562 .handler = piix4_master_intr,
563 .name = "PIIX4-8259",
564 .flags = IRQF_NO_THREAD,
565};
566
567static struct irqaction cascade_action = {
568 .handler = no_action,
569 .name = "cascade",
570 .flags = IRQF_NO_THREAD,
571};
572
573static inline void set_piix4_virtual_irq_type(void)
574{
575 piix4_virtual_irq_type.irq_enable = i8259A_chip.irq_unmask;
576 piix4_virtual_irq_type.irq_disable = i8259A_chip.irq_mask;
577 piix4_virtual_irq_type.irq_unmask = i8259A_chip.irq_unmask;
578}
579
580static void __init visws_pre_intr_init(void)
581{
582 int i;
583
584 set_piix4_virtual_irq_type();
585
586 for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) {
587 struct irq_chip *chip = NULL;
588
589 if (i == 0)
590 chip = &cobalt_irq_type;
591 else if (i == CO_IRQ_IDE0)
592 chip = &cobalt_irq_type;
593 else if (i == CO_IRQ_IDE1)
594 chip = &cobalt_irq_type;
595 else if (i == CO_IRQ_8259)
596 chip = &piix4_master_irq_type;
597 else if (i < CO_IRQ_APIC0)
598 chip = &piix4_virtual_irq_type;
599 else if (IS_CO_APIC(i))
600 chip = &cobalt_irq_type;
601
602 if (chip)
603 irq_set_chip(i, chip);
604 }
605
606 setup_irq(CO_IRQ_8259, &master_action);
607 setup_irq(2, &cascade_action);
608}
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index a44f457e70a1..bad628a620c4 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -29,12 +29,10 @@ void __init reserve_real_mode(void)
29void __init setup_real_mode(void) 29void __init setup_real_mode(void)
30{ 30{
31 u16 real_mode_seg; 31 u16 real_mode_seg;
32 u32 *rel; 32 const u32 *rel;
33 u32 count; 33 u32 count;
34 u32 *ptr;
35 u16 *seg;
36 int i;
37 unsigned char *base; 34 unsigned char *base;
35 unsigned long phys_base;
38 struct trampoline_header *trampoline_header; 36 struct trampoline_header *trampoline_header;
39 size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob); 37 size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob);
40#ifdef CONFIG_X86_64 38#ifdef CONFIG_X86_64
@@ -46,23 +44,23 @@ void __init setup_real_mode(void)
46 44
47 memcpy(base, real_mode_blob, size); 45 memcpy(base, real_mode_blob, size);
48 46
49 real_mode_seg = __pa(base) >> 4; 47 phys_base = __pa(base);
48 real_mode_seg = phys_base >> 4;
49
50 rel = (u32 *) real_mode_relocs; 50 rel = (u32 *) real_mode_relocs;
51 51
52 /* 16-bit segment relocations. */ 52 /* 16-bit segment relocations. */
53 count = rel[0]; 53 count = *rel++;
54 rel = &rel[1]; 54 while (count--) {
55 for (i = 0; i < count; i++) { 55 u16 *seg = (u16 *) (base + *rel++);
56 seg = (u16 *) (base + rel[i]);
57 *seg = real_mode_seg; 56 *seg = real_mode_seg;
58 } 57 }
59 58
60 /* 32-bit linear relocations. */ 59 /* 32-bit linear relocations. */
61 count = rel[i]; 60 count = *rel++;
62 rel = &rel[i + 1]; 61 while (count--) {
63 for (i = 0; i < count; i++) { 62 u32 *ptr = (u32 *) (base + *rel++);
64 ptr = (u32 *) (base + rel[i]); 63 *ptr += phys_base;
65 *ptr += __pa(base);
66 } 64 }
67 65
68 /* Must be perfomed *after* relocation. */ 66 /* Must be perfomed *after* relocation. */
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index 9cac82588cbc..3497f14e4dea 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -64,20 +64,7 @@ $(obj)/realmode.relocs: $(obj)/realmode.elf FORCE
64 64
65# --------------------------------------------------------------------------- 65# ---------------------------------------------------------------------------
66 66
67# How to compile the 16-bit code. Note we always compile for -march=i386, 67KBUILD_CFLAGS := $(LINUXINCLUDE) $(REALMODE_CFLAGS) -D_SETUP -D_WAKEUP \
68# that way we can complain to the user if the CPU is insufficient. 68 -I$(srctree)/arch/x86/boot
69KBUILD_CFLAGS := $(LINUXINCLUDE) -m32 -g -Os -D_SETUP -D__KERNEL__ -D_WAKEUP \
70 -I$(srctree)/arch/x86/boot \
71 -DDISABLE_BRANCH_PROFILING \
72 -Wall -Wstrict-prototypes \
73 -march=i386 -mregparm=3 \
74 -include $(srctree)/$(src)/../../boot/code16gcc.h \
75 -fno-strict-aliasing -fomit-frame-pointer -fno-pic \
76 -mno-mmx -mno-sse \
77 $(call cc-option, -ffreestanding) \
78 $(call cc-option, -fno-toplevel-reorder,\
79 $(call cc-option, -fno-unit-at-a-time)) \
80 $(call cc-option, -fno-stack-protector) \
81 $(call cc-option, -mpreferred-stack-boundary=2)
82KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ 69KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
83GCOV_PROFILE := n 70GCOV_PROFILE := n
diff --git a/arch/x86/realmode/rm/reboot.S b/arch/x86/realmode/rm/reboot.S
index f932ea61d1c8..d66c607bdc58 100644
--- a/arch/x86/realmode/rm/reboot.S
+++ b/arch/x86/realmode/rm/reboot.S
@@ -1,5 +1,4 @@
1#include <linux/linkage.h> 1#include <linux/linkage.h>
2#include <linux/init.h>
3#include <asm/segment.h> 2#include <asm/segment.h>
4#include <asm/page_types.h> 3#include <asm/page_types.h>
5#include <asm/processor-flags.h> 4#include <asm/processor-flags.h>
diff --git a/arch/x86/realmode/rm/trampoline_32.S b/arch/x86/realmode/rm/trampoline_32.S
index c1b2791183e7..48ddd76bc4c3 100644
--- a/arch/x86/realmode/rm/trampoline_32.S
+++ b/arch/x86/realmode/rm/trampoline_32.S
@@ -20,7 +20,6 @@
20 */ 20 */
21 21
22#include <linux/linkage.h> 22#include <linux/linkage.h>
23#include <linux/init.h>
24#include <asm/segment.h> 23#include <asm/segment.h>
25#include <asm/page_types.h> 24#include <asm/page_types.h>
26#include "realmode.h" 25#include "realmode.h"
diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S
index bb360dc39d21..dac7b20d2f9d 100644
--- a/arch/x86/realmode/rm/trampoline_64.S
+++ b/arch/x86/realmode/rm/trampoline_64.S
@@ -25,7 +25,6 @@
25 */ 25 */
26 26
27#include <linux/linkage.h> 27#include <linux/linkage.h>
28#include <linux/init.h>
29#include <asm/pgtable_types.h> 28#include <asm/pgtable_types.h>
30#include <asm/page_types.h> 29#include <asm/page_types.h>
31#include <asm/msr.h> 30#include <asm/msr.h>
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index aabfb8380a1c..96bc506ac6de 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -357,3 +357,5 @@
357348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev 357348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev
358349 i386 kcmp sys_kcmp 358349 i386 kcmp sys_kcmp
359350 i386 finit_module sys_finit_module 359350 i386 finit_module sys_finit_module
360351 i386 sched_setattr sys_sched_setattr
361352 i386 sched_getattr sys_sched_getattr
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index 38ae65dfd14f..04376ac3d9ef 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -320,6 +320,9 @@
320311 64 process_vm_writev sys_process_vm_writev 320311 64 process_vm_writev sys_process_vm_writev
321312 common kcmp sys_kcmp 321312 common kcmp sys_kcmp
322313 common finit_module sys_finit_module 322313 common finit_module sys_finit_module
323314 common sched_setattr sys_sched_setattr
324315 common sched_getattr sys_sched_getattr
325316 common renameat2 sys_renameat2
323 326
324# 327#
325# x32-specific system call numbers start at 512 to avoid cache impact 328# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index f7bab68a4b83..bbb1d2259ecf 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -69,8 +69,8 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = {
69 "__per_cpu_load|" 69 "__per_cpu_load|"
70 "init_per_cpu__.*|" 70 "init_per_cpu__.*|"
71 "__end_rodata_hpage_align|" 71 "__end_rodata_hpage_align|"
72 "__vvar_page|"
73#endif 72#endif
73 "__vvar_page|"
74 "_end)$" 74 "_end)$"
75}; 75};
76 76
@@ -722,15 +722,25 @@ static void percpu_init(void)
722 722
723/* 723/*
724 * Check to see if a symbol lies in the .data..percpu section. 724 * Check to see if a symbol lies in the .data..percpu section.
725 * For some as yet not understood reason the "__init_begin" 725 *
726 * symbol which immediately preceeds the .data..percpu section 726 * The linker incorrectly associates some symbols with the
727 * also shows up as it it were part of it so we do an explict 727 * .data..percpu section so we also need to check the symbol
728 * check for that symbol name and ignore it. 728 * name to make sure that we classify the symbol correctly.
729 *
730 * The GNU linker incorrectly associates:
731 * __init_begin
732 * __per_cpu_load
733 *
734 * The "gold" linker incorrectly associates:
735 * init_per_cpu__irq_stack_union
736 * init_per_cpu__gdt_page
729 */ 737 */
730static int is_percpu_sym(ElfW(Sym) *sym, const char *symname) 738static int is_percpu_sym(ElfW(Sym) *sym, const char *symname)
731{ 739{
732 return (sym->st_shndx == per_cpu_shndx) && 740 return (sym->st_shndx == per_cpu_shndx) &&
733 strcmp(symname, "__init_begin"); 741 strcmp(symname, "__init_begin") &&
742 strcmp(symname, "__per_cpu_load") &&
743 strncmp(symname, "init_per_cpu_", 13);
734} 744}
735 745
736 746
@@ -1015,6 +1025,29 @@ static void emit_relocs(int as_text, int use_real_mode)
1015 } 1025 }
1016} 1026}
1017 1027
1028/*
1029 * As an aid to debugging problems with different linkers
1030 * print summary information about the relocs.
1031 * Since different linkers tend to emit the sections in
1032 * different orders we use the section names in the output.
1033 */
1034static int do_reloc_info(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym,
1035 const char *symname)
1036{
1037 printf("%s\t%s\t%s\t%s\n",
1038 sec_name(sec->shdr.sh_info),
1039 rel_type(ELF_R_TYPE(rel->r_info)),
1040 symname,
1041 sec_name(sym->st_shndx));
1042 return 0;
1043}
1044
1045static void print_reloc_info(void)
1046{
1047 printf("reloc section\treloc type\tsymbol\tsymbol section\n");
1048 walk_relocs(do_reloc_info);
1049}
1050
1018#if ELF_BITS == 64 1051#if ELF_BITS == 64
1019# define process process_64 1052# define process process_64
1020#else 1053#else
@@ -1022,7 +1055,8 @@ static void emit_relocs(int as_text, int use_real_mode)
1022#endif 1055#endif
1023 1056
1024void process(FILE *fp, int use_real_mode, int as_text, 1057void process(FILE *fp, int use_real_mode, int as_text,
1025 int show_absolute_syms, int show_absolute_relocs) 1058 int show_absolute_syms, int show_absolute_relocs,
1059 int show_reloc_info)
1026{ 1060{
1027 regex_init(use_real_mode); 1061 regex_init(use_real_mode);
1028 read_ehdr(fp); 1062 read_ehdr(fp);
@@ -1040,5 +1074,9 @@ void process(FILE *fp, int use_real_mode, int as_text,
1040 print_absolute_relocs(); 1074 print_absolute_relocs();
1041 return; 1075 return;
1042 } 1076 }
1077 if (show_reloc_info) {
1078 print_reloc_info();
1079 return;
1080 }
1043 emit_relocs(as_text, use_real_mode); 1081 emit_relocs(as_text, use_real_mode);
1044} 1082}
diff --git a/arch/x86/tools/relocs.h b/arch/x86/tools/relocs.h
index 07cdb1eca4fa..f59590645b68 100644
--- a/arch/x86/tools/relocs.h
+++ b/arch/x86/tools/relocs.h
@@ -29,8 +29,9 @@ enum symtype {
29}; 29};
30 30
31void process_32(FILE *fp, int use_real_mode, int as_text, 31void process_32(FILE *fp, int use_real_mode, int as_text,
32 int show_absolute_syms, int show_absolute_relocs); 32 int show_absolute_syms, int show_absolute_relocs,
33 int show_reloc_info);
33void process_64(FILE *fp, int use_real_mode, int as_text, 34void process_64(FILE *fp, int use_real_mode, int as_text,
34 int show_absolute_syms, int show_absolute_relocs); 35 int show_absolute_syms, int show_absolute_relocs,
35 36 int show_reloc_info);
36#endif /* RELOCS_H */ 37#endif /* RELOCS_H */
diff --git a/arch/x86/tools/relocs_common.c b/arch/x86/tools/relocs_common.c
index 44d396823a53..acab636bcb34 100644
--- a/arch/x86/tools/relocs_common.c
+++ b/arch/x86/tools/relocs_common.c
@@ -11,12 +11,13 @@ void die(char *fmt, ...)
11 11
12static void usage(void) 12static void usage(void)
13{ 13{
14 die("relocs [--abs-syms|--abs-relocs|--text|--realmode] vmlinux\n"); 14 die("relocs [--abs-syms|--abs-relocs|--reloc-info|--text|--realmode]" \
15 " vmlinux\n");
15} 16}
16 17
17int main(int argc, char **argv) 18int main(int argc, char **argv)
18{ 19{
19 int show_absolute_syms, show_absolute_relocs; 20 int show_absolute_syms, show_absolute_relocs, show_reloc_info;
20 int as_text, use_real_mode; 21 int as_text, use_real_mode;
21 const char *fname; 22 const char *fname;
22 FILE *fp; 23 FILE *fp;
@@ -25,6 +26,7 @@ int main(int argc, char **argv)
25 26
26 show_absolute_syms = 0; 27 show_absolute_syms = 0;
27 show_absolute_relocs = 0; 28 show_absolute_relocs = 0;
29 show_reloc_info = 0;
28 as_text = 0; 30 as_text = 0;
29 use_real_mode = 0; 31 use_real_mode = 0;
30 fname = NULL; 32 fname = NULL;
@@ -39,6 +41,10 @@ int main(int argc, char **argv)
39 show_absolute_relocs = 1; 41 show_absolute_relocs = 1;
40 continue; 42 continue;
41 } 43 }
44 if (strcmp(arg, "--reloc-info") == 0) {
45 show_reloc_info = 1;
46 continue;
47 }
42 if (strcmp(arg, "--text") == 0) { 48 if (strcmp(arg, "--text") == 0) {
43 as_text = 1; 49 as_text = 1;
44 continue; 50 continue;
@@ -67,10 +73,12 @@ int main(int argc, char **argv)
67 rewind(fp); 73 rewind(fp);
68 if (e_ident[EI_CLASS] == ELFCLASS64) 74 if (e_ident[EI_CLASS] == ELFCLASS64)
69 process_64(fp, use_real_mode, as_text, 75 process_64(fp, use_real_mode, as_text,
70 show_absolute_syms, show_absolute_relocs); 76 show_absolute_syms, show_absolute_relocs,
77 show_reloc_info);
71 else 78 else
72 process_32(fp, use_real_mode, as_text, 79 process_32(fp, use_real_mode, as_text,
73 show_absolute_syms, show_absolute_relocs); 80 show_absolute_syms, show_absolute_relocs,
81 show_reloc_info);
74 fclose(fp); 82 fclose(fp);
75 return 0; 83 return 0;
76} 84}
diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h
index 7d01b8c56c00..cc04e67bfd05 100644
--- a/arch/x86/um/asm/barrier.h
+++ b/arch/x86/um/asm/barrier.h
@@ -40,11 +40,7 @@
40#define smp_rmb() barrier() 40#define smp_rmb() barrier()
41#endif /* CONFIG_X86_PPRO_FENCE */ 41#endif /* CONFIG_X86_PPRO_FENCE */
42 42
43#ifdef CONFIG_X86_OOSTORE
44#define smp_wmb() wmb()
45#else /* CONFIG_X86_OOSTORE */
46#define smp_wmb() barrier() 43#define smp_wmb() barrier()
47#endif /* CONFIG_X86_OOSTORE */
48 44
49#define smp_read_barrier_depends() read_barrier_depends() 45#define smp_read_barrier_depends() read_barrier_depends()
50#define set_mb(var, value) do { (void)xchg(&var, value); } while (0) 46#define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index fd14be1d1472..c580d1210ffe 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -2,6 +2,8 @@
2# Building vDSO images for x86. 2# Building vDSO images for x86.
3# 3#
4 4
5KBUILD_CFLAGS += $(DISABLE_LTO)
6
5VDSO64-$(CONFIG_X86_64) := y 7VDSO64-$(CONFIG_X86_64) := y
6VDSOX32-$(CONFIG_X86_X32_ABI) := y 8VDSOX32-$(CONFIG_X86_X32_ABI) := y
7VDSO32-$(CONFIG_X86_32) := y 9VDSO32-$(CONFIG_X86_32) := y
@@ -21,7 +23,8 @@ vobjs-$(VDSOX32-y) += $(vobjx32s-compat)
21vobj64s := $(filter-out $(vobjx32s-compat),$(vobjs-y)) 23vobj64s := $(filter-out $(vobjx32s-compat),$(vobjs-y))
22 24
23# files to link into kernel 25# files to link into kernel
24obj-$(VDSO64-y) += vma.o vdso.o 26obj-y += vma.o
27obj-$(VDSO64-y) += vdso.o
25obj-$(VDSOX32-y) += vdsox32.o 28obj-$(VDSOX32-y) += vdsox32.o
26obj-$(VDSO32-y) += vdso32.o vdso32-setup.o 29obj-$(VDSO32-y) += vdso32.o vdso32-setup.o
27 30
@@ -35,7 +38,8 @@ export CPPFLAGS_vdso.lds += -P -C
35 38
36VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \ 39VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \
37 -Wl,--no-undefined \ 40 -Wl,--no-undefined \
38 -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 41 -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 \
42 $(DISABLE_LTO)
39 43
40$(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so 44$(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so
41 45
@@ -127,7 +131,7 @@ vdso32.so-$(VDSO32-y) += sysenter
127vdso32-images = $(vdso32.so-y:%=vdso32-%.so) 131vdso32-images = $(vdso32.so-y:%=vdso32-%.so)
128 132
129CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds) 133CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds)
130VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-soname=linux-gate.so.1 134VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-m,elf_i386 -Wl,-soname=linux-gate.so.1
131 135
132# This makes sure the $(obj) subdirectory exists even though vdso32/ 136# This makes sure the $(obj) subdirectory exists even though vdso32/
133# is not a kbuild sub-make subdirectory. 137# is not a kbuild sub-make subdirectory.
@@ -135,7 +139,7 @@ override obj-dirs = $(dir $(obj)) $(obj)/vdso32/
135 139
136targets += vdso32/vdso32.lds 140targets += vdso32/vdso32.lds
137targets += $(vdso32-images) $(vdso32-images:=.dbg) 141targets += $(vdso32-images) $(vdso32-images:=.dbg)
138targets += vdso32/note.o $(vdso32.so-y:%=vdso32/%.o) 142targets += vdso32/note.o vdso32/vclock_gettime.o $(vdso32.so-y:%=vdso32/%.o)
139 143
140extra-y += $(vdso32-images) 144extra-y += $(vdso32-images)
141 145
@@ -145,8 +149,19 @@ KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS))
145$(vdso32-images:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32) 149$(vdso32-images:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32)
146$(vdso32-images:%=$(obj)/%.dbg): asflags-$(CONFIG_X86_64) += -m32 150$(vdso32-images:%=$(obj)/%.dbg): asflags-$(CONFIG_X86_64) += -m32
147 151
152KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS))
153KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32))
154KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32))
155KBUILD_CFLAGS_32 := $(filter-out -mfentry,$(KBUILD_CFLAGS_32))
156KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic
157KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector)
158KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls)
159KBUILD_CFLAGS_32 += -fno-omit-frame-pointer
160$(vdso32-images:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32)
161
148$(vdso32-images:%=$(obj)/%.dbg): $(obj)/vdso32-%.so.dbg: FORCE \ 162$(vdso32-images:%=$(obj)/%.dbg): $(obj)/vdso32-%.so.dbg: FORCE \
149 $(obj)/vdso32/vdso32.lds \ 163 $(obj)/vdso32/vdso32.lds \
164 $(obj)/vdso32/vclock_gettime.o \
150 $(obj)/vdso32/note.o \ 165 $(obj)/vdso32/note.o \
151 $(obj)/vdso32/%.o 166 $(obj)/vdso32/%.o
152 $(call if_changed,vdso) 167 $(call if_changed,vdso)
@@ -181,7 +196,8 @@ quiet_cmd_vdso = VDSO $@
181 -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \ 196 -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \
182 sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@' 197 sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@'
183 198
184VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) 199VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) \
200 $(LTO_CFLAGS)
185GCOV_PROFILE := n 201GCOV_PROFILE := n
186 202
187# 203#
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index eb5d7a56f8d4..16d686171e9a 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -4,6 +4,9 @@
4 * 4 *
5 * Fast user context implementation of clock_gettime, gettimeofday, and time. 5 * Fast user context implementation of clock_gettime, gettimeofday, and time.
6 * 6 *
7 * 32 Bit compat layer by Stefani Seibold <stefani@seibold.net>
8 * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany
9 *
7 * The code should have no internal unresolved relocations. 10 * The code should have no internal unresolved relocations.
8 * Check with readelf after changing. 11 * Check with readelf after changing.
9 */ 12 */
@@ -11,56 +14,55 @@
11/* Disable profiling for userspace code: */ 14/* Disable profiling for userspace code: */
12#define DISABLE_BRANCH_PROFILING 15#define DISABLE_BRANCH_PROFILING
13 16
14#include <linux/kernel.h> 17#include <uapi/linux/time.h>
15#include <linux/posix-timers.h>
16#include <linux/time.h>
17#include <linux/string.h>
18#include <asm/vsyscall.h>
19#include <asm/fixmap.h>
20#include <asm/vgtod.h> 18#include <asm/vgtod.h>
21#include <asm/timex.h>
22#include <asm/hpet.h> 19#include <asm/hpet.h>
20#include <asm/vvar.h>
23#include <asm/unistd.h> 21#include <asm/unistd.h>
24#include <asm/io.h> 22#include <asm/msr.h>
25#include <asm/pvclock.h> 23#include <linux/math64.h>
24#include <linux/time.h>
26 25
27#define gtod (&VVAR(vsyscall_gtod_data)) 26#define gtod (&VVAR(vsyscall_gtod_data))
28 27
29notrace static cycle_t vread_tsc(void) 28extern int __vdso_clock_gettime(clockid_t clock, struct timespec *ts);
29extern int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz);
30extern time_t __vdso_time(time_t *t);
31
32#ifdef CONFIG_HPET_TIMER
33static inline u32 read_hpet_counter(const volatile void *addr)
30{ 34{
31 cycle_t ret; 35 return *(const volatile u32 *) (addr + HPET_COUNTER);
32 u64 last; 36}
37#endif
33 38
34 /* 39#ifndef BUILD_VDSO32
35 * Empirically, a fence (of type that depends on the CPU)
36 * before rdtsc is enough to ensure that rdtsc is ordered
37 * with respect to loads. The various CPU manuals are unclear
38 * as to whether rdtsc can be reordered with later loads,
39 * but no one has ever seen it happen.
40 */
41 rdtsc_barrier();
42 ret = (cycle_t)vget_cycles();
43 40
44 last = VVAR(vsyscall_gtod_data).clock.cycle_last; 41#include <linux/kernel.h>
42#include <asm/vsyscall.h>
43#include <asm/fixmap.h>
44#include <asm/pvclock.h>
45 45
46 if (likely(ret >= last)) 46static notrace cycle_t vread_hpet(void)
47 return ret; 47{
48 return read_hpet_counter((const void *)fix_to_virt(VSYSCALL_HPET));
49}
48 50
49 /* 51notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
50 * GCC likes to generate cmov here, but this branch is extremely 52{
51 * predictable (it's just a funciton of time and the likely is 53 long ret;
52 * very likely) and there's a data dependence, so force GCC 54 asm("syscall" : "=a" (ret) :
53 * to generate a branch instead. I don't barrier() because 55 "0" (__NR_clock_gettime), "D" (clock), "S" (ts) : "memory");
54 * we don't actually need a barrier, and if this function 56 return ret;
55 * ever gets inlined it will generate worse code.
56 */
57 asm volatile ("");
58 return last;
59} 57}
60 58
61static notrace cycle_t vread_hpet(void) 59notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
62{ 60{
63 return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + HPET_COUNTER); 61 long ret;
62
63 asm("syscall" : "=a" (ret) :
64 "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory");
65 return ret;
64} 66}
65 67
66#ifdef CONFIG_PARAVIRT_CLOCK 68#ifdef CONFIG_PARAVIRT_CLOCK
@@ -124,7 +126,7 @@ static notrace cycle_t vread_pvclock(int *mode)
124 *mode = VCLOCK_NONE; 126 *mode = VCLOCK_NONE;
125 127
126 /* refer to tsc.c read_tsc() comment for rationale */ 128 /* refer to tsc.c read_tsc() comment for rationale */
127 last = VVAR(vsyscall_gtod_data).clock.cycle_last; 129 last = gtod->cycle_last;
128 130
129 if (likely(ret >= last)) 131 if (likely(ret >= last))
130 return ret; 132 return ret;
@@ -133,11 +135,30 @@ static notrace cycle_t vread_pvclock(int *mode)
133} 135}
134#endif 136#endif
135 137
138#else
139
140extern u8 hpet_page
141 __attribute__((visibility("hidden")));
142
143#ifdef CONFIG_HPET_TIMER
144static notrace cycle_t vread_hpet(void)
145{
146 return read_hpet_counter((const void *)(&hpet_page));
147}
148#endif
149
136notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) 150notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
137{ 151{
138 long ret; 152 long ret;
139 asm("syscall" : "=a" (ret) : 153
140 "0" (__NR_clock_gettime),"D" (clock), "S" (ts) : "memory"); 154 asm(
155 "mov %%ebx, %%edx \n"
156 "mov %2, %%ebx \n"
157 "call VDSO32_vsyscall \n"
158 "mov %%edx, %%ebx \n"
159 : "=a" (ret)
160 : "0" (__NR_clock_gettime), "g" (clock), "c" (ts)
161 : "memory", "edx");
141 return ret; 162 return ret;
142} 163}
143 164
@@ -145,28 +166,79 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
145{ 166{
146 long ret; 167 long ret;
147 168
148 asm("syscall" : "=a" (ret) : 169 asm(
149 "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory"); 170 "mov %%ebx, %%edx \n"
171 "mov %2, %%ebx \n"
172 "call VDSO32_vsyscall \n"
173 "mov %%edx, %%ebx \n"
174 : "=a" (ret)
175 : "0" (__NR_gettimeofday), "g" (tv), "c" (tz)
176 : "memory", "edx");
150 return ret; 177 return ret;
151} 178}
152 179
180#ifdef CONFIG_PARAVIRT_CLOCK
181
182static notrace cycle_t vread_pvclock(int *mode)
183{
184 *mode = VCLOCK_NONE;
185 return 0;
186}
187#endif
188
189#endif
190
191notrace static cycle_t vread_tsc(void)
192{
193 cycle_t ret;
194 u64 last;
195
196 /*
197 * Empirically, a fence (of type that depends on the CPU)
198 * before rdtsc is enough to ensure that rdtsc is ordered
199 * with respect to loads. The various CPU manuals are unclear
200 * as to whether rdtsc can be reordered with later loads,
201 * but no one has ever seen it happen.
202 */
203 rdtsc_barrier();
204 ret = (cycle_t)__native_read_tsc();
205
206 last = gtod->cycle_last;
207
208 if (likely(ret >= last))
209 return ret;
210
211 /*
212 * GCC likes to generate cmov here, but this branch is extremely
213 * predictable (it's just a funciton of time and the likely is
214 * very likely) and there's a data dependence, so force GCC
215 * to generate a branch instead. I don't barrier() because
216 * we don't actually need a barrier, and if this function
217 * ever gets inlined it will generate worse code.
218 */
219 asm volatile ("");
220 return last;
221}
153 222
154notrace static inline u64 vgetsns(int *mode) 223notrace static inline u64 vgetsns(int *mode)
155{ 224{
156 long v; 225 u64 v;
157 cycles_t cycles; 226 cycles_t cycles;
158 if (gtod->clock.vclock_mode == VCLOCK_TSC) 227
228 if (gtod->vclock_mode == VCLOCK_TSC)
159 cycles = vread_tsc(); 229 cycles = vread_tsc();
160 else if (gtod->clock.vclock_mode == VCLOCK_HPET) 230#ifdef CONFIG_HPET_TIMER
231 else if (gtod->vclock_mode == VCLOCK_HPET)
161 cycles = vread_hpet(); 232 cycles = vread_hpet();
233#endif
162#ifdef CONFIG_PARAVIRT_CLOCK 234#ifdef CONFIG_PARAVIRT_CLOCK
163 else if (gtod->clock.vclock_mode == VCLOCK_PVCLOCK) 235 else if (gtod->vclock_mode == VCLOCK_PVCLOCK)
164 cycles = vread_pvclock(mode); 236 cycles = vread_pvclock(mode);
165#endif 237#endif
166 else 238 else
167 return 0; 239 return 0;
168 v = (cycles - gtod->clock.cycle_last) & gtod->clock.mask; 240 v = (cycles - gtod->cycle_last) & gtod->mask;
169 return v * gtod->clock.mult; 241 return v * gtod->mult;
170} 242}
171 243
172/* Code size doesn't matter (vdso is 4k anyway) and this is faster. */ 244/* Code size doesn't matter (vdso is 4k anyway) and this is faster. */
@@ -176,106 +248,102 @@ notrace static int __always_inline do_realtime(struct timespec *ts)
176 u64 ns; 248 u64 ns;
177 int mode; 249 int mode;
178 250
179 ts->tv_nsec = 0;
180 do { 251 do {
181 seq = raw_read_seqcount_begin(&gtod->seq); 252 seq = gtod_read_begin(gtod);
182 mode = gtod->clock.vclock_mode; 253 mode = gtod->vclock_mode;
183 ts->tv_sec = gtod->wall_time_sec; 254 ts->tv_sec = gtod->wall_time_sec;
184 ns = gtod->wall_time_snsec; 255 ns = gtod->wall_time_snsec;
185 ns += vgetsns(&mode); 256 ns += vgetsns(&mode);
186 ns >>= gtod->clock.shift; 257 ns >>= gtod->shift;
187 } while (unlikely(read_seqcount_retry(&gtod->seq, seq))); 258 } while (unlikely(gtod_read_retry(gtod, seq)));
259
260 ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
261 ts->tv_nsec = ns;
188 262
189 timespec_add_ns(ts, ns);
190 return mode; 263 return mode;
191} 264}
192 265
193notrace static int do_monotonic(struct timespec *ts) 266notrace static int __always_inline do_monotonic(struct timespec *ts)
194{ 267{
195 unsigned long seq; 268 unsigned long seq;
196 u64 ns; 269 u64 ns;
197 int mode; 270 int mode;
198 271
199 ts->tv_nsec = 0;
200 do { 272 do {
201 seq = raw_read_seqcount_begin(&gtod->seq); 273 seq = gtod_read_begin(gtod);
202 mode = gtod->clock.vclock_mode; 274 mode = gtod->vclock_mode;
203 ts->tv_sec = gtod->monotonic_time_sec; 275 ts->tv_sec = gtod->monotonic_time_sec;
204 ns = gtod->monotonic_time_snsec; 276 ns = gtod->monotonic_time_snsec;
205 ns += vgetsns(&mode); 277 ns += vgetsns(&mode);
206 ns >>= gtod->clock.shift; 278 ns >>= gtod->shift;
207 } while (unlikely(read_seqcount_retry(&gtod->seq, seq))); 279 } while (unlikely(gtod_read_retry(gtod, seq)));
208 timespec_add_ns(ts, ns); 280
281 ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
282 ts->tv_nsec = ns;
209 283
210 return mode; 284 return mode;
211} 285}
212 286
213notrace static int do_realtime_coarse(struct timespec *ts) 287notrace static void do_realtime_coarse(struct timespec *ts)
214{ 288{
215 unsigned long seq; 289 unsigned long seq;
216 do { 290 do {
217 seq = raw_read_seqcount_begin(&gtod->seq); 291 seq = gtod_read_begin(gtod);
218 ts->tv_sec = gtod->wall_time_coarse.tv_sec; 292 ts->tv_sec = gtod->wall_time_coarse_sec;
219 ts->tv_nsec = gtod->wall_time_coarse.tv_nsec; 293 ts->tv_nsec = gtod->wall_time_coarse_nsec;
220 } while (unlikely(read_seqcount_retry(&gtod->seq, seq))); 294 } while (unlikely(gtod_read_retry(gtod, seq)));
221 return 0;
222} 295}
223 296
224notrace static int do_monotonic_coarse(struct timespec *ts) 297notrace static void do_monotonic_coarse(struct timespec *ts)
225{ 298{
226 unsigned long seq; 299 unsigned long seq;
227 do { 300 do {
228 seq = raw_read_seqcount_begin(&gtod->seq); 301 seq = gtod_read_begin(gtod);
229 ts->tv_sec = gtod->monotonic_time_coarse.tv_sec; 302 ts->tv_sec = gtod->monotonic_time_coarse_sec;
230 ts->tv_nsec = gtod->monotonic_time_coarse.tv_nsec; 303 ts->tv_nsec = gtod->monotonic_time_coarse_nsec;
231 } while (unlikely(read_seqcount_retry(&gtod->seq, seq))); 304 } while (unlikely(gtod_read_retry(gtod, seq)));
232
233 return 0;
234} 305}
235 306
236notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) 307notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
237{ 308{
238 int ret = VCLOCK_NONE;
239
240 switch (clock) { 309 switch (clock) {
241 case CLOCK_REALTIME: 310 case CLOCK_REALTIME:
242 ret = do_realtime(ts); 311 if (do_realtime(ts) == VCLOCK_NONE)
312 goto fallback;
243 break; 313 break;
244 case CLOCK_MONOTONIC: 314 case CLOCK_MONOTONIC:
245 ret = do_monotonic(ts); 315 if (do_monotonic(ts) == VCLOCK_NONE)
316 goto fallback;
246 break; 317 break;
247 case CLOCK_REALTIME_COARSE: 318 case CLOCK_REALTIME_COARSE:
248 return do_realtime_coarse(ts); 319 do_realtime_coarse(ts);
320 break;
249 case CLOCK_MONOTONIC_COARSE: 321 case CLOCK_MONOTONIC_COARSE:
250 return do_monotonic_coarse(ts); 322 do_monotonic_coarse(ts);
323 break;
324 default:
325 goto fallback;
251 } 326 }
252 327
253 if (ret == VCLOCK_NONE)
254 return vdso_fallback_gettime(clock, ts);
255 return 0; 328 return 0;
329fallback:
330 return vdso_fallback_gettime(clock, ts);
256} 331}
257int clock_gettime(clockid_t, struct timespec *) 332int clock_gettime(clockid_t, struct timespec *)
258 __attribute__((weak, alias("__vdso_clock_gettime"))); 333 __attribute__((weak, alias("__vdso_clock_gettime")));
259 334
260notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) 335notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
261{ 336{
262 long ret = VCLOCK_NONE;
263
264 if (likely(tv != NULL)) { 337 if (likely(tv != NULL)) {
265 BUILD_BUG_ON(offsetof(struct timeval, tv_usec) != 338 if (unlikely(do_realtime((struct timespec *)tv) == VCLOCK_NONE))
266 offsetof(struct timespec, tv_nsec) || 339 return vdso_fallback_gtod(tv, tz);
267 sizeof(*tv) != sizeof(struct timespec));
268 ret = do_realtime((struct timespec *)tv);
269 tv->tv_usec /= 1000; 340 tv->tv_usec /= 1000;
270 } 341 }
271 if (unlikely(tz != NULL)) { 342 if (unlikely(tz != NULL)) {
272 /* Avoid memcpy. Some old compilers fail to inline it */ 343 tz->tz_minuteswest = gtod->tz_minuteswest;
273 tz->tz_minuteswest = gtod->sys_tz.tz_minuteswest; 344 tz->tz_dsttime = gtod->tz_dsttime;
274 tz->tz_dsttime = gtod->sys_tz.tz_dsttime;
275 } 345 }
276 346
277 if (ret == VCLOCK_NONE)
278 return vdso_fallback_gtod(tv, tz);
279 return 0; 347 return 0;
280} 348}
281int gettimeofday(struct timeval *, struct timezone *) 349int gettimeofday(struct timeval *, struct timezone *)
@@ -287,8 +355,8 @@ int gettimeofday(struct timeval *, struct timezone *)
287 */ 355 */
288notrace time_t __vdso_time(time_t *t) 356notrace time_t __vdso_time(time_t *t)
289{ 357{
290 /* This is atomic on x86_64 so we don't need any locks. */ 358 /* This is atomic on x86 so we don't need any locks. */
291 time_t result = ACCESS_ONCE(VVAR(vsyscall_gtod_data).wall_time_sec); 359 time_t result = ACCESS_ONCE(gtod->wall_time_sec);
292 360
293 if (t) 361 if (t)
294 *t = result; 362 *t = result;
diff --git a/arch/x86/vdso/vdso-layout.lds.S b/arch/x86/vdso/vdso-layout.lds.S
index 634a2cf62046..2e263f367b13 100644
--- a/arch/x86/vdso/vdso-layout.lds.S
+++ b/arch/x86/vdso/vdso-layout.lds.S
@@ -6,7 +6,25 @@
6 6
7SECTIONS 7SECTIONS
8{ 8{
9 . = VDSO_PRELINK + SIZEOF_HEADERS; 9#ifdef BUILD_VDSO32
10#include <asm/vdso32.h>
11
12 .hpet_sect : {
13 hpet_page = . - VDSO_OFFSET(VDSO_HPET_PAGE);
14 } :text :hpet_sect
15
16 .vvar_sect : {
17 vvar = . - VDSO_OFFSET(VDSO_VVAR_PAGE);
18
19 /* Place all vvars at the offsets in asm/vvar.h. */
20#define EMIT_VVAR(name, offset) vvar_ ## name = vvar + offset;
21#define __VVAR_KERNEL_LDS
22#include <asm/vvar.h>
23#undef __VVAR_KERNEL_LDS
24#undef EMIT_VVAR
25 } :text :vvar_sect
26#endif
27 . = SIZEOF_HEADERS;
10 28
11 .hash : { *(.hash) } :text 29 .hash : { *(.hash) } :text
12 .gnu.hash : { *(.gnu.hash) } 30 .gnu.hash : { *(.gnu.hash) }
@@ -44,6 +62,11 @@ SECTIONS
44 . = ALIGN(0x100); 62 . = ALIGN(0x100);
45 63
46 .text : { *(.text*) } :text =0x90909090 64 .text : { *(.text*) } :text =0x90909090
65
66 /DISCARD/ : {
67 *(.discard)
68 *(.discard.*)
69 }
47} 70}
48 71
49/* 72/*
@@ -61,4 +84,8 @@ PHDRS
61 dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ 84 dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
62 note PT_NOTE FLAGS(4); /* PF_R */ 85 note PT_NOTE FLAGS(4); /* PF_R */
63 eh_frame_hdr PT_GNU_EH_FRAME; 86 eh_frame_hdr PT_GNU_EH_FRAME;
87#ifdef BUILD_VDSO32
88 vvar_sect PT_NULL FLAGS(4); /* PF_R */
89 hpet_sect PT_NULL FLAGS(4); /* PF_R */
90#endif
64} 91}
diff --git a/arch/x86/vdso/vdso.S b/arch/x86/vdso/vdso.S
index 01f5e3b4613c..be3f23b09af5 100644
--- a/arch/x86/vdso/vdso.S
+++ b/arch/x86/vdso/vdso.S
@@ -1,22 +1,3 @@
1#include <asm/page_types.h> 1#include <asm/vdso.h>
2#include <linux/linkage.h>
3#include <linux/init.h>
4 2
5__PAGE_ALIGNED_DATA 3DEFINE_VDSO_IMAGE(vdso, "arch/x86/vdso/vdso.so")
6
7 .globl vdso_start, vdso_end
8 .align PAGE_SIZE
9vdso_start:
10 .incbin "arch/x86/vdso/vdso.so"
11vdso_end:
12 .align PAGE_SIZE /* extra data here leaks to userspace. */
13
14.previous
15
16 .globl vdso_pages
17 .bss
18 .align 8
19 .type vdso_pages, @object
20vdso_pages:
21 .zero (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE * 8
22 .size vdso_pages, .-vdso_pages
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index d6bfb876cfb0..00348980a3a6 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -16,6 +16,7 @@
16#include <linux/mm.h> 16#include <linux/mm.h>
17#include <linux/err.h> 17#include <linux/err.h>
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/slab.h>
19 20
20#include <asm/cpufeature.h> 21#include <asm/cpufeature.h>
21#include <asm/msr.h> 22#include <asm/msr.h>
@@ -25,17 +26,14 @@
25#include <asm/tlbflush.h> 26#include <asm/tlbflush.h>
26#include <asm/vdso.h> 27#include <asm/vdso.h>
27#include <asm/proto.h> 28#include <asm/proto.h>
28 29#include <asm/fixmap.h>
29enum { 30#include <asm/hpet.h>
30 VDSO_DISABLED = 0, 31#include <asm/vvar.h>
31 VDSO_ENABLED = 1,
32 VDSO_COMPAT = 2,
33};
34 32
35#ifdef CONFIG_COMPAT_VDSO 33#ifdef CONFIG_COMPAT_VDSO
36#define VDSO_DEFAULT VDSO_COMPAT 34#define VDSO_DEFAULT 0
37#else 35#else
38#define VDSO_DEFAULT VDSO_ENABLED 36#define VDSO_DEFAULT 1
39#endif 37#endif
40 38
41#ifdef CONFIG_X86_64 39#ifdef CONFIG_X86_64
@@ -44,13 +42,6 @@ enum {
44#endif 42#endif
45 43
46/* 44/*
47 * This is the difference between the prelinked addresses in the vDSO images
48 * and the VDSO_HIGH_BASE address where CONFIG_COMPAT_VDSO places the vDSO
49 * in the user address space.
50 */
51#define VDSO_ADDR_ADJUST (VDSO_HIGH_BASE - (unsigned long)VDSO32_PRELINK)
52
53/*
54 * Should the kernel map a VDSO page into processes and pass its 45 * Should the kernel map a VDSO page into processes and pass its
55 * address down to glibc upon exec()? 46 * address down to glibc upon exec()?
56 */ 47 */
@@ -60,6 +51,9 @@ static int __init vdso_setup(char *s)
60{ 51{
61 vdso_enabled = simple_strtoul(s, NULL, 0); 52 vdso_enabled = simple_strtoul(s, NULL, 0);
62 53
54 if (vdso_enabled > 1)
55 pr_warn("vdso32 values other than 0 and 1 are no longer allowed; vdso disabled\n");
56
63 return 1; 57 return 1;
64} 58}
65 59
@@ -76,124 +70,8 @@ __setup_param("vdso=", vdso32_setup, vdso_setup, 0);
76EXPORT_SYMBOL_GPL(vdso_enabled); 70EXPORT_SYMBOL_GPL(vdso_enabled);
77#endif 71#endif
78 72
79static __init void reloc_symtab(Elf32_Ehdr *ehdr, 73static struct page **vdso32_pages;
80 unsigned offset, unsigned size) 74static unsigned vdso32_size;
81{
82 Elf32_Sym *sym = (void *)ehdr + offset;
83 unsigned nsym = size / sizeof(*sym);
84 unsigned i;
85
86 for(i = 0; i < nsym; i++, sym++) {
87 if (sym->st_shndx == SHN_UNDEF ||
88 sym->st_shndx == SHN_ABS)
89 continue; /* skip */
90
91 if (sym->st_shndx > SHN_LORESERVE) {
92 printk(KERN_INFO "VDSO: unexpected st_shndx %x\n",
93 sym->st_shndx);
94 continue;
95 }
96
97 switch(ELF_ST_TYPE(sym->st_info)) {
98 case STT_OBJECT:
99 case STT_FUNC:
100 case STT_SECTION:
101 case STT_FILE:
102 sym->st_value += VDSO_ADDR_ADJUST;
103 }
104 }
105}
106
107static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset)
108{
109 Elf32_Dyn *dyn = (void *)ehdr + offset;
110
111 for(; dyn->d_tag != DT_NULL; dyn++)
112 switch(dyn->d_tag) {
113 case DT_PLTGOT:
114 case DT_HASH:
115 case DT_STRTAB:
116 case DT_SYMTAB:
117 case DT_RELA:
118 case DT_INIT:
119 case DT_FINI:
120 case DT_REL:
121 case DT_DEBUG:
122 case DT_JMPREL:
123 case DT_VERSYM:
124 case DT_VERDEF:
125 case DT_VERNEED:
126 case DT_ADDRRNGLO ... DT_ADDRRNGHI:
127 /* definitely pointers needing relocation */
128 dyn->d_un.d_ptr += VDSO_ADDR_ADJUST;
129 break;
130
131 case DT_ENCODING ... OLD_DT_LOOS-1:
132 case DT_LOOS ... DT_HIOS-1:
133 /* Tags above DT_ENCODING are pointers if
134 they're even */
135 if (dyn->d_tag >= DT_ENCODING &&
136 (dyn->d_tag & 1) == 0)
137 dyn->d_un.d_ptr += VDSO_ADDR_ADJUST;
138 break;
139
140 case DT_VERDEFNUM:
141 case DT_VERNEEDNUM:
142 case DT_FLAGS_1:
143 case DT_RELACOUNT:
144 case DT_RELCOUNT:
145 case DT_VALRNGLO ... DT_VALRNGHI:
146 /* definitely not pointers */
147 break;
148
149 case OLD_DT_LOOS ... DT_LOOS-1:
150 case DT_HIOS ... DT_VALRNGLO-1:
151 default:
152 if (dyn->d_tag > DT_ENCODING)
153 printk(KERN_INFO "VDSO: unexpected DT_tag %x\n",
154 dyn->d_tag);
155 break;
156 }
157}
158
159static __init void relocate_vdso(Elf32_Ehdr *ehdr)
160{
161 Elf32_Phdr *phdr;
162 Elf32_Shdr *shdr;
163 int i;
164
165 BUG_ON(memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0 ||
166 !elf_check_arch_ia32(ehdr) ||
167 ehdr->e_type != ET_DYN);
168
169 ehdr->e_entry += VDSO_ADDR_ADJUST;
170
171 /* rebase phdrs */
172 phdr = (void *)ehdr + ehdr->e_phoff;
173 for (i = 0; i < ehdr->e_phnum; i++) {
174 phdr[i].p_vaddr += VDSO_ADDR_ADJUST;
175
176 /* relocate dynamic stuff */
177 if (phdr[i].p_type == PT_DYNAMIC)
178 reloc_dyn(ehdr, phdr[i].p_offset);
179 }
180
181 /* rebase sections */
182 shdr = (void *)ehdr + ehdr->e_shoff;
183 for(i = 0; i < ehdr->e_shnum; i++) {
184 if (!(shdr[i].sh_flags & SHF_ALLOC))
185 continue;
186
187 shdr[i].sh_addr += VDSO_ADDR_ADJUST;
188
189 if (shdr[i].sh_type == SHT_SYMTAB ||
190 shdr[i].sh_type == SHT_DYNSYM)
191 reloc_symtab(ehdr, shdr[i].sh_offset,
192 shdr[i].sh_size);
193 }
194}
195
196static struct page *vdso32_pages[1];
197 75
198#ifdef CONFIG_X86_64 76#ifdef CONFIG_X86_64
199 77
@@ -212,12 +90,6 @@ void syscall32_cpu_init(void)
212 wrmsrl(MSR_CSTAR, ia32_cstar_target); 90 wrmsrl(MSR_CSTAR, ia32_cstar_target);
213} 91}
214 92
215#define compat_uses_vma 1
216
217static inline void map_compat_vdso(int map)
218{
219}
220
221#else /* CONFIG_X86_32 */ 93#else /* CONFIG_X86_32 */
222 94
223#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP)) 95#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP))
@@ -241,64 +113,36 @@ void enable_sep_cpu(void)
241 put_cpu(); 113 put_cpu();
242} 114}
243 115
244static struct vm_area_struct gate_vma;
245
246static int __init gate_vma_init(void)
247{
248 gate_vma.vm_mm = NULL;
249 gate_vma.vm_start = FIXADDR_USER_START;
250 gate_vma.vm_end = FIXADDR_USER_END;
251 gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
252 gate_vma.vm_page_prot = __P101;
253
254 return 0;
255}
256
257#define compat_uses_vma 0
258
259static void map_compat_vdso(int map)
260{
261 static int vdso_mapped;
262
263 if (map == vdso_mapped)
264 return;
265
266 vdso_mapped = map;
267
268 __set_fixmap(FIX_VDSO, page_to_pfn(vdso32_pages[0]) << PAGE_SHIFT,
269 map ? PAGE_READONLY_EXEC : PAGE_NONE);
270
271 /* flush stray tlbs */
272 flush_tlb_all();
273}
274
275#endif /* CONFIG_X86_64 */ 116#endif /* CONFIG_X86_64 */
276 117
277int __init sysenter_setup(void) 118int __init sysenter_setup(void)
278{ 119{
279 void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC); 120 char *vdso32_start, *vdso32_end;
280 const void *vsyscall; 121 int npages, i;
281 size_t vsyscall_len;
282
283 vdso32_pages[0] = virt_to_page(syscall_page);
284
285#ifdef CONFIG_X86_32
286 gate_vma_init();
287#endif
288 122
123#ifdef CONFIG_COMPAT
289 if (vdso32_syscall()) { 124 if (vdso32_syscall()) {
290 vsyscall = &vdso32_syscall_start; 125 vdso32_start = vdso32_syscall_start;
291 vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start; 126 vdso32_end = vdso32_syscall_end;
292 } else if (vdso32_sysenter()){ 127 vdso32_pages = vdso32_syscall_pages;
293 vsyscall = &vdso32_sysenter_start; 128 } else
294 vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start; 129#endif
130 if (vdso32_sysenter()) {
131 vdso32_start = vdso32_sysenter_start;
132 vdso32_end = vdso32_sysenter_end;
133 vdso32_pages = vdso32_sysenter_pages;
295 } else { 134 } else {
296 vsyscall = &vdso32_int80_start; 135 vdso32_start = vdso32_int80_start;
297 vsyscall_len = &vdso32_int80_end - &vdso32_int80_start; 136 vdso32_end = vdso32_int80_end;
137 vdso32_pages = vdso32_int80_pages;
298 } 138 }
299 139
300 memcpy(syscall_page, vsyscall, vsyscall_len); 140 npages = ((vdso32_end - vdso32_start) + PAGE_SIZE - 1) / PAGE_SIZE;
301 relocate_vdso(syscall_page); 141 vdso32_size = npages << PAGE_SHIFT;
142 for (i = 0; i < npages; i++)
143 vdso32_pages[i] = virt_to_page(vdso32_start + i*PAGE_SIZE);
144
145 patch_vdso32(vdso32_start, vdso32_size);
302 146
303 return 0; 147 return 0;
304} 148}
@@ -309,48 +153,73 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
309 struct mm_struct *mm = current->mm; 153 struct mm_struct *mm = current->mm;
310 unsigned long addr; 154 unsigned long addr;
311 int ret = 0; 155 int ret = 0;
312 bool compat; 156 struct vm_area_struct *vma;
313 157
314#ifdef CONFIG_X86_X32_ABI 158#ifdef CONFIG_X86_X32_ABI
315 if (test_thread_flag(TIF_X32)) 159 if (test_thread_flag(TIF_X32))
316 return x32_setup_additional_pages(bprm, uses_interp); 160 return x32_setup_additional_pages(bprm, uses_interp);
317#endif 161#endif
318 162
319 if (vdso_enabled == VDSO_DISABLED) 163 if (vdso_enabled != 1) /* Other values all mean "disabled" */
320 return 0; 164 return 0;
321 165
322 down_write(&mm->mmap_sem); 166 down_write(&mm->mmap_sem);
323 167
324 /* Test compat mode once here, in case someone 168 addr = get_unmapped_area(NULL, 0, vdso32_size + VDSO_OFFSET(VDSO_PREV_PAGES), 0, 0);
325 changes it via sysctl */ 169 if (IS_ERR_VALUE(addr)) {
326 compat = (vdso_enabled == VDSO_COMPAT); 170 ret = addr;
171 goto up_fail;
172 }
173
174 addr += VDSO_OFFSET(VDSO_PREV_PAGES);
327 175
328 map_compat_vdso(compat); 176 current->mm->context.vdso = (void *)addr;
329 177
330 if (compat) 178 /*
331 addr = VDSO_HIGH_BASE; 179 * MAYWRITE to allow gdb to COW and set breakpoints
332 else { 180 */
333 addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0); 181 ret = install_special_mapping(mm,
334 if (IS_ERR_VALUE(addr)) { 182 addr,
335 ret = addr; 183 vdso32_size,
336 goto up_fail; 184 VM_READ|VM_EXEC|
337 } 185 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
186 vdso32_pages);
187
188 if (ret)
189 goto up_fail;
190
191 vma = _install_special_mapping(mm,
192 addr - VDSO_OFFSET(VDSO_PREV_PAGES),
193 VDSO_OFFSET(VDSO_PREV_PAGES),
194 VM_READ,
195 NULL);
196
197 if (IS_ERR(vma)) {
198 ret = PTR_ERR(vma);
199 goto up_fail;
338 } 200 }
339 201
340 current->mm->context.vdso = (void *)addr; 202 ret = remap_pfn_range(vma,
203 addr - VDSO_OFFSET(VDSO_VVAR_PAGE),
204 __pa_symbol(&__vvar_page) >> PAGE_SHIFT,
205 PAGE_SIZE,
206 PAGE_READONLY);
207
208 if (ret)
209 goto up_fail;
341 210
342 if (compat_uses_vma || !compat) { 211#ifdef CONFIG_HPET_TIMER
343 /* 212 if (hpet_address) {
344 * MAYWRITE to allow gdb to COW and set breakpoints 213 ret = io_remap_pfn_range(vma,
345 */ 214 addr - VDSO_OFFSET(VDSO_HPET_PAGE),
346 ret = install_special_mapping(mm, addr, PAGE_SIZE, 215 hpet_address >> PAGE_SHIFT,
347 VM_READ|VM_EXEC| 216 PAGE_SIZE,
348 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 217 pgprot_noncached(PAGE_READONLY));
349 vdso32_pages);
350 218
351 if (ret) 219 if (ret)
352 goto up_fail; 220 goto up_fail;
353 } 221 }
222#endif
354 223
355 current_thread_info()->sysenter_return = 224 current_thread_info()->sysenter_return =
356 VDSO32_SYMBOL(addr, SYSENTER_RETURN); 225 VDSO32_SYMBOL(addr, SYSENTER_RETURN);
@@ -411,20 +280,12 @@ const char *arch_vma_name(struct vm_area_struct *vma)
411 280
412struct vm_area_struct *get_gate_vma(struct mm_struct *mm) 281struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
413{ 282{
414 /*
415 * Check to see if the corresponding task was created in compat vdso
416 * mode.
417 */
418 if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE)
419 return &gate_vma;
420 return NULL; 283 return NULL;
421} 284}
422 285
423int in_gate_area(struct mm_struct *mm, unsigned long addr) 286int in_gate_area(struct mm_struct *mm, unsigned long addr)
424{ 287{
425 const struct vm_area_struct *vma = get_gate_vma(mm); 288 return 0;
426
427 return vma && addr >= vma->vm_start && addr < vma->vm_end;
428} 289}
429 290
430int in_gate_area_no_mm(unsigned long addr) 291int in_gate_area_no_mm(unsigned long addr)
diff --git a/arch/x86/vdso/vdso32.S b/arch/x86/vdso/vdso32.S
index 2ce5f82c333b..018bcd9f97b4 100644
--- a/arch/x86/vdso/vdso32.S
+++ b/arch/x86/vdso/vdso32.S
@@ -1,22 +1,9 @@
1#include <linux/init.h> 1#include <asm/vdso.h>
2 2
3__INITDATA 3DEFINE_VDSO_IMAGE(vdso32_int80, "arch/x86/vdso/vdso32-int80.so")
4 4
5 .globl vdso32_int80_start, vdso32_int80_end
6vdso32_int80_start:
7 .incbin "arch/x86/vdso/vdso32-int80.so"
8vdso32_int80_end:
9
10 .globl vdso32_syscall_start, vdso32_syscall_end
11vdso32_syscall_start:
12#ifdef CONFIG_COMPAT 5#ifdef CONFIG_COMPAT
13 .incbin "arch/x86/vdso/vdso32-syscall.so" 6DEFINE_VDSO_IMAGE(vdso32_syscall, "arch/x86/vdso/vdso32-syscall.so")
14#endif 7#endif
15vdso32_syscall_end:
16
17 .globl vdso32_sysenter_start, vdso32_sysenter_end
18vdso32_sysenter_start:
19 .incbin "arch/x86/vdso/vdso32-sysenter.so"
20vdso32_sysenter_end:
21 8
22__FINIT 9DEFINE_VDSO_IMAGE(vdso32_sysenter, "arch/x86/vdso/vdso32-sysenter.so")
diff --git a/arch/x86/vdso/vdso32/vclock_gettime.c b/arch/x86/vdso/vdso32/vclock_gettime.c
new file mode 100644
index 000000000000..175cc72c0f68
--- /dev/null
+++ b/arch/x86/vdso/vdso32/vclock_gettime.c
@@ -0,0 +1,30 @@
1#define BUILD_VDSO32
2
3#ifndef CONFIG_CC_OPTIMIZE_FOR_SIZE
4#undef CONFIG_OPTIMIZE_INLINING
5#endif
6
7#undef CONFIG_X86_PPRO_FENCE
8
9#ifdef CONFIG_X86_64
10
11/*
12 * in case of a 32 bit VDSO for a 64 bit kernel fake a 32 bit kernel
13 * configuration
14 */
15#undef CONFIG_64BIT
16#undef CONFIG_X86_64
17#undef CONFIG_ILLEGAL_POINTER_VALUE
18#undef CONFIG_SPARSEMEM_VMEMMAP
19#undef CONFIG_NR_CPUS
20
21#define CONFIG_X86_32 1
22#define CONFIG_PAGE_OFFSET 0
23#define CONFIG_ILLEGAL_POINTER_VALUE 0
24#define CONFIG_NR_CPUS 1
25
26#define BUILD_VDSO32_64
27
28#endif
29
30#include "../vclock_gettime.c"
diff --git a/arch/x86/vdso/vdso32/vdso32.lds.S b/arch/x86/vdso/vdso32/vdso32.lds.S
index 976124bb5f92..aadb8b9994cd 100644
--- a/arch/x86/vdso/vdso32/vdso32.lds.S
+++ b/arch/x86/vdso/vdso32/vdso32.lds.S
@@ -8,7 +8,11 @@
8 * values visible using the asm-x86/vdso.h macros from the kernel proper. 8 * values visible using the asm-x86/vdso.h macros from the kernel proper.
9 */ 9 */
10 10
11#include <asm/page.h>
12
13#define BUILD_VDSO32
11#define VDSO_PRELINK 0 14#define VDSO_PRELINK 0
15
12#include "../vdso-layout.lds.S" 16#include "../vdso-layout.lds.S"
13 17
14/* The ELF entry point can be used to set the AT_SYSINFO value. */ 18/* The ELF entry point can be used to set the AT_SYSINFO value. */
@@ -19,6 +23,13 @@ ENTRY(__kernel_vsyscall);
19 */ 23 */
20VERSION 24VERSION
21{ 25{
26 LINUX_2.6 {
27 global:
28 __vdso_clock_gettime;
29 __vdso_gettimeofday;
30 __vdso_time;
31 };
32
22 LINUX_2.5 { 33 LINUX_2.5 {
23 global: 34 global:
24 __kernel_vsyscall; 35 __kernel_vsyscall;
@@ -31,7 +42,9 @@ VERSION
31/* 42/*
32 * Symbols we define here called VDSO* get their values into vdso32-syms.h. 43 * Symbols we define here called VDSO* get their values into vdso32-syms.h.
33 */ 44 */
34VDSO32_PRELINK = VDSO_PRELINK;
35VDSO32_vsyscall = __kernel_vsyscall; 45VDSO32_vsyscall = __kernel_vsyscall;
36VDSO32_sigreturn = __kernel_sigreturn; 46VDSO32_sigreturn = __kernel_sigreturn;
37VDSO32_rt_sigreturn = __kernel_rt_sigreturn; 47VDSO32_rt_sigreturn = __kernel_rt_sigreturn;
48VDSO32_clock_gettime = clock_gettime;
49VDSO32_gettimeofday = gettimeofday;
50VDSO32_time = time;
diff --git a/arch/x86/vdso/vdsox32.S b/arch/x86/vdso/vdsox32.S
index d6b9a7f42a8a..f4aa34e7f370 100644
--- a/arch/x86/vdso/vdsox32.S
+++ b/arch/x86/vdso/vdsox32.S
@@ -1,22 +1,3 @@
1#include <asm/page_types.h> 1#include <asm/vdso.h>
2#include <linux/linkage.h>
3#include <linux/init.h>
4 2
5__PAGE_ALIGNED_DATA 3DEFINE_VDSO_IMAGE(vdsox32, "arch/x86/vdso/vdsox32.so")
6
7 .globl vdsox32_start, vdsox32_end
8 .align PAGE_SIZE
9vdsox32_start:
10 .incbin "arch/x86/vdso/vdsox32.so"
11vdsox32_end:
12 .align PAGE_SIZE /* extra data here leaks to userspace. */
13
14.previous
15
16 .globl vdsox32_pages
17 .bss
18 .align 8
19 .type vdsox32_pages, @object
20vdsox32_pages:
21 .zero (vdsox32_end - vdsox32_start + PAGE_SIZE - 1) / PAGE_SIZE * 8
22 .size vdsox32_pages, .-vdsox32_pages
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 431e87544411..1ad102613127 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -16,20 +16,22 @@
16#include <asm/vdso.h> 16#include <asm/vdso.h>
17#include <asm/page.h> 17#include <asm/page.h>
18 18
19#if defined(CONFIG_X86_64)
19unsigned int __read_mostly vdso_enabled = 1; 20unsigned int __read_mostly vdso_enabled = 1;
20 21
21extern char vdso_start[], vdso_end[]; 22DECLARE_VDSO_IMAGE(vdso);
22extern unsigned short vdso_sync_cpuid; 23extern unsigned short vdso_sync_cpuid;
23
24extern struct page *vdso_pages[];
25static unsigned vdso_size; 24static unsigned vdso_size;
26 25
27#ifdef CONFIG_X86_X32_ABI 26#ifdef CONFIG_X86_X32_ABI
28extern char vdsox32_start[], vdsox32_end[]; 27DECLARE_VDSO_IMAGE(vdsox32);
29extern struct page *vdsox32_pages[];
30static unsigned vdsox32_size; 28static unsigned vdsox32_size;
29#endif
30#endif
31 31
32static void __init patch_vdsox32(void *vdso, size_t len) 32#if defined(CONFIG_X86_32) || defined(CONFIG_X86_X32_ABI) || \
33 defined(CONFIG_COMPAT)
34void __init patch_vdso32(void *vdso, size_t len)
33{ 35{
34 Elf32_Ehdr *hdr = vdso; 36 Elf32_Ehdr *hdr = vdso;
35 Elf32_Shdr *sechdrs, *alt_sec = 0; 37 Elf32_Shdr *sechdrs, *alt_sec = 0;
@@ -52,7 +54,7 @@ static void __init patch_vdsox32(void *vdso, size_t len)
52 } 54 }
53 55
54 /* If we get here, it's probably a bug. */ 56 /* If we get here, it's probably a bug. */
55 pr_warning("patch_vdsox32: .altinstructions not found\n"); 57 pr_warning("patch_vdso32: .altinstructions not found\n");
56 return; /* nothing to patch */ 58 return; /* nothing to patch */
57 59
58found: 60found:
@@ -61,6 +63,7 @@ found:
61} 63}
62#endif 64#endif
63 65
66#if defined(CONFIG_X86_64)
64static void __init patch_vdso64(void *vdso, size_t len) 67static void __init patch_vdso64(void *vdso, size_t len)
65{ 68{
66 Elf64_Ehdr *hdr = vdso; 69 Elf64_Ehdr *hdr = vdso;
@@ -104,7 +107,7 @@ static int __init init_vdso(void)
104 vdso_pages[i] = virt_to_page(vdso_start + i*PAGE_SIZE); 107 vdso_pages[i] = virt_to_page(vdso_start + i*PAGE_SIZE);
105 108
106#ifdef CONFIG_X86_X32_ABI 109#ifdef CONFIG_X86_X32_ABI
107 patch_vdsox32(vdsox32_start, vdsox32_end - vdsox32_start); 110 patch_vdso32(vdsox32_start, vdsox32_end - vdsox32_start);
108 npages = (vdsox32_end - vdsox32_start + PAGE_SIZE - 1) / PAGE_SIZE; 111 npages = (vdsox32_end - vdsox32_start + PAGE_SIZE - 1) / PAGE_SIZE;
109 vdsox32_size = npages << PAGE_SHIFT; 112 vdsox32_size = npages << PAGE_SHIFT;
110 for (i = 0; i < npages; i++) 113 for (i = 0; i < npages; i++)
@@ -204,3 +207,4 @@ static __init int vdso_setup(char *s)
204 return 0; 207 return 0;
205} 208}
206__setup("vdso=", vdso_setup); 209__setup("vdso=", vdso_setup);
210#endif
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 1a3c76505649..e88fda867a33 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -7,7 +7,7 @@ config XEN
7 depends on PARAVIRT 7 depends on PARAVIRT
8 select PARAVIRT_CLOCK 8 select PARAVIRT_CLOCK
9 select XEN_HAVE_PVMMU 9 select XEN_HAVE_PVMMU
10 depends on X86_64 || (X86_32 && X86_PAE && !X86_VISWS) 10 depends on X86_64 || (X86_32 && X86_PAE)
11 depends on X86_TSC 11 depends on X86_TSC
12 help 12 help
13 This is the Linux Xen port. Enabling this will allow the 13 This is the Linux Xen port. Enabling this will allow the
@@ -19,11 +19,6 @@ config XEN_DOM0
19 depends on XEN && PCI_XEN && SWIOTLB_XEN 19 depends on XEN && PCI_XEN && SWIOTLB_XEN
20 depends on X86_LOCAL_APIC && X86_IO_APIC && ACPI && PCI 20 depends on X86_LOCAL_APIC && X86_IO_APIC && ACPI && PCI
21 21
22# Dummy symbol since people have come to rely on the PRIVILEGED_GUEST
23# name in tools.
24config XEN_PRIVILEGED_GUEST
25 def_bool XEN_DOM0
26
27config XEN_PVHVM 22config XEN_PVHVM
28 def_bool y 23 def_bool y
29 depends on XEN && PCI && X86_LOCAL_APIC 24 depends on XEN && PCI && X86_LOCAL_APIC
@@ -51,3 +46,7 @@ config XEN_DEBUG_FS
51 Enable statistics output and various tuning options in debugfs. 46 Enable statistics output and various tuning options in debugfs.
52 Enabling this option may incur a significant performance overhead. 47 Enabling this option may incur a significant performance overhead.
53 48
49config XEN_PVH
50 bool "Support for running as a PVH guest"
51 depends on X86_64 && XEN && XEN_PVHVM
52 def_bool n
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index fa6ade76ef3f..201d09a7c46b 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -262,8 +262,9 @@ static void __init xen_banner(void)
262 struct xen_extraversion extra; 262 struct xen_extraversion extra;
263 HYPERVISOR_xen_version(XENVER_extraversion, &extra); 263 HYPERVISOR_xen_version(XENVER_extraversion, &extra);
264 264
265 printk(KERN_INFO "Booting paravirtualized kernel on %s\n", 265 pr_info("Booting paravirtualized kernel %son %s\n",
266 pv_info.name); 266 xen_feature(XENFEAT_auto_translated_physmap) ?
267 "with PVH extensions " : "", pv_info.name);
267 printk(KERN_INFO "Xen version: %d.%d%s%s\n", 268 printk(KERN_INFO "Xen version: %d.%d%s%s\n",
268 version >> 16, version & 0xffff, extra.extraversion, 269 version >> 16, version & 0xffff, extra.extraversion,
269 xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); 270 xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
@@ -433,7 +434,7 @@ static void __init xen_init_cpuid_mask(void)
433 434
434 ax = 1; 435 ax = 1;
435 cx = 0; 436 cx = 0;
436 xen_cpuid(&ax, &bx, &cx, &dx); 437 cpuid(1, &ax, &bx, &cx, &dx);
437 438
438 xsave_mask = 439 xsave_mask =
439 (1 << (X86_FEATURE_XSAVE % 32)) | 440 (1 << (X86_FEATURE_XSAVE % 32)) |
@@ -1142,8 +1143,9 @@ void xen_setup_vcpu_info_placement(void)
1142 xen_vcpu_setup(cpu); 1143 xen_vcpu_setup(cpu);
1143 1144
1144 /* xen_vcpu_setup managed to place the vcpu_info within the 1145 /* xen_vcpu_setup managed to place the vcpu_info within the
1145 percpu area for all cpus, so make use of it */ 1146 * percpu area for all cpus, so make use of it. Note that for
1146 if (have_vcpu_info_placement) { 1147 * PVH we want to use native IRQ mechanism. */
1148 if (have_vcpu_info_placement && !xen_pvh_domain()) {
1147 pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct); 1149 pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
1148 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct); 1150 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
1149 pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); 1151 pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
@@ -1407,9 +1409,49 @@ static void __init xen_boot_params_init_edd(void)
1407 * Set up the GDT and segment registers for -fstack-protector. Until 1409 * Set up the GDT and segment registers for -fstack-protector. Until
1408 * we do this, we have to be careful not to call any stack-protected 1410 * we do this, we have to be careful not to call any stack-protected
1409 * function, which is most of the kernel. 1411 * function, which is most of the kernel.
1412 *
1413 * Note, that it is __ref because the only caller of this after init
1414 * is PVH which is not going to use xen_load_gdt_boot or other
1415 * __init functions.
1410 */ 1416 */
1411static void __init xen_setup_stackprotector(void) 1417static void __ref xen_setup_gdt(int cpu)
1412{ 1418{
1419 if (xen_feature(XENFEAT_auto_translated_physmap)) {
1420#ifdef CONFIG_X86_64
1421 unsigned long dummy;
1422
1423 load_percpu_segment(cpu); /* We need to access per-cpu area */
1424 switch_to_new_gdt(cpu); /* GDT and GS set */
1425
1426 /* We are switching of the Xen provided GDT to our HVM mode
1427 * GDT. The new GDT has __KERNEL_CS with CS.L = 1
1428 * and we are jumping to reload it.
1429 */
1430 asm volatile ("pushq %0\n"
1431 "leaq 1f(%%rip),%0\n"
1432 "pushq %0\n"
1433 "lretq\n"
1434 "1:\n"
1435 : "=&r" (dummy) : "0" (__KERNEL_CS));
1436
1437 /*
1438 * While not needed, we also set the %es, %ds, and %fs
1439 * to zero. We don't care about %ss as it is NULL.
1440 * Strictly speaking this is not needed as Xen zeros those
1441 * out (and also MSR_FS_BASE, MSR_GS_BASE, MSR_KERNEL_GS_BASE)
1442 *
1443 * Linux zeros them in cpu_init() and in secondary_startup_64
1444 * (for BSP).
1445 */
1446 loadsegment(es, 0);
1447 loadsegment(ds, 0);
1448 loadsegment(fs, 0);
1449#else
1450 /* PVH: TODO Implement. */
1451 BUG();
1452#endif
1453 return; /* PVH does not need any PV GDT ops. */
1454 }
1413 pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot; 1455 pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot;
1414 pv_cpu_ops.load_gdt = xen_load_gdt_boot; 1456 pv_cpu_ops.load_gdt = xen_load_gdt_boot;
1415 1457
@@ -1420,6 +1462,58 @@ static void __init xen_setup_stackprotector(void)
1420 pv_cpu_ops.load_gdt = xen_load_gdt; 1462 pv_cpu_ops.load_gdt = xen_load_gdt;
1421} 1463}
1422 1464
1465/*
1466 * A PV guest starts with default flags that are not set for PVH, set them
1467 * here asap.
1468 */
1469static void xen_pvh_set_cr_flags(int cpu)
1470{
1471
1472 /* Some of these are setup in 'secondary_startup_64'. The others:
1473 * X86_CR0_TS, X86_CR0_PE, X86_CR0_ET are set by Xen for HVM guests
1474 * (which PVH shared codepaths), while X86_CR0_PG is for PVH. */
1475 write_cr0(read_cr0() | X86_CR0_MP | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM);
1476
1477 if (!cpu)
1478 return;
1479 /*
1480 * For BSP, PSE PGE are set in probe_page_size_mask(), for APs
1481 * set them here. For all, OSFXSR OSXMMEXCPT are set in fpu_init.
1482 */
1483 if (cpu_has_pse)
1484 set_in_cr4(X86_CR4_PSE);
1485
1486 if (cpu_has_pge)
1487 set_in_cr4(X86_CR4_PGE);
1488}
1489
1490/*
1491 * Note, that it is ref - because the only caller of this after init
1492 * is PVH which is not going to use xen_load_gdt_boot or other
1493 * __init functions.
1494 */
1495void __ref xen_pvh_secondary_vcpu_init(int cpu)
1496{
1497 xen_setup_gdt(cpu);
1498 xen_pvh_set_cr_flags(cpu);
1499}
1500
1501static void __init xen_pvh_early_guest_init(void)
1502{
1503 if (!xen_feature(XENFEAT_auto_translated_physmap))
1504 return;
1505
1506 if (!xen_feature(XENFEAT_hvm_callback_vector))
1507 return;
1508
1509 xen_have_vector_callback = 1;
1510 xen_pvh_set_cr_flags(0);
1511
1512#ifdef CONFIG_X86_32
1513 BUG(); /* PVH: Implement proper support. */
1514#endif
1515}
1516
1423/* First C function to be called on Xen boot */ 1517/* First C function to be called on Xen boot */
1424asmlinkage void __init xen_start_kernel(void) 1518asmlinkage void __init xen_start_kernel(void)
1425{ 1519{
@@ -1431,13 +1525,16 @@ asmlinkage void __init xen_start_kernel(void)
1431 1525
1432 xen_domain_type = XEN_PV_DOMAIN; 1526 xen_domain_type = XEN_PV_DOMAIN;
1433 1527
1528 xen_setup_features();
1529 xen_pvh_early_guest_init();
1434 xen_setup_machphys_mapping(); 1530 xen_setup_machphys_mapping();
1435 1531
1436 /* Install Xen paravirt ops */ 1532 /* Install Xen paravirt ops */
1437 pv_info = xen_info; 1533 pv_info = xen_info;
1438 pv_init_ops = xen_init_ops; 1534 pv_init_ops = xen_init_ops;
1439 pv_cpu_ops = xen_cpu_ops;
1440 pv_apic_ops = xen_apic_ops; 1535 pv_apic_ops = xen_apic_ops;
1536 if (!xen_pvh_domain())
1537 pv_cpu_ops = xen_cpu_ops;
1441 1538
1442 x86_init.resources.memory_setup = xen_memory_setup; 1539 x86_init.resources.memory_setup = xen_memory_setup;
1443 x86_init.oem.arch_setup = xen_arch_setup; 1540 x86_init.oem.arch_setup = xen_arch_setup;
@@ -1469,17 +1566,14 @@ asmlinkage void __init xen_start_kernel(void)
1469 /* Work out if we support NX */ 1566 /* Work out if we support NX */
1470 x86_configure_nx(); 1567 x86_configure_nx();
1471 1568
1472 xen_setup_features();
1473
1474 /* Get mfn list */ 1569 /* Get mfn list */
1475 if (!xen_feature(XENFEAT_auto_translated_physmap)) 1570 xen_build_dynamic_phys_to_machine();
1476 xen_build_dynamic_phys_to_machine();
1477 1571
1478 /* 1572 /*
1479 * Set up kernel GDT and segment registers, mainly so that 1573 * Set up kernel GDT and segment registers, mainly so that
1480 * -fstack-protector code can be executed. 1574 * -fstack-protector code can be executed.
1481 */ 1575 */
1482 xen_setup_stackprotector(); 1576 xen_setup_gdt(0);
1483 1577
1484 xen_init_irq_ops(); 1578 xen_init_irq_ops();
1485 xen_init_cpuid_mask(); 1579 xen_init_cpuid_mask();
@@ -1548,14 +1642,18 @@ asmlinkage void __init xen_start_kernel(void)
1548 /* set the limit of our address space */ 1642 /* set the limit of our address space */
1549 xen_reserve_top(); 1643 xen_reserve_top();
1550 1644
1551 /* We used to do this in xen_arch_setup, but that is too late on AMD 1645 /* PVH: runs at default kernel iopl of 0 */
1552 * were early_cpu_init (run before ->arch_setup()) calls early_amd_init 1646 if (!xen_pvh_domain()) {
1553 * which pokes 0xcf8 port. 1647 /*
1554 */ 1648 * We used to do this in xen_arch_setup, but that is too late
1555 set_iopl.iopl = 1; 1649 * on AMD were early_cpu_init (run before ->arch_setup()) calls
1556 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 1650 * early_amd_init which pokes 0xcf8 port.
1557 if (rc != 0) 1651 */
1558 xen_raw_printk("physdev_op failed %d\n", rc); 1652 set_iopl.iopl = 1;
1653 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
1654 if (rc != 0)
1655 xen_raw_printk("physdev_op failed %d\n", rc);
1656 }
1559 1657
1560#ifdef CONFIG_X86_32 1658#ifdef CONFIG_X86_32
1561 /* set up basic CPUID stuff */ 1659 /* set up basic CPUID stuff */
diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c
index 3a5f55d51907..c98583588580 100644
--- a/arch/x86/xen/grant-table.c
+++ b/arch/x86/xen/grant-table.c
@@ -125,3 +125,67 @@ void arch_gnttab_unmap(void *shared, unsigned long nr_gframes)
125 apply_to_page_range(&init_mm, (unsigned long)shared, 125 apply_to_page_range(&init_mm, (unsigned long)shared,
126 PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL); 126 PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL);
127} 127}
128#ifdef CONFIG_XEN_PVH
129#include <xen/balloon.h>
130#include <xen/events.h>
131#include <xen/xen.h>
132#include <linux/slab.h>
133static int __init xlated_setup_gnttab_pages(void)
134{
135 struct page **pages;
136 xen_pfn_t *pfns;
137 int rc;
138 unsigned int i;
139 unsigned long nr_grant_frames = gnttab_max_grant_frames();
140
141 BUG_ON(nr_grant_frames == 0);
142 pages = kcalloc(nr_grant_frames, sizeof(pages[0]), GFP_KERNEL);
143 if (!pages)
144 return -ENOMEM;
145
146 pfns = kcalloc(nr_grant_frames, sizeof(pfns[0]), GFP_KERNEL);
147 if (!pfns) {
148 kfree(pages);
149 return -ENOMEM;
150 }
151 rc = alloc_xenballooned_pages(nr_grant_frames, pages, 0 /* lowmem */);
152 if (rc) {
153 pr_warn("%s Couldn't balloon alloc %ld pfns rc:%d\n", __func__,
154 nr_grant_frames, rc);
155 kfree(pages);
156 kfree(pfns);
157 return rc;
158 }
159 for (i = 0; i < nr_grant_frames; i++)
160 pfns[i] = page_to_pfn(pages[i]);
161
162 rc = arch_gnttab_map_shared(pfns, nr_grant_frames, nr_grant_frames,
163 &xen_auto_xlat_grant_frames.vaddr);
164
165 if (rc) {
166 pr_warn("%s Couldn't map %ld pfns rc:%d\n", __func__,
167 nr_grant_frames, rc);
168 free_xenballooned_pages(nr_grant_frames, pages);
169 kfree(pages);
170 kfree(pfns);
171 return rc;
172 }
173 kfree(pages);
174
175 xen_auto_xlat_grant_frames.pfn = pfns;
176 xen_auto_xlat_grant_frames.count = nr_grant_frames;
177
178 return 0;
179}
180
181static int __init xen_pvh_gnttab_setup(void)
182{
183 if (!xen_pvh_domain())
184 return -ENODEV;
185
186 return xlated_setup_gnttab_pages();
187}
188/* Call it _before_ __gnttab_init as we need to initialize the
189 * xen_auto_xlat_grant_frames first. */
190core_initcall(xen_pvh_gnttab_setup);
191#endif
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 0da7f863056f..08f763de26fe 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -5,6 +5,7 @@
5#include <xen/interface/xen.h> 5#include <xen/interface/xen.h>
6#include <xen/interface/sched.h> 6#include <xen/interface/sched.h>
7#include <xen/interface/vcpu.h> 7#include <xen/interface/vcpu.h>
8#include <xen/features.h>
8#include <xen/events.h> 9#include <xen/events.h>
9 10
10#include <asm/xen/hypercall.h> 11#include <asm/xen/hypercall.h>
@@ -22,7 +23,7 @@ void xen_force_evtchn_callback(void)
22 (void)HYPERVISOR_xen_version(0, NULL); 23 (void)HYPERVISOR_xen_version(0, NULL);
23} 24}
24 25
25static unsigned long xen_save_fl(void) 26asmlinkage unsigned long xen_save_fl(void)
26{ 27{
27 struct vcpu_info *vcpu; 28 struct vcpu_info *vcpu;
28 unsigned long flags; 29 unsigned long flags;
@@ -40,7 +41,7 @@ static unsigned long xen_save_fl(void)
40} 41}
41PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl); 42PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl);
42 43
43static void xen_restore_fl(unsigned long flags) 44__visible void xen_restore_fl(unsigned long flags)
44{ 45{
45 struct vcpu_info *vcpu; 46 struct vcpu_info *vcpu;
46 47
@@ -62,7 +63,7 @@ static void xen_restore_fl(unsigned long flags)
62} 63}
63PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl); 64PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl);
64 65
65static void xen_irq_disable(void) 66asmlinkage void xen_irq_disable(void)
66{ 67{
67 /* There's a one instruction preempt window here. We need to 68 /* There's a one instruction preempt window here. We need to
68 make sure we're don't switch CPUs between getting the vcpu 69 make sure we're don't switch CPUs between getting the vcpu
@@ -73,7 +74,7 @@ static void xen_irq_disable(void)
73} 74}
74PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable); 75PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable);
75 76
76static void xen_irq_enable(void) 77asmlinkage void xen_irq_enable(void)
77{ 78{
78 struct vcpu_info *vcpu; 79 struct vcpu_info *vcpu;
79 80
@@ -128,6 +129,8 @@ static const struct pv_irq_ops xen_irq_ops __initconst = {
128 129
129void __init xen_init_irq_ops(void) 130void __init xen_init_irq_ops(void)
130{ 131{
131 pv_irq_ops = xen_irq_ops; 132 /* For PVH we use default pv_irq_ops settings. */
133 if (!xen_feature(XENFEAT_hvm_callback_vector))
134 pv_irq_ops = xen_irq_ops;
132 x86_init.irqs.intr_init = xen_init_IRQ; 135 x86_init.irqs.intr_init = xen_init_IRQ;
133} 136}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index ce563be09cc1..86e02eabb640 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -431,7 +431,7 @@ static pteval_t iomap_pte(pteval_t val)
431 return val; 431 return val;
432} 432}
433 433
434static pteval_t xen_pte_val(pte_t pte) 434__visible pteval_t xen_pte_val(pte_t pte)
435{ 435{
436 pteval_t pteval = pte.pte; 436 pteval_t pteval = pte.pte;
437#if 0 437#if 0
@@ -448,7 +448,7 @@ static pteval_t xen_pte_val(pte_t pte)
448} 448}
449PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); 449PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
450 450
451static pgdval_t xen_pgd_val(pgd_t pgd) 451__visible pgdval_t xen_pgd_val(pgd_t pgd)
452{ 452{
453 return pte_mfn_to_pfn(pgd.pgd); 453 return pte_mfn_to_pfn(pgd.pgd);
454} 454}
@@ -479,7 +479,7 @@ void xen_set_pat(u64 pat)
479 WARN_ON(pat != 0x0007010600070106ull); 479 WARN_ON(pat != 0x0007010600070106ull);
480} 480}
481 481
482static pte_t xen_make_pte(pteval_t pte) 482__visible pte_t xen_make_pte(pteval_t pte)
483{ 483{
484 phys_addr_t addr = (pte & PTE_PFN_MASK); 484 phys_addr_t addr = (pte & PTE_PFN_MASK);
485#if 0 485#if 0
@@ -514,14 +514,14 @@ static pte_t xen_make_pte(pteval_t pte)
514} 514}
515PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); 515PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
516 516
517static pgd_t xen_make_pgd(pgdval_t pgd) 517__visible pgd_t xen_make_pgd(pgdval_t pgd)
518{ 518{
519 pgd = pte_pfn_to_mfn(pgd); 519 pgd = pte_pfn_to_mfn(pgd);
520 return native_make_pgd(pgd); 520 return native_make_pgd(pgd);
521} 521}
522PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd); 522PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
523 523
524static pmdval_t xen_pmd_val(pmd_t pmd) 524__visible pmdval_t xen_pmd_val(pmd_t pmd)
525{ 525{
526 return pte_mfn_to_pfn(pmd.pmd); 526 return pte_mfn_to_pfn(pmd.pmd);
527} 527}
@@ -580,7 +580,7 @@ static void xen_pmd_clear(pmd_t *pmdp)
580} 580}
581#endif /* CONFIG_X86_PAE */ 581#endif /* CONFIG_X86_PAE */
582 582
583static pmd_t xen_make_pmd(pmdval_t pmd) 583__visible pmd_t xen_make_pmd(pmdval_t pmd)
584{ 584{
585 pmd = pte_pfn_to_mfn(pmd); 585 pmd = pte_pfn_to_mfn(pmd);
586 return native_make_pmd(pmd); 586 return native_make_pmd(pmd);
@@ -588,13 +588,13 @@ static pmd_t xen_make_pmd(pmdval_t pmd)
588PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd); 588PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
589 589
590#if PAGETABLE_LEVELS == 4 590#if PAGETABLE_LEVELS == 4
591static pudval_t xen_pud_val(pud_t pud) 591__visible pudval_t xen_pud_val(pud_t pud)
592{ 592{
593 return pte_mfn_to_pfn(pud.pud); 593 return pte_mfn_to_pfn(pud.pud);
594} 594}
595PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val); 595PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
596 596
597static pud_t xen_make_pud(pudval_t pud) 597__visible pud_t xen_make_pud(pudval_t pud)
598{ 598{
599 pud = pte_pfn_to_mfn(pud); 599 pud = pte_pfn_to_mfn(pud);
600 600
@@ -1198,44 +1198,40 @@ static void __init xen_cleanhighmap(unsigned long vaddr,
1198 * instead of somewhere later and be confusing. */ 1198 * instead of somewhere later and be confusing. */
1199 xen_mc_flush(); 1199 xen_mc_flush();
1200} 1200}
1201#endif 1201static void __init xen_pagetable_p2m_copy(void)
1202static void __init xen_pagetable_init(void)
1203{ 1202{
1204#ifdef CONFIG_X86_64
1205 unsigned long size; 1203 unsigned long size;
1206 unsigned long addr; 1204 unsigned long addr;
1207#endif 1205 unsigned long new_mfn_list;
1208 paging_init(); 1206
1209 xen_setup_shared_info(); 1207 if (xen_feature(XENFEAT_auto_translated_physmap))
1210#ifdef CONFIG_X86_64 1208 return;
1211 if (!xen_feature(XENFEAT_auto_translated_physmap)) { 1209
1212 unsigned long new_mfn_list; 1210 size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
1213 1211
1214 size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); 1212 new_mfn_list = xen_revector_p2m_tree();
1215 1213 /* No memory or already called. */
1216 /* On 32-bit, we get zero so this never gets executed. */ 1214 if (!new_mfn_list || new_mfn_list == xen_start_info->mfn_list)
1217 new_mfn_list = xen_revector_p2m_tree(); 1215 return;
1218 if (new_mfn_list && new_mfn_list != xen_start_info->mfn_list) { 1216
1219 /* using __ka address and sticking INVALID_P2M_ENTRY! */ 1217 /* using __ka address and sticking INVALID_P2M_ENTRY! */
1220 memset((void *)xen_start_info->mfn_list, 0xff, size); 1218 memset((void *)xen_start_info->mfn_list, 0xff, size);
1221 1219
1222 /* We should be in __ka space. */ 1220 /* We should be in __ka space. */
1223 BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map); 1221 BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map);
1224 addr = xen_start_info->mfn_list; 1222 addr = xen_start_info->mfn_list;
1225 /* We roundup to the PMD, which means that if anybody at this stage is 1223 /* We roundup to the PMD, which means that if anybody at this stage is
1226 * using the __ka address of xen_start_info or xen_start_info->shared_info 1224 * using the __ka address of xen_start_info or xen_start_info->shared_info
1227 * they are in going to crash. Fortunatly we have already revectored 1225 * they are in going to crash. Fortunatly we have already revectored
1228 * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */ 1226 * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */
1229 size = roundup(size, PMD_SIZE); 1227 size = roundup(size, PMD_SIZE);
1230 xen_cleanhighmap(addr, addr + size); 1228 xen_cleanhighmap(addr, addr + size);
1231 1229
1232 size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); 1230 size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
1233 memblock_free(__pa(xen_start_info->mfn_list), size); 1231 memblock_free(__pa(xen_start_info->mfn_list), size);
1234 /* And revector! Bye bye old array */ 1232 /* And revector! Bye bye old array */
1235 xen_start_info->mfn_list = new_mfn_list; 1233 xen_start_info->mfn_list = new_mfn_list;
1236 } else 1234
1237 goto skip;
1238 }
1239 /* At this stage, cleanup_highmap has already cleaned __ka space 1235 /* At this stage, cleanup_highmap has already cleaned __ka space
1240 * from _brk_limit way up to the max_pfn_mapped (which is the end of 1236 * from _brk_limit way up to the max_pfn_mapped (which is the end of
1241 * the ramdisk). We continue on, erasing PMD entries that point to page 1237 * the ramdisk). We continue on, erasing PMD entries that point to page
@@ -1255,7 +1251,15 @@ static void __init xen_pagetable_init(void)
1255 * anything at this stage. */ 1251 * anything at this stage. */
1256 xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1); 1252 xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1);
1257#endif 1253#endif
1258skip: 1254}
1255#endif
1256
1257static void __init xen_pagetable_init(void)
1258{
1259 paging_init();
1260 xen_setup_shared_info();
1261#ifdef CONFIG_X86_64
1262 xen_pagetable_p2m_copy();
1259#endif 1263#endif
1260 xen_post_allocator_init(); 1264 xen_post_allocator_init();
1261} 1265}
@@ -1753,6 +1757,10 @@ static void set_page_prot_flags(void *addr, pgprot_t prot, unsigned long flags)
1753 unsigned long pfn = __pa(addr) >> PAGE_SHIFT; 1757 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1754 pte_t pte = pfn_pte(pfn, prot); 1758 pte_t pte = pfn_pte(pfn, prot);
1755 1759
1760 /* For PVH no need to set R/O or R/W to pin them or unpin them. */
1761 if (xen_feature(XENFEAT_auto_translated_physmap))
1762 return;
1763
1756 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags)) 1764 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags))
1757 BUG(); 1765 BUG();
1758} 1766}
@@ -1863,6 +1871,7 @@ static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end,
1863 * but that's enough to get __va working. We need to fill in the rest 1871 * but that's enough to get __va working. We need to fill in the rest
1864 * of the physical mapping once some sort of allocator has been set 1872 * of the physical mapping once some sort of allocator has been set
1865 * up. 1873 * up.
1874 * NOTE: for PVH, the page tables are native.
1866 */ 1875 */
1867void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) 1876void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1868{ 1877{
@@ -1884,17 +1893,18 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1884 /* Zap identity mapping */ 1893 /* Zap identity mapping */
1885 init_level4_pgt[0] = __pgd(0); 1894 init_level4_pgt[0] = __pgd(0);
1886 1895
1887 /* Pre-constructed entries are in pfn, so convert to mfn */ 1896 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1888 /* L4[272] -> level3_ident_pgt 1897 /* Pre-constructed entries are in pfn, so convert to mfn */
1889 * L4[511] -> level3_kernel_pgt */ 1898 /* L4[272] -> level3_ident_pgt
1890 convert_pfn_mfn(init_level4_pgt); 1899 * L4[511] -> level3_kernel_pgt */
1891 1900 convert_pfn_mfn(init_level4_pgt);
1892 /* L3_i[0] -> level2_ident_pgt */ 1901
1893 convert_pfn_mfn(level3_ident_pgt); 1902 /* L3_i[0] -> level2_ident_pgt */
1894 /* L3_k[510] -> level2_kernel_pgt 1903 convert_pfn_mfn(level3_ident_pgt);
1895 * L3_i[511] -> level2_fixmap_pgt */ 1904 /* L3_k[510] -> level2_kernel_pgt
1896 convert_pfn_mfn(level3_kernel_pgt); 1905 * L3_i[511] -> level2_fixmap_pgt */
1897 1906 convert_pfn_mfn(level3_kernel_pgt);
1907 }
1898 /* We get [511][511] and have Xen's version of level2_kernel_pgt */ 1908 /* We get [511][511] and have Xen's version of level2_kernel_pgt */
1899 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); 1909 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1900 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud); 1910 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
@@ -1918,31 +1928,33 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1918 copy_page(level2_fixmap_pgt, l2); 1928 copy_page(level2_fixmap_pgt, l2);
1919 /* Note that we don't do anything with level1_fixmap_pgt which 1929 /* Note that we don't do anything with level1_fixmap_pgt which
1920 * we don't need. */ 1930 * we don't need. */
1931 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1932 /* Make pagetable pieces RO */
1933 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1934 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1935 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1936 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1937 set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);
1938 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1939 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1940
1941 /* Pin down new L4 */
1942 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1943 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1944
1945 /* Unpin Xen-provided one */
1946 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1921 1947
1922 /* Make pagetable pieces RO */ 1948 /*
1923 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); 1949 * At this stage there can be no user pgd, and no page
1924 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); 1950 * structure to attach it to, so make sure we just set kernel
1925 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); 1951 * pgd.
1926 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); 1952 */
1927 set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO); 1953 xen_mc_batch();
1928 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); 1954 __xen_write_cr3(true, __pa(init_level4_pgt));
1929 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); 1955 xen_mc_issue(PARAVIRT_LAZY_CPU);
1930 1956 } else
1931 /* Pin down new L4 */ 1957 native_write_cr3(__pa(init_level4_pgt));
1932 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1933 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1934
1935 /* Unpin Xen-provided one */
1936 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1937
1938 /*
1939 * At this stage there can be no user pgd, and no page
1940 * structure to attach it to, so make sure we just set kernel
1941 * pgd.
1942 */
1943 xen_mc_batch();
1944 __xen_write_cr3(true, __pa(init_level4_pgt));
1945 xen_mc_issue(PARAVIRT_LAZY_CPU);
1946 1958
1947 /* We can't that easily rip out L3 and L2, as the Xen pagetables are 1959 /* We can't that easily rip out L3 and L2, as the Xen pagetables are
1948 * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for 1960 * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for
@@ -2046,7 +2058,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
2046 case FIX_RO_IDT: 2058 case FIX_RO_IDT:
2047#ifdef CONFIG_X86_32 2059#ifdef CONFIG_X86_32
2048 case FIX_WP_TEST: 2060 case FIX_WP_TEST:
2049 case FIX_VDSO:
2050# ifdef CONFIG_HIGHMEM 2061# ifdef CONFIG_HIGHMEM
2051 case FIX_KMAP_BEGIN ... FIX_KMAP_END: 2062 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
2052# endif 2063# endif
@@ -2103,6 +2114,9 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
2103 2114
2104static void __init xen_post_allocator_init(void) 2115static void __init xen_post_allocator_init(void)
2105{ 2116{
2117 if (xen_feature(XENFEAT_auto_translated_physmap))
2118 return;
2119
2106 pv_mmu_ops.set_pte = xen_set_pte; 2120 pv_mmu_ops.set_pte = xen_set_pte;
2107 pv_mmu_ops.set_pmd = xen_set_pmd; 2121 pv_mmu_ops.set_pmd = xen_set_pmd;
2108 pv_mmu_ops.set_pud = xen_set_pud; 2122 pv_mmu_ops.set_pud = xen_set_pud;
@@ -2207,6 +2221,15 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
2207void __init xen_init_mmu_ops(void) 2221void __init xen_init_mmu_ops(void)
2208{ 2222{
2209 x86_init.paging.pagetable_init = xen_pagetable_init; 2223 x86_init.paging.pagetable_init = xen_pagetable_init;
2224
2225 /* Optimization - we can use the HVM one but it has no idea which
2226 * VCPUs are descheduled - which means that it will needlessly IPI
2227 * them. Xen knows so let it do the job.
2228 */
2229 if (xen_feature(XENFEAT_auto_translated_physmap)) {
2230 pv_mmu_ops.flush_tlb_others = xen_flush_tlb_others;
2231 return;
2232 }
2210 pv_mmu_ops = xen_mmu_ops; 2233 pv_mmu_ops = xen_mmu_ops;
2211 2234
2212 memset(dummy_mapping, 0xff, PAGE_SIZE); 2235 memset(dummy_mapping, 0xff, PAGE_SIZE);
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 2ae8699e8767..85e5d78c9874 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -280,6 +280,9 @@ void __ref xen_build_mfn_list_list(void)
280{ 280{
281 unsigned long pfn; 281 unsigned long pfn;
282 282
283 if (xen_feature(XENFEAT_auto_translated_physmap))
284 return;
285
283 /* Pre-initialize p2m_top_mfn to be completely missing */ 286 /* Pre-initialize p2m_top_mfn to be completely missing */
284 if (p2m_top_mfn == NULL) { 287 if (p2m_top_mfn == NULL) {
285 p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); 288 p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
@@ -336,6 +339,9 @@ void __ref xen_build_mfn_list_list(void)
336 339
337void xen_setup_mfn_list_list(void) 340void xen_setup_mfn_list_list(void)
338{ 341{
342 if (xen_feature(XENFEAT_auto_translated_physmap))
343 return;
344
339 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); 345 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
340 346
341 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = 347 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
@@ -346,10 +352,15 @@ void xen_setup_mfn_list_list(void)
346/* Set up p2m_top to point to the domain-builder provided p2m pages */ 352/* Set up p2m_top to point to the domain-builder provided p2m pages */
347void __init xen_build_dynamic_phys_to_machine(void) 353void __init xen_build_dynamic_phys_to_machine(void)
348{ 354{
349 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; 355 unsigned long *mfn_list;
350 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); 356 unsigned long max_pfn;
351 unsigned long pfn; 357 unsigned long pfn;
352 358
359 if (xen_feature(XENFEAT_auto_translated_physmap))
360 return;
361
362 mfn_list = (unsigned long *)xen_start_info->mfn_list;
363 max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
353 xen_max_p2m_pfn = max_pfn; 364 xen_max_p2m_pfn = max_pfn;
354 365
355 p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); 366 p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
@@ -870,6 +881,65 @@ static unsigned long mfn_hash(unsigned long mfn)
870 return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT); 881 return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT);
871} 882}
872 883
884int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
885 struct gnttab_map_grant_ref *kmap_ops,
886 struct page **pages, unsigned int count)
887{
888 int i, ret = 0;
889 bool lazy = false;
890 pte_t *pte;
891
892 if (xen_feature(XENFEAT_auto_translated_physmap))
893 return 0;
894
895 if (kmap_ops &&
896 !in_interrupt() &&
897 paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) {
898 arch_enter_lazy_mmu_mode();
899 lazy = true;
900 }
901
902 for (i = 0; i < count; i++) {
903 unsigned long mfn, pfn;
904
905 /* Do not add to override if the map failed. */
906 if (map_ops[i].status)
907 continue;
908
909 if (map_ops[i].flags & GNTMAP_contains_pte) {
910 pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) +
911 (map_ops[i].host_addr & ~PAGE_MASK));
912 mfn = pte_mfn(*pte);
913 } else {
914 mfn = PFN_DOWN(map_ops[i].dev_bus_addr);
915 }
916 pfn = page_to_pfn(pages[i]);
917
918 WARN_ON(PagePrivate(pages[i]));
919 SetPagePrivate(pages[i]);
920 set_page_private(pages[i], mfn);
921 pages[i]->index = pfn_to_mfn(pfn);
922
923 if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) {
924 ret = -ENOMEM;
925 goto out;
926 }
927
928 if (kmap_ops) {
929 ret = m2p_add_override(mfn, pages[i], &kmap_ops[i]);
930 if (ret)
931 goto out;
932 }
933 }
934
935out:
936 if (lazy)
937 arch_leave_lazy_mmu_mode();
938
939 return ret;
940}
941EXPORT_SYMBOL_GPL(set_foreign_p2m_mapping);
942
873/* Add an MFN override for a particular page */ 943/* Add an MFN override for a particular page */
874int m2p_add_override(unsigned long mfn, struct page *page, 944int m2p_add_override(unsigned long mfn, struct page *page,
875 struct gnttab_map_grant_ref *kmap_op) 945 struct gnttab_map_grant_ref *kmap_op)
@@ -888,13 +958,6 @@ int m2p_add_override(unsigned long mfn, struct page *page,
888 "m2p_add_override: pfn %lx not mapped", pfn)) 958 "m2p_add_override: pfn %lx not mapped", pfn))
889 return -EINVAL; 959 return -EINVAL;
890 } 960 }
891 WARN_ON(PagePrivate(page));
892 SetPagePrivate(page);
893 set_page_private(page, mfn);
894 page->index = pfn_to_mfn(pfn);
895
896 if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn))))
897 return -ENOMEM;
898 961
899 if (kmap_op != NULL) { 962 if (kmap_op != NULL) {
900 if (!PageHighMem(page)) { 963 if (!PageHighMem(page)) {
@@ -932,20 +995,62 @@ int m2p_add_override(unsigned long mfn, struct page *page,
932 return 0; 995 return 0;
933} 996}
934EXPORT_SYMBOL_GPL(m2p_add_override); 997EXPORT_SYMBOL_GPL(m2p_add_override);
998
999int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
1000 struct gnttab_map_grant_ref *kmap_ops,
1001 struct page **pages, unsigned int count)
1002{
1003 int i, ret = 0;
1004 bool lazy = false;
1005
1006 if (xen_feature(XENFEAT_auto_translated_physmap))
1007 return 0;
1008
1009 if (kmap_ops &&
1010 !in_interrupt() &&
1011 paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) {
1012 arch_enter_lazy_mmu_mode();
1013 lazy = true;
1014 }
1015
1016 for (i = 0; i < count; i++) {
1017 unsigned long mfn = get_phys_to_machine(page_to_pfn(pages[i]));
1018 unsigned long pfn = page_to_pfn(pages[i]);
1019
1020 if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT)) {
1021 ret = -EINVAL;
1022 goto out;
1023 }
1024
1025 set_page_private(pages[i], INVALID_P2M_ENTRY);
1026 WARN_ON(!PagePrivate(pages[i]));
1027 ClearPagePrivate(pages[i]);
1028 set_phys_to_machine(pfn, pages[i]->index);
1029
1030 if (kmap_ops)
1031 ret = m2p_remove_override(pages[i], &kmap_ops[i], mfn);
1032 if (ret)
1033 goto out;
1034 }
1035
1036out:
1037 if (lazy)
1038 arch_leave_lazy_mmu_mode();
1039 return ret;
1040}
1041EXPORT_SYMBOL_GPL(clear_foreign_p2m_mapping);
1042
935int m2p_remove_override(struct page *page, 1043int m2p_remove_override(struct page *page,
936 struct gnttab_map_grant_ref *kmap_op) 1044 struct gnttab_map_grant_ref *kmap_op,
1045 unsigned long mfn)
937{ 1046{
938 unsigned long flags; 1047 unsigned long flags;
939 unsigned long mfn;
940 unsigned long pfn; 1048 unsigned long pfn;
941 unsigned long uninitialized_var(address); 1049 unsigned long uninitialized_var(address);
942 unsigned level; 1050 unsigned level;
943 pte_t *ptep = NULL; 1051 pte_t *ptep = NULL;
944 1052
945 pfn = page_to_pfn(page); 1053 pfn = page_to_pfn(page);
946 mfn = get_phys_to_machine(pfn);
947 if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT))
948 return -EINVAL;
949 1054
950 if (!PageHighMem(page)) { 1055 if (!PageHighMem(page)) {
951 address = (unsigned long)__va(pfn << PAGE_SHIFT); 1056 address = (unsigned long)__va(pfn << PAGE_SHIFT);
@@ -959,10 +1064,7 @@ int m2p_remove_override(struct page *page,
959 spin_lock_irqsave(&m2p_override_lock, flags); 1064 spin_lock_irqsave(&m2p_override_lock, flags);
960 list_del(&page->lru); 1065 list_del(&page->lru);
961 spin_unlock_irqrestore(&m2p_override_lock, flags); 1066 spin_unlock_irqrestore(&m2p_override_lock, flags);
962 WARN_ON(!PagePrivate(page));
963 ClearPagePrivate(page);
964 1067
965 set_phys_to_machine(pfn, page->index);
966 if (kmap_op != NULL) { 1068 if (kmap_op != NULL) {
967 if (!PageHighMem(page)) { 1069 if (!PageHighMem(page)) {
968 struct multicall_space mcs; 1070 struct multicall_space mcs;
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
index 0a7852483ffe..a8261716d58d 100644
--- a/arch/x86/xen/platform-pci-unplug.c
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -30,10 +30,9 @@
30#define XEN_PLATFORM_ERR_PROTOCOL -2 30#define XEN_PLATFORM_ERR_PROTOCOL -2
31#define XEN_PLATFORM_ERR_BLACKLIST -3 31#define XEN_PLATFORM_ERR_BLACKLIST -3
32 32
33/* store the value of xen_emul_unplug after the unplug is done */
34int xen_platform_pci_unplug;
35EXPORT_SYMBOL_GPL(xen_platform_pci_unplug);
36#ifdef CONFIG_XEN_PVHVM 33#ifdef CONFIG_XEN_PVHVM
34/* store the value of xen_emul_unplug after the unplug is done */
35static int xen_platform_pci_unplug;
37static int xen_emul_unplug; 36static int xen_emul_unplug;
38 37
39static int check_platform_magic(void) 38static int check_platform_magic(void)
@@ -69,6 +68,80 @@ static int check_platform_magic(void)
69 return 0; 68 return 0;
70} 69}
71 70
71bool xen_has_pv_devices()
72{
73 if (!xen_domain())
74 return false;
75
76 /* PV domains always have them. */
77 if (xen_pv_domain())
78 return true;
79
80 /* And user has xen_platform_pci=0 set in guest config as
81 * driver did not modify the value. */
82 if (xen_platform_pci_unplug == 0)
83 return false;
84
85 if (xen_platform_pci_unplug & XEN_UNPLUG_NEVER)
86 return false;
87
88 if (xen_platform_pci_unplug & XEN_UNPLUG_ALL)
89 return true;
90
91 /* This is an odd one - we are going to run legacy
92 * and PV drivers at the same time. */
93 if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY)
94 return true;
95
96 /* And the caller has to follow with xen_pv_{disk,nic}_devices
97 * to be certain which driver can load. */
98 return false;
99}
100EXPORT_SYMBOL_GPL(xen_has_pv_devices);
101
102static bool __xen_has_pv_device(int state)
103{
104 /* HVM domains might or might not */
105 if (xen_hvm_domain() && (xen_platform_pci_unplug & state))
106 return true;
107
108 return xen_has_pv_devices();
109}
110
111bool xen_has_pv_nic_devices(void)
112{
113 return __xen_has_pv_device(XEN_UNPLUG_ALL_NICS | XEN_UNPLUG_ALL);
114}
115EXPORT_SYMBOL_GPL(xen_has_pv_nic_devices);
116
117bool xen_has_pv_disk_devices(void)
118{
119 return __xen_has_pv_device(XEN_UNPLUG_ALL_IDE_DISKS |
120 XEN_UNPLUG_AUX_IDE_DISKS | XEN_UNPLUG_ALL);
121}
122EXPORT_SYMBOL_GPL(xen_has_pv_disk_devices);
123
124/*
125 * This one is odd - it determines whether you want to run PV _and_
126 * legacy (IDE) drivers together. This combination is only possible
127 * under HVM.
128 */
129bool xen_has_pv_and_legacy_disk_devices(void)
130{
131 if (!xen_domain())
132 return false;
133
134 /* N.B. This is only ever used in HVM mode */
135 if (xen_pv_domain())
136 return false;
137
138 if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY)
139 return true;
140
141 return false;
142}
143EXPORT_SYMBOL_GPL(xen_has_pv_and_legacy_disk_devices);
144
72void xen_unplug_emulated_devices(void) 145void xen_unplug_emulated_devices(void)
73{ 146{
74 int r; 147 int r;
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 68c054f59de6..0982233b9b84 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -27,6 +27,7 @@
27#include <xen/interface/memory.h> 27#include <xen/interface/memory.h>
28#include <xen/interface/physdev.h> 28#include <xen/interface/physdev.h>
29#include <xen/features.h> 29#include <xen/features.h>
30#include "mmu.h"
30#include "xen-ops.h" 31#include "xen-ops.h"
31#include "vdso.h" 32#include "vdso.h"
32 33
@@ -34,7 +35,7 @@
34extern const char xen_hypervisor_callback[]; 35extern const char xen_hypervisor_callback[];
35extern const char xen_failsafe_callback[]; 36extern const char xen_failsafe_callback[];
36#ifdef CONFIG_X86_64 37#ifdef CONFIG_X86_64
37extern const char nmi[]; 38extern asmlinkage void nmi(void);
38#endif 39#endif
39extern void xen_sysenter_target(void); 40extern void xen_sysenter_target(void);
40extern void xen_syscall_target(void); 41extern void xen_syscall_target(void);
@@ -81,6 +82,9 @@ static void __init xen_add_extra_mem(u64 start, u64 size)
81 82
82 memblock_reserve(start, size); 83 memblock_reserve(start, size);
83 84
85 if (xen_feature(XENFEAT_auto_translated_physmap))
86 return;
87
84 xen_max_p2m_pfn = PFN_DOWN(start + size); 88 xen_max_p2m_pfn = PFN_DOWN(start + size);
85 for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) { 89 for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) {
86 unsigned long mfn = pfn_to_mfn(pfn); 90 unsigned long mfn = pfn_to_mfn(pfn);
@@ -103,6 +107,7 @@ static unsigned long __init xen_do_chunk(unsigned long start,
103 .domid = DOMID_SELF 107 .domid = DOMID_SELF
104 }; 108 };
105 unsigned long len = 0; 109 unsigned long len = 0;
110 int xlated_phys = xen_feature(XENFEAT_auto_translated_physmap);
106 unsigned long pfn; 111 unsigned long pfn;
107 int ret; 112 int ret;
108 113
@@ -116,7 +121,7 @@ static unsigned long __init xen_do_chunk(unsigned long start,
116 continue; 121 continue;
117 frame = mfn; 122 frame = mfn;
118 } else { 123 } else {
119 if (mfn != INVALID_P2M_ENTRY) 124 if (!xlated_phys && mfn != INVALID_P2M_ENTRY)
120 continue; 125 continue;
121 frame = pfn; 126 frame = pfn;
122 } 127 }
@@ -154,6 +159,13 @@ static unsigned long __init xen_do_chunk(unsigned long start,
154static unsigned long __init xen_release_chunk(unsigned long start, 159static unsigned long __init xen_release_chunk(unsigned long start,
155 unsigned long end) 160 unsigned long end)
156{ 161{
162 /*
163 * Xen already ballooned out the E820 non RAM regions for us
164 * and set them up properly in EPT.
165 */
166 if (xen_feature(XENFEAT_auto_translated_physmap))
167 return end - start;
168
157 return xen_do_chunk(start, end, true); 169 return xen_do_chunk(start, end, true);
158} 170}
159 171
@@ -222,7 +234,13 @@ static void __init xen_set_identity_and_release_chunk(
222 * (except for the ISA region which must be 1:1 mapped) to 234 * (except for the ISA region which must be 1:1 mapped) to
223 * release the refcounts (in Xen) on the original frames. 235 * release the refcounts (in Xen) on the original frames.
224 */ 236 */
225 for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) { 237
238 /*
239 * PVH E820 matches the hypervisor's P2M which means we need to
240 * account for the proper values of *release and *identity.
241 */
242 for (pfn = start_pfn; !xen_feature(XENFEAT_auto_translated_physmap) &&
243 pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) {
226 pte_t pte = __pte_ma(0); 244 pte_t pte = __pte_ma(0);
227 245
228 if (pfn < PFN_UP(ISA_END_ADDRESS)) 246 if (pfn < PFN_UP(ISA_END_ADDRESS))
@@ -559,20 +577,17 @@ void xen_enable_syscall(void)
559void xen_enable_nmi(void) 577void xen_enable_nmi(void)
560{ 578{
561#ifdef CONFIG_X86_64 579#ifdef CONFIG_X86_64
562 if (register_callback(CALLBACKTYPE_nmi, nmi)) 580 if (register_callback(CALLBACKTYPE_nmi, (char *)nmi))
563 BUG(); 581 BUG();
564#endif 582#endif
565} 583}
566void __init xen_arch_setup(void) 584void __init xen_pvmmu_arch_setup(void)
567{ 585{
568 xen_panic_handler_init();
569
570 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); 586 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
571 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); 587 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
572 588
573 if (!xen_feature(XENFEAT_auto_translated_physmap)) 589 HYPERVISOR_vm_assist(VMASST_CMD_enable,
574 HYPERVISOR_vm_assist(VMASST_CMD_enable, 590 VMASST_TYPE_pae_extended_cr3);
575 VMASST_TYPE_pae_extended_cr3);
576 591
577 if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) || 592 if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
578 register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback)) 593 register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
@@ -581,6 +596,15 @@ void __init xen_arch_setup(void)
581 xen_enable_sysenter(); 596 xen_enable_sysenter();
582 xen_enable_syscall(); 597 xen_enable_syscall();
583 xen_enable_nmi(); 598 xen_enable_nmi();
599}
600
601/* This function is not called for HVM domains */
602void __init xen_arch_setup(void)
603{
604 xen_panic_handler_init();
605 if (!xen_feature(XENFEAT_auto_translated_physmap))
606 xen_pvmmu_arch_setup();
607
584#ifdef CONFIG_ACPI 608#ifdef CONFIG_ACPI
585 if (!(xen_start_info->flags & SIF_INITDOMAIN)) { 609 if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
586 printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); 610 printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index c36b325abd83..a18eadd8bb40 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -73,9 +73,11 @@ static void cpu_bringup(void)
73 touch_softlockup_watchdog(); 73 touch_softlockup_watchdog();
74 preempt_disable(); 74 preempt_disable();
75 75
76 xen_enable_sysenter(); 76 /* PVH runs in ring 0 and allows us to do native syscalls. Yay! */
77 xen_enable_syscall(); 77 if (!xen_feature(XENFEAT_supervisor_mode_kernel)) {
78 78 xen_enable_sysenter();
79 xen_enable_syscall();
80 }
79 cpu = smp_processor_id(); 81 cpu = smp_processor_id();
80 smp_store_cpu_info(cpu); 82 smp_store_cpu_info(cpu);
81 cpu_data(cpu).x86_max_cores = 1; 83 cpu_data(cpu).x86_max_cores = 1;
@@ -97,8 +99,14 @@ static void cpu_bringup(void)
97 wmb(); /* make sure everything is out */ 99 wmb(); /* make sure everything is out */
98} 100}
99 101
100static void cpu_bringup_and_idle(void) 102/* Note: cpu parameter is only relevant for PVH */
103static void cpu_bringup_and_idle(int cpu)
101{ 104{
105#ifdef CONFIG_X86_64
106 if (xen_feature(XENFEAT_auto_translated_physmap) &&
107 xen_feature(XENFEAT_supervisor_mode_kernel))
108 xen_pvh_secondary_vcpu_init(cpu);
109#endif
102 cpu_bringup(); 110 cpu_bringup();
103 cpu_startup_entry(CPUHP_ONLINE); 111 cpu_startup_entry(CPUHP_ONLINE);
104} 112}
@@ -274,9 +282,10 @@ static void __init xen_smp_prepare_boot_cpu(void)
274 native_smp_prepare_boot_cpu(); 282 native_smp_prepare_boot_cpu();
275 283
276 if (xen_pv_domain()) { 284 if (xen_pv_domain()) {
277 /* We've switched to the "real" per-cpu gdt, so make sure the 285 if (!xen_feature(XENFEAT_writable_page_tables))
278 old memory can be recycled */ 286 /* We've switched to the "real" per-cpu gdt, so make
279 make_lowmem_page_readwrite(xen_initial_gdt); 287 * sure the old memory can be recycled. */
288 make_lowmem_page_readwrite(xen_initial_gdt);
280 289
281#ifdef CONFIG_X86_32 290#ifdef CONFIG_X86_32
282 /* 291 /*
@@ -360,22 +369,21 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
360 369
361 gdt = get_cpu_gdt_table(cpu); 370 gdt = get_cpu_gdt_table(cpu);
362 371
363 ctxt->flags = VGCF_IN_KERNEL;
364 ctxt->user_regs.ss = __KERNEL_DS;
365#ifdef CONFIG_X86_32 372#ifdef CONFIG_X86_32
373 /* Note: PVH is not yet supported on x86_32. */
366 ctxt->user_regs.fs = __KERNEL_PERCPU; 374 ctxt->user_regs.fs = __KERNEL_PERCPU;
367 ctxt->user_regs.gs = __KERNEL_STACK_CANARY; 375 ctxt->user_regs.gs = __KERNEL_STACK_CANARY;
368#else
369 ctxt->gs_base_kernel = per_cpu_offset(cpu);
370#endif 376#endif
371 ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; 377 ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
372 378
373 memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt)); 379 memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
374 380
375 { 381 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
382 ctxt->flags = VGCF_IN_KERNEL;
376 ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ 383 ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
377 ctxt->user_regs.ds = __USER_DS; 384 ctxt->user_regs.ds = __USER_DS;
378 ctxt->user_regs.es = __USER_DS; 385 ctxt->user_regs.es = __USER_DS;
386 ctxt->user_regs.ss = __KERNEL_DS;
379 387
380 xen_copy_trap_info(ctxt->trap_ctxt); 388 xen_copy_trap_info(ctxt->trap_ctxt);
381 389
@@ -396,18 +404,27 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
396#ifdef CONFIG_X86_32 404#ifdef CONFIG_X86_32
397 ctxt->event_callback_cs = __KERNEL_CS; 405 ctxt->event_callback_cs = __KERNEL_CS;
398 ctxt->failsafe_callback_cs = __KERNEL_CS; 406 ctxt->failsafe_callback_cs = __KERNEL_CS;
407#else
408 ctxt->gs_base_kernel = per_cpu_offset(cpu);
399#endif 409#endif
400 ctxt->event_callback_eip = 410 ctxt->event_callback_eip =
401 (unsigned long)xen_hypervisor_callback; 411 (unsigned long)xen_hypervisor_callback;
402 ctxt->failsafe_callback_eip = 412 ctxt->failsafe_callback_eip =
403 (unsigned long)xen_failsafe_callback; 413 (unsigned long)xen_failsafe_callback;
414 ctxt->user_regs.cs = __KERNEL_CS;
415 per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
416#ifdef CONFIG_X86_32
404 } 417 }
405 ctxt->user_regs.cs = __KERNEL_CS; 418#else
419 } else
420 /* N.B. The user_regs.eip (cpu_bringup_and_idle) is called with
421 * %rdi having the cpu number - which means are passing in
422 * as the first parameter the cpu. Subtle!
423 */
424 ctxt->user_regs.rdi = cpu;
425#endif
406 ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); 426 ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
407
408 per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
409 ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir)); 427 ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
410
411 if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt)) 428 if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt))
412 BUG(); 429 BUG();
413 430
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 0e36cde12f7e..4d3acc34a998 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -106,7 +106,7 @@ static DEFINE_PER_CPU(struct xen_lock_waiting, lock_waiting);
106static cpumask_t waiting_cpus; 106static cpumask_t waiting_cpus;
107 107
108static bool xen_pvspin = true; 108static bool xen_pvspin = true;
109static void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want) 109__visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
110{ 110{
111 int irq = __this_cpu_read(lock_kicker_irq); 111 int irq = __this_cpu_read(lock_kicker_irq);
112 struct xen_lock_waiting *w = &__get_cpu_var(lock_waiting); 112 struct xen_lock_waiting *w = &__get_cpu_var(lock_waiting);
@@ -183,7 +183,7 @@ static void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
183 183
184 local_irq_save(flags); 184 local_irq_save(flags);
185 185
186 kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); 186 kstat_incr_irq_this_cpu(irq);
187out: 187out:
188 cpumask_clear_cpu(cpu, &waiting_cpus); 188 cpumask_clear_cpu(cpu, &waiting_cpus);
189 w->lock = NULL; 189 w->lock = NULL;
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 12a1ca707b94..7b78f88c1707 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -446,6 +446,7 @@ void xen_setup_timer(int cpu)
446 IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER| 446 IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER|
447 IRQF_FORCE_RESUME, 447 IRQF_FORCE_RESUME,
448 name, NULL); 448 name, NULL);
449 (void)xen_set_irq_priority(irq, XEN_IRQ_PRIORITY_MAX);
449 450
450 memcpy(evt, xen_clockevent, sizeof(*evt)); 451 memcpy(evt, xen_clockevent, sizeof(*evt));
451 452
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 7faed5869e5b..485b69585540 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -11,8 +11,28 @@
11#include <asm/page_types.h> 11#include <asm/page_types.h>
12 12
13#include <xen/interface/elfnote.h> 13#include <xen/interface/elfnote.h>
14#include <xen/interface/features.h>
14#include <asm/xen/interface.h> 15#include <asm/xen/interface.h>
15 16
17#ifdef CONFIG_XEN_PVH
18#define PVH_FEATURES_STR "|writable_descriptor_tables|auto_translated_physmap|supervisor_mode_kernel"
19/* Note the lack of 'hvm_callback_vector'. Older hypervisor will
20 * balk at this being part of XEN_ELFNOTE_FEATURES, so we put it in
21 * XEN_ELFNOTE_SUPPORTED_FEATURES which older hypervisors will ignore.
22 */
23#define PVH_FEATURES ((1 << XENFEAT_writable_page_tables) | \
24 (1 << XENFEAT_auto_translated_physmap) | \
25 (1 << XENFEAT_supervisor_mode_kernel) | \
26 (1 << XENFEAT_hvm_callback_vector))
27/* The XENFEAT_writable_page_tables is not stricly neccessary as we set that
28 * up regardless whether this CONFIG option is enabled or not, but it
29 * clarifies what the right flags need to be.
30 */
31#else
32#define PVH_FEATURES_STR ""
33#define PVH_FEATURES (0)
34#endif
35
16 __INIT 36 __INIT
17ENTRY(startup_xen) 37ENTRY(startup_xen)
18 cld 38 cld
@@ -95,7 +115,10 @@ NEXT_HYPERCALL(arch_6)
95#endif 115#endif
96 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen) 116 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)
97 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page) 117 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
98 ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") 118 ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .ascii "!writable_page_tables|pae_pgdir_above_4gb"; .asciz PVH_FEATURES_STR)
119 ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, .long (PVH_FEATURES) |
120 (1 << XENFEAT_writable_page_tables) |
121 (1 << XENFEAT_dom0))
99 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") 122 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
100 ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") 123 ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
101 ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, 124 ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 95f8c6142328..1cb6f4c37300 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -123,4 +123,5 @@ __visible void xen_adjust_exception_frame(void);
123 123
124extern int xen_panic_handler_init(void); 124extern int xen_panic_handler_init(void);
125 125
126void xen_pvh_secondary_vcpu_init(int cpu);
126#endif /* XEN_OPS_H */ 127#endif /* XEN_OPS_H */