aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorKonrad Rzeszutek Wilk <konrad.wilk@oracle.com>2012-09-05 10:22:45 -0400
committerKonrad Rzeszutek Wilk <konrad.wilk@oracle.com>2012-09-05 10:22:45 -0400
commit593d0a3e9f813db910dc50574532914db21d09ff (patch)
tree12d8413ee57b4383ca8c906996ffe02be6d377a5 /arch/x86
parent50e900417b8096939d12a46848f965e27a905e36 (diff)
parent4cb38750d49010ae72e718d46605ac9ba5a851b4 (diff)
Merge commit '4cb38750d49010ae72e718d46605ac9ba5a851b4' into stable/for-linus-3.6
* commit '4cb38750d49010ae72e718d46605ac9ba5a851b4': (6849 commits) bcma: fix invalid PMU chip control masks [libata] pata_cmd64x: whitespace cleanup libata-acpi: fix up for acpi_pm_device_sleep_state API sata_dwc_460ex: device tree may specify dma_channel ahci, trivial: fixed coding style issues related to braces ahci_platform: add hibernation callbacks libata-eh.c: local functions should not be exposed globally libata-transport.c: local functions should not be exposed globally sata_dwc_460ex: support hardreset ata: use module_pci_driver drivers/ata/pata_pcmcia.c: adjust suspicious bit operation pata_imx: Convert to clk_prepare_enable/clk_disable_unprepare ahci: Enable SB600 64bit DMA on MSI K9AGM2 (MS-7327) v2 [libata] Prevent interface errors with Seagate FreeAgent GoFlex drivers/acpi/glue: revert accidental license-related 6b66d95895c bits libata-acpi: add missing inlines in libata.h i2c-omap: Add support for I2C_M_STOP message flag i2c: Fall back to emulated SMBus if the operation isn't supported natively i2c: Add SCCB support i2c-tiny-usb: Add support for the Robofuzz OSIF USB/I2C converter ...
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig.debug19
-rw-r--r--arch/x86/Makefile3
-rw-r--r--arch/x86/boot/compressed/cmdline.c4
-rw-r--r--arch/x86/boot/compressed/early_serial_console.c4
-rw-r--r--arch/x86/boot/compressed/eboot.c198
-rw-r--r--arch/x86/boot/compressed/head_32.S10
-rw-r--r--arch/x86/boot/compressed/head_64.S10
-rw-r--r--arch/x86/boot/compressed/misc.c31
-rw-r--r--arch/x86/boot/compressed/misc.h27
-rw-r--r--arch/x86/boot/header.S11
-rw-r--r--arch/x86/crypto/Makefile14
-rw-r--r--arch/x86/crypto/ablk_helper.c149
-rw-r--r--arch/x86/crypto/aes_glue.c2
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c110
-rw-r--r--arch/x86/crypto/camellia_glue.c355
-rw-r--r--arch/x86/crypto/glue_helper.c307
-rw-r--r--arch/x86/crypto/serpent-avx-x86_64-asm_64.S704
-rw-r--r--arch/x86/crypto/serpent_avx_glue.c636
-rw-r--r--arch/x86/crypto/serpent_sse2_glue.c513
-rw-r--r--arch/x86/crypto/sha1_ssse3_asm.S2
-rw-r--r--arch/x86/crypto/sha1_ssse3_glue.c6
-rw-r--r--arch/x86/crypto/twofish-avx-x86_64-asm_64.S300
-rw-r--r--arch/x86/crypto/twofish_avx_glue.c624
-rw-r--r--arch/x86/crypto/twofish_glue_3way.c409
-rw-r--r--arch/x86/ia32/ia32_signal.c2
-rw-r--r--arch/x86/include/asm/alternative.h74
-rw-r--r--arch/x86/include/asm/amd_nb.h21
-rw-r--r--arch/x86/include/asm/apic.h66
-rw-r--r--arch/x86/include/asm/bitops.h7
-rw-r--r--arch/x86/include/asm/bootparam.h1
-rw-r--r--arch/x86/include/asm/cpufeature.h4
-rw-r--r--arch/x86/include/asm/crypto/ablk_helper.h31
-rw-r--r--arch/x86/include/asm/crypto/aes.h (renamed from arch/x86/include/asm/aes.h)0
-rw-r--r--arch/x86/include/asm/crypto/glue_helper.h115
-rw-r--r--arch/x86/include/asm/crypto/serpent-avx.h32
-rw-r--r--arch/x86/include/asm/crypto/serpent-sse2.h (renamed from arch/x86/include/asm/serpent.h)4
-rw-r--r--arch/x86/include/asm/crypto/twofish.h46
-rw-r--r--arch/x86/include/asm/emergency-restart.h2
-rw-r--r--arch/x86/include/asm/entry_arch.h9
-rw-r--r--arch/x86/include/asm/floppy.h2
-rw-r--r--arch/x86/include/asm/hypervisor.h1
-rw-r--r--arch/x86/include/asm/iommu.h1
-rw-r--r--arch/x86/include/asm/irq_vectors.h11
-rw-r--r--arch/x86/include/asm/kvm.h1
-rw-r--r--arch/x86/include/asm/kvm_emulate.h6
-rw-r--r--arch/x86/include/asm/kvm_host.h35
-rw-r--r--arch/x86/include/asm/kvm_para.h7
-rw-r--r--arch/x86/include/asm/msr.h46
-rw-r--r--arch/x86/include/asm/nmi.h20
-rw-r--r--arch/x86/include/asm/paravirt.h46
-rw-r--r--arch/x86/include/asm/paravirt_types.h5
-rw-r--r--arch/x86/include/asm/pci_x86.h15
-rw-r--r--arch/x86/include/asm/percpu.h17
-rw-r--r--arch/x86/include/asm/perf_event.h22
-rw-r--r--arch/x86/include/asm/pgtable-2level.h4
-rw-r--r--arch/x86/include/asm/pgtable-3level.h36
-rw-r--r--arch/x86/include/asm/pgtable_64.h8
-rw-r--r--arch/x86/include/asm/processor-flags.h2
-rw-r--r--arch/x86/include/asm/processor.h13
-rw-r--r--arch/x86/include/asm/realmode.h3
-rw-r--r--arch/x86/include/asm/reboot.h4
-rw-r--r--arch/x86/include/asm/smp.h21
-rw-r--r--arch/x86/include/asm/tlb.h9
-rw-r--r--arch/x86/include/asm/tlbflush.h49
-rw-r--r--arch/x86/include/asm/uaccess_64.h11
-rw-r--r--arch/x86/include/asm/uprobes.h2
-rw-r--r--arch/x86/include/asm/uv/uv.h5
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h28
-rw-r--r--arch/x86/include/asm/vmx.h6
-rw-r--r--arch/x86/include/asm/x2apic.h18
-rw-r--r--arch/x86/include/asm/x86_init.h4
-rw-r--r--arch/x86/kernel/acpi/boot.c27
-rw-r--r--arch/x86/kernel/alternative.c19
-rw-r--r--arch/x86/kernel/amd_nb.c11
-rw-r--r--arch/x86/kernel/apic/apic.c42
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c76
-rw-r--r--arch/x86/kernel/apic/apic_noop.c9
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c50
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c48
-rw-r--r--arch/x86/kernel/apic/es7000_32.c51
-rw-r--r--arch/x86/kernel/apic/io_apic.c350
-rw-r--r--arch/x86/kernel/apic/numaq_32.c30
-rw-r--r--arch/x86/kernel/apic/probe_32.c23
-rw-r--r--arch/x86/kernel/apic/probe_64.c11
-rw-r--r--arch/x86/kernel/apic/summit_32.c68
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c82
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c39
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c45
-rw-r--r--arch/x86/kernel/apm_32.c29
-rw-r--r--arch/x86/kernel/cpu/Makefile6
-rw-r--r--arch/x86/kernel/cpu/amd.c39
-rw-r--r--arch/x86/kernel/cpu/bugs.c20
-rw-r--r--arch/x86/kernel/cpu/common.c33
-rw-r--r--arch/x86/kernel/cpu/cpu.h9
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c3
-rw-r--r--arch/x86/kernel/cpu/intel.c176
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c28
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c264
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.pl25
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c6
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c6
-rw-r--r--arch/x86/kernel/cpu/perf_event.c111
-rw-r--r--arch/x86/kernel/cpu/perf_event.h26
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c103
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c134
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c12
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.c1850
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.h424
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c16
-rw-r--r--arch/x86/kernel/cpu/perf_event_p6.c4
-rw-r--r--arch/x86/kernel/cpu/scattered.c2
-rw-r--r--arch/x86/kernel/cpu/sched.c55
-rw-r--r--arch/x86/kernel/dumpstack.c5
-rw-r--r--arch/x86/kernel/dumpstack_32.c25
-rw-r--r--arch/x86/kernel/dumpstack_64.c21
-rw-r--r--arch/x86/kernel/entry_64.S38
-rw-r--r--arch/x86/kernel/irq.c4
-rw-r--r--arch/x86/kernel/irqinit.c73
-rw-r--r--arch/x86/kernel/kgdb.c8
-rw-r--r--arch/x86/kernel/kvm.c64
-rw-r--r--arch/x86/kernel/microcode_core.c66
-rw-r--r--arch/x86/kernel/module.c34
-rw-r--r--arch/x86/kernel/nmi.c47
-rw-r--r--arch/x86/kernel/nmi_selftest.c7
-rw-r--r--arch/x86/kernel/paravirt.c2
-rw-r--r--arch/x86/kernel/pci-calgary_64.c34
-rw-r--r--arch/x86/kernel/pci-dma.c11
-rw-r--r--arch/x86/kernel/process.c34
-rw-r--r--arch/x86/kernel/process_64.c12
-rw-r--r--arch/x86/kernel/quirks.c2
-rw-r--r--arch/x86/kernel/reboot.c82
-rw-r--r--arch/x86/kernel/setup.c2
-rw-r--r--arch/x86/kernel/setup_percpu.c2
-rw-r--r--arch/x86/kernel/signal.c5
-rw-r--r--arch/x86/kernel/smpboot.c114
-rw-r--r--arch/x86/kernel/traps.c19
-rw-r--r--arch/x86/kernel/tsc.c50
-rw-r--r--arch/x86/kernel/uprobes.c3
-rw-r--r--arch/x86/kernel/vm86_32.c6
-rw-r--r--arch/x86/kernel/vsmp_64.c44
-rw-r--r--arch/x86/kernel/vsyscall_64.c56
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c1
-rw-r--r--arch/x86/kernel/x86_init.c2
-rw-r--r--arch/x86/kernel/xsave.c12
-rw-r--r--arch/x86/kvm/cpuid.c46
-rw-r--r--arch/x86/kvm/cpuid.h9
-rw-r--r--arch/x86/kvm/emulate.c273
-rw-r--r--arch/x86/kvm/i8259.c17
-rw-r--r--arch/x86/kvm/lapic.c194
-rw-r--r--arch/x86/kvm/lapic.h11
-rw-r--r--arch/x86/kvm/mmu.c362
-rw-r--r--arch/x86/kvm/mmutrace.h45
-rw-r--r--arch/x86/kvm/paging_tmpl.h3
-rw-r--r--arch/x86/kvm/pmu.c22
-rw-r--r--arch/x86/kvm/svm.c12
-rw-r--r--arch/x86/kvm/trace.h46
-rw-r--r--arch/x86/kvm/vmx.c189
-rw-r--r--arch/x86/kvm/x86.c123
-rw-r--r--arch/x86/lib/csum-wrappers_64.c2
-rw-r--r--arch/x86/lib/msr-reg-export.c4
-rw-r--r--arch/x86/lib/msr-reg.S10
-rw-r--r--arch/x86/mm/init.c2
-rw-r--r--arch/x86/mm/pageattr.c10
-rw-r--r--arch/x86/mm/tlb.c401
-rw-r--r--arch/x86/net/bpf_jit_comp.c4
-rw-r--r--arch/x86/oprofile/op_model_amd.c4
-rw-r--r--arch/x86/pci/acpi.c109
-rw-r--r--arch/x86/pci/amd_bus.c7
-rw-r--r--arch/x86/pci/bus_numa.c22
-rw-r--r--arch/x86/pci/bus_numa.h3
-rw-r--r--arch/x86/pci/common.c2
-rw-r--r--arch/x86/pci/mmconfig-shared.c372
-rw-r--r--arch/x86/pci/mmconfig_32.c30
-rw-r--r--arch/x86/pci/mmconfig_64.c52
-rw-r--r--arch/x86/pci/mrst.c2
-rw-r--r--arch/x86/platform/efi/efi.c30
-rw-r--r--arch/x86/platform/mrst/early_printk_mrst.c13
-rw-r--r--arch/x86/platform/olpc/olpc-xo15-sci.c6
-rw-r--r--arch/x86/platform/uv/tlb_uv.c459
-rw-r--r--arch/x86/platform/uv/uv_irq.c9
-rw-r--r--arch/x86/realmode/rm/Makefile2
-rw-r--r--arch/x86/realmode/rm/header.S4
-rw-r--r--arch/x86/realmode/rm/reboot.S (renamed from arch/x86/realmode/rm/reboot_32.S)30
-rw-r--r--arch/x86/vdso/vdso32-setup.c6
-rw-r--r--arch/x86/xen/enlighten.c2
-rw-r--r--arch/x86/xen/mmu.c12
-rw-r--r--arch/x86/xen/smp.c2
187 files changed, 9698 insertions, 3832 deletions
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index e46c2147397f..b322f124ee3c 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -129,6 +129,25 @@ config DOUBLEFAULT
129 option saves about 4k and might cause you much additional grey 129 option saves about 4k and might cause you much additional grey
130 hair. 130 hair.
131 131
132config DEBUG_TLBFLUSH
133 bool "Set upper limit of TLB entries to flush one-by-one"
134 depends on DEBUG_KERNEL && (X86_64 || X86_INVLPG)
135 ---help---
136
137 X86-only for now.
138
139 This option allows the user to tune the amount of TLB entries the
140 kernel flushes one-by-one instead of doing a full TLB flush. In
141 certain situations, the former is cheaper. This is controlled by the
142 tlb_flushall_shift knob under /sys/kernel/debug/x86. If you set it
143 to -1, the code flushes the whole TLB unconditionally. Otherwise,
144 for positive values of it, the kernel will use single TLB entry
145 invalidating instructions according to the following formula:
146
147 flush_entries <= active_tlb_entries / 2^tlb_flushall_shift
148
149 If in doubt, say "N".
150
132config IOMMU_DEBUG 151config IOMMU_DEBUG
133 bool "Enable IOMMU debugging" 152 bool "Enable IOMMU debugging"
134 depends on GART_IOMMU && DEBUG_KERNEL 153 depends on GART_IOMMU && DEBUG_KERNEL
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 1f2521434554..b0c5276861ec 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -49,6 +49,9 @@ else
49 KBUILD_AFLAGS += -m64 49 KBUILD_AFLAGS += -m64
50 KBUILD_CFLAGS += -m64 50 KBUILD_CFLAGS += -m64
51 51
52 # Use -mpreferred-stack-boundary=3 if supported.
53 KBUILD_CFLAGS += $(call cc-option,-mno-sse -mpreferred-stack-boundary=3)
54
52 # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu) 55 # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu)
53 cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8) 56 cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8)
54 cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona) 57 cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c
index cb62f786990d..10f6b1178c68 100644
--- a/arch/x86/boot/compressed/cmdline.c
+++ b/arch/x86/boot/compressed/cmdline.c
@@ -1,5 +1,7 @@
1#include "misc.h" 1#include "misc.h"
2 2
3#ifdef CONFIG_EARLY_PRINTK
4
3static unsigned long fs; 5static unsigned long fs;
4static inline void set_fs(unsigned long seg) 6static inline void set_fs(unsigned long seg)
5{ 7{
@@ -19,3 +21,5 @@ int cmdline_find_option_bool(const char *option)
19{ 21{
20 return __cmdline_find_option_bool(real_mode->hdr.cmd_line_ptr, option); 22 return __cmdline_find_option_bool(real_mode->hdr.cmd_line_ptr, option);
21} 23}
24
25#endif
diff --git a/arch/x86/boot/compressed/early_serial_console.c b/arch/x86/boot/compressed/early_serial_console.c
index 261e81fb9582..d3d003cb5481 100644
--- a/arch/x86/boot/compressed/early_serial_console.c
+++ b/arch/x86/boot/compressed/early_serial_console.c
@@ -1,5 +1,9 @@
1#include "misc.h" 1#include "misc.h"
2 2
3#ifdef CONFIG_EARLY_PRINTK
4
3int early_serial_base; 5int early_serial_base;
4 6
5#include "../early_serial_console.c" 7#include "../early_serial_console.c"
8
9#endif
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 4e85f5f85837..b3e0227df2c9 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -729,32 +729,68 @@ fail:
729 * need to create one ourselves (usually the bootloader would create 729 * need to create one ourselves (usually the bootloader would create
730 * one for us). 730 * one for us).
731 */ 731 */
732static efi_status_t make_boot_params(struct boot_params *boot_params, 732struct boot_params *make_boot_params(void *handle, efi_system_table_t *_table)
733 efi_loaded_image_t *image,
734 void *handle)
735{ 733{
736 struct efi_info *efi = &boot_params->efi_info; 734 struct boot_params *boot_params;
737 struct apm_bios_info *bi = &boot_params->apm_bios_info; 735 struct sys_desc_table *sdt;
738 struct sys_desc_table *sdt = &boot_params->sys_desc_table; 736 struct apm_bios_info *bi;
739 struct e820entry *e820_map = &boot_params->e820_map[0]; 737 struct setup_header *hdr;
740 struct e820entry *prev = NULL; 738 struct efi_info *efi;
741 struct setup_header *hdr = &boot_params->hdr; 739 efi_loaded_image_t *image;
742 unsigned long size, key, desc_size, _size; 740 void *options;
743 efi_memory_desc_t *mem_map; 741 u32 load_options_size;
744 void *options = image->load_options; 742 efi_guid_t proto = LOADED_IMAGE_PROTOCOL_GUID;
745 u32 load_options_size = image->load_options_size / 2; /* ASCII */
746 int options_size = 0; 743 int options_size = 0;
747 efi_status_t status; 744 efi_status_t status;
748 __u32 desc_version;
749 unsigned long cmdline; 745 unsigned long cmdline;
750 u8 nr_entries;
751 u16 *s2; 746 u16 *s2;
752 u8 *s1; 747 u8 *s1;
753 int i; 748 int i;
754 749
750 sys_table = _table;
751
752 /* Check if we were booted by the EFI firmware */
753 if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
754 return NULL;
755
756 status = efi_call_phys3(sys_table->boottime->handle_protocol,
757 handle, &proto, (void *)&image);
758 if (status != EFI_SUCCESS) {
759 efi_printk("Failed to get handle for LOADED_IMAGE_PROTOCOL\n");
760 return NULL;
761 }
762
763 status = low_alloc(0x4000, 1, (unsigned long *)&boot_params);
764 if (status != EFI_SUCCESS) {
765 efi_printk("Failed to alloc lowmem for boot params\n");
766 return NULL;
767 }
768
769 memset(boot_params, 0x0, 0x4000);
770
771 hdr = &boot_params->hdr;
772 efi = &boot_params->efi_info;
773 bi = &boot_params->apm_bios_info;
774 sdt = &boot_params->sys_desc_table;
775
776 /* Copy the second sector to boot_params */
777 memcpy(&hdr->jump, image->image_base + 512, 512);
778
779 /*
780 * Fill out some of the header fields ourselves because the
781 * EFI firmware loader doesn't load the first sector.
782 */
783 hdr->root_flags = 1;
784 hdr->vid_mode = 0xffff;
785 hdr->boot_flag = 0xAA55;
786
787 hdr->code32_start = (__u64)(unsigned long)image->image_base;
788
755 hdr->type_of_loader = 0x21; 789 hdr->type_of_loader = 0x21;
756 790
757 /* Convert unicode cmdline to ascii */ 791 /* Convert unicode cmdline to ascii */
792 options = image->load_options;
793 load_options_size = image->load_options_size / 2; /* ASCII */
758 cmdline = 0; 794 cmdline = 0;
759 s2 = (u16 *)options; 795 s2 = (u16 *)options;
760 796
@@ -791,18 +827,36 @@ static efi_status_t make_boot_params(struct boot_params *boot_params,
791 hdr->ramdisk_image = 0; 827 hdr->ramdisk_image = 0;
792 hdr->ramdisk_size = 0; 828 hdr->ramdisk_size = 0;
793 829
794 status = handle_ramdisks(image, hdr);
795 if (status != EFI_SUCCESS)
796 goto free_cmdline;
797
798 setup_graphics(boot_params);
799
800 /* Clear APM BIOS info */ 830 /* Clear APM BIOS info */
801 memset(bi, 0, sizeof(*bi)); 831 memset(bi, 0, sizeof(*bi));
802 832
803 memset(sdt, 0, sizeof(*sdt)); 833 memset(sdt, 0, sizeof(*sdt));
804 834
805 memcpy(&efi->efi_loader_signature, EFI_LOADER_SIGNATURE, sizeof(__u32)); 835 status = handle_ramdisks(image, hdr);
836 if (status != EFI_SUCCESS)
837 goto fail2;
838
839 return boot_params;
840fail2:
841 if (options_size)
842 low_free(options_size, hdr->cmd_line_ptr);
843fail:
844 low_free(0x4000, (unsigned long)boot_params);
845 return NULL;
846}
847
848static efi_status_t exit_boot(struct boot_params *boot_params,
849 void *handle)
850{
851 struct efi_info *efi = &boot_params->efi_info;
852 struct e820entry *e820_map = &boot_params->e820_map[0];
853 struct e820entry *prev = NULL;
854 unsigned long size, key, desc_size, _size;
855 efi_memory_desc_t *mem_map;
856 efi_status_t status;
857 __u32 desc_version;
858 u8 nr_entries;
859 int i;
806 860
807 size = sizeof(*mem_map) * 32; 861 size = sizeof(*mem_map) * 32;
808 862
@@ -811,7 +865,7 @@ again:
811 _size = size; 865 _size = size;
812 status = low_alloc(size, 1, (unsigned long *)&mem_map); 866 status = low_alloc(size, 1, (unsigned long *)&mem_map);
813 if (status != EFI_SUCCESS) 867 if (status != EFI_SUCCESS)
814 goto free_cmdline; 868 return status;
815 869
816 status = efi_call_phys5(sys_table->boottime->get_memory_map, &size, 870 status = efi_call_phys5(sys_table->boottime->get_memory_map, &size,
817 mem_map, &key, &desc_size, &desc_version); 871 mem_map, &key, &desc_size, &desc_version);
@@ -823,6 +877,7 @@ again:
823 if (status != EFI_SUCCESS) 877 if (status != EFI_SUCCESS)
824 goto free_mem_map; 878 goto free_mem_map;
825 879
880 memcpy(&efi->efi_loader_signature, EFI_LOADER_SIGNATURE, sizeof(__u32));
826 efi->efi_systab = (unsigned long)sys_table; 881 efi->efi_systab = (unsigned long)sys_table;
827 efi->efi_memdesc_size = desc_size; 882 efi->efi_memdesc_size = desc_size;
828 efi->efi_memdesc_version = desc_version; 883 efi->efi_memdesc_version = desc_version;
@@ -906,61 +961,13 @@ again:
906 961
907free_mem_map: 962free_mem_map:
908 low_free(_size, (unsigned long)mem_map); 963 low_free(_size, (unsigned long)mem_map);
909free_cmdline:
910 if (options_size)
911 low_free(options_size, hdr->cmd_line_ptr);
912fail:
913 return status; 964 return status;
914} 965}
915 966
916/* 967static efi_status_t relocate_kernel(struct setup_header *hdr)
917 * On success we return a pointer to a boot_params structure, and NULL
918 * on failure.
919 */
920struct boot_params *efi_main(void *handle, efi_system_table_t *_table)
921{ 968{
922 struct boot_params *boot_params;
923 unsigned long start, nr_pages; 969 unsigned long start, nr_pages;
924 struct desc_ptr *gdt, *idt;
925 efi_loaded_image_t *image;
926 struct setup_header *hdr;
927 efi_status_t status; 970 efi_status_t status;
928 efi_guid_t proto = LOADED_IMAGE_PROTOCOL_GUID;
929 struct desc_struct *desc;
930
931 sys_table = _table;
932
933 /* Check if we were booted by the EFI firmware */
934 if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
935 goto fail;
936
937 status = efi_call_phys3(sys_table->boottime->handle_protocol,
938 handle, &proto, (void *)&image);
939 if (status != EFI_SUCCESS) {
940 efi_printk("Failed to get handle for LOADED_IMAGE_PROTOCOL\n");
941 goto fail;
942 }
943
944 status = low_alloc(0x4000, 1, (unsigned long *)&boot_params);
945 if (status != EFI_SUCCESS) {
946 efi_printk("Failed to alloc lowmem for boot params\n");
947 goto fail;
948 }
949
950 memset(boot_params, 0x0, 0x4000);
951
952 hdr = &boot_params->hdr;
953
954 /* Copy the second sector to boot_params */
955 memcpy(&hdr->jump, image->image_base + 512, 512);
956
957 /*
958 * Fill out some of the header fields ourselves because the
959 * EFI firmware loader doesn't load the first sector.
960 */
961 hdr->root_flags = 1;
962 hdr->vid_mode = 0xffff;
963 hdr->boot_flag = 0xAA55;
964 971
965 /* 972 /*
966 * The EFI firmware loader could have placed the kernel image 973 * The EFI firmware loader could have placed the kernel image
@@ -978,16 +985,40 @@ struct boot_params *efi_main(void *handle, efi_system_table_t *_table)
978 if (status != EFI_SUCCESS) { 985 if (status != EFI_SUCCESS) {
979 status = low_alloc(hdr->init_size, hdr->kernel_alignment, 986 status = low_alloc(hdr->init_size, hdr->kernel_alignment,
980 &start); 987 &start);
981 if (status != EFI_SUCCESS) { 988 if (status != EFI_SUCCESS)
982 efi_printk("Failed to alloc mem for kernel\n"); 989 efi_printk("Failed to alloc mem for kernel\n");
983 goto fail;
984 }
985 } 990 }
986 991
992 if (status == EFI_SUCCESS)
993 memcpy((void *)start, (void *)(unsigned long)hdr->code32_start,
994 hdr->init_size);
995
996 hdr->pref_address = hdr->code32_start;
987 hdr->code32_start = (__u32)start; 997 hdr->code32_start = (__u32)start;
988 hdr->pref_address = (__u64)(unsigned long)image->image_base;
989 998
990 memcpy((void *)start, image->image_base, image->image_size); 999 return status;
1000}
1001
1002/*
1003 * On success we return a pointer to a boot_params structure, and NULL
1004 * on failure.
1005 */
1006struct boot_params *efi_main(void *handle, efi_system_table_t *_table,
1007 struct boot_params *boot_params)
1008{
1009 struct desc_ptr *gdt, *idt;
1010 efi_loaded_image_t *image;
1011 struct setup_header *hdr = &boot_params->hdr;
1012 efi_status_t status;
1013 struct desc_struct *desc;
1014
1015 sys_table = _table;
1016
1017 /* Check if we were booted by the EFI firmware */
1018 if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
1019 goto fail;
1020
1021 setup_graphics(boot_params);
991 1022
992 status = efi_call_phys3(sys_table->boottime->allocate_pool, 1023 status = efi_call_phys3(sys_table->boottime->allocate_pool,
993 EFI_LOADER_DATA, sizeof(*gdt), 1024 EFI_LOADER_DATA, sizeof(*gdt),
@@ -1015,7 +1046,18 @@ struct boot_params *efi_main(void *handle, efi_system_table_t *_table)
1015 idt->size = 0; 1046 idt->size = 0;
1016 idt->address = 0; 1047 idt->address = 0;
1017 1048
1018 status = make_boot_params(boot_params, image, handle); 1049 /*
1050 * If the kernel isn't already loaded at the preferred load
1051 * address, relocate it.
1052 */
1053 if (hdr->pref_address != hdr->code32_start) {
1054 status = relocate_kernel(hdr);
1055
1056 if (status != EFI_SUCCESS)
1057 goto fail;
1058 }
1059
1060 status = exit_boot(boot_params, handle);
1019 if (status != EFI_SUCCESS) 1061 if (status != EFI_SUCCESS)
1020 goto fail; 1062 goto fail;
1021 1063
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index c85e3ac99bba..aa4aaf1b2380 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -42,6 +42,16 @@ ENTRY(startup_32)
42 */ 42 */
43 add $0x4, %esp 43 add $0x4, %esp
44 44
45 call make_boot_params
46 cmpl $0, %eax
47 je 1f
48 movl 0x4(%esp), %esi
49 movl (%esp), %ecx
50 pushl %eax
51 pushl %esi
52 pushl %ecx
53
54 .org 0x30,0x90
45 call efi_main 55 call efi_main
46 cmpl $0, %eax 56 cmpl $0, %eax
47 movl %eax, %esi 57 movl %eax, %esi
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 87e03a13d8e3..2c4b171eec33 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -209,6 +209,16 @@ ENTRY(startup_64)
209 .org 0x210 209 .org 0x210
210 mov %rcx, %rdi 210 mov %rcx, %rdi
211 mov %rdx, %rsi 211 mov %rdx, %rsi
212 pushq %rdi
213 pushq %rsi
214 call make_boot_params
215 cmpq $0,%rax
216 je 1f
217 mov %rax, %rdx
218 popq %rsi
219 popq %rdi
220
221 .org 0x230,0x90
212 call efi_main 222 call efi_main
213 movq %rax,%rsi 223 movq %rax,%rsi
214 cmpq $0,%rax 224 cmpq $0,%rax
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 7116dcba0c9e..88f7ff6da404 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -108,8 +108,6 @@ static void error(char *m);
108 * This is set up by the setup-routine at boot-time 108 * This is set up by the setup-routine at boot-time
109 */ 109 */
110struct boot_params *real_mode; /* Pointer to real-mode data */ 110struct boot_params *real_mode; /* Pointer to real-mode data */
111static int quiet;
112static int debug;
113 111
114void *memset(void *s, int c, size_t n); 112void *memset(void *s, int c, size_t n);
115void *memcpy(void *dest, const void *src, size_t n); 113void *memcpy(void *dest, const void *src, size_t n);
@@ -170,15 +168,11 @@ static void serial_putchar(int ch)
170 outb(ch, early_serial_base + TXR); 168 outb(ch, early_serial_base + TXR);
171} 169}
172 170
173void __putstr(int error, const char *s) 171void __putstr(const char *s)
174{ 172{
175 int x, y, pos; 173 int x, y, pos;
176 char c; 174 char c;
177 175
178#ifndef CONFIG_X86_VERBOSE_BOOTUP
179 if (!error)
180 return;
181#endif
182 if (early_serial_base) { 176 if (early_serial_base) {
183 const char *str = s; 177 const char *str = s;
184 while (*str) { 178 while (*str) {
@@ -265,9 +259,9 @@ void *memcpy(void *dest, const void *src, size_t n)
265 259
266static void error(char *x) 260static void error(char *x)
267{ 261{
268 __putstr(1, "\n\n"); 262 error_putstr("\n\n");
269 __putstr(1, x); 263 error_putstr(x);
270 __putstr(1, "\n\n -- System halted"); 264 error_putstr("\n\n -- System halted");
271 265
272 while (1) 266 while (1)
273 asm("hlt"); 267 asm("hlt");
@@ -294,8 +288,7 @@ static void parse_elf(void *output)
294 return; 288 return;
295 } 289 }
296 290
297 if (!quiet) 291 debug_putstr("Parsing ELF... ");
298 putstr("Parsing ELF... ");
299 292
300 phdrs = malloc(sizeof(*phdrs) * ehdr.e_phnum); 293 phdrs = malloc(sizeof(*phdrs) * ehdr.e_phnum);
301 if (!phdrs) 294 if (!phdrs)
@@ -332,11 +325,6 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
332{ 325{
333 real_mode = rmode; 326 real_mode = rmode;
334 327
335 if (cmdline_find_option_bool("quiet"))
336 quiet = 1;
337 if (cmdline_find_option_bool("debug"))
338 debug = 1;
339
340 if (real_mode->screen_info.orig_video_mode == 7) { 328 if (real_mode->screen_info.orig_video_mode == 7) {
341 vidmem = (char *) 0xb0000; 329 vidmem = (char *) 0xb0000;
342 vidport = 0x3b4; 330 vidport = 0x3b4;
@@ -349,8 +337,7 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
349 cols = real_mode->screen_info.orig_video_cols; 337 cols = real_mode->screen_info.orig_video_cols;
350 338
351 console_init(); 339 console_init();
352 if (debug) 340 debug_putstr("early console in decompress_kernel\n");
353 putstr("early console in decompress_kernel\n");
354 341
355 free_mem_ptr = heap; /* Heap */ 342 free_mem_ptr = heap; /* Heap */
356 free_mem_end_ptr = heap + BOOT_HEAP_SIZE; 343 free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
@@ -369,11 +356,9 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
369 error("Wrong destination address"); 356 error("Wrong destination address");
370#endif 357#endif
371 358
372 if (!quiet) 359 debug_putstr("\nDecompressing Linux... ");
373 putstr("\nDecompressing Linux... ");
374 decompress(input_data, input_len, NULL, NULL, output, NULL, error); 360 decompress(input_data, input_len, NULL, NULL, output, NULL, error);
375 parse_elf(output); 361 parse_elf(output);
376 if (!quiet) 362 debug_putstr("done.\nBooting the kernel.\n");
377 putstr("done.\nBooting the kernel.\n");
378 return; 363 return;
379} 364}
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 3f19c81a6203..0e6dc0ee0eea 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -24,9 +24,21 @@
24 24
25/* misc.c */ 25/* misc.c */
26extern struct boot_params *real_mode; /* Pointer to real-mode data */ 26extern struct boot_params *real_mode; /* Pointer to real-mode data */
27void __putstr(int error, const char *s); 27void __putstr(const char *s);
28#define putstr(__x) __putstr(0, __x) 28#define error_putstr(__x) __putstr(__x)
29#define puts(__x) __putstr(0, __x) 29
30#ifdef CONFIG_X86_VERBOSE_BOOTUP
31
32#define debug_putstr(__x) __putstr(__x)
33
34#else
35
36static inline void debug_putstr(const char *s)
37{ }
38
39#endif
40
41#ifdef CONFIG_EARLY_PRINTK
30 42
31/* cmdline.c */ 43/* cmdline.c */
32int cmdline_find_option(const char *option, char *buffer, int bufsize); 44int cmdline_find_option(const char *option, char *buffer, int bufsize);
@@ -36,4 +48,13 @@ int cmdline_find_option_bool(const char *option);
36extern int early_serial_base; 48extern int early_serial_base;
37void console_init(void); 49void console_init(void);
38 50
51#else
52
53/* early_serial_console.c */
54static const int early_serial_base;
55static inline void console_init(void)
56{ }
57
58#endif
59
39#endif 60#endif
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index efe5acfc79c3..b4e15dd6786a 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -283,7 +283,7 @@ _start:
283 # Part 2 of the header, from the old setup.S 283 # Part 2 of the header, from the old setup.S
284 284
285 .ascii "HdrS" # header signature 285 .ascii "HdrS" # header signature
286 .word 0x020a # header version number (>= 0x0105) 286 .word 0x020b # header version number (>= 0x0105)
287 # or else old loadlin-1.5 will fail) 287 # or else old loadlin-1.5 will fail)
288 .globl realmode_swtch 288 .globl realmode_swtch
289realmode_swtch: .word 0, 0 # default_switch, SETUPSEG 289realmode_swtch: .word 0, 0 # default_switch, SETUPSEG
@@ -401,18 +401,13 @@ pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr
401#define INIT_SIZE VO_INIT_SIZE 401#define INIT_SIZE VO_INIT_SIZE
402#endif 402#endif
403init_size: .long INIT_SIZE # kernel initialization size 403init_size: .long INIT_SIZE # kernel initialization size
404handover_offset: .long 0x30 # offset to the handover
405 # protocol entry point
404 406
405# End of setup header ##################################################### 407# End of setup header #####################################################
406 408
407 .section ".entrytext", "ax" 409 .section ".entrytext", "ax"
408start_of_setup: 410start_of_setup:
409#ifdef SAFE_RESET_DISK_CONTROLLER
410# Reset the disk controller.
411 movw $0x0000, %ax # Reset disk controller
412 movb $0x80, %dl # All disks
413 int $0x13
414#endif
415
416# Force %es = %ds 411# Force %es = %ds
417 movw %ds, %ax 412 movw %ds, %ax
418 movw %ax, %es 413 movw %ax, %es
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index e191ac048b59..e908e5de82d3 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -2,6 +2,9 @@
2# Arch-specific CryptoAPI modules. 2# Arch-specific CryptoAPI modules.
3# 3#
4 4
5obj-$(CONFIG_CRYPTO_ABLK_HELPER_X86) += ablk_helper.o
6obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
7
5obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o 8obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
6obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o 9obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
7obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o 10obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
@@ -12,8 +15,10 @@ obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
12obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o 15obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
13obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o 16obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
14obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o 17obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
18obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
15obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o 19obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
16obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o 20obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
21obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
17obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o 22obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
18obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o 23obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
19 24
@@ -30,16 +35,11 @@ camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
30blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o 35blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
31twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o 36twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
32twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o 37twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
38twofish-avx-x86_64-y := twofish-avx-x86_64-asm_64.o twofish_avx_glue.o
33salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o 39salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
34serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o 40serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
41serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o serpent_avx_glue.o
35 42
36aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o 43aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
37
38ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o 44ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
39
40# enable AVX support only when $(AS) can actually assemble the instructions
41ifeq ($(call as-instr,vpxor %xmm0$(comma)%xmm1$(comma)%xmm2,yes,no),yes)
42AFLAGS_sha1_ssse3_asm.o += -DSHA1_ENABLE_AVX_SUPPORT
43CFLAGS_sha1_ssse3_glue.o += -DSHA1_ENABLE_AVX_SUPPORT
44endif
45sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o 45sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
diff --git a/arch/x86/crypto/ablk_helper.c b/arch/x86/crypto/ablk_helper.c
new file mode 100644
index 000000000000..43282fe04a8b
--- /dev/null
+++ b/arch/x86/crypto/ablk_helper.c
@@ -0,0 +1,149 @@
1/*
2 * Shared async block cipher helpers
3 *
4 * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * Based on aesni-intel_glue.c by:
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
23 * USA
24 *
25 */
26
27#include <linux/kernel.h>
28#include <linux/crypto.h>
29#include <linux/init.h>
30#include <linux/module.h>
31#include <crypto/algapi.h>
32#include <crypto/cryptd.h>
33#include <asm/i387.h>
34#include <asm/crypto/ablk_helper.h>
35
36int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
37 unsigned int key_len)
38{
39 struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm);
40 struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base;
41 int err;
42
43 crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
44 crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm)
45 & CRYPTO_TFM_REQ_MASK);
46 err = crypto_ablkcipher_setkey(child, key, key_len);
47 crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child)
48 & CRYPTO_TFM_RES_MASK);
49 return err;
50}
51EXPORT_SYMBOL_GPL(ablk_set_key);
52
53int __ablk_encrypt(struct ablkcipher_request *req)
54{
55 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
56 struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm);
57 struct blkcipher_desc desc;
58
59 desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
60 desc.info = req->info;
61 desc.flags = 0;
62
63 return crypto_blkcipher_crt(desc.tfm)->encrypt(
64 &desc, req->dst, req->src, req->nbytes);
65}
66EXPORT_SYMBOL_GPL(__ablk_encrypt);
67
68int ablk_encrypt(struct ablkcipher_request *req)
69{
70 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
71 struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm);
72
73 if (!irq_fpu_usable()) {
74 struct ablkcipher_request *cryptd_req =
75 ablkcipher_request_ctx(req);
76
77 memcpy(cryptd_req, req, sizeof(*req));
78 ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
79
80 return crypto_ablkcipher_encrypt(cryptd_req);
81 } else {
82 return __ablk_encrypt(req);
83 }
84}
85EXPORT_SYMBOL_GPL(ablk_encrypt);
86
87int ablk_decrypt(struct ablkcipher_request *req)
88{
89 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
90 struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm);
91
92 if (!irq_fpu_usable()) {
93 struct ablkcipher_request *cryptd_req =
94 ablkcipher_request_ctx(req);
95
96 memcpy(cryptd_req, req, sizeof(*req));
97 ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
98
99 return crypto_ablkcipher_decrypt(cryptd_req);
100 } else {
101 struct blkcipher_desc desc;
102
103 desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
104 desc.info = req->info;
105 desc.flags = 0;
106
107 return crypto_blkcipher_crt(desc.tfm)->decrypt(
108 &desc, req->dst, req->src, req->nbytes);
109 }
110}
111EXPORT_SYMBOL_GPL(ablk_decrypt);
112
113void ablk_exit(struct crypto_tfm *tfm)
114{
115 struct async_helper_ctx *ctx = crypto_tfm_ctx(tfm);
116
117 cryptd_free_ablkcipher(ctx->cryptd_tfm);
118}
119EXPORT_SYMBOL_GPL(ablk_exit);
120
121int ablk_init_common(struct crypto_tfm *tfm, const char *drv_name)
122{
123 struct async_helper_ctx *ctx = crypto_tfm_ctx(tfm);
124 struct cryptd_ablkcipher *cryptd_tfm;
125
126 cryptd_tfm = cryptd_alloc_ablkcipher(drv_name, 0, 0);
127 if (IS_ERR(cryptd_tfm))
128 return PTR_ERR(cryptd_tfm);
129
130 ctx->cryptd_tfm = cryptd_tfm;
131 tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) +
132 crypto_ablkcipher_reqsize(&cryptd_tfm->base);
133
134 return 0;
135}
136EXPORT_SYMBOL_GPL(ablk_init_common);
137
138int ablk_init(struct crypto_tfm *tfm)
139{
140 char drv_name[CRYPTO_MAX_ALG_NAME];
141
142 snprintf(drv_name, sizeof(drv_name), "__driver-%s",
143 crypto_tfm_alg_driver_name(tfm));
144
145 return ablk_init_common(tfm, drv_name);
146}
147EXPORT_SYMBOL_GPL(ablk_init);
148
149MODULE_LICENSE("GPL");
diff --git a/arch/x86/crypto/aes_glue.c b/arch/x86/crypto/aes_glue.c
index 8efcf42a9d7e..59b37deb8c8d 100644
--- a/arch/x86/crypto/aes_glue.c
+++ b/arch/x86/crypto/aes_glue.c
@@ -5,7 +5,7 @@
5 5
6#include <linux/module.h> 6#include <linux/module.h>
7#include <crypto/aes.h> 7#include <crypto/aes.h>
8#include <asm/aes.h> 8#include <asm/crypto/aes.h>
9 9
10asmlinkage void aes_enc_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in); 10asmlinkage void aes_enc_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in);
11asmlinkage void aes_dec_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in); 11asmlinkage void aes_dec_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in);
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index ac7f5cd019e8..34fdcff4d2c8 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -30,7 +30,8 @@
30#include <crypto/ctr.h> 30#include <crypto/ctr.h>
31#include <asm/cpu_device_id.h> 31#include <asm/cpu_device_id.h>
32#include <asm/i387.h> 32#include <asm/i387.h>
33#include <asm/aes.h> 33#include <asm/crypto/aes.h>
34#include <asm/crypto/ablk_helper.h>
34#include <crypto/scatterwalk.h> 35#include <crypto/scatterwalk.h>
35#include <crypto/internal/aead.h> 36#include <crypto/internal/aead.h>
36#include <linux/workqueue.h> 37#include <linux/workqueue.h>
@@ -52,10 +53,6 @@
52#define HAS_XTS 53#define HAS_XTS
53#endif 54#endif
54 55
55struct async_aes_ctx {
56 struct cryptd_ablkcipher *cryptd_tfm;
57};
58
59/* This data is stored at the end of the crypto_tfm struct. 56/* This data is stored at the end of the crypto_tfm struct.
60 * It's a type of per "session" data storage location. 57 * It's a type of per "session" data storage location.
61 * This needs to be 16 byte aligned. 58 * This needs to be 16 byte aligned.
@@ -377,87 +374,6 @@ static int ctr_crypt(struct blkcipher_desc *desc,
377} 374}
378#endif 375#endif
379 376
380static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
381 unsigned int key_len)
382{
383 struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
384 struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base;
385 int err;
386
387 crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
388 crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm)
389 & CRYPTO_TFM_REQ_MASK);
390 err = crypto_ablkcipher_setkey(child, key, key_len);
391 crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child)
392 & CRYPTO_TFM_RES_MASK);
393 return err;
394}
395
396static int ablk_encrypt(struct ablkcipher_request *req)
397{
398 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
399 struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
400
401 if (!irq_fpu_usable()) {
402 struct ablkcipher_request *cryptd_req =
403 ablkcipher_request_ctx(req);
404 memcpy(cryptd_req, req, sizeof(*req));
405 ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
406 return crypto_ablkcipher_encrypt(cryptd_req);
407 } else {
408 struct blkcipher_desc desc;
409 desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
410 desc.info = req->info;
411 desc.flags = 0;
412 return crypto_blkcipher_crt(desc.tfm)->encrypt(
413 &desc, req->dst, req->src, req->nbytes);
414 }
415}
416
417static int ablk_decrypt(struct ablkcipher_request *req)
418{
419 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
420 struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
421
422 if (!irq_fpu_usable()) {
423 struct ablkcipher_request *cryptd_req =
424 ablkcipher_request_ctx(req);
425 memcpy(cryptd_req, req, sizeof(*req));
426 ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
427 return crypto_ablkcipher_decrypt(cryptd_req);
428 } else {
429 struct blkcipher_desc desc;
430 desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
431 desc.info = req->info;
432 desc.flags = 0;
433 return crypto_blkcipher_crt(desc.tfm)->decrypt(
434 &desc, req->dst, req->src, req->nbytes);
435 }
436}
437
438static void ablk_exit(struct crypto_tfm *tfm)
439{
440 struct async_aes_ctx *ctx = crypto_tfm_ctx(tfm);
441
442 cryptd_free_ablkcipher(ctx->cryptd_tfm);
443}
444
445static int ablk_init_common(struct crypto_tfm *tfm, const char *drv_name)
446{
447 struct async_aes_ctx *ctx = crypto_tfm_ctx(tfm);
448 struct cryptd_ablkcipher *cryptd_tfm;
449
450 cryptd_tfm = cryptd_alloc_ablkcipher(drv_name, 0, 0);
451 if (IS_ERR(cryptd_tfm))
452 return PTR_ERR(cryptd_tfm);
453
454 ctx->cryptd_tfm = cryptd_tfm;
455 tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) +
456 crypto_ablkcipher_reqsize(&cryptd_tfm->base);
457
458 return 0;
459}
460
461static int ablk_ecb_init(struct crypto_tfm *tfm) 377static int ablk_ecb_init(struct crypto_tfm *tfm)
462{ 378{
463 return ablk_init_common(tfm, "__driver-ecb-aes-aesni"); 379 return ablk_init_common(tfm, "__driver-ecb-aes-aesni");
@@ -613,7 +529,7 @@ static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
613 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); 529 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
614 struct aesni_rfc4106_gcm_ctx *child_ctx = 530 struct aesni_rfc4106_gcm_ctx *child_ctx =
615 aesni_rfc4106_gcm_ctx_get(cryptd_child); 531 aesni_rfc4106_gcm_ctx_get(cryptd_child);
616 u8 *new_key_mem = NULL; 532 u8 *new_key_align, *new_key_mem = NULL;
617 533
618 if (key_len < 4) { 534 if (key_len < 4) {
619 crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); 535 crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
@@ -637,9 +553,9 @@ static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
637 if (!new_key_mem) 553 if (!new_key_mem)
638 return -ENOMEM; 554 return -ENOMEM;
639 555
640 new_key_mem = PTR_ALIGN(new_key_mem, AESNI_ALIGN); 556 new_key_align = PTR_ALIGN(new_key_mem, AESNI_ALIGN);
641 memcpy(new_key_mem, key, key_len); 557 memcpy(new_key_align, key, key_len);
642 key = new_key_mem; 558 key = new_key_align;
643 } 559 }
644 560
645 if (!irq_fpu_usable()) 561 if (!irq_fpu_usable())
@@ -968,7 +884,7 @@ static struct crypto_alg aesni_algs[] = { {
968 .cra_priority = 400, 884 .cra_priority = 400,
969 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 885 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
970 .cra_blocksize = AES_BLOCK_SIZE, 886 .cra_blocksize = AES_BLOCK_SIZE,
971 .cra_ctxsize = sizeof(struct async_aes_ctx), 887 .cra_ctxsize = sizeof(struct async_helper_ctx),
972 .cra_alignmask = 0, 888 .cra_alignmask = 0,
973 .cra_type = &crypto_ablkcipher_type, 889 .cra_type = &crypto_ablkcipher_type,
974 .cra_module = THIS_MODULE, 890 .cra_module = THIS_MODULE,
@@ -989,7 +905,7 @@ static struct crypto_alg aesni_algs[] = { {
989 .cra_priority = 400, 905 .cra_priority = 400,
990 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 906 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
991 .cra_blocksize = AES_BLOCK_SIZE, 907 .cra_blocksize = AES_BLOCK_SIZE,
992 .cra_ctxsize = sizeof(struct async_aes_ctx), 908 .cra_ctxsize = sizeof(struct async_helper_ctx),
993 .cra_alignmask = 0, 909 .cra_alignmask = 0,
994 .cra_type = &crypto_ablkcipher_type, 910 .cra_type = &crypto_ablkcipher_type,
995 .cra_module = THIS_MODULE, 911 .cra_module = THIS_MODULE,
@@ -1033,7 +949,7 @@ static struct crypto_alg aesni_algs[] = { {
1033 .cra_priority = 400, 949 .cra_priority = 400,
1034 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 950 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
1035 .cra_blocksize = 1, 951 .cra_blocksize = 1,
1036 .cra_ctxsize = sizeof(struct async_aes_ctx), 952 .cra_ctxsize = sizeof(struct async_helper_ctx),
1037 .cra_alignmask = 0, 953 .cra_alignmask = 0,
1038 .cra_type = &crypto_ablkcipher_type, 954 .cra_type = &crypto_ablkcipher_type,
1039 .cra_module = THIS_MODULE, 955 .cra_module = THIS_MODULE,
@@ -1098,7 +1014,7 @@ static struct crypto_alg aesni_algs[] = { {
1098 .cra_priority = 400, 1014 .cra_priority = 400,
1099 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 1015 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
1100 .cra_blocksize = 1, 1016 .cra_blocksize = 1,
1101 .cra_ctxsize = sizeof(struct async_aes_ctx), 1017 .cra_ctxsize = sizeof(struct async_helper_ctx),
1102 .cra_alignmask = 0, 1018 .cra_alignmask = 0,
1103 .cra_type = &crypto_ablkcipher_type, 1019 .cra_type = &crypto_ablkcipher_type,
1104 .cra_module = THIS_MODULE, 1020 .cra_module = THIS_MODULE,
@@ -1126,7 +1042,7 @@ static struct crypto_alg aesni_algs[] = { {
1126 .cra_priority = 400, 1042 .cra_priority = 400,
1127 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 1043 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
1128 .cra_blocksize = AES_BLOCK_SIZE, 1044 .cra_blocksize = AES_BLOCK_SIZE,
1129 .cra_ctxsize = sizeof(struct async_aes_ctx), 1045 .cra_ctxsize = sizeof(struct async_helper_ctx),
1130 .cra_alignmask = 0, 1046 .cra_alignmask = 0,
1131 .cra_type = &crypto_ablkcipher_type, 1047 .cra_type = &crypto_ablkcipher_type,
1132 .cra_module = THIS_MODULE, 1048 .cra_module = THIS_MODULE,
@@ -1150,7 +1066,7 @@ static struct crypto_alg aesni_algs[] = { {
1150 .cra_priority = 400, 1066 .cra_priority = 400,
1151 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 1067 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
1152 .cra_blocksize = AES_BLOCK_SIZE, 1068 .cra_blocksize = AES_BLOCK_SIZE,
1153 .cra_ctxsize = sizeof(struct async_aes_ctx), 1069 .cra_ctxsize = sizeof(struct async_helper_ctx),
1154 .cra_alignmask = 0, 1070 .cra_alignmask = 0,
1155 .cra_type = &crypto_ablkcipher_type, 1071 .cra_type = &crypto_ablkcipher_type,
1156 .cra_module = THIS_MODULE, 1072 .cra_module = THIS_MODULE,
@@ -1174,7 +1090,7 @@ static struct crypto_alg aesni_algs[] = { {
1174 .cra_priority = 400, 1090 .cra_priority = 400,
1175 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 1091 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
1176 .cra_blocksize = AES_BLOCK_SIZE, 1092 .cra_blocksize = AES_BLOCK_SIZE,
1177 .cra_ctxsize = sizeof(struct async_aes_ctx), 1093 .cra_ctxsize = sizeof(struct async_helper_ctx),
1178 .cra_alignmask = 0, 1094 .cra_alignmask = 0,
1179 .cra_type = &crypto_ablkcipher_type, 1095 .cra_type = &crypto_ablkcipher_type,
1180 .cra_module = THIS_MODULE, 1096 .cra_module = THIS_MODULE,
diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c
index 3306dc0b139e..eeb2b3b743e9 100644
--- a/arch/x86/crypto/camellia_glue.c
+++ b/arch/x86/crypto/camellia_glue.c
@@ -5,10 +5,6 @@
5 * 5 *
6 * Camellia parts based on code by: 6 * Camellia parts based on code by:
7 * Copyright (C) 2006 NTT (Nippon Telegraph and Telephone Corporation) 7 * Copyright (C) 2006 NTT (Nippon Telegraph and Telephone Corporation)
8 * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
9 * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
10 * CTR part based on code (crypto/ctr.c) by:
11 * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
12 * 8 *
13 * This program is free software; you can redistribute it and/or modify 9 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by 10 * it under the terms of the GNU General Public License as published by
@@ -34,9 +30,9 @@
34#include <linux/module.h> 30#include <linux/module.h>
35#include <linux/types.h> 31#include <linux/types.h>
36#include <crypto/algapi.h> 32#include <crypto/algapi.h>
37#include <crypto/b128ops.h>
38#include <crypto/lrw.h> 33#include <crypto/lrw.h>
39#include <crypto/xts.h> 34#include <crypto/xts.h>
35#include <asm/crypto/glue_helper.h>
40 36
41#define CAMELLIA_MIN_KEY_SIZE 16 37#define CAMELLIA_MIN_KEY_SIZE 16
42#define CAMELLIA_MAX_KEY_SIZE 32 38#define CAMELLIA_MAX_KEY_SIZE 32
@@ -1312,307 +1308,128 @@ static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key,
1312 &tfm->crt_flags); 1308 &tfm->crt_flags);
1313} 1309}
1314 1310
1315static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, 1311static void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src)
1316 void (*fn)(struct camellia_ctx *, u8 *, const u8 *),
1317 void (*fn_2way)(struct camellia_ctx *, u8 *, const u8 *))
1318{ 1312{
1319 struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 1313 u128 iv = *src;
1320 unsigned int bsize = CAMELLIA_BLOCK_SIZE;
1321 unsigned int nbytes;
1322 int err;
1323
1324 err = blkcipher_walk_virt(desc, walk);
1325
1326 while ((nbytes = walk->nbytes)) {
1327 u8 *wsrc = walk->src.virt.addr;
1328 u8 *wdst = walk->dst.virt.addr;
1329
1330 /* Process two block batch */
1331 if (nbytes >= bsize * 2) {
1332 do {
1333 fn_2way(ctx, wdst, wsrc);
1334
1335 wsrc += bsize * 2;
1336 wdst += bsize * 2;
1337 nbytes -= bsize * 2;
1338 } while (nbytes >= bsize * 2);
1339
1340 if (nbytes < bsize)
1341 goto done;
1342 }
1343
1344 /* Handle leftovers */
1345 do {
1346 fn(ctx, wdst, wsrc);
1347
1348 wsrc += bsize;
1349 wdst += bsize;
1350 nbytes -= bsize;
1351 } while (nbytes >= bsize);
1352
1353done:
1354 err = blkcipher_walk_done(desc, walk, nbytes);
1355 }
1356
1357 return err;
1358}
1359
1360static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1361 struct scatterlist *src, unsigned int nbytes)
1362{
1363 struct blkcipher_walk walk;
1364
1365 blkcipher_walk_init(&walk, dst, src, nbytes);
1366 return ecb_crypt(desc, &walk, camellia_enc_blk, camellia_enc_blk_2way);
1367}
1368 1314
1369static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 1315 camellia_dec_blk_2way(ctx, (u8 *)dst, (u8 *)src);
1370 struct scatterlist *src, unsigned int nbytes)
1371{
1372 struct blkcipher_walk walk;
1373
1374 blkcipher_walk_init(&walk, dst, src, nbytes);
1375 return ecb_crypt(desc, &walk, camellia_dec_blk, camellia_dec_blk_2way);
1376}
1377 1316
1378static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, 1317 u128_xor(&dst[1], &dst[1], &iv);
1379 struct blkcipher_walk *walk)
1380{
1381 struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
1382 unsigned int bsize = CAMELLIA_BLOCK_SIZE;
1383 unsigned int nbytes = walk->nbytes;
1384 u128 *src = (u128 *)walk->src.virt.addr;
1385 u128 *dst = (u128 *)walk->dst.virt.addr;
1386 u128 *iv = (u128 *)walk->iv;
1387
1388 do {
1389 u128_xor(dst, src, iv);
1390 camellia_enc_blk(ctx, (u8 *)dst, (u8 *)dst);
1391 iv = dst;
1392
1393 src += 1;
1394 dst += 1;
1395 nbytes -= bsize;
1396 } while (nbytes >= bsize);
1397
1398 u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv);
1399 return nbytes;
1400} 1318}
1401 1319
1402static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 1320static void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
1403 struct scatterlist *src, unsigned int nbytes)
1404{ 1321{
1405 struct blkcipher_walk walk; 1322 be128 ctrblk;
1406 int err;
1407 1323
1408 blkcipher_walk_init(&walk, dst, src, nbytes); 1324 if (dst != src)
1409 err = blkcipher_walk_virt(desc, &walk); 1325 *dst = *src;
1410 1326
1411 while ((nbytes = walk.nbytes)) { 1327 u128_to_be128(&ctrblk, iv);
1412 nbytes = __cbc_encrypt(desc, &walk); 1328 u128_inc(iv);
1413 err = blkcipher_walk_done(desc, &walk, nbytes);
1414 }
1415 1329
1416 return err; 1330 camellia_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk);
1417} 1331}
1418 1332
1419static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, 1333static void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src,
1420 struct blkcipher_walk *walk) 1334 u128 *iv)
1421{ 1335{
1422 struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 1336 be128 ctrblks[2];
1423 unsigned int bsize = CAMELLIA_BLOCK_SIZE;
1424 unsigned int nbytes = walk->nbytes;
1425 u128 *src = (u128 *)walk->src.virt.addr;
1426 u128 *dst = (u128 *)walk->dst.virt.addr;
1427 u128 ivs[2 - 1];
1428 u128 last_iv;
1429 1337
1430 /* Start of the last block. */ 1338 if (dst != src) {
1431 src += nbytes / bsize - 1; 1339 dst[0] = src[0];
1432 dst += nbytes / bsize - 1; 1340 dst[1] = src[1];
1433
1434 last_iv = *src;
1435
1436 /* Process two block batch */
1437 if (nbytes >= bsize * 2) {
1438 do {
1439 nbytes -= bsize * (2 - 1);
1440 src -= 2 - 1;
1441 dst -= 2 - 1;
1442
1443 ivs[0] = src[0];
1444
1445 camellia_dec_blk_2way(ctx, (u8 *)dst, (u8 *)src);
1446
1447 u128_xor(dst + 1, dst + 1, ivs + 0);
1448
1449 nbytes -= bsize;
1450 if (nbytes < bsize)
1451 goto done;
1452
1453 u128_xor(dst, dst, src - 1);
1454 src -= 1;
1455 dst -= 1;
1456 } while (nbytes >= bsize * 2);
1457
1458 if (nbytes < bsize)
1459 goto done;
1460 } 1341 }
1461 1342
1462 /* Handle leftovers */ 1343 u128_to_be128(&ctrblks[0], iv);
1463 for (;;) { 1344 u128_inc(iv);
1464 camellia_dec_blk(ctx, (u8 *)dst, (u8 *)src); 1345 u128_to_be128(&ctrblks[1], iv);
1465 1346 u128_inc(iv);
1466 nbytes -= bsize;
1467 if (nbytes < bsize)
1468 break;
1469 1347
1470 u128_xor(dst, dst, src - 1); 1348 camellia_enc_blk_xor_2way(ctx, (u8 *)dst, (u8 *)ctrblks);
1471 src -= 1;
1472 dst -= 1;
1473 }
1474
1475done:
1476 u128_xor(dst, dst, (u128 *)walk->iv);
1477 *(u128 *)walk->iv = last_iv;
1478
1479 return nbytes;
1480} 1349}
1481 1350
1482static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 1351static const struct common_glue_ctx camellia_enc = {
1483 struct scatterlist *src, unsigned int nbytes) 1352 .num_funcs = 2,
1484{ 1353 .fpu_blocks_limit = -1,
1485 struct blkcipher_walk walk; 1354
1486 int err; 1355 .funcs = { {
1487 1356 .num_blocks = 2,
1488 blkcipher_walk_init(&walk, dst, src, nbytes); 1357 .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk_2way) }
1489 err = blkcipher_walk_virt(desc, &walk); 1358 }, {
1359 .num_blocks = 1,
1360 .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk) }
1361 } }
1362};
1490 1363
1491 while ((nbytes = walk.nbytes)) { 1364static const struct common_glue_ctx camellia_ctr = {
1492 nbytes = __cbc_decrypt(desc, &walk); 1365 .num_funcs = 2,
1493 err = blkcipher_walk_done(desc, &walk, nbytes); 1366 .fpu_blocks_limit = -1,
1494 } 1367
1368 .funcs = { {
1369 .num_blocks = 2,
1370 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr_2way) }
1371 }, {
1372 .num_blocks = 1,
1373 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr) }
1374 } }
1375};
1495 1376
1496 return err; 1377static const struct common_glue_ctx camellia_dec = {
1497} 1378 .num_funcs = 2,
1379 .fpu_blocks_limit = -1,
1380
1381 .funcs = { {
1382 .num_blocks = 2,
1383 .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk_2way) }
1384 }, {
1385 .num_blocks = 1,
1386 .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk) }
1387 } }
1388};
1498 1389
1499static inline void u128_to_be128(be128 *dst, const u128 *src) 1390static const struct common_glue_ctx camellia_dec_cbc = {
1500{ 1391 .num_funcs = 2,
1501 dst->a = cpu_to_be64(src->a); 1392 .fpu_blocks_limit = -1,
1502 dst->b = cpu_to_be64(src->b); 1393
1503} 1394 .funcs = { {
1395 .num_blocks = 2,
1396 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_decrypt_cbc_2way) }
1397 }, {
1398 .num_blocks = 1,
1399 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_dec_blk) }
1400 } }
1401};
1504 1402
1505static inline void be128_to_u128(u128 *dst, const be128 *src) 1403static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1404 struct scatterlist *src, unsigned int nbytes)
1506{ 1405{
1507 dst->a = be64_to_cpu(src->a); 1406 return glue_ecb_crypt_128bit(&camellia_enc, desc, dst, src, nbytes);
1508 dst->b = be64_to_cpu(src->b);
1509} 1407}
1510 1408
1511static inline void u128_inc(u128 *i) 1409static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1410 struct scatterlist *src, unsigned int nbytes)
1512{ 1411{
1513 i->b++; 1412 return glue_ecb_crypt_128bit(&camellia_dec, desc, dst, src, nbytes);
1514 if (!i->b)
1515 i->a++;
1516} 1413}
1517 1414
1518static void ctr_crypt_final(struct blkcipher_desc *desc, 1415static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1519 struct blkcipher_walk *walk) 1416 struct scatterlist *src, unsigned int nbytes)
1520{ 1417{
1521 struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 1418 return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(camellia_enc_blk), desc,
1522 u8 keystream[CAMELLIA_BLOCK_SIZE]; 1419 dst, src, nbytes);
1523 u8 *src = walk->src.virt.addr;
1524 u8 *dst = walk->dst.virt.addr;
1525 unsigned int nbytes = walk->nbytes;
1526 u128 ctrblk;
1527
1528 memcpy(keystream, src, nbytes);
1529 camellia_enc_blk_xor(ctx, keystream, walk->iv);
1530 memcpy(dst, keystream, nbytes);
1531
1532 be128_to_u128(&ctrblk, (be128 *)walk->iv);
1533 u128_inc(&ctrblk);
1534 u128_to_be128((be128 *)walk->iv, &ctrblk);
1535} 1420}
1536 1421
1537static unsigned int __ctr_crypt(struct blkcipher_desc *desc, 1422static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1538 struct blkcipher_walk *walk) 1423 struct scatterlist *src, unsigned int nbytes)
1539{ 1424{
1540 struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 1425 return glue_cbc_decrypt_128bit(&camellia_dec_cbc, desc, dst, src,
1541 unsigned int bsize = CAMELLIA_BLOCK_SIZE; 1426 nbytes);
1542 unsigned int nbytes = walk->nbytes;
1543 u128 *src = (u128 *)walk->src.virt.addr;
1544 u128 *dst = (u128 *)walk->dst.virt.addr;
1545 u128 ctrblk;
1546 be128 ctrblocks[2];
1547
1548 be128_to_u128(&ctrblk, (be128 *)walk->iv);
1549
1550 /* Process two block batch */
1551 if (nbytes >= bsize * 2) {
1552 do {
1553 if (dst != src) {
1554 dst[0] = src[0];
1555 dst[1] = src[1];
1556 }
1557
1558 /* create ctrblks for parallel encrypt */
1559 u128_to_be128(&ctrblocks[0], &ctrblk);
1560 u128_inc(&ctrblk);
1561 u128_to_be128(&ctrblocks[1], &ctrblk);
1562 u128_inc(&ctrblk);
1563
1564 camellia_enc_blk_xor_2way(ctx, (u8 *)dst,
1565 (u8 *)ctrblocks);
1566
1567 src += 2;
1568 dst += 2;
1569 nbytes -= bsize * 2;
1570 } while (nbytes >= bsize * 2);
1571
1572 if (nbytes < bsize)
1573 goto done;
1574 }
1575
1576 /* Handle leftovers */
1577 do {
1578 if (dst != src)
1579 *dst = *src;
1580
1581 u128_to_be128(&ctrblocks[0], &ctrblk);
1582 u128_inc(&ctrblk);
1583
1584 camellia_enc_blk_xor(ctx, (u8 *)dst, (u8 *)ctrblocks);
1585
1586 src += 1;
1587 dst += 1;
1588 nbytes -= bsize;
1589 } while (nbytes >= bsize);
1590
1591done:
1592 u128_to_be128((be128 *)walk->iv, &ctrblk);
1593 return nbytes;
1594} 1427}
1595 1428
1596static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, 1429static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1597 struct scatterlist *src, unsigned int nbytes) 1430 struct scatterlist *src, unsigned int nbytes)
1598{ 1431{
1599 struct blkcipher_walk walk; 1432 return glue_ctr_crypt_128bit(&camellia_ctr, desc, dst, src, nbytes);
1600 int err;
1601
1602 blkcipher_walk_init(&walk, dst, src, nbytes);
1603 err = blkcipher_walk_virt_block(desc, &walk, CAMELLIA_BLOCK_SIZE);
1604
1605 while ((nbytes = walk.nbytes) >= CAMELLIA_BLOCK_SIZE) {
1606 nbytes = __ctr_crypt(desc, &walk);
1607 err = blkcipher_walk_done(desc, &walk, nbytes);
1608 }
1609
1610 if (walk.nbytes) {
1611 ctr_crypt_final(desc, &walk);
1612 err = blkcipher_walk_done(desc, &walk, 0);
1613 }
1614
1615 return err;
1616} 1433}
1617 1434
1618static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) 1435static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
new file mode 100644
index 000000000000..4854f0f31e4f
--- /dev/null
+++ b/arch/x86/crypto/glue_helper.c
@@ -0,0 +1,307 @@
1/*
2 * Shared glue code for 128bit block ciphers
3 *
4 * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
7 * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
8 * CTR part based on code (crypto/ctr.c) by:
9 * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
24 * USA
25 *
26 */
27
28#include <linux/module.h>
29#include <crypto/b128ops.h>
30#include <crypto/lrw.h>
31#include <crypto/xts.h>
32#include <asm/crypto/glue_helper.h>
33#include <crypto/scatterwalk.h>
34
35static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
36 struct blkcipher_desc *desc,
37 struct blkcipher_walk *walk)
38{
39 void *ctx = crypto_blkcipher_ctx(desc->tfm);
40 const unsigned int bsize = 128 / 8;
41 unsigned int nbytes, i, func_bytes;
42 bool fpu_enabled = false;
43 int err;
44
45 err = blkcipher_walk_virt(desc, walk);
46
47 while ((nbytes = walk->nbytes)) {
48 u8 *wsrc = walk->src.virt.addr;
49 u8 *wdst = walk->dst.virt.addr;
50
51 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
52 desc, fpu_enabled, nbytes);
53
54 for (i = 0; i < gctx->num_funcs; i++) {
55 func_bytes = bsize * gctx->funcs[i].num_blocks;
56
57 /* Process multi-block batch */
58 if (nbytes >= func_bytes) {
59 do {
60 gctx->funcs[i].fn_u.ecb(ctx, wdst,
61 wsrc);
62
63 wsrc += func_bytes;
64 wdst += func_bytes;
65 nbytes -= func_bytes;
66 } while (nbytes >= func_bytes);
67
68 if (nbytes < bsize)
69 goto done;
70 }
71 }
72
73done:
74 err = blkcipher_walk_done(desc, walk, nbytes);
75 }
76
77 glue_fpu_end(fpu_enabled);
78 return err;
79}
80
81int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
82 struct blkcipher_desc *desc, struct scatterlist *dst,
83 struct scatterlist *src, unsigned int nbytes)
84{
85 struct blkcipher_walk walk;
86
87 blkcipher_walk_init(&walk, dst, src, nbytes);
88 return __glue_ecb_crypt_128bit(gctx, desc, &walk);
89}
90EXPORT_SYMBOL_GPL(glue_ecb_crypt_128bit);
91
92static unsigned int __glue_cbc_encrypt_128bit(const common_glue_func_t fn,
93 struct blkcipher_desc *desc,
94 struct blkcipher_walk *walk)
95{
96 void *ctx = crypto_blkcipher_ctx(desc->tfm);
97 const unsigned int bsize = 128 / 8;
98 unsigned int nbytes = walk->nbytes;
99 u128 *src = (u128 *)walk->src.virt.addr;
100 u128 *dst = (u128 *)walk->dst.virt.addr;
101 u128 *iv = (u128 *)walk->iv;
102
103 do {
104 u128_xor(dst, src, iv);
105 fn(ctx, (u8 *)dst, (u8 *)dst);
106 iv = dst;
107
108 src += 1;
109 dst += 1;
110 nbytes -= bsize;
111 } while (nbytes >= bsize);
112
113 u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv);
114 return nbytes;
115}
116
117int glue_cbc_encrypt_128bit(const common_glue_func_t fn,
118 struct blkcipher_desc *desc,
119 struct scatterlist *dst,
120 struct scatterlist *src, unsigned int nbytes)
121{
122 struct blkcipher_walk walk;
123 int err;
124
125 blkcipher_walk_init(&walk, dst, src, nbytes);
126 err = blkcipher_walk_virt(desc, &walk);
127
128 while ((nbytes = walk.nbytes)) {
129 nbytes = __glue_cbc_encrypt_128bit(fn, desc, &walk);
130 err = blkcipher_walk_done(desc, &walk, nbytes);
131 }
132
133 return err;
134}
135EXPORT_SYMBOL_GPL(glue_cbc_encrypt_128bit);
136
137static unsigned int
138__glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
139 struct blkcipher_desc *desc,
140 struct blkcipher_walk *walk)
141{
142 void *ctx = crypto_blkcipher_ctx(desc->tfm);
143 const unsigned int bsize = 128 / 8;
144 unsigned int nbytes = walk->nbytes;
145 u128 *src = (u128 *)walk->src.virt.addr;
146 u128 *dst = (u128 *)walk->dst.virt.addr;
147 u128 last_iv;
148 unsigned int num_blocks, func_bytes;
149 unsigned int i;
150
151 /* Start of the last block. */
152 src += nbytes / bsize - 1;
153 dst += nbytes / bsize - 1;
154
155 last_iv = *src;
156
157 for (i = 0; i < gctx->num_funcs; i++) {
158 num_blocks = gctx->funcs[i].num_blocks;
159 func_bytes = bsize * num_blocks;
160
161 /* Process multi-block batch */
162 if (nbytes >= func_bytes) {
163 do {
164 nbytes -= func_bytes - bsize;
165 src -= num_blocks - 1;
166 dst -= num_blocks - 1;
167
168 gctx->funcs[i].fn_u.cbc(ctx, dst, src);
169
170 nbytes -= bsize;
171 if (nbytes < bsize)
172 goto done;
173
174 u128_xor(dst, dst, src - 1);
175 src -= 1;
176 dst -= 1;
177 } while (nbytes >= func_bytes);
178
179 if (nbytes < bsize)
180 goto done;
181 }
182 }
183
184done:
185 u128_xor(dst, dst, (u128 *)walk->iv);
186 *(u128 *)walk->iv = last_iv;
187
188 return nbytes;
189}
190
191int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
192 struct blkcipher_desc *desc,
193 struct scatterlist *dst,
194 struct scatterlist *src, unsigned int nbytes)
195{
196 const unsigned int bsize = 128 / 8;
197 bool fpu_enabled = false;
198 struct blkcipher_walk walk;
199 int err;
200
201 blkcipher_walk_init(&walk, dst, src, nbytes);
202 err = blkcipher_walk_virt(desc, &walk);
203
204 while ((nbytes = walk.nbytes)) {
205 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
206 desc, fpu_enabled, nbytes);
207 nbytes = __glue_cbc_decrypt_128bit(gctx, desc, &walk);
208 err = blkcipher_walk_done(desc, &walk, nbytes);
209 }
210
211 glue_fpu_end(fpu_enabled);
212 return err;
213}
214EXPORT_SYMBOL_GPL(glue_cbc_decrypt_128bit);
215
216static void glue_ctr_crypt_final_128bit(const common_glue_ctr_func_t fn_ctr,
217 struct blkcipher_desc *desc,
218 struct blkcipher_walk *walk)
219{
220 void *ctx = crypto_blkcipher_ctx(desc->tfm);
221 u8 *src = (u8 *)walk->src.virt.addr;
222 u8 *dst = (u8 *)walk->dst.virt.addr;
223 unsigned int nbytes = walk->nbytes;
224 u128 ctrblk;
225 u128 tmp;
226
227 be128_to_u128(&ctrblk, (be128 *)walk->iv);
228
229 memcpy(&tmp, src, nbytes);
230 fn_ctr(ctx, &tmp, &tmp, &ctrblk);
231 memcpy(dst, &tmp, nbytes);
232
233 u128_to_be128((be128 *)walk->iv, &ctrblk);
234}
235EXPORT_SYMBOL_GPL(glue_ctr_crypt_final_128bit);
236
237static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
238 struct blkcipher_desc *desc,
239 struct blkcipher_walk *walk)
240{
241 const unsigned int bsize = 128 / 8;
242 void *ctx = crypto_blkcipher_ctx(desc->tfm);
243 unsigned int nbytes = walk->nbytes;
244 u128 *src = (u128 *)walk->src.virt.addr;
245 u128 *dst = (u128 *)walk->dst.virt.addr;
246 u128 ctrblk;
247 unsigned int num_blocks, func_bytes;
248 unsigned int i;
249
250 be128_to_u128(&ctrblk, (be128 *)walk->iv);
251
252 /* Process multi-block batch */
253 for (i = 0; i < gctx->num_funcs; i++) {
254 num_blocks = gctx->funcs[i].num_blocks;
255 func_bytes = bsize * num_blocks;
256
257 if (nbytes >= func_bytes) {
258 do {
259 gctx->funcs[i].fn_u.ctr(ctx, dst, src, &ctrblk);
260
261 src += num_blocks;
262 dst += num_blocks;
263 nbytes -= func_bytes;
264 } while (nbytes >= func_bytes);
265
266 if (nbytes < bsize)
267 goto done;
268 }
269 }
270
271done:
272 u128_to_be128((be128 *)walk->iv, &ctrblk);
273 return nbytes;
274}
275
276int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
277 struct blkcipher_desc *desc, struct scatterlist *dst,
278 struct scatterlist *src, unsigned int nbytes)
279{
280 const unsigned int bsize = 128 / 8;
281 bool fpu_enabled = false;
282 struct blkcipher_walk walk;
283 int err;
284
285 blkcipher_walk_init(&walk, dst, src, nbytes);
286 err = blkcipher_walk_virt_block(desc, &walk, bsize);
287
288 while ((nbytes = walk.nbytes) >= bsize) {
289 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
290 desc, fpu_enabled, nbytes);
291 nbytes = __glue_ctr_crypt_128bit(gctx, desc, &walk);
292 err = blkcipher_walk_done(desc, &walk, nbytes);
293 }
294
295 glue_fpu_end(fpu_enabled);
296
297 if (walk.nbytes) {
298 glue_ctr_crypt_final_128bit(
299 gctx->funcs[gctx->num_funcs - 1].fn_u.ctr, desc, &walk);
300 err = blkcipher_walk_done(desc, &walk, 0);
301 }
302
303 return err;
304}
305EXPORT_SYMBOL_GPL(glue_ctr_crypt_128bit);
306
307MODULE_LICENSE("GPL");
diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
new file mode 100644
index 000000000000..504106bf04a2
--- /dev/null
+++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
@@ -0,0 +1,704 @@
1/*
2 * Serpent Cipher 8-way parallel algorithm (x86_64/AVX)
3 *
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
6 *
7 * Based on arch/x86/crypto/serpent-sse2-x86_64-asm_64.S by
8 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
23 * USA
24 *
25 */
26
27.file "serpent-avx-x86_64-asm_64.S"
28.text
29
30#define CTX %rdi
31
32/**********************************************************************
33 8-way AVX serpent
34 **********************************************************************/
35#define RA1 %xmm0
36#define RB1 %xmm1
37#define RC1 %xmm2
38#define RD1 %xmm3
39#define RE1 %xmm4
40
41#define tp %xmm5
42
43#define RA2 %xmm6
44#define RB2 %xmm7
45#define RC2 %xmm8
46#define RD2 %xmm9
47#define RE2 %xmm10
48
49#define RNOT %xmm11
50
51#define RK0 %xmm12
52#define RK1 %xmm13
53#define RK2 %xmm14
54#define RK3 %xmm15
55
56
57#define S0_1(x0, x1, x2, x3, x4) \
58 vpor x0, x3, tp; \
59 vpxor x3, x0, x0; \
60 vpxor x2, x3, x4; \
61 vpxor RNOT, x4, x4; \
62 vpxor x1, tp, x3; \
63 vpand x0, x1, x1; \
64 vpxor x4, x1, x1; \
65 vpxor x0, x2, x2;
66#define S0_2(x0, x1, x2, x3, x4) \
67 vpxor x3, x0, x0; \
68 vpor x0, x4, x4; \
69 vpxor x2, x0, x0; \
70 vpand x1, x2, x2; \
71 vpxor x2, x3, x3; \
72 vpxor RNOT, x1, x1; \
73 vpxor x4, x2, x2; \
74 vpxor x2, x1, x1;
75
76#define S1_1(x0, x1, x2, x3, x4) \
77 vpxor x0, x1, tp; \
78 vpxor x3, x0, x0; \
79 vpxor RNOT, x3, x3; \
80 vpand tp, x1, x4; \
81 vpor tp, x0, x0; \
82 vpxor x2, x3, x3; \
83 vpxor x3, x0, x0; \
84 vpxor x3, tp, x1;
85#define S1_2(x0, x1, x2, x3, x4) \
86 vpxor x4, x3, x3; \
87 vpor x4, x1, x1; \
88 vpxor x2, x4, x4; \
89 vpand x0, x2, x2; \
90 vpxor x1, x2, x2; \
91 vpor x0, x1, x1; \
92 vpxor RNOT, x0, x0; \
93 vpxor x2, x0, x0; \
94 vpxor x1, x4, x4;
95
96#define S2_1(x0, x1, x2, x3, x4) \
97 vpxor RNOT, x3, x3; \
98 vpxor x0, x1, x1; \
99 vpand x2, x0, tp; \
100 vpxor x3, tp, tp; \
101 vpor x0, x3, x3; \
102 vpxor x1, x2, x2; \
103 vpxor x1, x3, x3; \
104 vpand tp, x1, x1;
105#define S2_2(x0, x1, x2, x3, x4) \
106 vpxor x2, tp, tp; \
107 vpand x3, x2, x2; \
108 vpor x1, x3, x3; \
109 vpxor RNOT, tp, tp; \
110 vpxor tp, x3, x3; \
111 vpxor tp, x0, x4; \
112 vpxor x2, tp, x0; \
113 vpor x2, x1, x1;
114
115#define S3_1(x0, x1, x2, x3, x4) \
116 vpxor x3, x1, tp; \
117 vpor x0, x3, x3; \
118 vpand x0, x1, x4; \
119 vpxor x2, x0, x0; \
120 vpxor tp, x2, x2; \
121 vpand x3, tp, x1; \
122 vpxor x3, x2, x2; \
123 vpor x4, x0, x0; \
124 vpxor x3, x4, x4;
125#define S3_2(x0, x1, x2, x3, x4) \
126 vpxor x0, x1, x1; \
127 vpand x3, x0, x0; \
128 vpand x4, x3, x3; \
129 vpxor x2, x3, x3; \
130 vpor x1, x4, x4; \
131 vpand x1, x2, x2; \
132 vpxor x3, x4, x4; \
133 vpxor x3, x0, x0; \
134 vpxor x2, x3, x3;
135
136#define S4_1(x0, x1, x2, x3, x4) \
137 vpand x0, x3, tp; \
138 vpxor x3, x0, x0; \
139 vpxor x2, tp, tp; \
140 vpor x3, x2, x2; \
141 vpxor x1, x0, x0; \
142 vpxor tp, x3, x4; \
143 vpor x0, x2, x2; \
144 vpxor x1, x2, x2;
145#define S4_2(x0, x1, x2, x3, x4) \
146 vpand x0, x1, x1; \
147 vpxor x4, x1, x1; \
148 vpand x2, x4, x4; \
149 vpxor tp, x2, x2; \
150 vpxor x0, x4, x4; \
151 vpor x1, tp, x3; \
152 vpxor RNOT, x1, x1; \
153 vpxor x0, x3, x3;
154
155#define S5_1(x0, x1, x2, x3, x4) \
156 vpor x0, x1, tp; \
157 vpxor tp, x2, x2; \
158 vpxor RNOT, x3, x3; \
159 vpxor x0, x1, x4; \
160 vpxor x2, x0, x0; \
161 vpand x4, tp, x1; \
162 vpor x3, x4, x4; \
163 vpxor x0, x4, x4;
164#define S5_2(x0, x1, x2, x3, x4) \
165 vpand x3, x0, x0; \
166 vpxor x3, x1, x1; \
167 vpxor x2, x3, x3; \
168 vpxor x1, x0, x0; \
169 vpand x4, x2, x2; \
170 vpxor x2, x1, x1; \
171 vpand x0, x2, x2; \
172 vpxor x2, x3, x3;
173
174#define S6_1(x0, x1, x2, x3, x4) \
175 vpxor x0, x3, x3; \
176 vpxor x2, x1, tp; \
177 vpxor x0, x2, x2; \
178 vpand x3, x0, x0; \
179 vpor x3, tp, tp; \
180 vpxor RNOT, x1, x4; \
181 vpxor tp, x0, x0; \
182 vpxor x2, tp, x1;
183#define S6_2(x0, x1, x2, x3, x4) \
184 vpxor x4, x3, x3; \
185 vpxor x0, x4, x4; \
186 vpand x0, x2, x2; \
187 vpxor x1, x4, x4; \
188 vpxor x3, x2, x2; \
189 vpand x1, x3, x3; \
190 vpxor x0, x3, x3; \
191 vpxor x2, x1, x1;
192
193#define S7_1(x0, x1, x2, x3, x4) \
194 vpxor RNOT, x1, tp; \
195 vpxor RNOT, x0, x0; \
196 vpand x2, tp, x1; \
197 vpxor x3, x1, x1; \
198 vpor tp, x3, x3; \
199 vpxor x2, tp, x4; \
200 vpxor x3, x2, x2; \
201 vpxor x0, x3, x3; \
202 vpor x1, x0, x0;
203#define S7_2(x0, x1, x2, x3, x4) \
204 vpand x0, x2, x2; \
205 vpxor x4, x0, x0; \
206 vpxor x3, x4, x4; \
207 vpand x0, x3, x3; \
208 vpxor x1, x4, x4; \
209 vpxor x4, x2, x2; \
210 vpxor x1, x3, x3; \
211 vpor x0, x4, x4; \
212 vpxor x1, x4, x4;
213
214#define SI0_1(x0, x1, x2, x3, x4) \
215 vpxor x0, x1, x1; \
216 vpor x1, x3, tp; \
217 vpxor x1, x3, x4; \
218 vpxor RNOT, x0, x0; \
219 vpxor tp, x2, x2; \
220 vpxor x0, tp, x3; \
221 vpand x1, x0, x0; \
222 vpxor x2, x0, x0;
223#define SI0_2(x0, x1, x2, x3, x4) \
224 vpand x3, x2, x2; \
225 vpxor x4, x3, x3; \
226 vpxor x3, x2, x2; \
227 vpxor x3, x1, x1; \
228 vpand x0, x3, x3; \
229 vpxor x0, x1, x1; \
230 vpxor x2, x0, x0; \
231 vpxor x3, x4, x4;
232
233#define SI1_1(x0, x1, x2, x3, x4) \
234 vpxor x3, x1, x1; \
235 vpxor x2, x0, tp; \
236 vpxor RNOT, x2, x2; \
237 vpor x1, x0, x4; \
238 vpxor x3, x4, x4; \
239 vpand x1, x3, x3; \
240 vpxor x2, x1, x1; \
241 vpand x4, x2, x2;
242#define SI1_2(x0, x1, x2, x3, x4) \
243 vpxor x1, x4, x4; \
244 vpor x3, x1, x1; \
245 vpxor tp, x3, x3; \
246 vpxor tp, x2, x2; \
247 vpor x4, tp, x0; \
248 vpxor x4, x2, x2; \
249 vpxor x0, x1, x1; \
250 vpxor x1, x4, x4;
251
252#define SI2_1(x0, x1, x2, x3, x4) \
253 vpxor x1, x2, x2; \
254 vpxor RNOT, x3, tp; \
255 vpor x2, tp, tp; \
256 vpxor x3, x2, x2; \
257 vpxor x0, x3, x4; \
258 vpxor x1, tp, x3; \
259 vpor x2, x1, x1; \
260 vpxor x0, x2, x2;
261#define SI2_2(x0, x1, x2, x3, x4) \
262 vpxor x4, x1, x1; \
263 vpor x3, x4, x4; \
264 vpxor x3, x2, x2; \
265 vpxor x2, x4, x4; \
266 vpand x1, x2, x2; \
267 vpxor x3, x2, x2; \
268 vpxor x4, x3, x3; \
269 vpxor x0, x4, x4;
270
271#define SI3_1(x0, x1, x2, x3, x4) \
272 vpxor x1, x2, x2; \
273 vpand x2, x1, tp; \
274 vpxor x0, tp, tp; \
275 vpor x1, x0, x0; \
276 vpxor x3, x1, x4; \
277 vpxor x3, x0, x0; \
278 vpor tp, x3, x3; \
279 vpxor x2, tp, x1;
280#define SI3_2(x0, x1, x2, x3, x4) \
281 vpxor x3, x1, x1; \
282 vpxor x2, x0, x0; \
283 vpxor x3, x2, x2; \
284 vpand x1, x3, x3; \
285 vpxor x0, x1, x1; \
286 vpand x2, x0, x0; \
287 vpxor x3, x4, x4; \
288 vpxor x0, x3, x3; \
289 vpxor x1, x0, x0;
290
291#define SI4_1(x0, x1, x2, x3, x4) \
292 vpxor x3, x2, x2; \
293 vpand x1, x0, tp; \
294 vpxor x2, tp, tp; \
295 vpor x3, x2, x2; \
296 vpxor RNOT, x0, x4; \
297 vpxor tp, x1, x1; \
298 vpxor x2, tp, x0; \
299 vpand x4, x2, x2;
300#define SI4_2(x0, x1, x2, x3, x4) \
301 vpxor x0, x2, x2; \
302 vpor x4, x0, x0; \
303 vpxor x3, x0, x0; \
304 vpand x2, x3, x3; \
305 vpxor x3, x4, x4; \
306 vpxor x1, x3, x3; \
307 vpand x0, x1, x1; \
308 vpxor x1, x4, x4; \
309 vpxor x3, x0, x0;
310
311#define SI5_1(x0, x1, x2, x3, x4) \
312 vpor x2, x1, tp; \
313 vpxor x1, x2, x2; \
314 vpxor x3, tp, tp; \
315 vpand x1, x3, x3; \
316 vpxor x3, x2, x2; \
317 vpor x0, x3, x3; \
318 vpxor RNOT, x0, x0; \
319 vpxor x2, x3, x3; \
320 vpor x0, x2, x2;
321#define SI5_2(x0, x1, x2, x3, x4) \
322 vpxor tp, x1, x4; \
323 vpxor x4, x2, x2; \
324 vpand x0, x4, x4; \
325 vpxor tp, x0, x0; \
326 vpxor x3, tp, x1; \
327 vpand x2, x0, x0; \
328 vpxor x3, x2, x2; \
329 vpxor x2, x0, x0; \
330 vpxor x4, x2, x2; \
331 vpxor x3, x4, x4;
332
333#define SI6_1(x0, x1, x2, x3, x4) \
334 vpxor x2, x0, x0; \
335 vpand x3, x0, tp; \
336 vpxor x3, x2, x2; \
337 vpxor x2, tp, tp; \
338 vpxor x1, x3, x3; \
339 vpor x0, x2, x2; \
340 vpxor x3, x2, x2; \
341 vpand tp, x3, x3;
342#define SI6_2(x0, x1, x2, x3, x4) \
343 vpxor RNOT, tp, tp; \
344 vpxor x1, x3, x3; \
345 vpand x2, x1, x1; \
346 vpxor tp, x0, x4; \
347 vpxor x4, x3, x3; \
348 vpxor x2, x4, x4; \
349 vpxor x1, tp, x0; \
350 vpxor x0, x2, x2;
351
352#define SI7_1(x0, x1, x2, x3, x4) \
353 vpand x0, x3, tp; \
354 vpxor x2, x0, x0; \
355 vpor x3, x2, x2; \
356 vpxor x1, x3, x4; \
357 vpxor RNOT, x0, x0; \
358 vpor tp, x1, x1; \
359 vpxor x0, x4, x4; \
360 vpand x2, x0, x0; \
361 vpxor x1, x0, x0;
362#define SI7_2(x0, x1, x2, x3, x4) \
363 vpand x2, x1, x1; \
364 vpxor x2, tp, x3; \
365 vpxor x3, x4, x4; \
366 vpand x3, x2, x2; \
367 vpor x0, x3, x3; \
368 vpxor x4, x1, x1; \
369 vpxor x4, x3, x3; \
370 vpand x0, x4, x4; \
371 vpxor x2, x4, x4;
372
373#define get_key(i, j, t) \
374 vbroadcastss (4*(i)+(j))*4(CTX), t;
375
376#define K2(x0, x1, x2, x3, x4, i) \
377 get_key(i, 0, RK0); \
378 get_key(i, 1, RK1); \
379 get_key(i, 2, RK2); \
380 get_key(i, 3, RK3); \
381 vpxor RK0, x0 ## 1, x0 ## 1; \
382 vpxor RK1, x1 ## 1, x1 ## 1; \
383 vpxor RK2, x2 ## 1, x2 ## 1; \
384 vpxor RK3, x3 ## 1, x3 ## 1; \
385 vpxor RK0, x0 ## 2, x0 ## 2; \
386 vpxor RK1, x1 ## 2, x1 ## 2; \
387 vpxor RK2, x2 ## 2, x2 ## 2; \
388 vpxor RK3, x3 ## 2, x3 ## 2;
389
390#define LK2(x0, x1, x2, x3, x4, i) \
391 vpslld $13, x0 ## 1, x4 ## 1; \
392 vpsrld $(32 - 13), x0 ## 1, x0 ## 1; \
393 vpor x4 ## 1, x0 ## 1, x0 ## 1; \
394 vpxor x0 ## 1, x1 ## 1, x1 ## 1; \
395 vpslld $3, x2 ## 1, x4 ## 1; \
396 vpsrld $(32 - 3), x2 ## 1, x2 ## 1; \
397 vpor x4 ## 1, x2 ## 1, x2 ## 1; \
398 vpxor x2 ## 1, x1 ## 1, x1 ## 1; \
399 vpslld $13, x0 ## 2, x4 ## 2; \
400 vpsrld $(32 - 13), x0 ## 2, x0 ## 2; \
401 vpor x4 ## 2, x0 ## 2, x0 ## 2; \
402 vpxor x0 ## 2, x1 ## 2, x1 ## 2; \
403 vpslld $3, x2 ## 2, x4 ## 2; \
404 vpsrld $(32 - 3), x2 ## 2, x2 ## 2; \
405 vpor x4 ## 2, x2 ## 2, x2 ## 2; \
406 vpxor x2 ## 2, x1 ## 2, x1 ## 2; \
407 vpslld $1, x1 ## 1, x4 ## 1; \
408 vpsrld $(32 - 1), x1 ## 1, x1 ## 1; \
409 vpor x4 ## 1, x1 ## 1, x1 ## 1; \
410 vpslld $3, x0 ## 1, x4 ## 1; \
411 vpxor x2 ## 1, x3 ## 1, x3 ## 1; \
412 vpxor x4 ## 1, x3 ## 1, x3 ## 1; \
413 get_key(i, 1, RK1); \
414 vpslld $1, x1 ## 2, x4 ## 2; \
415 vpsrld $(32 - 1), x1 ## 2, x1 ## 2; \
416 vpor x4 ## 2, x1 ## 2, x1 ## 2; \
417 vpslld $3, x0 ## 2, x4 ## 2; \
418 vpxor x2 ## 2, x3 ## 2, x3 ## 2; \
419 vpxor x4 ## 2, x3 ## 2, x3 ## 2; \
420 get_key(i, 3, RK3); \
421 vpslld $7, x3 ## 1, x4 ## 1; \
422 vpsrld $(32 - 7), x3 ## 1, x3 ## 1; \
423 vpor x4 ## 1, x3 ## 1, x3 ## 1; \
424 vpslld $7, x1 ## 1, x4 ## 1; \
425 vpxor x1 ## 1, x0 ## 1, x0 ## 1; \
426 vpxor x3 ## 1, x0 ## 1, x0 ## 1; \
427 vpxor x3 ## 1, x2 ## 1, x2 ## 1; \
428 vpxor x4 ## 1, x2 ## 1, x2 ## 1; \
429 get_key(i, 0, RK0); \
430 vpslld $7, x3 ## 2, x4 ## 2; \
431 vpsrld $(32 - 7), x3 ## 2, x3 ## 2; \
432 vpor x4 ## 2, x3 ## 2, x3 ## 2; \
433 vpslld $7, x1 ## 2, x4 ## 2; \
434 vpxor x1 ## 2, x0 ## 2, x0 ## 2; \
435 vpxor x3 ## 2, x0 ## 2, x0 ## 2; \
436 vpxor x3 ## 2, x2 ## 2, x2 ## 2; \
437 vpxor x4 ## 2, x2 ## 2, x2 ## 2; \
438 get_key(i, 2, RK2); \
439 vpxor RK1, x1 ## 1, x1 ## 1; \
440 vpxor RK3, x3 ## 1, x3 ## 1; \
441 vpslld $5, x0 ## 1, x4 ## 1; \
442 vpsrld $(32 - 5), x0 ## 1, x0 ## 1; \
443 vpor x4 ## 1, x0 ## 1, x0 ## 1; \
444 vpslld $22, x2 ## 1, x4 ## 1; \
445 vpsrld $(32 - 22), x2 ## 1, x2 ## 1; \
446 vpor x4 ## 1, x2 ## 1, x2 ## 1; \
447 vpxor RK0, x0 ## 1, x0 ## 1; \
448 vpxor RK2, x2 ## 1, x2 ## 1; \
449 vpxor RK1, x1 ## 2, x1 ## 2; \
450 vpxor RK3, x3 ## 2, x3 ## 2; \
451 vpslld $5, x0 ## 2, x4 ## 2; \
452 vpsrld $(32 - 5), x0 ## 2, x0 ## 2; \
453 vpor x4 ## 2, x0 ## 2, x0 ## 2; \
454 vpslld $22, x2 ## 2, x4 ## 2; \
455 vpsrld $(32 - 22), x2 ## 2, x2 ## 2; \
456 vpor x4 ## 2, x2 ## 2, x2 ## 2; \
457 vpxor RK0, x0 ## 2, x0 ## 2; \
458 vpxor RK2, x2 ## 2, x2 ## 2;
459
460#define KL2(x0, x1, x2, x3, x4, i) \
461 vpxor RK0, x0 ## 1, x0 ## 1; \
462 vpxor RK2, x2 ## 1, x2 ## 1; \
463 vpsrld $5, x0 ## 1, x4 ## 1; \
464 vpslld $(32 - 5), x0 ## 1, x0 ## 1; \
465 vpor x4 ## 1, x0 ## 1, x0 ## 1; \
466 vpxor RK3, x3 ## 1, x3 ## 1; \
467 vpxor RK1, x1 ## 1, x1 ## 1; \
468 vpsrld $22, x2 ## 1, x4 ## 1; \
469 vpslld $(32 - 22), x2 ## 1, x2 ## 1; \
470 vpor x4 ## 1, x2 ## 1, x2 ## 1; \
471 vpxor x3 ## 1, x2 ## 1, x2 ## 1; \
472 vpxor RK0, x0 ## 2, x0 ## 2; \
473 vpxor RK2, x2 ## 2, x2 ## 2; \
474 vpsrld $5, x0 ## 2, x4 ## 2; \
475 vpslld $(32 - 5), x0 ## 2, x0 ## 2; \
476 vpor x4 ## 2, x0 ## 2, x0 ## 2; \
477 vpxor RK3, x3 ## 2, x3 ## 2; \
478 vpxor RK1, x1 ## 2, x1 ## 2; \
479 vpsrld $22, x2 ## 2, x4 ## 2; \
480 vpslld $(32 - 22), x2 ## 2, x2 ## 2; \
481 vpor x4 ## 2, x2 ## 2, x2 ## 2; \
482 vpxor x3 ## 2, x2 ## 2, x2 ## 2; \
483 vpxor x3 ## 1, x0 ## 1, x0 ## 1; \
484 vpslld $7, x1 ## 1, x4 ## 1; \
485 vpxor x1 ## 1, x0 ## 1, x0 ## 1; \
486 vpxor x4 ## 1, x2 ## 1, x2 ## 1; \
487 vpsrld $1, x1 ## 1, x4 ## 1; \
488 vpslld $(32 - 1), x1 ## 1, x1 ## 1; \
489 vpor x4 ## 1, x1 ## 1, x1 ## 1; \
490 vpxor x3 ## 2, x0 ## 2, x0 ## 2; \
491 vpslld $7, x1 ## 2, x4 ## 2; \
492 vpxor x1 ## 2, x0 ## 2, x0 ## 2; \
493 vpxor x4 ## 2, x2 ## 2, x2 ## 2; \
494 vpsrld $1, x1 ## 2, x4 ## 2; \
495 vpslld $(32 - 1), x1 ## 2, x1 ## 2; \
496 vpor x4 ## 2, x1 ## 2, x1 ## 2; \
497 vpsrld $7, x3 ## 1, x4 ## 1; \
498 vpslld $(32 - 7), x3 ## 1, x3 ## 1; \
499 vpor x4 ## 1, x3 ## 1, x3 ## 1; \
500 vpxor x0 ## 1, x1 ## 1, x1 ## 1; \
501 vpslld $3, x0 ## 1, x4 ## 1; \
502 vpxor x4 ## 1, x3 ## 1, x3 ## 1; \
503 vpsrld $7, x3 ## 2, x4 ## 2; \
504 vpslld $(32 - 7), x3 ## 2, x3 ## 2; \
505 vpor x4 ## 2, x3 ## 2, x3 ## 2; \
506 vpxor x0 ## 2, x1 ## 2, x1 ## 2; \
507 vpslld $3, x0 ## 2, x4 ## 2; \
508 vpxor x4 ## 2, x3 ## 2, x3 ## 2; \
509 vpsrld $13, x0 ## 1, x4 ## 1; \
510 vpslld $(32 - 13), x0 ## 1, x0 ## 1; \
511 vpor x4 ## 1, x0 ## 1, x0 ## 1; \
512 vpxor x2 ## 1, x1 ## 1, x1 ## 1; \
513 vpxor x2 ## 1, x3 ## 1, x3 ## 1; \
514 vpsrld $3, x2 ## 1, x4 ## 1; \
515 vpslld $(32 - 3), x2 ## 1, x2 ## 1; \
516 vpor x4 ## 1, x2 ## 1, x2 ## 1; \
517 vpsrld $13, x0 ## 2, x4 ## 2; \
518 vpslld $(32 - 13), x0 ## 2, x0 ## 2; \
519 vpor x4 ## 2, x0 ## 2, x0 ## 2; \
520 vpxor x2 ## 2, x1 ## 2, x1 ## 2; \
521 vpxor x2 ## 2, x3 ## 2, x3 ## 2; \
522 vpsrld $3, x2 ## 2, x4 ## 2; \
523 vpslld $(32 - 3), x2 ## 2, x2 ## 2; \
524 vpor x4 ## 2, x2 ## 2, x2 ## 2;
525
526#define S(SBOX, x0, x1, x2, x3, x4) \
527 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
528 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
529 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
530 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
531
532#define SP(SBOX, x0, x1, x2, x3, x4, i) \
533 get_key(i, 0, RK0); \
534 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
535 get_key(i, 2, RK2); \
536 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
537 get_key(i, 3, RK3); \
538 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
539 get_key(i, 1, RK1); \
540 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
541
542#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
543 vpunpckldq x1, x0, t0; \
544 vpunpckhdq x1, x0, t2; \
545 vpunpckldq x3, x2, t1; \
546 vpunpckhdq x3, x2, x3; \
547 \
548 vpunpcklqdq t1, t0, x0; \
549 vpunpckhqdq t1, t0, x1; \
550 vpunpcklqdq x3, t2, x2; \
551 vpunpckhqdq x3, t2, x3;
552
553#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
554 vmovdqu (0*4*4)(in), x0; \
555 vmovdqu (1*4*4)(in), x1; \
556 vmovdqu (2*4*4)(in), x2; \
557 vmovdqu (3*4*4)(in), x3; \
558 \
559 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
560
561#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
562 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
563 \
564 vmovdqu x0, (0*4*4)(out); \
565 vmovdqu x1, (1*4*4)(out); \
566 vmovdqu x2, (2*4*4)(out); \
567 vmovdqu x3, (3*4*4)(out);
568
569#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
570 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
571 \
572 vpxor (0*4*4)(out), x0, x0; \
573 vmovdqu x0, (0*4*4)(out); \
574 vpxor (1*4*4)(out), x1, x1; \
575 vmovdqu x1, (1*4*4)(out); \
576 vpxor (2*4*4)(out), x2, x2; \
577 vmovdqu x2, (2*4*4)(out); \
578 vpxor (3*4*4)(out), x3, x3; \
579 vmovdqu x3, (3*4*4)(out);
580
581.align 8
582.global __serpent_enc_blk_8way_avx
583.type __serpent_enc_blk_8way_avx,@function;
584
585__serpent_enc_blk_8way_avx:
586 /* input:
587 * %rdi: ctx, CTX
588 * %rsi: dst
589 * %rdx: src
590 * %rcx: bool, if true: xor output
591 */
592
593 vpcmpeqd RNOT, RNOT, RNOT;
594
595 leaq (4*4*4)(%rdx), %rax;
596 read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
597 read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
598
599 K2(RA, RB, RC, RD, RE, 0);
600 S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);
601 S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2);
602 S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3);
603 S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4);
604 S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5);
605 S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6);
606 S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7);
607 S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8);
608 S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9);
609 S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10);
610 S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11);
611 S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12);
612 S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13);
613 S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14);
614 S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15);
615 S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16);
616 S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17);
617 S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18);
618 S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19);
619 S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20);
620 S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21);
621 S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22);
622 S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23);
623 S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24);
624 S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25);
625 S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26);
626 S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27);
627 S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28);
628 S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29);
629 S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30);
630 S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);
631 S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);
632
633 leaq (4*4*4)(%rsi), %rax;
634
635 testb %cl, %cl;
636 jnz __enc_xor8;
637
638 write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
639 write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
640
641 ret;
642
643__enc_xor8:
644 xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
645 xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
646
647 ret;
648
649.align 8
650.global serpent_dec_blk_8way_avx
651.type serpent_dec_blk_8way_avx,@function;
652
653serpent_dec_blk_8way_avx:
654 /* input:
655 * %rdi: ctx, CTX
656 * %rsi: dst
657 * %rdx: src
658 */
659
660 vpcmpeqd RNOT, RNOT, RNOT;
661
662 leaq (4*4*4)(%rdx), %rax;
663 read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
664 read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
665
666 K2(RA, RB, RC, RD, RE, 32);
667 SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);
668 SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30);
669 SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29);
670 SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28);
671 SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27);
672 SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26);
673 SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25);
674 SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24);
675 SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23);
676 SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22);
677 SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21);
678 SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20);
679 SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19);
680 SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18);
681 SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17);
682 SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16);
683 SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15);
684 SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14);
685 SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13);
686 SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12);
687 SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11);
688 SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10);
689 SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9);
690 SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8);
691 SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7);
692 SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6);
693 SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5);
694 SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4);
695 SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3);
696 SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2);
697 SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);
698 S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);
699
700 leaq (4*4*4)(%rsi), %rax;
701 write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2);
702 write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2);
703
704 ret;
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
new file mode 100644
index 000000000000..b36bdac237eb
--- /dev/null
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -0,0 +1,636 @@
1/*
2 * Glue Code for AVX assembler versions of Serpent Cipher
3 *
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
6 *
7 * Glue code based on serpent_sse2_glue.c by:
8 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
23 * USA
24 *
25 */
26
27#include <linux/module.h>
28#include <linux/hardirq.h>
29#include <linux/types.h>
30#include <linux/crypto.h>
31#include <linux/err.h>
32#include <crypto/algapi.h>
33#include <crypto/serpent.h>
34#include <crypto/cryptd.h>
35#include <crypto/b128ops.h>
36#include <crypto/ctr.h>
37#include <crypto/lrw.h>
38#include <crypto/xts.h>
39#include <asm/xcr.h>
40#include <asm/xsave.h>
41#include <asm/crypto/serpent-avx.h>
42#include <asm/crypto/ablk_helper.h>
43#include <asm/crypto/glue_helper.h>
44
45static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)
46{
47 u128 ivs[SERPENT_PARALLEL_BLOCKS - 1];
48 unsigned int j;
49
50 for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
51 ivs[j] = src[j];
52
53 serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
54
55 for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
56 u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
57}
58
59static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
60{
61 be128 ctrblk;
62
63 u128_to_be128(&ctrblk, iv);
64 u128_inc(iv);
65
66 __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
67 u128_xor(dst, src, (u128 *)&ctrblk);
68}
69
70static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
71 u128 *iv)
72{
73 be128 ctrblks[SERPENT_PARALLEL_BLOCKS];
74 unsigned int i;
75
76 for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) {
77 if (dst != src)
78 dst[i] = src[i];
79
80 u128_to_be128(&ctrblks[i], iv);
81 u128_inc(iv);
82 }
83
84 serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
85}
86
87static const struct common_glue_ctx serpent_enc = {
88 .num_funcs = 2,
89 .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
90
91 .funcs = { {
92 .num_blocks = SERPENT_PARALLEL_BLOCKS,
93 .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_enc_blk_xway) }
94 }, {
95 .num_blocks = 1,
96 .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) }
97 } }
98};
99
100static const struct common_glue_ctx serpent_ctr = {
101 .num_funcs = 2,
102 .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
103
104 .funcs = { {
105 .num_blocks = SERPENT_PARALLEL_BLOCKS,
106 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr_xway) }
107 }, {
108 .num_blocks = 1,
109 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) }
110 } }
111};
112
113static const struct common_glue_ctx serpent_dec = {
114 .num_funcs = 2,
115 .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
116
117 .funcs = { {
118 .num_blocks = SERPENT_PARALLEL_BLOCKS,
119 .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_dec_blk_xway) }
120 }, {
121 .num_blocks = 1,
122 .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) }
123 } }
124};
125
126static const struct common_glue_ctx serpent_dec_cbc = {
127 .num_funcs = 2,
128 .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
129
130 .funcs = { {
131 .num_blocks = SERPENT_PARALLEL_BLOCKS,
132 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_decrypt_cbc_xway) }
133 }, {
134 .num_blocks = 1,
135 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) }
136 } }
137};
138
139static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
140 struct scatterlist *src, unsigned int nbytes)
141{
142 return glue_ecb_crypt_128bit(&serpent_enc, desc, dst, src, nbytes);
143}
144
145static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
146 struct scatterlist *src, unsigned int nbytes)
147{
148 return glue_ecb_crypt_128bit(&serpent_dec, desc, dst, src, nbytes);
149}
150
151static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
152 struct scatterlist *src, unsigned int nbytes)
153{
154 return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(__serpent_encrypt), desc,
155 dst, src, nbytes);
156}
157
158static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
159 struct scatterlist *src, unsigned int nbytes)
160{
161 return glue_cbc_decrypt_128bit(&serpent_dec_cbc, desc, dst, src,
162 nbytes);
163}
164
165static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
166 struct scatterlist *src, unsigned int nbytes)
167{
168 return glue_ctr_crypt_128bit(&serpent_ctr, desc, dst, src, nbytes);
169}
170
171static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes)
172{
173 return glue_fpu_begin(SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS,
174 NULL, fpu_enabled, nbytes);
175}
176
177static inline void serpent_fpu_end(bool fpu_enabled)
178{
179 glue_fpu_end(fpu_enabled);
180}
181
182struct crypt_priv {
183 struct serpent_ctx *ctx;
184 bool fpu_enabled;
185};
186
187static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
188{
189 const unsigned int bsize = SERPENT_BLOCK_SIZE;
190 struct crypt_priv *ctx = priv;
191 int i;
192
193 ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
194
195 if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
196 serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst);
197 return;
198 }
199
200 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
201 __serpent_encrypt(ctx->ctx, srcdst, srcdst);
202}
203
204static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
205{
206 const unsigned int bsize = SERPENT_BLOCK_SIZE;
207 struct crypt_priv *ctx = priv;
208 int i;
209
210 ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
211
212 if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
213 serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst);
214 return;
215 }
216
217 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
218 __serpent_decrypt(ctx->ctx, srcdst, srcdst);
219}
220
221struct serpent_lrw_ctx {
222 struct lrw_table_ctx lrw_table;
223 struct serpent_ctx serpent_ctx;
224};
225
226static int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
227 unsigned int keylen)
228{
229 struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
230 int err;
231
232 err = __serpent_setkey(&ctx->serpent_ctx, key, keylen -
233 SERPENT_BLOCK_SIZE);
234 if (err)
235 return err;
236
237 return lrw_init_table(&ctx->lrw_table, key + keylen -
238 SERPENT_BLOCK_SIZE);
239}
240
241static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
242 struct scatterlist *src, unsigned int nbytes)
243{
244 struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
245 be128 buf[SERPENT_PARALLEL_BLOCKS];
246 struct crypt_priv crypt_ctx = {
247 .ctx = &ctx->serpent_ctx,
248 .fpu_enabled = false,
249 };
250 struct lrw_crypt_req req = {
251 .tbuf = buf,
252 .tbuflen = sizeof(buf),
253
254 .table_ctx = &ctx->lrw_table,
255 .crypt_ctx = &crypt_ctx,
256 .crypt_fn = encrypt_callback,
257 };
258 int ret;
259
260 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
261 ret = lrw_crypt(desc, dst, src, nbytes, &req);
262 serpent_fpu_end(crypt_ctx.fpu_enabled);
263
264 return ret;
265}
266
267static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
268 struct scatterlist *src, unsigned int nbytes)
269{
270 struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
271 be128 buf[SERPENT_PARALLEL_BLOCKS];
272 struct crypt_priv crypt_ctx = {
273 .ctx = &ctx->serpent_ctx,
274 .fpu_enabled = false,
275 };
276 struct lrw_crypt_req req = {
277 .tbuf = buf,
278 .tbuflen = sizeof(buf),
279
280 .table_ctx = &ctx->lrw_table,
281 .crypt_ctx = &crypt_ctx,
282 .crypt_fn = decrypt_callback,
283 };
284 int ret;
285
286 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
287 ret = lrw_crypt(desc, dst, src, nbytes, &req);
288 serpent_fpu_end(crypt_ctx.fpu_enabled);
289
290 return ret;
291}
292
293static void lrw_exit_tfm(struct crypto_tfm *tfm)
294{
295 struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
296
297 lrw_free_table(&ctx->lrw_table);
298}
299
300struct serpent_xts_ctx {
301 struct serpent_ctx tweak_ctx;
302 struct serpent_ctx crypt_ctx;
303};
304
305static int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
306 unsigned int keylen)
307{
308 struct serpent_xts_ctx *ctx = crypto_tfm_ctx(tfm);
309 u32 *flags = &tfm->crt_flags;
310 int err;
311
312 /* key consists of keys of equal size concatenated, therefore
313 * the length must be even
314 */
315 if (keylen % 2) {
316 *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
317 return -EINVAL;
318 }
319
320 /* first half of xts-key is for crypt */
321 err = __serpent_setkey(&ctx->crypt_ctx, key, keylen / 2);
322 if (err)
323 return err;
324
325 /* second half of xts-key is for tweak */
326 return __serpent_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2);
327}
328
329static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
330 struct scatterlist *src, unsigned int nbytes)
331{
332 struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
333 be128 buf[SERPENT_PARALLEL_BLOCKS];
334 struct crypt_priv crypt_ctx = {
335 .ctx = &ctx->crypt_ctx,
336 .fpu_enabled = false,
337 };
338 struct xts_crypt_req req = {
339 .tbuf = buf,
340 .tbuflen = sizeof(buf),
341
342 .tweak_ctx = &ctx->tweak_ctx,
343 .tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt),
344 .crypt_ctx = &crypt_ctx,
345 .crypt_fn = encrypt_callback,
346 };
347 int ret;
348
349 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
350 ret = xts_crypt(desc, dst, src, nbytes, &req);
351 serpent_fpu_end(crypt_ctx.fpu_enabled);
352
353 return ret;
354}
355
356static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
357 struct scatterlist *src, unsigned int nbytes)
358{
359 struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
360 be128 buf[SERPENT_PARALLEL_BLOCKS];
361 struct crypt_priv crypt_ctx = {
362 .ctx = &ctx->crypt_ctx,
363 .fpu_enabled = false,
364 };
365 struct xts_crypt_req req = {
366 .tbuf = buf,
367 .tbuflen = sizeof(buf),
368
369 .tweak_ctx = &ctx->tweak_ctx,
370 .tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt),
371 .crypt_ctx = &crypt_ctx,
372 .crypt_fn = decrypt_callback,
373 };
374 int ret;
375
376 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
377 ret = xts_crypt(desc, dst, src, nbytes, &req);
378 serpent_fpu_end(crypt_ctx.fpu_enabled);
379
380 return ret;
381}
382
383static struct crypto_alg serpent_algs[10] = { {
384 .cra_name = "__ecb-serpent-avx",
385 .cra_driver_name = "__driver-ecb-serpent-avx",
386 .cra_priority = 0,
387 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
388 .cra_blocksize = SERPENT_BLOCK_SIZE,
389 .cra_ctxsize = sizeof(struct serpent_ctx),
390 .cra_alignmask = 0,
391 .cra_type = &crypto_blkcipher_type,
392 .cra_module = THIS_MODULE,
393 .cra_list = LIST_HEAD_INIT(serpent_algs[0].cra_list),
394 .cra_u = {
395 .blkcipher = {
396 .min_keysize = SERPENT_MIN_KEY_SIZE,
397 .max_keysize = SERPENT_MAX_KEY_SIZE,
398 .setkey = serpent_setkey,
399 .encrypt = ecb_encrypt,
400 .decrypt = ecb_decrypt,
401 },
402 },
403}, {
404 .cra_name = "__cbc-serpent-avx",
405 .cra_driver_name = "__driver-cbc-serpent-avx",
406 .cra_priority = 0,
407 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
408 .cra_blocksize = SERPENT_BLOCK_SIZE,
409 .cra_ctxsize = sizeof(struct serpent_ctx),
410 .cra_alignmask = 0,
411 .cra_type = &crypto_blkcipher_type,
412 .cra_module = THIS_MODULE,
413 .cra_list = LIST_HEAD_INIT(serpent_algs[1].cra_list),
414 .cra_u = {
415 .blkcipher = {
416 .min_keysize = SERPENT_MIN_KEY_SIZE,
417 .max_keysize = SERPENT_MAX_KEY_SIZE,
418 .setkey = serpent_setkey,
419 .encrypt = cbc_encrypt,
420 .decrypt = cbc_decrypt,
421 },
422 },
423}, {
424 .cra_name = "__ctr-serpent-avx",
425 .cra_driver_name = "__driver-ctr-serpent-avx",
426 .cra_priority = 0,
427 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
428 .cra_blocksize = 1,
429 .cra_ctxsize = sizeof(struct serpent_ctx),
430 .cra_alignmask = 0,
431 .cra_type = &crypto_blkcipher_type,
432 .cra_module = THIS_MODULE,
433 .cra_list = LIST_HEAD_INIT(serpent_algs[2].cra_list),
434 .cra_u = {
435 .blkcipher = {
436 .min_keysize = SERPENT_MIN_KEY_SIZE,
437 .max_keysize = SERPENT_MAX_KEY_SIZE,
438 .ivsize = SERPENT_BLOCK_SIZE,
439 .setkey = serpent_setkey,
440 .encrypt = ctr_crypt,
441 .decrypt = ctr_crypt,
442 },
443 },
444}, {
445 .cra_name = "__lrw-serpent-avx",
446 .cra_driver_name = "__driver-lrw-serpent-avx",
447 .cra_priority = 0,
448 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
449 .cra_blocksize = SERPENT_BLOCK_SIZE,
450 .cra_ctxsize = sizeof(struct serpent_lrw_ctx),
451 .cra_alignmask = 0,
452 .cra_type = &crypto_blkcipher_type,
453 .cra_module = THIS_MODULE,
454 .cra_list = LIST_HEAD_INIT(serpent_algs[3].cra_list),
455 .cra_exit = lrw_exit_tfm,
456 .cra_u = {
457 .blkcipher = {
458 .min_keysize = SERPENT_MIN_KEY_SIZE +
459 SERPENT_BLOCK_SIZE,
460 .max_keysize = SERPENT_MAX_KEY_SIZE +
461 SERPENT_BLOCK_SIZE,
462 .ivsize = SERPENT_BLOCK_SIZE,
463 .setkey = lrw_serpent_setkey,
464 .encrypt = lrw_encrypt,
465 .decrypt = lrw_decrypt,
466 },
467 },
468}, {
469 .cra_name = "__xts-serpent-avx",
470 .cra_driver_name = "__driver-xts-serpent-avx",
471 .cra_priority = 0,
472 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
473 .cra_blocksize = SERPENT_BLOCK_SIZE,
474 .cra_ctxsize = sizeof(struct serpent_xts_ctx),
475 .cra_alignmask = 0,
476 .cra_type = &crypto_blkcipher_type,
477 .cra_module = THIS_MODULE,
478 .cra_list = LIST_HEAD_INIT(serpent_algs[4].cra_list),
479 .cra_u = {
480 .blkcipher = {
481 .min_keysize = SERPENT_MIN_KEY_SIZE * 2,
482 .max_keysize = SERPENT_MAX_KEY_SIZE * 2,
483 .ivsize = SERPENT_BLOCK_SIZE,
484 .setkey = xts_serpent_setkey,
485 .encrypt = xts_encrypt,
486 .decrypt = xts_decrypt,
487 },
488 },
489}, {
490 .cra_name = "ecb(serpent)",
491 .cra_driver_name = "ecb-serpent-avx",
492 .cra_priority = 500,
493 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
494 .cra_blocksize = SERPENT_BLOCK_SIZE,
495 .cra_ctxsize = sizeof(struct async_helper_ctx),
496 .cra_alignmask = 0,
497 .cra_type = &crypto_ablkcipher_type,
498 .cra_module = THIS_MODULE,
499 .cra_list = LIST_HEAD_INIT(serpent_algs[5].cra_list),
500 .cra_init = ablk_init,
501 .cra_exit = ablk_exit,
502 .cra_u = {
503 .ablkcipher = {
504 .min_keysize = SERPENT_MIN_KEY_SIZE,
505 .max_keysize = SERPENT_MAX_KEY_SIZE,
506 .setkey = ablk_set_key,
507 .encrypt = ablk_encrypt,
508 .decrypt = ablk_decrypt,
509 },
510 },
511}, {
512 .cra_name = "cbc(serpent)",
513 .cra_driver_name = "cbc-serpent-avx",
514 .cra_priority = 500,
515 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
516 .cra_blocksize = SERPENT_BLOCK_SIZE,
517 .cra_ctxsize = sizeof(struct async_helper_ctx),
518 .cra_alignmask = 0,
519 .cra_type = &crypto_ablkcipher_type,
520 .cra_module = THIS_MODULE,
521 .cra_list = LIST_HEAD_INIT(serpent_algs[6].cra_list),
522 .cra_init = ablk_init,
523 .cra_exit = ablk_exit,
524 .cra_u = {
525 .ablkcipher = {
526 .min_keysize = SERPENT_MIN_KEY_SIZE,
527 .max_keysize = SERPENT_MAX_KEY_SIZE,
528 .ivsize = SERPENT_BLOCK_SIZE,
529 .setkey = ablk_set_key,
530 .encrypt = __ablk_encrypt,
531 .decrypt = ablk_decrypt,
532 },
533 },
534}, {
535 .cra_name = "ctr(serpent)",
536 .cra_driver_name = "ctr-serpent-avx",
537 .cra_priority = 500,
538 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
539 .cra_blocksize = 1,
540 .cra_ctxsize = sizeof(struct async_helper_ctx),
541 .cra_alignmask = 0,
542 .cra_type = &crypto_ablkcipher_type,
543 .cra_module = THIS_MODULE,
544 .cra_list = LIST_HEAD_INIT(serpent_algs[7].cra_list),
545 .cra_init = ablk_init,
546 .cra_exit = ablk_exit,
547 .cra_u = {
548 .ablkcipher = {
549 .min_keysize = SERPENT_MIN_KEY_SIZE,
550 .max_keysize = SERPENT_MAX_KEY_SIZE,
551 .ivsize = SERPENT_BLOCK_SIZE,
552 .setkey = ablk_set_key,
553 .encrypt = ablk_encrypt,
554 .decrypt = ablk_encrypt,
555 .geniv = "chainiv",
556 },
557 },
558}, {
559 .cra_name = "lrw(serpent)",
560 .cra_driver_name = "lrw-serpent-avx",
561 .cra_priority = 500,
562 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
563 .cra_blocksize = SERPENT_BLOCK_SIZE,
564 .cra_ctxsize = sizeof(struct async_helper_ctx),
565 .cra_alignmask = 0,
566 .cra_type = &crypto_ablkcipher_type,
567 .cra_module = THIS_MODULE,
568 .cra_list = LIST_HEAD_INIT(serpent_algs[8].cra_list),
569 .cra_init = ablk_init,
570 .cra_exit = ablk_exit,
571 .cra_u = {
572 .ablkcipher = {
573 .min_keysize = SERPENT_MIN_KEY_SIZE +
574 SERPENT_BLOCK_SIZE,
575 .max_keysize = SERPENT_MAX_KEY_SIZE +
576 SERPENT_BLOCK_SIZE,
577 .ivsize = SERPENT_BLOCK_SIZE,
578 .setkey = ablk_set_key,
579 .encrypt = ablk_encrypt,
580 .decrypt = ablk_decrypt,
581 },
582 },
583}, {
584 .cra_name = "xts(serpent)",
585 .cra_driver_name = "xts-serpent-avx",
586 .cra_priority = 500,
587 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
588 .cra_blocksize = SERPENT_BLOCK_SIZE,
589 .cra_ctxsize = sizeof(struct async_helper_ctx),
590 .cra_alignmask = 0,
591 .cra_type = &crypto_ablkcipher_type,
592 .cra_module = THIS_MODULE,
593 .cra_list = LIST_HEAD_INIT(serpent_algs[9].cra_list),
594 .cra_init = ablk_init,
595 .cra_exit = ablk_exit,
596 .cra_u = {
597 .ablkcipher = {
598 .min_keysize = SERPENT_MIN_KEY_SIZE * 2,
599 .max_keysize = SERPENT_MAX_KEY_SIZE * 2,
600 .ivsize = SERPENT_BLOCK_SIZE,
601 .setkey = ablk_set_key,
602 .encrypt = ablk_encrypt,
603 .decrypt = ablk_decrypt,
604 },
605 },
606} };
607
608static int __init serpent_init(void)
609{
610 u64 xcr0;
611
612 if (!cpu_has_avx || !cpu_has_osxsave) {
613 printk(KERN_INFO "AVX instructions are not detected.\n");
614 return -ENODEV;
615 }
616
617 xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
618 if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
619 printk(KERN_INFO "AVX detected but unusable.\n");
620 return -ENODEV;
621 }
622
623 return crypto_register_algs(serpent_algs, ARRAY_SIZE(serpent_algs));
624}
625
626static void __exit serpent_exit(void)
627{
628 crypto_unregister_algs(serpent_algs, ARRAY_SIZE(serpent_algs));
629}
630
631module_init(serpent_init);
632module_exit(serpent_exit);
633
634MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX optimized");
635MODULE_LICENSE("GPL");
636MODULE_ALIAS("serpent");
diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
index 4b21be85e0a1..d679c8675f4a 100644
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -41,358 +41,145 @@
41#include <crypto/ctr.h> 41#include <crypto/ctr.h>
42#include <crypto/lrw.h> 42#include <crypto/lrw.h>
43#include <crypto/xts.h> 43#include <crypto/xts.h>
44#include <asm/i387.h> 44#include <asm/crypto/serpent-sse2.h>
45#include <asm/serpent.h> 45#include <asm/crypto/ablk_helper.h>
46#include <crypto/scatterwalk.h> 46#include <asm/crypto/glue_helper.h>
47#include <linux/workqueue.h>
48#include <linux/spinlock.h>
49
50struct async_serpent_ctx {
51 struct cryptd_ablkcipher *cryptd_tfm;
52};
53 47
54static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes) 48static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)
55{
56 if (fpu_enabled)
57 return true;
58
59 /* SSE2 is only used when chunk to be processed is large enough, so
60 * do not enable FPU until it is necessary.
61 */
62 if (nbytes < SERPENT_BLOCK_SIZE * SERPENT_PARALLEL_BLOCKS)
63 return false;
64
65 kernel_fpu_begin();
66 return true;
67}
68
69static inline void serpent_fpu_end(bool fpu_enabled)
70{ 49{
71 if (fpu_enabled) 50 u128 ivs[SERPENT_PARALLEL_BLOCKS - 1];
72 kernel_fpu_end(); 51 unsigned int j;
73}
74
75static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
76 bool enc)
77{
78 bool fpu_enabled = false;
79 struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
80 const unsigned int bsize = SERPENT_BLOCK_SIZE;
81 unsigned int nbytes;
82 int err;
83
84 err = blkcipher_walk_virt(desc, walk);
85 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
86
87 while ((nbytes = walk->nbytes)) {
88 u8 *wsrc = walk->src.virt.addr;
89 u8 *wdst = walk->dst.virt.addr;
90
91 fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes);
92
93 /* Process multi-block batch */
94 if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) {
95 do {
96 if (enc)
97 serpent_enc_blk_xway(ctx, wdst, wsrc);
98 else
99 serpent_dec_blk_xway(ctx, wdst, wsrc);
100
101 wsrc += bsize * SERPENT_PARALLEL_BLOCKS;
102 wdst += bsize * SERPENT_PARALLEL_BLOCKS;
103 nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
104 } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS);
105
106 if (nbytes < bsize)
107 goto done;
108 }
109
110 /* Handle leftovers */
111 do {
112 if (enc)
113 __serpent_encrypt(ctx, wdst, wsrc);
114 else
115 __serpent_decrypt(ctx, wdst, wsrc);
116
117 wsrc += bsize;
118 wdst += bsize;
119 nbytes -= bsize;
120 } while (nbytes >= bsize);
121
122done:
123 err = blkcipher_walk_done(desc, walk, nbytes);
124 }
125 52
126 serpent_fpu_end(fpu_enabled); 53 for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
127 return err; 54 ivs[j] = src[j];
128}
129 55
130static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 56 serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
131 struct scatterlist *src, unsigned int nbytes)
132{
133 struct blkcipher_walk walk;
134 57
135 blkcipher_walk_init(&walk, dst, src, nbytes); 58 for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
136 return ecb_crypt(desc, &walk, true); 59 u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
137} 60}
138 61
139static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 62static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
140 struct scatterlist *src, unsigned int nbytes)
141{ 63{
142 struct blkcipher_walk walk; 64 be128 ctrblk;
143 65
144 blkcipher_walk_init(&walk, dst, src, nbytes); 66 u128_to_be128(&ctrblk, iv);
145 return ecb_crypt(desc, &walk, false); 67 u128_inc(iv);
146}
147 68
148static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, 69 __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
149 struct blkcipher_walk *walk) 70 u128_xor(dst, src, (u128 *)&ctrblk);
150{
151 struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
152 const unsigned int bsize = SERPENT_BLOCK_SIZE;
153 unsigned int nbytes = walk->nbytes;
154 u128 *src = (u128 *)walk->src.virt.addr;
155 u128 *dst = (u128 *)walk->dst.virt.addr;
156 u128 *iv = (u128 *)walk->iv;
157
158 do {
159 u128_xor(dst, src, iv);
160 __serpent_encrypt(ctx, (u8 *)dst, (u8 *)dst);
161 iv = dst;
162
163 src += 1;
164 dst += 1;
165 nbytes -= bsize;
166 } while (nbytes >= bsize);
167
168 u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv);
169 return nbytes;
170} 71}
171 72
172static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 73static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
173 struct scatterlist *src, unsigned int nbytes) 74 u128 *iv)
174{ 75{
175 struct blkcipher_walk walk; 76 be128 ctrblks[SERPENT_PARALLEL_BLOCKS];
176 int err; 77 unsigned int i;
177 78
178 blkcipher_walk_init(&walk, dst, src, nbytes); 79 for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) {
179 err = blkcipher_walk_virt(desc, &walk); 80 if (dst != src)
81 dst[i] = src[i];
180 82
181 while ((nbytes = walk.nbytes)) { 83 u128_to_be128(&ctrblks[i], iv);
182 nbytes = __cbc_encrypt(desc, &walk); 84 u128_inc(iv);
183 err = blkcipher_walk_done(desc, &walk, nbytes);
184 } 85 }
185 86
186 return err; 87 serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
187} 88}
188 89
189static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, 90static const struct common_glue_ctx serpent_enc = {
190 struct blkcipher_walk *walk) 91 .num_funcs = 2,
191{ 92 .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
192 struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
193 const unsigned int bsize = SERPENT_BLOCK_SIZE;
194 unsigned int nbytes = walk->nbytes;
195 u128 *src = (u128 *)walk->src.virt.addr;
196 u128 *dst = (u128 *)walk->dst.virt.addr;
197 u128 ivs[SERPENT_PARALLEL_BLOCKS - 1];
198 u128 last_iv;
199 int i;
200
201 /* Start of the last block. */
202 src += nbytes / bsize - 1;
203 dst += nbytes / bsize - 1;
204
205 last_iv = *src;
206
207 /* Process multi-block batch */
208 if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) {
209 do {
210 nbytes -= bsize * (SERPENT_PARALLEL_BLOCKS - 1);
211 src -= SERPENT_PARALLEL_BLOCKS - 1;
212 dst -= SERPENT_PARALLEL_BLOCKS - 1;
213
214 for (i = 0; i < SERPENT_PARALLEL_BLOCKS - 1; i++)
215 ivs[i] = src[i];
216
217 serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
218
219 for (i = 0; i < SERPENT_PARALLEL_BLOCKS - 1; i++)
220 u128_xor(dst + (i + 1), dst + (i + 1), ivs + i);
221
222 nbytes -= bsize;
223 if (nbytes < bsize)
224 goto done;
225 93
226 u128_xor(dst, dst, src - 1); 94 .funcs = { {
227 src -= 1; 95 .num_blocks = SERPENT_PARALLEL_BLOCKS,
228 dst -= 1; 96 .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_enc_blk_xway) }
229 } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS); 97 }, {
230 98 .num_blocks = 1,
231 if (nbytes < bsize) 99 .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) }
232 goto done; 100 } }
233 } 101};
234
235 /* Handle leftovers */
236 for (;;) {
237 __serpent_decrypt(ctx, (u8 *)dst, (u8 *)src);
238
239 nbytes -= bsize;
240 if (nbytes < bsize)
241 break;
242 102
243 u128_xor(dst, dst, src - 1); 103static const struct common_glue_ctx serpent_ctr = {
244 src -= 1; 104 .num_funcs = 2,
245 dst -= 1; 105 .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
246 } 106
107 .funcs = { {
108 .num_blocks = SERPENT_PARALLEL_BLOCKS,
109 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr_xway) }
110 }, {
111 .num_blocks = 1,
112 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) }
113 } }
114};
247 115
248done: 116static const struct common_glue_ctx serpent_dec = {
249 u128_xor(dst, dst, (u128 *)walk->iv); 117 .num_funcs = 2,
250 *(u128 *)walk->iv = last_iv; 118 .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
119
120 .funcs = { {
121 .num_blocks = SERPENT_PARALLEL_BLOCKS,
122 .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_dec_blk_xway) }
123 }, {
124 .num_blocks = 1,
125 .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) }
126 } }
127};
251 128
252 return nbytes; 129static const struct common_glue_ctx serpent_dec_cbc = {
253} 130 .num_funcs = 2,
131 .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
132
133 .funcs = { {
134 .num_blocks = SERPENT_PARALLEL_BLOCKS,
135 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_decrypt_cbc_xway) }
136 }, {
137 .num_blocks = 1,
138 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) }
139 } }
140};
254 141
255static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 142static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
256 struct scatterlist *src, unsigned int nbytes) 143 struct scatterlist *src, unsigned int nbytes)
257{ 144{
258 bool fpu_enabled = false; 145 return glue_ecb_crypt_128bit(&serpent_enc, desc, dst, src, nbytes);
259 struct blkcipher_walk walk;
260 int err;
261
262 blkcipher_walk_init(&walk, dst, src, nbytes);
263 err = blkcipher_walk_virt(desc, &walk);
264 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
265
266 while ((nbytes = walk.nbytes)) {
267 fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes);
268 nbytes = __cbc_decrypt(desc, &walk);
269 err = blkcipher_walk_done(desc, &walk, nbytes);
270 }
271
272 serpent_fpu_end(fpu_enabled);
273 return err;
274} 146}
275 147
276static inline void u128_to_be128(be128 *dst, const u128 *src) 148static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
149 struct scatterlist *src, unsigned int nbytes)
277{ 150{
278 dst->a = cpu_to_be64(src->a); 151 return glue_ecb_crypt_128bit(&serpent_dec, desc, dst, src, nbytes);
279 dst->b = cpu_to_be64(src->b);
280} 152}
281 153
282static inline void be128_to_u128(u128 *dst, const be128 *src) 154static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
155 struct scatterlist *src, unsigned int nbytes)
283{ 156{
284 dst->a = be64_to_cpu(src->a); 157 return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(__serpent_encrypt), desc,
285 dst->b = be64_to_cpu(src->b); 158 dst, src, nbytes);
286} 159}
287 160
288static inline void u128_inc(u128 *i) 161static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
162 struct scatterlist *src, unsigned int nbytes)
289{ 163{
290 i->b++; 164 return glue_cbc_decrypt_128bit(&serpent_dec_cbc, desc, dst, src,
291 if (!i->b) 165 nbytes);
292 i->a++;
293} 166}
294 167
295static void ctr_crypt_final(struct blkcipher_desc *desc, 168static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
296 struct blkcipher_walk *walk) 169 struct scatterlist *src, unsigned int nbytes)
297{ 170{
298 struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 171 return glue_ctr_crypt_128bit(&serpent_ctr, desc, dst, src, nbytes);
299 u8 *ctrblk = walk->iv;
300 u8 keystream[SERPENT_BLOCK_SIZE];
301 u8 *src = walk->src.virt.addr;
302 u8 *dst = walk->dst.virt.addr;
303 unsigned int nbytes = walk->nbytes;
304
305 __serpent_encrypt(ctx, keystream, ctrblk);
306 crypto_xor(keystream, src, nbytes);
307 memcpy(dst, keystream, nbytes);
308
309 crypto_inc(ctrblk, SERPENT_BLOCK_SIZE);
310} 172}
311 173
312static unsigned int __ctr_crypt(struct blkcipher_desc *desc, 174static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes)
313 struct blkcipher_walk *walk)
314{ 175{
315 struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 176 return glue_fpu_begin(SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS,
316 const unsigned int bsize = SERPENT_BLOCK_SIZE; 177 NULL, fpu_enabled, nbytes);
317 unsigned int nbytes = walk->nbytes;
318 u128 *src = (u128 *)walk->src.virt.addr;
319 u128 *dst = (u128 *)walk->dst.virt.addr;
320 u128 ctrblk;
321 be128 ctrblocks[SERPENT_PARALLEL_BLOCKS];
322 int i;
323
324 be128_to_u128(&ctrblk, (be128 *)walk->iv);
325
326 /* Process multi-block batch */
327 if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) {
328 do {
329 /* create ctrblks for parallel encrypt */
330 for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) {
331 if (dst != src)
332 dst[i] = src[i];
333
334 u128_to_be128(&ctrblocks[i], &ctrblk);
335 u128_inc(&ctrblk);
336 }
337
338 serpent_enc_blk_xway_xor(ctx, (u8 *)dst,
339 (u8 *)ctrblocks);
340
341 src += SERPENT_PARALLEL_BLOCKS;
342 dst += SERPENT_PARALLEL_BLOCKS;
343 nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
344 } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS);
345
346 if (nbytes < bsize)
347 goto done;
348 }
349
350 /* Handle leftovers */
351 do {
352 if (dst != src)
353 *dst = *src;
354
355 u128_to_be128(&ctrblocks[0], &ctrblk);
356 u128_inc(&ctrblk);
357
358 __serpent_encrypt(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks);
359 u128_xor(dst, dst, (u128 *)ctrblocks);
360
361 src += 1;
362 dst += 1;
363 nbytes -= bsize;
364 } while (nbytes >= bsize);
365
366done:
367 u128_to_be128((be128 *)walk->iv, &ctrblk);
368 return nbytes;
369} 178}
370 179
371static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, 180static inline void serpent_fpu_end(bool fpu_enabled)
372 struct scatterlist *src, unsigned int nbytes)
373{ 181{
374 bool fpu_enabled = false; 182 glue_fpu_end(fpu_enabled);
375 struct blkcipher_walk walk;
376 int err;
377
378 blkcipher_walk_init(&walk, dst, src, nbytes);
379 err = blkcipher_walk_virt_block(desc, &walk, SERPENT_BLOCK_SIZE);
380 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
381
382 while ((nbytes = walk.nbytes) >= SERPENT_BLOCK_SIZE) {
383 fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes);
384 nbytes = __ctr_crypt(desc, &walk);
385 err = blkcipher_walk_done(desc, &walk, nbytes);
386 }
387
388 serpent_fpu_end(fpu_enabled);
389
390 if (walk.nbytes) {
391 ctr_crypt_final(desc, &walk);
392 err = blkcipher_walk_done(desc, &walk, 0);
393 }
394
395 return err;
396} 183}
397 184
398struct crypt_priv { 185struct crypt_priv {
@@ -596,106 +383,6 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
596 return ret; 383 return ret;
597} 384}
598 385
599static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
600 unsigned int key_len)
601{
602 struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
603 struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base;
604 int err;
605
606 crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
607 crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm)
608 & CRYPTO_TFM_REQ_MASK);
609 err = crypto_ablkcipher_setkey(child, key, key_len);
610 crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child)
611 & CRYPTO_TFM_RES_MASK);
612 return err;
613}
614
615static int __ablk_encrypt(struct ablkcipher_request *req)
616{
617 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
618 struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
619 struct blkcipher_desc desc;
620
621 desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
622 desc.info = req->info;
623 desc.flags = 0;
624
625 return crypto_blkcipher_crt(desc.tfm)->encrypt(
626 &desc, req->dst, req->src, req->nbytes);
627}
628
629static int ablk_encrypt(struct ablkcipher_request *req)
630{
631 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
632 struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
633
634 if (!irq_fpu_usable()) {
635 struct ablkcipher_request *cryptd_req =
636 ablkcipher_request_ctx(req);
637
638 memcpy(cryptd_req, req, sizeof(*req));
639 ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
640
641 return crypto_ablkcipher_encrypt(cryptd_req);
642 } else {
643 return __ablk_encrypt(req);
644 }
645}
646
647static int ablk_decrypt(struct ablkcipher_request *req)
648{
649 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
650 struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
651
652 if (!irq_fpu_usable()) {
653 struct ablkcipher_request *cryptd_req =
654 ablkcipher_request_ctx(req);
655
656 memcpy(cryptd_req, req, sizeof(*req));
657 ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
658
659 return crypto_ablkcipher_decrypt(cryptd_req);
660 } else {
661 struct blkcipher_desc desc;
662
663 desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
664 desc.info = req->info;
665 desc.flags = 0;
666
667 return crypto_blkcipher_crt(desc.tfm)->decrypt(
668 &desc, req->dst, req->src, req->nbytes);
669 }
670}
671
672static void ablk_exit(struct crypto_tfm *tfm)
673{
674 struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm);
675
676 cryptd_free_ablkcipher(ctx->cryptd_tfm);
677}
678
679static int ablk_init(struct crypto_tfm *tfm)
680{
681 struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm);
682 struct cryptd_ablkcipher *cryptd_tfm;
683 char drv_name[CRYPTO_MAX_ALG_NAME];
684
685 snprintf(drv_name, sizeof(drv_name), "__driver-%s",
686 crypto_tfm_alg_driver_name(tfm));
687
688 cryptd_tfm = cryptd_alloc_ablkcipher(drv_name, 0, 0);
689 if (IS_ERR(cryptd_tfm))
690 return PTR_ERR(cryptd_tfm);
691
692 ctx->cryptd_tfm = cryptd_tfm;
693 tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) +
694 crypto_ablkcipher_reqsize(&cryptd_tfm->base);
695
696 return 0;
697}
698
699static struct crypto_alg serpent_algs[10] = { { 386static struct crypto_alg serpent_algs[10] = { {
700 .cra_name = "__ecb-serpent-sse2", 387 .cra_name = "__ecb-serpent-sse2",
701 .cra_driver_name = "__driver-ecb-serpent-sse2", 388 .cra_driver_name = "__driver-ecb-serpent-sse2",
@@ -808,7 +495,7 @@ static struct crypto_alg serpent_algs[10] = { {
808 .cra_priority = 400, 495 .cra_priority = 400,
809 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 496 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
810 .cra_blocksize = SERPENT_BLOCK_SIZE, 497 .cra_blocksize = SERPENT_BLOCK_SIZE,
811 .cra_ctxsize = sizeof(struct async_serpent_ctx), 498 .cra_ctxsize = sizeof(struct async_helper_ctx),
812 .cra_alignmask = 0, 499 .cra_alignmask = 0,
813 .cra_type = &crypto_ablkcipher_type, 500 .cra_type = &crypto_ablkcipher_type,
814 .cra_module = THIS_MODULE, 501 .cra_module = THIS_MODULE,
@@ -830,7 +517,7 @@ static struct crypto_alg serpent_algs[10] = { {
830 .cra_priority = 400, 517 .cra_priority = 400,
831 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 518 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
832 .cra_blocksize = SERPENT_BLOCK_SIZE, 519 .cra_blocksize = SERPENT_BLOCK_SIZE,
833 .cra_ctxsize = sizeof(struct async_serpent_ctx), 520 .cra_ctxsize = sizeof(struct async_helper_ctx),
834 .cra_alignmask = 0, 521 .cra_alignmask = 0,
835 .cra_type = &crypto_ablkcipher_type, 522 .cra_type = &crypto_ablkcipher_type,
836 .cra_module = THIS_MODULE, 523 .cra_module = THIS_MODULE,
@@ -853,7 +540,7 @@ static struct crypto_alg serpent_algs[10] = { {
853 .cra_priority = 400, 540 .cra_priority = 400,
854 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 541 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
855 .cra_blocksize = 1, 542 .cra_blocksize = 1,
856 .cra_ctxsize = sizeof(struct async_serpent_ctx), 543 .cra_ctxsize = sizeof(struct async_helper_ctx),
857 .cra_alignmask = 0, 544 .cra_alignmask = 0,
858 .cra_type = &crypto_ablkcipher_type, 545 .cra_type = &crypto_ablkcipher_type,
859 .cra_module = THIS_MODULE, 546 .cra_module = THIS_MODULE,
@@ -877,7 +564,7 @@ static struct crypto_alg serpent_algs[10] = { {
877 .cra_priority = 400, 564 .cra_priority = 400,
878 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 565 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
879 .cra_blocksize = SERPENT_BLOCK_SIZE, 566 .cra_blocksize = SERPENT_BLOCK_SIZE,
880 .cra_ctxsize = sizeof(struct async_serpent_ctx), 567 .cra_ctxsize = sizeof(struct async_helper_ctx),
881 .cra_alignmask = 0, 568 .cra_alignmask = 0,
882 .cra_type = &crypto_ablkcipher_type, 569 .cra_type = &crypto_ablkcipher_type,
883 .cra_module = THIS_MODULE, 570 .cra_module = THIS_MODULE,
@@ -902,7 +589,7 @@ static struct crypto_alg serpent_algs[10] = { {
902 .cra_priority = 400, 589 .cra_priority = 400,
903 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 590 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
904 .cra_blocksize = SERPENT_BLOCK_SIZE, 591 .cra_blocksize = SERPENT_BLOCK_SIZE,
905 .cra_ctxsize = sizeof(struct async_serpent_ctx), 592 .cra_ctxsize = sizeof(struct async_helper_ctx),
906 .cra_alignmask = 0, 593 .cra_alignmask = 0,
907 .cra_type = &crypto_ablkcipher_type, 594 .cra_type = &crypto_ablkcipher_type,
908 .cra_module = THIS_MODULE, 595 .cra_module = THIS_MODULE,
diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S
index b2c2f57d70e8..49d6987a73d9 100644
--- a/arch/x86/crypto/sha1_ssse3_asm.S
+++ b/arch/x86/crypto/sha1_ssse3_asm.S
@@ -468,7 +468,7 @@ W_PRECALC_SSSE3
468 */ 468 */
469SHA1_VECTOR_ASM sha1_transform_ssse3 469SHA1_VECTOR_ASM sha1_transform_ssse3
470 470
471#ifdef SHA1_ENABLE_AVX_SUPPORT 471#ifdef CONFIG_AS_AVX
472 472
473.macro W_PRECALC_AVX 473.macro W_PRECALC_AVX
474 474
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
index f916499d0abe..4a11a9d72451 100644
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -35,7 +35,7 @@
35 35
36asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data, 36asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data,
37 unsigned int rounds); 37 unsigned int rounds);
38#ifdef SHA1_ENABLE_AVX_SUPPORT 38#ifdef CONFIG_AS_AVX
39asmlinkage void sha1_transform_avx(u32 *digest, const char *data, 39asmlinkage void sha1_transform_avx(u32 *digest, const char *data,
40 unsigned int rounds); 40 unsigned int rounds);
41#endif 41#endif
@@ -184,7 +184,7 @@ static struct shash_alg alg = {
184 } 184 }
185}; 185};
186 186
187#ifdef SHA1_ENABLE_AVX_SUPPORT 187#ifdef CONFIG_AS_AVX
188static bool __init avx_usable(void) 188static bool __init avx_usable(void)
189{ 189{
190 u64 xcr0; 190 u64 xcr0;
@@ -209,7 +209,7 @@ static int __init sha1_ssse3_mod_init(void)
209 if (cpu_has_ssse3) 209 if (cpu_has_ssse3)
210 sha1_transform_asm = sha1_transform_ssse3; 210 sha1_transform_asm = sha1_transform_ssse3;
211 211
212#ifdef SHA1_ENABLE_AVX_SUPPORT 212#ifdef CONFIG_AS_AVX
213 /* allow AVX to override SSSE3, it's a little faster */ 213 /* allow AVX to override SSSE3, it's a little faster */
214 if (avx_usable()) 214 if (avx_usable())
215 sha1_transform_asm = sha1_transform_avx; 215 sha1_transform_asm = sha1_transform_avx;
diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
new file mode 100644
index 000000000000..35f45574390d
--- /dev/null
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -0,0 +1,300 @@
1/*
2 * Twofish Cipher 8-way parallel algorithm (AVX/x86_64)
3 *
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
20 * USA
21 *
22 */
23
24.file "twofish-avx-x86_64-asm_64.S"
25.text
26
27/* structure of crypto context */
28#define s0 0
29#define s1 1024
30#define s2 2048
31#define s3 3072
32#define w 4096
33#define k 4128
34
35/**********************************************************************
36 8-way AVX twofish
37 **********************************************************************/
38#define CTX %rdi
39
40#define RA1 %xmm0
41#define RB1 %xmm1
42#define RC1 %xmm2
43#define RD1 %xmm3
44
45#define RA2 %xmm4
46#define RB2 %xmm5
47#define RC2 %xmm6
48#define RD2 %xmm7
49
50#define RX %xmm8
51#define RY %xmm9
52
53#define RK1 %xmm10
54#define RK2 %xmm11
55
56#define RID1 %rax
57#define RID1b %al
58#define RID2 %rbx
59#define RID2b %bl
60
61#define RGI1 %rdx
62#define RGI1bl %dl
63#define RGI1bh %dh
64#define RGI2 %rcx
65#define RGI2bl %cl
66#define RGI2bh %ch
67
68#define RGS1 %r8
69#define RGS1d %r8d
70#define RGS2 %r9
71#define RGS2d %r9d
72#define RGS3 %r10
73#define RGS3d %r10d
74
75
76#define lookup_32bit(t0, t1, t2, t3, src, dst) \
77 movb src ## bl, RID1b; \
78 movb src ## bh, RID2b; \
79 movl t0(CTX, RID1, 4), dst ## d; \
80 xorl t1(CTX, RID2, 4), dst ## d; \
81 shrq $16, src; \
82 movb src ## bl, RID1b; \
83 movb src ## bh, RID2b; \
84 xorl t2(CTX, RID1, 4), dst ## d; \
85 xorl t3(CTX, RID2, 4), dst ## d;
86
87#define G(a, x, t0, t1, t2, t3) \
88 vmovq a, RGI1; \
89 vpsrldq $8, a, x; \
90 vmovq x, RGI2; \
91 \
92 lookup_32bit(t0, t1, t2, t3, RGI1, RGS1); \
93 shrq $16, RGI1; \
94 lookup_32bit(t0, t1, t2, t3, RGI1, RGS2); \
95 shlq $32, RGS2; \
96 orq RGS1, RGS2; \
97 \
98 lookup_32bit(t0, t1, t2, t3, RGI2, RGS1); \
99 shrq $16, RGI2; \
100 lookup_32bit(t0, t1, t2, t3, RGI2, RGS3); \
101 shlq $32, RGS3; \
102 orq RGS1, RGS3; \
103 \
104 vmovq RGS2, x; \
105 vpinsrq $1, RGS3, x, x;
106
107#define encround(a, b, c, d, x, y) \
108 G(a, x, s0, s1, s2, s3); \
109 G(b, y, s1, s2, s3, s0); \
110 vpaddd x, y, x; \
111 vpaddd y, x, y; \
112 vpaddd x, RK1, x; \
113 vpaddd y, RK2, y; \
114 vpxor x, c, c; \
115 vpsrld $1, c, x; \
116 vpslld $(32 - 1), c, c; \
117 vpor c, x, c; \
118 vpslld $1, d, x; \
119 vpsrld $(32 - 1), d, d; \
120 vpor d, x, d; \
121 vpxor d, y, d;
122
123#define decround(a, b, c, d, x, y) \
124 G(a, x, s0, s1, s2, s3); \
125 G(b, y, s1, s2, s3, s0); \
126 vpaddd x, y, x; \
127 vpaddd y, x, y; \
128 vpaddd y, RK2, y; \
129 vpxor d, y, d; \
130 vpsrld $1, d, y; \
131 vpslld $(32 - 1), d, d; \
132 vpor d, y, d; \
133 vpslld $1, c, y; \
134 vpsrld $(32 - 1), c, c; \
135 vpor c, y, c; \
136 vpaddd x, RK1, x; \
137 vpxor x, c, c;
138
139#define encrypt_round(n, a, b, c, d) \
140 vbroadcastss (k+4*(2*(n)))(CTX), RK1; \
141 vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \
142 encround(a ## 1, b ## 1, c ## 1, d ## 1, RX, RY); \
143 encround(a ## 2, b ## 2, c ## 2, d ## 2, RX, RY);
144
145#define decrypt_round(n, a, b, c, d) \
146 vbroadcastss (k+4*(2*(n)))(CTX), RK1; \
147 vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \
148 decround(a ## 1, b ## 1, c ## 1, d ## 1, RX, RY); \
149 decround(a ## 2, b ## 2, c ## 2, d ## 2, RX, RY);
150
151#define encrypt_cycle(n) \
152 encrypt_round((2*n), RA, RB, RC, RD); \
153 encrypt_round(((2*n) + 1), RC, RD, RA, RB);
154
155#define decrypt_cycle(n) \
156 decrypt_round(((2*n) + 1), RC, RD, RA, RB); \
157 decrypt_round((2*n), RA, RB, RC, RD);
158
159
160#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
161 vpunpckldq x1, x0, t0; \
162 vpunpckhdq x1, x0, t2; \
163 vpunpckldq x3, x2, t1; \
164 vpunpckhdq x3, x2, x3; \
165 \
166 vpunpcklqdq t1, t0, x0; \
167 vpunpckhqdq t1, t0, x1; \
168 vpunpcklqdq x3, t2, x2; \
169 vpunpckhqdq x3, t2, x3;
170
171#define inpack_blocks(in, x0, x1, x2, x3, wkey, t0, t1, t2) \
172 vpxor (0*4*4)(in), wkey, x0; \
173 vpxor (1*4*4)(in), wkey, x1; \
174 vpxor (2*4*4)(in), wkey, x2; \
175 vpxor (3*4*4)(in), wkey, x3; \
176 \
177 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
178
179#define outunpack_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \
180 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
181 \
182 vpxor x0, wkey, x0; \
183 vmovdqu x0, (0*4*4)(out); \
184 vpxor x1, wkey, x1; \
185 vmovdqu x1, (1*4*4)(out); \
186 vpxor x2, wkey, x2; \
187 vmovdqu x2, (2*4*4)(out); \
188 vpxor x3, wkey, x3; \
189 vmovdqu x3, (3*4*4)(out);
190
191#define outunpack_xor_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \
192 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
193 \
194 vpxor x0, wkey, x0; \
195 vpxor (0*4*4)(out), x0, x0; \
196 vmovdqu x0, (0*4*4)(out); \
197 vpxor x1, wkey, x1; \
198 vpxor (1*4*4)(out), x1, x1; \
199 vmovdqu x1, (1*4*4)(out); \
200 vpxor x2, wkey, x2; \
201 vpxor (2*4*4)(out), x2, x2; \
202 vmovdqu x2, (2*4*4)(out); \
203 vpxor x3, wkey, x3; \
204 vpxor (3*4*4)(out), x3, x3; \
205 vmovdqu x3, (3*4*4)(out);
206
207.align 8
208.global __twofish_enc_blk_8way
209.type __twofish_enc_blk_8way,@function;
210
211__twofish_enc_blk_8way:
212 /* input:
213 * %rdi: ctx, CTX
214 * %rsi: dst
215 * %rdx: src
216 * %rcx: bool, if true: xor output
217 */
218
219 pushq %rbx;
220 pushq %rcx;
221
222 vmovdqu w(CTX), RK1;
223
224 leaq (4*4*4)(%rdx), %rax;
225 inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RK1, RX, RY, RK2);
226 inpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX, RY, RK2);
227
228 xorq RID1, RID1;
229 xorq RID2, RID2;
230
231 encrypt_cycle(0);
232 encrypt_cycle(1);
233 encrypt_cycle(2);
234 encrypt_cycle(3);
235 encrypt_cycle(4);
236 encrypt_cycle(5);
237 encrypt_cycle(6);
238 encrypt_cycle(7);
239
240 vmovdqu (w+4*4)(CTX), RK1;
241
242 popq %rcx;
243 popq %rbx;
244
245 leaq (4*4*4)(%rsi), %rax;
246
247 testb %cl, %cl;
248 jnz __enc_xor8;
249
250 outunpack_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2);
251 outunpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2);
252
253 ret;
254
255__enc_xor8:
256 outunpack_xor_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2);
257 outunpack_xor_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2);
258
259 ret;
260
261.align 8
262.global twofish_dec_blk_8way
263.type twofish_dec_blk_8way,@function;
264
265twofish_dec_blk_8way:
266 /* input:
267 * %rdi: ctx, CTX
268 * %rsi: dst
269 * %rdx: src
270 */
271
272 pushq %rbx;
273
274 vmovdqu (w+4*4)(CTX), RK1;
275
276 leaq (4*4*4)(%rdx), %rax;
277 inpack_blocks(%rdx, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2);
278 inpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2);
279
280 xorq RID1, RID1;
281 xorq RID2, RID2;
282
283 decrypt_cycle(7);
284 decrypt_cycle(6);
285 decrypt_cycle(5);
286 decrypt_cycle(4);
287 decrypt_cycle(3);
288 decrypt_cycle(2);
289 decrypt_cycle(1);
290 decrypt_cycle(0);
291
292 vmovdqu (w)(CTX), RK1;
293
294 popq %rbx;
295
296 leaq (4*4*4)(%rsi), %rax;
297 outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RK1, RX, RY, RK2);
298 outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX, RY, RK2);
299
300 ret;
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
new file mode 100644
index 000000000000..782b67ddaf6a
--- /dev/null
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -0,0 +1,624 @@
1/*
2 * Glue Code for AVX assembler version of Twofish Cipher
3 *
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
20 * USA
21 *
22 */
23
24#include <linux/module.h>
25#include <linux/hardirq.h>
26#include <linux/types.h>
27#include <linux/crypto.h>
28#include <linux/err.h>
29#include <crypto/algapi.h>
30#include <crypto/twofish.h>
31#include <crypto/cryptd.h>
32#include <crypto/b128ops.h>
33#include <crypto/ctr.h>
34#include <crypto/lrw.h>
35#include <crypto/xts.h>
36#include <asm/i387.h>
37#include <asm/xcr.h>
38#include <asm/xsave.h>
39#include <asm/crypto/twofish.h>
40#include <asm/crypto/ablk_helper.h>
41#include <asm/crypto/glue_helper.h>
42#include <crypto/scatterwalk.h>
43#include <linux/workqueue.h>
44#include <linux/spinlock.h>
45
46#define TWOFISH_PARALLEL_BLOCKS 8
47
48static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
49 const u8 *src)
50{
51 __twofish_enc_blk_3way(ctx, dst, src, false);
52}
53
54/* 8-way parallel cipher functions */
55asmlinkage void __twofish_enc_blk_8way(struct twofish_ctx *ctx, u8 *dst,
56 const u8 *src, bool xor);
57asmlinkage void twofish_dec_blk_8way(struct twofish_ctx *ctx, u8 *dst,
58 const u8 *src);
59
60static inline void twofish_enc_blk_xway(struct twofish_ctx *ctx, u8 *dst,
61 const u8 *src)
62{
63 __twofish_enc_blk_8way(ctx, dst, src, false);
64}
65
66static inline void twofish_enc_blk_xway_xor(struct twofish_ctx *ctx, u8 *dst,
67 const u8 *src)
68{
69 __twofish_enc_blk_8way(ctx, dst, src, true);
70}
71
72static inline void twofish_dec_blk_xway(struct twofish_ctx *ctx, u8 *dst,
73 const u8 *src)
74{
75 twofish_dec_blk_8way(ctx, dst, src);
76}
77
78static void twofish_dec_blk_cbc_xway(void *ctx, u128 *dst, const u128 *src)
79{
80 u128 ivs[TWOFISH_PARALLEL_BLOCKS - 1];
81 unsigned int j;
82
83 for (j = 0; j < TWOFISH_PARALLEL_BLOCKS - 1; j++)
84 ivs[j] = src[j];
85
86 twofish_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
87
88 for (j = 0; j < TWOFISH_PARALLEL_BLOCKS - 1; j++)
89 u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
90}
91
92static void twofish_enc_blk_ctr_xway(void *ctx, u128 *dst, const u128 *src,
93 u128 *iv)
94{
95 be128 ctrblks[TWOFISH_PARALLEL_BLOCKS];
96 unsigned int i;
97
98 for (i = 0; i < TWOFISH_PARALLEL_BLOCKS; i++) {
99 if (dst != src)
100 dst[i] = src[i];
101
102 u128_to_be128(&ctrblks[i], iv);
103 u128_inc(iv);
104 }
105
106 twofish_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
107}
108
109static const struct common_glue_ctx twofish_enc = {
110 .num_funcs = 3,
111 .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
112
113 .funcs = { {
114 .num_blocks = TWOFISH_PARALLEL_BLOCKS,
115 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_xway) }
116 }, {
117 .num_blocks = 3,
118 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) }
119 }, {
120 .num_blocks = 1,
121 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk) }
122 } }
123};
124
125static const struct common_glue_ctx twofish_ctr = {
126 .num_funcs = 3,
127 .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
128
129 .funcs = { {
130 .num_blocks = TWOFISH_PARALLEL_BLOCKS,
131 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_xway) }
132 }, {
133 .num_blocks = 3,
134 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) }
135 }, {
136 .num_blocks = 1,
137 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr) }
138 } }
139};
140
141static const struct common_glue_ctx twofish_dec = {
142 .num_funcs = 3,
143 .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
144
145 .funcs = { {
146 .num_blocks = TWOFISH_PARALLEL_BLOCKS,
147 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_xway) }
148 }, {
149 .num_blocks = 3,
150 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) }
151 }, {
152 .num_blocks = 1,
153 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk) }
154 } }
155};
156
157static const struct common_glue_ctx twofish_dec_cbc = {
158 .num_funcs = 3,
159 .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
160
161 .funcs = { {
162 .num_blocks = TWOFISH_PARALLEL_BLOCKS,
163 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_xway) }
164 }, {
165 .num_blocks = 3,
166 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) }
167 }, {
168 .num_blocks = 1,
169 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk) }
170 } }
171};
172
173static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
174 struct scatterlist *src, unsigned int nbytes)
175{
176 return glue_ecb_crypt_128bit(&twofish_enc, desc, dst, src, nbytes);
177}
178
179static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
180 struct scatterlist *src, unsigned int nbytes)
181{
182 return glue_ecb_crypt_128bit(&twofish_dec, desc, dst, src, nbytes);
183}
184
185static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
186 struct scatterlist *src, unsigned int nbytes)
187{
188 return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(twofish_enc_blk), desc,
189 dst, src, nbytes);
190}
191
192static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
193 struct scatterlist *src, unsigned int nbytes)
194{
195 return glue_cbc_decrypt_128bit(&twofish_dec_cbc, desc, dst, src,
196 nbytes);
197}
198
199static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
200 struct scatterlist *src, unsigned int nbytes)
201{
202 return glue_ctr_crypt_128bit(&twofish_ctr, desc, dst, src, nbytes);
203}
204
205static inline bool twofish_fpu_begin(bool fpu_enabled, unsigned int nbytes)
206{
207 return glue_fpu_begin(TF_BLOCK_SIZE, TWOFISH_PARALLEL_BLOCKS, NULL,
208 fpu_enabled, nbytes);
209}
210
211static inline void twofish_fpu_end(bool fpu_enabled)
212{
213 glue_fpu_end(fpu_enabled);
214}
215
216struct crypt_priv {
217 struct twofish_ctx *ctx;
218 bool fpu_enabled;
219};
220
221static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
222{
223 const unsigned int bsize = TF_BLOCK_SIZE;
224 struct crypt_priv *ctx = priv;
225 int i;
226
227 ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
228
229 if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
230 twofish_enc_blk_xway(ctx->ctx, srcdst, srcdst);
231 return;
232 }
233
234 for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3)
235 twofish_enc_blk_3way(ctx->ctx, srcdst, srcdst);
236
237 nbytes %= bsize * 3;
238
239 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
240 twofish_enc_blk(ctx->ctx, srcdst, srcdst);
241}
242
243static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
244{
245 const unsigned int bsize = TF_BLOCK_SIZE;
246 struct crypt_priv *ctx = priv;
247 int i;
248
249 ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
250
251 if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
252 twofish_dec_blk_xway(ctx->ctx, srcdst, srcdst);
253 return;
254 }
255
256 for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3)
257 twofish_dec_blk_3way(ctx->ctx, srcdst, srcdst);
258
259 nbytes %= bsize * 3;
260
261 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
262 twofish_dec_blk(ctx->ctx, srcdst, srcdst);
263}
264
265static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
266 struct scatterlist *src, unsigned int nbytes)
267{
268 struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
269 be128 buf[TWOFISH_PARALLEL_BLOCKS];
270 struct crypt_priv crypt_ctx = {
271 .ctx = &ctx->twofish_ctx,
272 .fpu_enabled = false,
273 };
274 struct lrw_crypt_req req = {
275 .tbuf = buf,
276 .tbuflen = sizeof(buf),
277
278 .table_ctx = &ctx->lrw_table,
279 .crypt_ctx = &crypt_ctx,
280 .crypt_fn = encrypt_callback,
281 };
282 int ret;
283
284 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
285 ret = lrw_crypt(desc, dst, src, nbytes, &req);
286 twofish_fpu_end(crypt_ctx.fpu_enabled);
287
288 return ret;
289}
290
291static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
292 struct scatterlist *src, unsigned int nbytes)
293{
294 struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
295 be128 buf[TWOFISH_PARALLEL_BLOCKS];
296 struct crypt_priv crypt_ctx = {
297 .ctx = &ctx->twofish_ctx,
298 .fpu_enabled = false,
299 };
300 struct lrw_crypt_req req = {
301 .tbuf = buf,
302 .tbuflen = sizeof(buf),
303
304 .table_ctx = &ctx->lrw_table,
305 .crypt_ctx = &crypt_ctx,
306 .crypt_fn = decrypt_callback,
307 };
308 int ret;
309
310 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
311 ret = lrw_crypt(desc, dst, src, nbytes, &req);
312 twofish_fpu_end(crypt_ctx.fpu_enabled);
313
314 return ret;
315}
316
317static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
318 struct scatterlist *src, unsigned int nbytes)
319{
320 struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
321 be128 buf[TWOFISH_PARALLEL_BLOCKS];
322 struct crypt_priv crypt_ctx = {
323 .ctx = &ctx->crypt_ctx,
324 .fpu_enabled = false,
325 };
326 struct xts_crypt_req req = {
327 .tbuf = buf,
328 .tbuflen = sizeof(buf),
329
330 .tweak_ctx = &ctx->tweak_ctx,
331 .tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk),
332 .crypt_ctx = &crypt_ctx,
333 .crypt_fn = encrypt_callback,
334 };
335 int ret;
336
337 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
338 ret = xts_crypt(desc, dst, src, nbytes, &req);
339 twofish_fpu_end(crypt_ctx.fpu_enabled);
340
341 return ret;
342}
343
344static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
345 struct scatterlist *src, unsigned int nbytes)
346{
347 struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
348 be128 buf[TWOFISH_PARALLEL_BLOCKS];
349 struct crypt_priv crypt_ctx = {
350 .ctx = &ctx->crypt_ctx,
351 .fpu_enabled = false,
352 };
353 struct xts_crypt_req req = {
354 .tbuf = buf,
355 .tbuflen = sizeof(buf),
356
357 .tweak_ctx = &ctx->tweak_ctx,
358 .tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk),
359 .crypt_ctx = &crypt_ctx,
360 .crypt_fn = decrypt_callback,
361 };
362 int ret;
363
364 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
365 ret = xts_crypt(desc, dst, src, nbytes, &req);
366 twofish_fpu_end(crypt_ctx.fpu_enabled);
367
368 return ret;
369}
370
371static struct crypto_alg twofish_algs[10] = { {
372 .cra_name = "__ecb-twofish-avx",
373 .cra_driver_name = "__driver-ecb-twofish-avx",
374 .cra_priority = 0,
375 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
376 .cra_blocksize = TF_BLOCK_SIZE,
377 .cra_ctxsize = sizeof(struct twofish_ctx),
378 .cra_alignmask = 0,
379 .cra_type = &crypto_blkcipher_type,
380 .cra_module = THIS_MODULE,
381 .cra_list = LIST_HEAD_INIT(twofish_algs[0].cra_list),
382 .cra_u = {
383 .blkcipher = {
384 .min_keysize = TF_MIN_KEY_SIZE,
385 .max_keysize = TF_MAX_KEY_SIZE,
386 .setkey = twofish_setkey,
387 .encrypt = ecb_encrypt,
388 .decrypt = ecb_decrypt,
389 },
390 },
391}, {
392 .cra_name = "__cbc-twofish-avx",
393 .cra_driver_name = "__driver-cbc-twofish-avx",
394 .cra_priority = 0,
395 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
396 .cra_blocksize = TF_BLOCK_SIZE,
397 .cra_ctxsize = sizeof(struct twofish_ctx),
398 .cra_alignmask = 0,
399 .cra_type = &crypto_blkcipher_type,
400 .cra_module = THIS_MODULE,
401 .cra_list = LIST_HEAD_INIT(twofish_algs[1].cra_list),
402 .cra_u = {
403 .blkcipher = {
404 .min_keysize = TF_MIN_KEY_SIZE,
405 .max_keysize = TF_MAX_KEY_SIZE,
406 .setkey = twofish_setkey,
407 .encrypt = cbc_encrypt,
408 .decrypt = cbc_decrypt,
409 },
410 },
411}, {
412 .cra_name = "__ctr-twofish-avx",
413 .cra_driver_name = "__driver-ctr-twofish-avx",
414 .cra_priority = 0,
415 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
416 .cra_blocksize = 1,
417 .cra_ctxsize = sizeof(struct twofish_ctx),
418 .cra_alignmask = 0,
419 .cra_type = &crypto_blkcipher_type,
420 .cra_module = THIS_MODULE,
421 .cra_list = LIST_HEAD_INIT(twofish_algs[2].cra_list),
422 .cra_u = {
423 .blkcipher = {
424 .min_keysize = TF_MIN_KEY_SIZE,
425 .max_keysize = TF_MAX_KEY_SIZE,
426 .ivsize = TF_BLOCK_SIZE,
427 .setkey = twofish_setkey,
428 .encrypt = ctr_crypt,
429 .decrypt = ctr_crypt,
430 },
431 },
432}, {
433 .cra_name = "__lrw-twofish-avx",
434 .cra_driver_name = "__driver-lrw-twofish-avx",
435 .cra_priority = 0,
436 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
437 .cra_blocksize = TF_BLOCK_SIZE,
438 .cra_ctxsize = sizeof(struct twofish_lrw_ctx),
439 .cra_alignmask = 0,
440 .cra_type = &crypto_blkcipher_type,
441 .cra_module = THIS_MODULE,
442 .cra_list = LIST_HEAD_INIT(twofish_algs[3].cra_list),
443 .cra_exit = lrw_twofish_exit_tfm,
444 .cra_u = {
445 .blkcipher = {
446 .min_keysize = TF_MIN_KEY_SIZE +
447 TF_BLOCK_SIZE,
448 .max_keysize = TF_MAX_KEY_SIZE +
449 TF_BLOCK_SIZE,
450 .ivsize = TF_BLOCK_SIZE,
451 .setkey = lrw_twofish_setkey,
452 .encrypt = lrw_encrypt,
453 .decrypt = lrw_decrypt,
454 },
455 },
456}, {
457 .cra_name = "__xts-twofish-avx",
458 .cra_driver_name = "__driver-xts-twofish-avx",
459 .cra_priority = 0,
460 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
461 .cra_blocksize = TF_BLOCK_SIZE,
462 .cra_ctxsize = sizeof(struct twofish_xts_ctx),
463 .cra_alignmask = 0,
464 .cra_type = &crypto_blkcipher_type,
465 .cra_module = THIS_MODULE,
466 .cra_list = LIST_HEAD_INIT(twofish_algs[4].cra_list),
467 .cra_u = {
468 .blkcipher = {
469 .min_keysize = TF_MIN_KEY_SIZE * 2,
470 .max_keysize = TF_MAX_KEY_SIZE * 2,
471 .ivsize = TF_BLOCK_SIZE,
472 .setkey = xts_twofish_setkey,
473 .encrypt = xts_encrypt,
474 .decrypt = xts_decrypt,
475 },
476 },
477}, {
478 .cra_name = "ecb(twofish)",
479 .cra_driver_name = "ecb-twofish-avx",
480 .cra_priority = 400,
481 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
482 .cra_blocksize = TF_BLOCK_SIZE,
483 .cra_ctxsize = sizeof(struct async_helper_ctx),
484 .cra_alignmask = 0,
485 .cra_type = &crypto_ablkcipher_type,
486 .cra_module = THIS_MODULE,
487 .cra_list = LIST_HEAD_INIT(twofish_algs[5].cra_list),
488 .cra_init = ablk_init,
489 .cra_exit = ablk_exit,
490 .cra_u = {
491 .ablkcipher = {
492 .min_keysize = TF_MIN_KEY_SIZE,
493 .max_keysize = TF_MAX_KEY_SIZE,
494 .setkey = ablk_set_key,
495 .encrypt = ablk_encrypt,
496 .decrypt = ablk_decrypt,
497 },
498 },
499}, {
500 .cra_name = "cbc(twofish)",
501 .cra_driver_name = "cbc-twofish-avx",
502 .cra_priority = 400,
503 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
504 .cra_blocksize = TF_BLOCK_SIZE,
505 .cra_ctxsize = sizeof(struct async_helper_ctx),
506 .cra_alignmask = 0,
507 .cra_type = &crypto_ablkcipher_type,
508 .cra_module = THIS_MODULE,
509 .cra_list = LIST_HEAD_INIT(twofish_algs[6].cra_list),
510 .cra_init = ablk_init,
511 .cra_exit = ablk_exit,
512 .cra_u = {
513 .ablkcipher = {
514 .min_keysize = TF_MIN_KEY_SIZE,
515 .max_keysize = TF_MAX_KEY_SIZE,
516 .ivsize = TF_BLOCK_SIZE,
517 .setkey = ablk_set_key,
518 .encrypt = __ablk_encrypt,
519 .decrypt = ablk_decrypt,
520 },
521 },
522}, {
523 .cra_name = "ctr(twofish)",
524 .cra_driver_name = "ctr-twofish-avx",
525 .cra_priority = 400,
526 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
527 .cra_blocksize = 1,
528 .cra_ctxsize = sizeof(struct async_helper_ctx),
529 .cra_alignmask = 0,
530 .cra_type = &crypto_ablkcipher_type,
531 .cra_module = THIS_MODULE,
532 .cra_list = LIST_HEAD_INIT(twofish_algs[7].cra_list),
533 .cra_init = ablk_init,
534 .cra_exit = ablk_exit,
535 .cra_u = {
536 .ablkcipher = {
537 .min_keysize = TF_MIN_KEY_SIZE,
538 .max_keysize = TF_MAX_KEY_SIZE,
539 .ivsize = TF_BLOCK_SIZE,
540 .setkey = ablk_set_key,
541 .encrypt = ablk_encrypt,
542 .decrypt = ablk_encrypt,
543 .geniv = "chainiv",
544 },
545 },
546}, {
547 .cra_name = "lrw(twofish)",
548 .cra_driver_name = "lrw-twofish-avx",
549 .cra_priority = 400,
550 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
551 .cra_blocksize = TF_BLOCK_SIZE,
552 .cra_ctxsize = sizeof(struct async_helper_ctx),
553 .cra_alignmask = 0,
554 .cra_type = &crypto_ablkcipher_type,
555 .cra_module = THIS_MODULE,
556 .cra_list = LIST_HEAD_INIT(twofish_algs[8].cra_list),
557 .cra_init = ablk_init,
558 .cra_exit = ablk_exit,
559 .cra_u = {
560 .ablkcipher = {
561 .min_keysize = TF_MIN_KEY_SIZE +
562 TF_BLOCK_SIZE,
563 .max_keysize = TF_MAX_KEY_SIZE +
564 TF_BLOCK_SIZE,
565 .ivsize = TF_BLOCK_SIZE,
566 .setkey = ablk_set_key,
567 .encrypt = ablk_encrypt,
568 .decrypt = ablk_decrypt,
569 },
570 },
571}, {
572 .cra_name = "xts(twofish)",
573 .cra_driver_name = "xts-twofish-avx",
574 .cra_priority = 400,
575 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
576 .cra_blocksize = TF_BLOCK_SIZE,
577 .cra_ctxsize = sizeof(struct async_helper_ctx),
578 .cra_alignmask = 0,
579 .cra_type = &crypto_ablkcipher_type,
580 .cra_module = THIS_MODULE,
581 .cra_list = LIST_HEAD_INIT(twofish_algs[9].cra_list),
582 .cra_init = ablk_init,
583 .cra_exit = ablk_exit,
584 .cra_u = {
585 .ablkcipher = {
586 .min_keysize = TF_MIN_KEY_SIZE * 2,
587 .max_keysize = TF_MAX_KEY_SIZE * 2,
588 .ivsize = TF_BLOCK_SIZE,
589 .setkey = ablk_set_key,
590 .encrypt = ablk_encrypt,
591 .decrypt = ablk_decrypt,
592 },
593 },
594} };
595
596static int __init twofish_init(void)
597{
598 u64 xcr0;
599
600 if (!cpu_has_avx || !cpu_has_osxsave) {
601 printk(KERN_INFO "AVX instructions are not detected.\n");
602 return -ENODEV;
603 }
604
605 xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
606 if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
607 printk(KERN_INFO "AVX detected but unusable.\n");
608 return -ENODEV;
609 }
610
611 return crypto_register_algs(twofish_algs, ARRAY_SIZE(twofish_algs));
612}
613
614static void __exit twofish_exit(void)
615{
616 crypto_unregister_algs(twofish_algs, ARRAY_SIZE(twofish_algs));
617}
618
619module_init(twofish_init);
620module_exit(twofish_exit);
621
622MODULE_DESCRIPTION("Twofish Cipher Algorithm, AVX optimized");
623MODULE_LICENSE("GPL");
624MODULE_ALIAS("twofish");
diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c
index 922ab24cce31..15f9347316c8 100644
--- a/arch/x86/crypto/twofish_glue_3way.c
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -3,11 +3,6 @@
3 * 3 *
4 * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 4 * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 * 5 *
6 * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
7 * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
8 * CTR part based on code (crypto/ctr.c) by:
9 * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
10 *
11 * This program is free software; you can redistribute it and/or modify 6 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by 7 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or 8 * the Free Software Foundation; either version 2 of the License, or
@@ -33,20 +28,13 @@
33#include <crypto/algapi.h> 28#include <crypto/algapi.h>
34#include <crypto/twofish.h> 29#include <crypto/twofish.h>
35#include <crypto/b128ops.h> 30#include <crypto/b128ops.h>
31#include <asm/crypto/twofish.h>
32#include <asm/crypto/glue_helper.h>
36#include <crypto/lrw.h> 33#include <crypto/lrw.h>
37#include <crypto/xts.h> 34#include <crypto/xts.h>
38 35
39/* regular block cipher functions from twofish_x86_64 module */ 36EXPORT_SYMBOL_GPL(__twofish_enc_blk_3way);
40asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst, 37EXPORT_SYMBOL_GPL(twofish_dec_blk_3way);
41 const u8 *src);
42asmlinkage void twofish_dec_blk(struct twofish_ctx *ctx, u8 *dst,
43 const u8 *src);
44
45/* 3-way parallel cipher functions */
46asmlinkage void __twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
47 const u8 *src, bool xor);
48asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst,
49 const u8 *src);
50 38
51static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, 39static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
52 const u8 *src) 40 const u8 *src)
@@ -60,311 +48,139 @@ static inline void twofish_enc_blk_xor_3way(struct twofish_ctx *ctx, u8 *dst,
60 __twofish_enc_blk_3way(ctx, dst, src, true); 48 __twofish_enc_blk_3way(ctx, dst, src, true);
61} 49}
62 50
63static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, 51void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src)
64 void (*fn)(struct twofish_ctx *, u8 *, const u8 *),
65 void (*fn_3way)(struct twofish_ctx *, u8 *, const u8 *))
66{
67 struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
68 unsigned int bsize = TF_BLOCK_SIZE;
69 unsigned int nbytes;
70 int err;
71
72 err = blkcipher_walk_virt(desc, walk);
73
74 while ((nbytes = walk->nbytes)) {
75 u8 *wsrc = walk->src.virt.addr;
76 u8 *wdst = walk->dst.virt.addr;
77
78 /* Process three block batch */
79 if (nbytes >= bsize * 3) {
80 do {
81 fn_3way(ctx, wdst, wsrc);
82
83 wsrc += bsize * 3;
84 wdst += bsize * 3;
85 nbytes -= bsize * 3;
86 } while (nbytes >= bsize * 3);
87
88 if (nbytes < bsize)
89 goto done;
90 }
91
92 /* Handle leftovers */
93 do {
94 fn(ctx, wdst, wsrc);
95
96 wsrc += bsize;
97 wdst += bsize;
98 nbytes -= bsize;
99 } while (nbytes >= bsize);
100
101done:
102 err = blkcipher_walk_done(desc, walk, nbytes);
103 }
104
105 return err;
106}
107
108static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
109 struct scatterlist *src, unsigned int nbytes)
110{ 52{
111 struct blkcipher_walk walk; 53 u128 ivs[2];
112 54
113 blkcipher_walk_init(&walk, dst, src, nbytes); 55 ivs[0] = src[0];
114 return ecb_crypt(desc, &walk, twofish_enc_blk, twofish_enc_blk_3way); 56 ivs[1] = src[1];
115}
116 57
117static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 58 twofish_dec_blk_3way(ctx, (u8 *)dst, (u8 *)src);
118 struct scatterlist *src, unsigned int nbytes)
119{
120 struct blkcipher_walk walk;
121 59
122 blkcipher_walk_init(&walk, dst, src, nbytes); 60 u128_xor(&dst[1], &dst[1], &ivs[0]);
123 return ecb_crypt(desc, &walk, twofish_dec_blk, twofish_dec_blk_3way); 61 u128_xor(&dst[2], &dst[2], &ivs[1]);
124} 62}
63EXPORT_SYMBOL_GPL(twofish_dec_blk_cbc_3way);
125 64
126static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, 65void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
127 struct blkcipher_walk *walk)
128{
129 struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
130 unsigned int bsize = TF_BLOCK_SIZE;
131 unsigned int nbytes = walk->nbytes;
132 u128 *src = (u128 *)walk->src.virt.addr;
133 u128 *dst = (u128 *)walk->dst.virt.addr;
134 u128 *iv = (u128 *)walk->iv;
135
136 do {
137 u128_xor(dst, src, iv);
138 twofish_enc_blk(ctx, (u8 *)dst, (u8 *)dst);
139 iv = dst;
140
141 src += 1;
142 dst += 1;
143 nbytes -= bsize;
144 } while (nbytes >= bsize);
145
146 u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv);
147 return nbytes;
148}
149
150static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
151 struct scatterlist *src, unsigned int nbytes)
152{ 66{
153 struct blkcipher_walk walk; 67 be128 ctrblk;
154 int err;
155 68
156 blkcipher_walk_init(&walk, dst, src, nbytes); 69 if (dst != src)
157 err = blkcipher_walk_virt(desc, &walk); 70 *dst = *src;
158 71
159 while ((nbytes = walk.nbytes)) { 72 u128_to_be128(&ctrblk, iv);
160 nbytes = __cbc_encrypt(desc, &walk); 73 u128_inc(iv);
161 err = blkcipher_walk_done(desc, &walk, nbytes);
162 }
163 74
164 return err; 75 twofish_enc_blk(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
76 u128_xor(dst, dst, (u128 *)&ctrblk);
165} 77}
78EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr);
166 79
167static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, 80void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src,
168 struct blkcipher_walk *walk) 81 u128 *iv)
169{ 82{
170 struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 83 be128 ctrblks[3];
171 unsigned int bsize = TF_BLOCK_SIZE;
172 unsigned int nbytes = walk->nbytes;
173 u128 *src = (u128 *)walk->src.virt.addr;
174 u128 *dst = (u128 *)walk->dst.virt.addr;
175 u128 ivs[3 - 1];
176 u128 last_iv;
177
178 /* Start of the last block. */
179 src += nbytes / bsize - 1;
180 dst += nbytes / bsize - 1;
181
182 last_iv = *src;
183
184 /* Process three block batch */
185 if (nbytes >= bsize * 3) {
186 do {
187 nbytes -= bsize * (3 - 1);
188 src -= 3 - 1;
189 dst -= 3 - 1;
190
191 ivs[0] = src[0];
192 ivs[1] = src[1];
193
194 twofish_dec_blk_3way(ctx, (u8 *)dst, (u8 *)src);
195
196 u128_xor(dst + 1, dst + 1, ivs + 0);
197 u128_xor(dst + 2, dst + 2, ivs + 1);
198
199 nbytes -= bsize;
200 if (nbytes < bsize)
201 goto done;
202
203 u128_xor(dst, dst, src - 1);
204 src -= 1;
205 dst -= 1;
206 } while (nbytes >= bsize * 3);
207
208 if (nbytes < bsize)
209 goto done;
210 }
211
212 /* Handle leftovers */
213 for (;;) {
214 twofish_dec_blk(ctx, (u8 *)dst, (u8 *)src);
215
216 nbytes -= bsize;
217 if (nbytes < bsize)
218 break;
219 84
220 u128_xor(dst, dst, src - 1); 85 if (dst != src) {
221 src -= 1; 86 dst[0] = src[0];
222 dst -= 1; 87 dst[1] = src[1];
88 dst[2] = src[2];
223 } 89 }
224 90
225done: 91 u128_to_be128(&ctrblks[0], iv);
226 u128_xor(dst, dst, (u128 *)walk->iv); 92 u128_inc(iv);
227 *(u128 *)walk->iv = last_iv; 93 u128_to_be128(&ctrblks[1], iv);
94 u128_inc(iv);
95 u128_to_be128(&ctrblks[2], iv);
96 u128_inc(iv);
228 97
229 return nbytes; 98 twofish_enc_blk_xor_3way(ctx, (u8 *)dst, (u8 *)ctrblks);
230} 99}
100EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr_3way);
101
102static const struct common_glue_ctx twofish_enc = {
103 .num_funcs = 2,
104 .fpu_blocks_limit = -1,
105
106 .funcs = { {
107 .num_blocks = 3,
108 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) }
109 }, {
110 .num_blocks = 1,
111 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk) }
112 } }
113};
231 114
232static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 115static const struct common_glue_ctx twofish_ctr = {
233 struct scatterlist *src, unsigned int nbytes) 116 .num_funcs = 2,
234{ 117 .fpu_blocks_limit = -1,
235 struct blkcipher_walk walk; 118
236 int err; 119 .funcs = { {
237 120 .num_blocks = 3,
238 blkcipher_walk_init(&walk, dst, src, nbytes); 121 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_ctr_3way) }
239 err = blkcipher_walk_virt(desc, &walk); 122 }, {
123 .num_blocks = 1,
124 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_ctr) }
125 } }
126};
240 127
241 while ((nbytes = walk.nbytes)) { 128static const struct common_glue_ctx twofish_dec = {
242 nbytes = __cbc_decrypt(desc, &walk); 129 .num_funcs = 2,
243 err = blkcipher_walk_done(desc, &walk, nbytes); 130 .fpu_blocks_limit = -1,
244 } 131
132 .funcs = { {
133 .num_blocks = 3,
134 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) }
135 }, {
136 .num_blocks = 1,
137 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk) }
138 } }
139};
245 140
246 return err; 141static const struct common_glue_ctx twofish_dec_cbc = {
247} 142 .num_funcs = 2,
143 .fpu_blocks_limit = -1,
144
145 .funcs = { {
146 .num_blocks = 3,
147 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) }
148 }, {
149 .num_blocks = 1,
150 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk) }
151 } }
152};
248 153
249static inline void u128_to_be128(be128 *dst, const u128 *src) 154static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
155 struct scatterlist *src, unsigned int nbytes)
250{ 156{
251 dst->a = cpu_to_be64(src->a); 157 return glue_ecb_crypt_128bit(&twofish_enc, desc, dst, src, nbytes);
252 dst->b = cpu_to_be64(src->b);
253} 158}
254 159
255static inline void be128_to_u128(u128 *dst, const be128 *src) 160static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
161 struct scatterlist *src, unsigned int nbytes)
256{ 162{
257 dst->a = be64_to_cpu(src->a); 163 return glue_ecb_crypt_128bit(&twofish_dec, desc, dst, src, nbytes);
258 dst->b = be64_to_cpu(src->b);
259} 164}
260 165
261static inline void u128_inc(u128 *i) 166static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
167 struct scatterlist *src, unsigned int nbytes)
262{ 168{
263 i->b++; 169 return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(twofish_enc_blk), desc,
264 if (!i->b) 170 dst, src, nbytes);
265 i->a++;
266} 171}
267 172
268static void ctr_crypt_final(struct blkcipher_desc *desc, 173static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
269 struct blkcipher_walk *walk) 174 struct scatterlist *src, unsigned int nbytes)
270{ 175{
271 struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 176 return glue_cbc_decrypt_128bit(&twofish_dec_cbc, desc, dst, src,
272 u8 *ctrblk = walk->iv; 177 nbytes);
273 u8 keystream[TF_BLOCK_SIZE];
274 u8 *src = walk->src.virt.addr;
275 u8 *dst = walk->dst.virt.addr;
276 unsigned int nbytes = walk->nbytes;
277
278 twofish_enc_blk(ctx, keystream, ctrblk);
279 crypto_xor(keystream, src, nbytes);
280 memcpy(dst, keystream, nbytes);
281
282 crypto_inc(ctrblk, TF_BLOCK_SIZE);
283}
284
285static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
286 struct blkcipher_walk *walk)
287{
288 struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
289 unsigned int bsize = TF_BLOCK_SIZE;
290 unsigned int nbytes = walk->nbytes;
291 u128 *src = (u128 *)walk->src.virt.addr;
292 u128 *dst = (u128 *)walk->dst.virt.addr;
293 u128 ctrblk;
294 be128 ctrblocks[3];
295
296 be128_to_u128(&ctrblk, (be128 *)walk->iv);
297
298 /* Process three block batch */
299 if (nbytes >= bsize * 3) {
300 do {
301 if (dst != src) {
302 dst[0] = src[0];
303 dst[1] = src[1];
304 dst[2] = src[2];
305 }
306
307 /* create ctrblks for parallel encrypt */
308 u128_to_be128(&ctrblocks[0], &ctrblk);
309 u128_inc(&ctrblk);
310 u128_to_be128(&ctrblocks[1], &ctrblk);
311 u128_inc(&ctrblk);
312 u128_to_be128(&ctrblocks[2], &ctrblk);
313 u128_inc(&ctrblk);
314
315 twofish_enc_blk_xor_3way(ctx, (u8 *)dst,
316 (u8 *)ctrblocks);
317
318 src += 3;
319 dst += 3;
320 nbytes -= bsize * 3;
321 } while (nbytes >= bsize * 3);
322
323 if (nbytes < bsize)
324 goto done;
325 }
326
327 /* Handle leftovers */
328 do {
329 if (dst != src)
330 *dst = *src;
331
332 u128_to_be128(&ctrblocks[0], &ctrblk);
333 u128_inc(&ctrblk);
334
335 twofish_enc_blk(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks);
336 u128_xor(dst, dst, (u128 *)ctrblocks);
337
338 src += 1;
339 dst += 1;
340 nbytes -= bsize;
341 } while (nbytes >= bsize);
342
343done:
344 u128_to_be128((be128 *)walk->iv, &ctrblk);
345 return nbytes;
346} 178}
347 179
348static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, 180static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
349 struct scatterlist *src, unsigned int nbytes) 181 struct scatterlist *src, unsigned int nbytes)
350{ 182{
351 struct blkcipher_walk walk; 183 return glue_ctr_crypt_128bit(&twofish_ctr, desc, dst, src, nbytes);
352 int err;
353
354 blkcipher_walk_init(&walk, dst, src, nbytes);
355 err = blkcipher_walk_virt_block(desc, &walk, TF_BLOCK_SIZE);
356
357 while ((nbytes = walk.nbytes) >= TF_BLOCK_SIZE) {
358 nbytes = __ctr_crypt(desc, &walk);
359 err = blkcipher_walk_done(desc, &walk, nbytes);
360 }
361
362 if (walk.nbytes) {
363 ctr_crypt_final(desc, &walk);
364 err = blkcipher_walk_done(desc, &walk, 0);
365 }
366
367 return err;
368} 184}
369 185
370static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) 186static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
@@ -397,13 +213,8 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
397 twofish_dec_blk(ctx, srcdst, srcdst); 213 twofish_dec_blk(ctx, srcdst, srcdst);
398} 214}
399 215
400struct twofish_lrw_ctx { 216int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
401 struct lrw_table_ctx lrw_table; 217 unsigned int keylen)
402 struct twofish_ctx twofish_ctx;
403};
404
405static int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
406 unsigned int keylen)
407{ 218{
408 struct twofish_lrw_ctx *ctx = crypto_tfm_ctx(tfm); 219 struct twofish_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
409 int err; 220 int err;
@@ -415,6 +226,7 @@ static int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
415 226
416 return lrw_init_table(&ctx->lrw_table, key + keylen - TF_BLOCK_SIZE); 227 return lrw_init_table(&ctx->lrw_table, key + keylen - TF_BLOCK_SIZE);
417} 228}
229EXPORT_SYMBOL_GPL(lrw_twofish_setkey);
418 230
419static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 231static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
420 struct scatterlist *src, unsigned int nbytes) 232 struct scatterlist *src, unsigned int nbytes)
@@ -450,20 +262,16 @@ static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
450 return lrw_crypt(desc, dst, src, nbytes, &req); 262 return lrw_crypt(desc, dst, src, nbytes, &req);
451} 263}
452 264
453static void lrw_exit_tfm(struct crypto_tfm *tfm) 265void lrw_twofish_exit_tfm(struct crypto_tfm *tfm)
454{ 266{
455 struct twofish_lrw_ctx *ctx = crypto_tfm_ctx(tfm); 267 struct twofish_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
456 268
457 lrw_free_table(&ctx->lrw_table); 269 lrw_free_table(&ctx->lrw_table);
458} 270}
271EXPORT_SYMBOL_GPL(lrw_twofish_exit_tfm);
459 272
460struct twofish_xts_ctx { 273int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
461 struct twofish_ctx tweak_ctx; 274 unsigned int keylen)
462 struct twofish_ctx crypt_ctx;
463};
464
465static int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
466 unsigned int keylen)
467{ 275{
468 struct twofish_xts_ctx *ctx = crypto_tfm_ctx(tfm); 276 struct twofish_xts_ctx *ctx = crypto_tfm_ctx(tfm);
469 u32 *flags = &tfm->crt_flags; 277 u32 *flags = &tfm->crt_flags;
@@ -486,6 +294,7 @@ static int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
486 return __twofish_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2, 294 return __twofish_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2,
487 flags); 295 flags);
488} 296}
297EXPORT_SYMBOL_GPL(xts_twofish_setkey);
489 298
490static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 299static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
491 struct scatterlist *src, unsigned int nbytes) 300 struct scatterlist *src, unsigned int nbytes)
@@ -596,7 +405,7 @@ static struct crypto_alg tf_algs[5] = { {
596 .cra_type = &crypto_blkcipher_type, 405 .cra_type = &crypto_blkcipher_type,
597 .cra_module = THIS_MODULE, 406 .cra_module = THIS_MODULE,
598 .cra_list = LIST_HEAD_INIT(tf_algs[3].cra_list), 407 .cra_list = LIST_HEAD_INIT(tf_algs[3].cra_list),
599 .cra_exit = lrw_exit_tfm, 408 .cra_exit = lrw_twofish_exit_tfm,
600 .cra_u = { 409 .cra_u = {
601 .blkcipher = { 410 .blkcipher = {
602 .min_keysize = TF_MIN_KEY_SIZE + TF_BLOCK_SIZE, 411 .min_keysize = TF_MIN_KEY_SIZE + TF_BLOCK_SIZE,
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index daeca56211e3..673ac9b63d6b 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -38,7 +38,7 @@
38int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from) 38int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
39{ 39{
40 int err = 0; 40 int err = 0;
41 bool ia32 = is_ia32_task(); 41 bool ia32 = test_thread_flag(TIF_IA32);
42 42
43 if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t))) 43 if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t)))
44 return -EFAULT; 44 return -EFAULT;
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 49331bedc158..70780689599a 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -75,23 +75,54 @@ static inline int alternatives_text_reserved(void *start, void *end)
75} 75}
76#endif /* CONFIG_SMP */ 76#endif /* CONFIG_SMP */
77 77
78#define OLDINSTR(oldinstr) "661:\n\t" oldinstr "\n662:\n"
79
80#define b_replacement(number) "663"#number
81#define e_replacement(number) "664"#number
82
83#define alt_slen "662b-661b"
84#define alt_rlen(number) e_replacement(number)"f-"b_replacement(number)"f"
85
86#define ALTINSTR_ENTRY(feature, number) \
87 " .long 661b - .\n" /* label */ \
88 " .long " b_replacement(number)"f - .\n" /* new instruction */ \
89 " .word " __stringify(feature) "\n" /* feature bit */ \
90 " .byte " alt_slen "\n" /* source len */ \
91 " .byte " alt_rlen(number) "\n" /* replacement len */
92
93#define DISCARD_ENTRY(number) /* rlen <= slen */ \
94 " .byte 0xff + (" alt_rlen(number) ") - (" alt_slen ")\n"
95
96#define ALTINSTR_REPLACEMENT(newinstr, feature, number) /* replacement */ \
97 b_replacement(number)":\n\t" newinstr "\n" e_replacement(number) ":\n\t"
98
78/* alternative assembly primitive: */ 99/* alternative assembly primitive: */
79#define ALTERNATIVE(oldinstr, newinstr, feature) \ 100#define ALTERNATIVE(oldinstr, newinstr, feature) \
80 \ 101 OLDINSTR(oldinstr) \
81 "661:\n\t" oldinstr "\n662:\n" \ 102 ".section .altinstructions,\"a\"\n" \
82 ".section .altinstructions,\"a\"\n" \ 103 ALTINSTR_ENTRY(feature, 1) \
83 " .long 661b - .\n" /* label */ \ 104 ".previous\n" \
84 " .long 663f - .\n" /* new instruction */ \ 105 ".section .discard,\"aw\",@progbits\n" \
85 " .word " __stringify(feature) "\n" /* feature bit */ \ 106 DISCARD_ENTRY(1) \
86 " .byte 662b-661b\n" /* sourcelen */ \ 107 ".previous\n" \
87 " .byte 664f-663f\n" /* replacementlen */ \ 108 ".section .altinstr_replacement, \"ax\"\n" \
88 ".previous\n" \ 109 ALTINSTR_REPLACEMENT(newinstr, feature, 1) \
89 ".section .discard,\"aw\",@progbits\n" \ 110 ".previous"
90 " .byte 0xff + (664f-663f) - (662b-661b)\n" /* rlen <= slen */ \ 111
91 ".previous\n" \ 112#define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\
92 ".section .altinstr_replacement, \"ax\"\n" \ 113 OLDINSTR(oldinstr) \
93 "663:\n\t" newinstr "\n664:\n" /* replacement */ \ 114 ".section .altinstructions,\"a\"\n" \
94 ".previous" 115 ALTINSTR_ENTRY(feature1, 1) \
116 ALTINSTR_ENTRY(feature2, 2) \
117 ".previous\n" \
118 ".section .discard,\"aw\",@progbits\n" \
119 DISCARD_ENTRY(1) \
120 DISCARD_ENTRY(2) \
121 ".previous\n" \
122 ".section .altinstr_replacement, \"ax\"\n" \
123 ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \
124 ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \
125 ".previous"
95 126
96/* 127/*
97 * This must be included *after* the definition of ALTERNATIVE due to 128 * This must be included *after* the definition of ALTERNATIVE due to
@@ -140,6 +171,19 @@ static inline int alternatives_text_reserved(void *start, void *end)
140 : output : [old] "i" (oldfunc), [new] "i" (newfunc), ## input) 171 : output : [old] "i" (oldfunc), [new] "i" (newfunc), ## input)
141 172
142/* 173/*
174 * Like alternative_call, but there are two features and respective functions.
175 * If CPU has feature2, function2 is used.
176 * Otherwise, if CPU has feature1, function1 is used.
177 * Otherwise, old function is used.
178 */
179#define alternative_call_2(oldfunc, newfunc1, feature1, newfunc2, feature2, \
180 output, input...) \
181 asm volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", feature1,\
182 "call %P[new2]", feature2) \
183 : output : [old] "i" (oldfunc), [new1] "i" (newfunc1), \
184 [new2] "i" (newfunc2), ## input)
185
186/*
143 * use this macro(s) if you need more than one output parameter 187 * use this macro(s) if you need more than one output parameter
144 * in alternative_io 188 * in alternative_io
145 */ 189 */
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 49ad773f4b9f..b3341e9cd8fd 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -26,10 +26,31 @@ struct amd_l3_cache {
26 u8 subcaches[4]; 26 u8 subcaches[4];
27}; 27};
28 28
29struct threshold_block {
30 unsigned int block;
31 unsigned int bank;
32 unsigned int cpu;
33 u32 address;
34 u16 interrupt_enable;
35 bool interrupt_capable;
36 u16 threshold_limit;
37 struct kobject kobj;
38 struct list_head miscj;
39};
40
41struct threshold_bank {
42 struct kobject *kobj;
43 struct threshold_block *blocks;
44
45 /* initialized to the number of CPUs on the node sharing this bank */
46 atomic_t cpus;
47};
48
29struct amd_northbridge { 49struct amd_northbridge {
30 struct pci_dev *misc; 50 struct pci_dev *misc;
31 struct pci_dev *link; 51 struct pci_dev *link;
32 struct amd_l3_cache l3_cache; 52 struct amd_l3_cache l3_cache;
53 struct threshold_bank *bank4;
33}; 54};
34 55
35struct amd_northbridge_info { 56struct amd_northbridge_info {
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index eaff4790ed96..f34261296ffb 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -306,7 +306,8 @@ struct apic {
306 unsigned long (*check_apicid_used)(physid_mask_t *map, int apicid); 306 unsigned long (*check_apicid_used)(physid_mask_t *map, int apicid);
307 unsigned long (*check_apicid_present)(int apicid); 307 unsigned long (*check_apicid_present)(int apicid);
308 308
309 void (*vector_allocation_domain)(int cpu, struct cpumask *retmask); 309 void (*vector_allocation_domain)(int cpu, struct cpumask *retmask,
310 const struct cpumask *mask);
310 void (*init_apic_ldr)(void); 311 void (*init_apic_ldr)(void);
311 312
312 void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap); 313 void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap);
@@ -331,9 +332,9 @@ struct apic {
331 unsigned long (*set_apic_id)(unsigned int id); 332 unsigned long (*set_apic_id)(unsigned int id);
332 unsigned long apic_id_mask; 333 unsigned long apic_id_mask;
333 334
334 unsigned int (*cpu_mask_to_apicid)(const struct cpumask *cpumask); 335 int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask,
335 unsigned int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask, 336 const struct cpumask *andmask,
336 const struct cpumask *andmask); 337 unsigned int *apicid);
337 338
338 /* ipi */ 339 /* ipi */
339 void (*send_IPI_mask)(const struct cpumask *mask, int vector); 340 void (*send_IPI_mask)(const struct cpumask *mask, int vector);
@@ -464,6 +465,8 @@ static inline u32 safe_apic_wait_icr_idle(void)
464 return apic->safe_wait_icr_idle(); 465 return apic->safe_wait_icr_idle();
465} 466}
466 467
468extern void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v));
469
467#else /* CONFIG_X86_LOCAL_APIC */ 470#else /* CONFIG_X86_LOCAL_APIC */
468 471
469static inline u32 apic_read(u32 reg) { return 0; } 472static inline u32 apic_read(u32 reg) { return 0; }
@@ -473,6 +476,7 @@ static inline u64 apic_icr_read(void) { return 0; }
473static inline void apic_icr_write(u32 low, u32 high) { } 476static inline void apic_icr_write(u32 low, u32 high) { }
474static inline void apic_wait_icr_idle(void) { } 477static inline void apic_wait_icr_idle(void) { }
475static inline u32 safe_apic_wait_icr_idle(void) { return 0; } 478static inline u32 safe_apic_wait_icr_idle(void) { return 0; }
479static inline void apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) {}
476 480
477#endif /* CONFIG_X86_LOCAL_APIC */ 481#endif /* CONFIG_X86_LOCAL_APIC */
478 482
@@ -537,7 +541,12 @@ static inline const struct cpumask *default_target_cpus(void)
537#endif 541#endif
538} 542}
539 543
540DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid); 544static inline const struct cpumask *online_target_cpus(void)
545{
546 return cpu_online_mask;
547}
548
549DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid);
541 550
542 551
543static inline unsigned int read_apic_id(void) 552static inline unsigned int read_apic_id(void)
@@ -586,21 +595,50 @@ static inline int default_phys_pkg_id(int cpuid_apic, int index_msb)
586 595
587#endif 596#endif
588 597
589static inline unsigned int 598static inline int
590default_cpu_mask_to_apicid(const struct cpumask *cpumask) 599flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
600 const struct cpumask *andmask,
601 unsigned int *apicid)
591{ 602{
592 return cpumask_bits(cpumask)[0] & APIC_ALL_CPUS; 603 unsigned long cpu_mask = cpumask_bits(cpumask)[0] &
604 cpumask_bits(andmask)[0] &
605 cpumask_bits(cpu_online_mask)[0] &
606 APIC_ALL_CPUS;
607
608 if (likely(cpu_mask)) {
609 *apicid = (unsigned int)cpu_mask;
610 return 0;
611 } else {
612 return -EINVAL;
613 }
593} 614}
594 615
595static inline unsigned int 616extern int
596default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, 617default_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
597 const struct cpumask *andmask) 618 const struct cpumask *andmask,
619 unsigned int *apicid);
620
621static inline void
622flat_vector_allocation_domain(int cpu, struct cpumask *retmask,
623 const struct cpumask *mask)
598{ 624{
599 unsigned long mask1 = cpumask_bits(cpumask)[0]; 625 /* Careful. Some cpus do not strictly honor the set of cpus
600 unsigned long mask2 = cpumask_bits(andmask)[0]; 626 * specified in the interrupt destination when using lowest
601 unsigned long mask3 = cpumask_bits(cpu_online_mask)[0]; 627 * priority interrupt delivery mode.
628 *
629 * In particular there was a hyperthreading cpu observed to
630 * deliver interrupts to the wrong hyperthread when only one
631 * hyperthread was specified in the interrupt desitination.
632 */
633 cpumask_clear(retmask);
634 cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
635}
602 636
603 return (unsigned int)(mask1 & mask2 & mask3); 637static inline void
638default_vector_allocation_domain(int cpu, struct cpumask *retmask,
639 const struct cpumask *mask)
640{
641 cpumask_copy(retmask, cpumask_of(cpu));
604} 642}
605 643
606static inline unsigned long default_check_apicid_used(physid_mask_t *map, int apicid) 644static inline unsigned long default_check_apicid_used(physid_mask_t *map, int apicid)
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index a6983b277220..72f5009deb5a 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -264,6 +264,13 @@ static inline int test_and_clear_bit(int nr, volatile unsigned long *addr)
264 * This operation is non-atomic and can be reordered. 264 * This operation is non-atomic and can be reordered.
265 * If two examples of this operation race, one can appear to succeed 265 * If two examples of this operation race, one can appear to succeed
266 * but actually fail. You must protect multiple accesses with a lock. 266 * but actually fail. You must protect multiple accesses with a lock.
267 *
268 * Note: the operation is performed atomically with respect to
269 * the local CPU, but not other CPUs. Portable code should not
270 * rely on this behaviour.
271 * KVM relies on this behaviour on x86 for modifying memory that is also
272 * accessed from a hypervisor on the same CPU if running in a VM: don't change
273 * this without also updating arch/x86/kernel/kvm.c
267 */ 274 */
268static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) 275static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
269{ 276{
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
index eb45aa6b1f27..2ad874cb661c 100644
--- a/arch/x86/include/asm/bootparam.h
+++ b/arch/x86/include/asm/bootparam.h
@@ -66,6 +66,7 @@ struct setup_header {
66 __u64 setup_data; 66 __u64 setup_data;
67 __u64 pref_address; 67 __u64 pref_address;
68 __u32 init_size; 68 __u32 init_size;
69 __u32 handover_offset;
69} __attribute__((packed)); 70} __attribute__((packed));
70 71
71struct sys_desc_table { 72struct sys_desc_table {
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 340ee49961a6..6b7ee5ff6820 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -176,7 +176,7 @@
176#define X86_FEATURE_XSAVEOPT (7*32+ 4) /* Optimized Xsave */ 176#define X86_FEATURE_XSAVEOPT (7*32+ 4) /* Optimized Xsave */
177#define X86_FEATURE_PLN (7*32+ 5) /* Intel Power Limit Notification */ 177#define X86_FEATURE_PLN (7*32+ 5) /* Intel Power Limit Notification */
178#define X86_FEATURE_PTS (7*32+ 6) /* Intel Package Thermal Status */ 178#define X86_FEATURE_PTS (7*32+ 6) /* Intel Package Thermal Status */
179#define X86_FEATURE_DTS (7*32+ 7) /* Digital Thermal Sensor */ 179#define X86_FEATURE_DTHERM (7*32+ 7) /* Digital Thermal Sensor */
180#define X86_FEATURE_HW_PSTATE (7*32+ 8) /* AMD HW-PState */ 180#define X86_FEATURE_HW_PSTATE (7*32+ 8) /* AMD HW-PState */
181 181
182/* Virtualization flags: Linux defined, word 8 */ 182/* Virtualization flags: Linux defined, word 8 */
@@ -207,6 +207,8 @@
207#define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */ 207#define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */
208#define X86_FEATURE_INVPCID (9*32+10) /* Invalidate Processor Context ID */ 208#define X86_FEATURE_INVPCID (9*32+10) /* Invalidate Processor Context ID */
209#define X86_FEATURE_RTM (9*32+11) /* Restricted Transactional Memory */ 209#define X86_FEATURE_RTM (9*32+11) /* Restricted Transactional Memory */
210#define X86_FEATURE_RDSEED (9*32+18) /* The RDSEED instruction */
211#define X86_FEATURE_ADX (9*32+19) /* The ADCX and ADOX instructions */
210 212
211#if defined(__KERNEL__) && !defined(__ASSEMBLY__) 213#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
212 214
diff --git a/arch/x86/include/asm/crypto/ablk_helper.h b/arch/x86/include/asm/crypto/ablk_helper.h
new file mode 100644
index 000000000000..4f93df50c23e
--- /dev/null
+++ b/arch/x86/include/asm/crypto/ablk_helper.h
@@ -0,0 +1,31 @@
1/*
2 * Shared async block cipher helpers
3 */
4
5#ifndef _CRYPTO_ABLK_HELPER_H
6#define _CRYPTO_ABLK_HELPER_H
7
8#include <linux/crypto.h>
9#include <linux/kernel.h>
10#include <crypto/cryptd.h>
11
12struct async_helper_ctx {
13 struct cryptd_ablkcipher *cryptd_tfm;
14};
15
16extern int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
17 unsigned int key_len);
18
19extern int __ablk_encrypt(struct ablkcipher_request *req);
20
21extern int ablk_encrypt(struct ablkcipher_request *req);
22
23extern int ablk_decrypt(struct ablkcipher_request *req);
24
25extern void ablk_exit(struct crypto_tfm *tfm);
26
27extern int ablk_init_common(struct crypto_tfm *tfm, const char *drv_name);
28
29extern int ablk_init(struct crypto_tfm *tfm);
30
31#endif /* _CRYPTO_ABLK_HELPER_H */
diff --git a/arch/x86/include/asm/aes.h b/arch/x86/include/asm/crypto/aes.h
index 80545a1cbe39..80545a1cbe39 100644
--- a/arch/x86/include/asm/aes.h
+++ b/arch/x86/include/asm/crypto/aes.h
diff --git a/arch/x86/include/asm/crypto/glue_helper.h b/arch/x86/include/asm/crypto/glue_helper.h
new file mode 100644
index 000000000000..3e408bddc96f
--- /dev/null
+++ b/arch/x86/include/asm/crypto/glue_helper.h
@@ -0,0 +1,115 @@
1/*
2 * Shared glue code for 128bit block ciphers
3 */
4
5#ifndef _CRYPTO_GLUE_HELPER_H
6#define _CRYPTO_GLUE_HELPER_H
7
8#include <linux/kernel.h>
9#include <linux/crypto.h>
10#include <asm/i387.h>
11#include <crypto/b128ops.h>
12
13typedef void (*common_glue_func_t)(void *ctx, u8 *dst, const u8 *src);
14typedef void (*common_glue_cbc_func_t)(void *ctx, u128 *dst, const u128 *src);
15typedef void (*common_glue_ctr_func_t)(void *ctx, u128 *dst, const u128 *src,
16 u128 *iv);
17
18#define GLUE_FUNC_CAST(fn) ((common_glue_func_t)(fn))
19#define GLUE_CBC_FUNC_CAST(fn) ((common_glue_cbc_func_t)(fn))
20#define GLUE_CTR_FUNC_CAST(fn) ((common_glue_ctr_func_t)(fn))
21
22struct common_glue_func_entry {
23 unsigned int num_blocks; /* number of blocks that @fn will process */
24 union {
25 common_glue_func_t ecb;
26 common_glue_cbc_func_t cbc;
27 common_glue_ctr_func_t ctr;
28 } fn_u;
29};
30
31struct common_glue_ctx {
32 unsigned int num_funcs;
33 int fpu_blocks_limit; /* -1 means fpu not needed at all */
34
35 /*
36 * First funcs entry must have largest num_blocks and last funcs entry
37 * must have num_blocks == 1!
38 */
39 struct common_glue_func_entry funcs[];
40};
41
42static inline bool glue_fpu_begin(unsigned int bsize, int fpu_blocks_limit,
43 struct blkcipher_desc *desc,
44 bool fpu_enabled, unsigned int nbytes)
45{
46 if (likely(fpu_blocks_limit < 0))
47 return false;
48
49 if (fpu_enabled)
50 return true;
51
52 /*
53 * Vector-registers are only used when chunk to be processed is large
54 * enough, so do not enable FPU until it is necessary.
55 */
56 if (nbytes < bsize * (unsigned int)fpu_blocks_limit)
57 return false;
58
59 if (desc) {
60 /* prevent sleeping if FPU is in use */
61 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
62 }
63
64 kernel_fpu_begin();
65 return true;
66}
67
68static inline void glue_fpu_end(bool fpu_enabled)
69{
70 if (fpu_enabled)
71 kernel_fpu_end();
72}
73
74static inline void u128_to_be128(be128 *dst, const u128 *src)
75{
76 dst->a = cpu_to_be64(src->a);
77 dst->b = cpu_to_be64(src->b);
78}
79
80static inline void be128_to_u128(u128 *dst, const be128 *src)
81{
82 dst->a = be64_to_cpu(src->a);
83 dst->b = be64_to_cpu(src->b);
84}
85
86static inline void u128_inc(u128 *i)
87{
88 i->b++;
89 if (!i->b)
90 i->a++;
91}
92
93extern int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
94 struct blkcipher_desc *desc,
95 struct scatterlist *dst,
96 struct scatterlist *src, unsigned int nbytes);
97
98extern int glue_cbc_encrypt_128bit(const common_glue_func_t fn,
99 struct blkcipher_desc *desc,
100 struct scatterlist *dst,
101 struct scatterlist *src,
102 unsigned int nbytes);
103
104extern int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
105 struct blkcipher_desc *desc,
106 struct scatterlist *dst,
107 struct scatterlist *src,
108 unsigned int nbytes);
109
110extern int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
111 struct blkcipher_desc *desc,
112 struct scatterlist *dst,
113 struct scatterlist *src, unsigned int nbytes);
114
115#endif /* _CRYPTO_GLUE_HELPER_H */
diff --git a/arch/x86/include/asm/crypto/serpent-avx.h b/arch/x86/include/asm/crypto/serpent-avx.h
new file mode 100644
index 000000000000..432deedd2945
--- /dev/null
+++ b/arch/x86/include/asm/crypto/serpent-avx.h
@@ -0,0 +1,32 @@
1#ifndef ASM_X86_SERPENT_AVX_H
2#define ASM_X86_SERPENT_AVX_H
3
4#include <linux/crypto.h>
5#include <crypto/serpent.h>
6
7#define SERPENT_PARALLEL_BLOCKS 8
8
9asmlinkage void __serpent_enc_blk_8way_avx(struct serpent_ctx *ctx, u8 *dst,
10 const u8 *src, bool xor);
11asmlinkage void serpent_dec_blk_8way_avx(struct serpent_ctx *ctx, u8 *dst,
12 const u8 *src);
13
14static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst,
15 const u8 *src)
16{
17 __serpent_enc_blk_8way_avx(ctx, dst, src, false);
18}
19
20static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst,
21 const u8 *src)
22{
23 __serpent_enc_blk_8way_avx(ctx, dst, src, true);
24}
25
26static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst,
27 const u8 *src)
28{
29 serpent_dec_blk_8way_avx(ctx, dst, src);
30}
31
32#endif
diff --git a/arch/x86/include/asm/serpent.h b/arch/x86/include/asm/crypto/serpent-sse2.h
index d3ef63fe0c81..e6e77dffbdab 100644
--- a/arch/x86/include/asm/serpent.h
+++ b/arch/x86/include/asm/crypto/serpent-sse2.h
@@ -1,5 +1,5 @@
1#ifndef ASM_X86_SERPENT_H 1#ifndef ASM_X86_SERPENT_SSE2_H
2#define ASM_X86_SERPENT_H 2#define ASM_X86_SERPENT_SSE2_H
3 3
4#include <linux/crypto.h> 4#include <linux/crypto.h>
5#include <crypto/serpent.h> 5#include <crypto/serpent.h>
diff --git a/arch/x86/include/asm/crypto/twofish.h b/arch/x86/include/asm/crypto/twofish.h
new file mode 100644
index 000000000000..9d2c514bd5f9
--- /dev/null
+++ b/arch/x86/include/asm/crypto/twofish.h
@@ -0,0 +1,46 @@
1#ifndef ASM_X86_TWOFISH_H
2#define ASM_X86_TWOFISH_H
3
4#include <linux/crypto.h>
5#include <crypto/twofish.h>
6#include <crypto/lrw.h>
7#include <crypto/b128ops.h>
8
9struct twofish_lrw_ctx {
10 struct lrw_table_ctx lrw_table;
11 struct twofish_ctx twofish_ctx;
12};
13
14struct twofish_xts_ctx {
15 struct twofish_ctx tweak_ctx;
16 struct twofish_ctx crypt_ctx;
17};
18
19/* regular block cipher functions from twofish_x86_64 module */
20asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst,
21 const u8 *src);
22asmlinkage void twofish_dec_blk(struct twofish_ctx *ctx, u8 *dst,
23 const u8 *src);
24
25/* 3-way parallel cipher functions */
26asmlinkage void __twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
27 const u8 *src, bool xor);
28asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst,
29 const u8 *src);
30
31/* helpers from twofish_x86_64-3way module */
32extern void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src);
33extern void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src,
34 u128 *iv);
35extern void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src,
36 u128 *iv);
37
38extern int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
39 unsigned int keylen);
40
41extern void lrw_twofish_exit_tfm(struct crypto_tfm *tfm);
42
43extern int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
44 unsigned int keylen);
45
46#endif /* ASM_X86_TWOFISH_H */
diff --git a/arch/x86/include/asm/emergency-restart.h b/arch/x86/include/asm/emergency-restart.h
index cc70c1c78ca4..75ce3f47d204 100644
--- a/arch/x86/include/asm/emergency-restart.h
+++ b/arch/x86/include/asm/emergency-restart.h
@@ -4,9 +4,7 @@
4enum reboot_type { 4enum reboot_type {
5 BOOT_TRIPLE = 't', 5 BOOT_TRIPLE = 't',
6 BOOT_KBD = 'k', 6 BOOT_KBD = 'k',
7#ifdef CONFIG_X86_32
8 BOOT_BIOS = 'b', 7 BOOT_BIOS = 'b',
9#endif
10 BOOT_ACPI = 'a', 8 BOOT_ACPI = 'a',
11 BOOT_EFI = 'e', 9 BOOT_EFI = 'e',
12 BOOT_CF9 = 'p', 10 BOOT_CF9 = 'p',
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 0baa628e330c..40afa0005c69 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -15,15 +15,6 @@ BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)
15BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR) 15BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR)
16BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR) 16BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
17BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR) 17BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR)
18
19.irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
20 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
21.if NUM_INVALIDATE_TLB_VECTORS > \idx
22BUILD_INTERRUPT3(invalidate_interrupt\idx,
23 (INVALIDATE_TLB_VECTOR_START)+\idx,
24 smp_invalidate_interrupt)
25.endif
26.endr
27#endif 18#endif
28 19
29BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) 20BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)
diff --git a/arch/x86/include/asm/floppy.h b/arch/x86/include/asm/floppy.h
index dbe82a5c5eac..d3d74698dce9 100644
--- a/arch/x86/include/asm/floppy.h
+++ b/arch/x86/include/asm/floppy.h
@@ -99,7 +99,7 @@ static irqreturn_t floppy_hardint(int irq, void *dev_id)
99 virtual_dma_residue += virtual_dma_count; 99 virtual_dma_residue += virtual_dma_count;
100 virtual_dma_count = 0; 100 virtual_dma_count = 0;
101#ifdef TRACE_FLPY_INT 101#ifdef TRACE_FLPY_INT
102 printk("count=%x, residue=%x calls=%d bytes=%d dma_wait=%d\n", 102 printk(KERN_DEBUG "count=%x, residue=%x calls=%d bytes=%d dma_wait=%d\n",
103 virtual_dma_count, virtual_dma_residue, calls, bytes, 103 virtual_dma_count, virtual_dma_residue, calls, bytes,
104 dma_wait); 104 dma_wait);
105 calls = 0; 105 calls = 0;
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index 7a15153c675d..b518c7509933 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -49,6 +49,7 @@ extern const struct hypervisor_x86 *x86_hyper;
49extern const struct hypervisor_x86 x86_hyper_vmware; 49extern const struct hypervisor_x86 x86_hyper_vmware;
50extern const struct hypervisor_x86 x86_hyper_ms_hyperv; 50extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
51extern const struct hypervisor_x86 x86_hyper_xen_hvm; 51extern const struct hypervisor_x86 x86_hyper_xen_hvm;
52extern const struct hypervisor_x86 x86_hyper_kvm;
52 53
53static inline bool hypervisor_x2apic_available(void) 54static inline bool hypervisor_x2apic_available(void)
54{ 55{
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index dffc38ee6255..345c99cef152 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -5,7 +5,6 @@ extern struct dma_map_ops nommu_dma_ops;
5extern int force_iommu, no_iommu; 5extern int force_iommu, no_iommu;
6extern int iommu_detected; 6extern int iommu_detected;
7extern int iommu_pass_through; 7extern int iommu_pass_through;
8extern int iommu_group_mf;
9 8
10/* 10 seconds */ 9/* 10 seconds */
11#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) 10#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000)
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 4b4448761e88..1508e518c7e3 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -119,17 +119,6 @@
119 */ 119 */
120#define LOCAL_TIMER_VECTOR 0xef 120#define LOCAL_TIMER_VECTOR 0xef
121 121
122/* up to 32 vectors used for spreading out TLB flushes: */
123#if NR_CPUS <= 32
124# define NUM_INVALIDATE_TLB_VECTORS (NR_CPUS)
125#else
126# define NUM_INVALIDATE_TLB_VECTORS (32)
127#endif
128
129#define INVALIDATE_TLB_VECTOR_END (0xee)
130#define INVALIDATE_TLB_VECTOR_START \
131 (INVALIDATE_TLB_VECTOR_END-NUM_INVALIDATE_TLB_VECTORS+1)
132
133#define NR_VECTORS 256 122#define NR_VECTORS 256
134 123
135#define FPU_IRQ 13 124#define FPU_IRQ 13
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index e7d1c194d272..246617efd67f 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -12,6 +12,7 @@
12/* Select x86 specific features in <linux/kvm.h> */ 12/* Select x86 specific features in <linux/kvm.h> */
13#define __KVM_HAVE_PIT 13#define __KVM_HAVE_PIT
14#define __KVM_HAVE_IOAPIC 14#define __KVM_HAVE_IOAPIC
15#define __KVM_HAVE_IRQ_LINE
15#define __KVM_HAVE_DEVICE_ASSIGNMENT 16#define __KVM_HAVE_DEVICE_ASSIGNMENT
16#define __KVM_HAVE_MSI 17#define __KVM_HAVE_MSI
17#define __KVM_HAVE_USER_NMI 18#define __KVM_HAVE_USER_NMI
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 1ac46c22dd50..c764f43b71c5 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -192,8 +192,8 @@ struct x86_emulate_ops {
192 struct x86_instruction_info *info, 192 struct x86_instruction_info *info,
193 enum x86_intercept_stage stage); 193 enum x86_intercept_stage stage);
194 194
195 bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, 195 void (*get_cpuid)(struct x86_emulate_ctxt *ctxt,
196 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); 196 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
197}; 197};
198 198
199typedef u32 __attribute__((vector_size(16))) sse128_t; 199typedef u32 __attribute__((vector_size(16))) sse128_t;
@@ -280,9 +280,9 @@ struct x86_emulate_ctxt {
280 u8 modrm_seg; 280 u8 modrm_seg;
281 bool rip_relative; 281 bool rip_relative;
282 unsigned long _eip; 282 unsigned long _eip;
283 struct operand memop;
283 /* Fields above regs are cleared together. */ 284 /* Fields above regs are cleared together. */
284 unsigned long regs[NR_VCPU_REGS]; 285 unsigned long regs[NR_VCPU_REGS];
285 struct operand memop;
286 struct operand *memopp; 286 struct operand *memopp;
287 struct fetch_cache fetch; 287 struct fetch_cache fetch;
288 struct read_cache io_read; 288 struct read_cache io_read;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index db7c1f2709a2..09155d64cf7e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -48,12 +48,13 @@
48 48
49#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) 49#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
50#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) 50#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
51#define CR3_PCID_ENABLED_RESERVED_BITS 0xFFFFFF0000000000ULL
51#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ 52#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \
52 0xFFFFFF0000000000ULL) 53 0xFFFFFF0000000000ULL)
53#define CR4_RESERVED_BITS \ 54#define CR4_RESERVED_BITS \
54 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ 55 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
55 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ 56 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
56 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ 57 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \
57 | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_RDWRGSFS \ 58 | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_RDWRGSFS \
58 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) 59 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
59 60
@@ -175,6 +176,13 @@ enum {
175 176
176/* apic attention bits */ 177/* apic attention bits */
177#define KVM_APIC_CHECK_VAPIC 0 178#define KVM_APIC_CHECK_VAPIC 0
179/*
180 * The following bit is set with PV-EOI, unset on EOI.
181 * We detect PV-EOI changes by guest by comparing
182 * this bit with PV-EOI in guest memory.
183 * See the implementation in apic_update_pv_eoi.
184 */
185#define KVM_APIC_PV_EOI_PENDING 1
178 186
179/* 187/*
180 * We don't want allocation failures within the mmu code, so we preallocate 188 * We don't want allocation failures within the mmu code, so we preallocate
@@ -313,8 +321,8 @@ struct kvm_pmu {
313 u64 counter_bitmask[2]; 321 u64 counter_bitmask[2];
314 u64 global_ctrl_mask; 322 u64 global_ctrl_mask;
315 u8 version; 323 u8 version;
316 struct kvm_pmc gp_counters[X86_PMC_MAX_GENERIC]; 324 struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC];
317 struct kvm_pmc fixed_counters[X86_PMC_MAX_FIXED]; 325 struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED];
318 struct irq_work irq_work; 326 struct irq_work irq_work;
319 u64 reprogram_pmi; 327 u64 reprogram_pmi;
320}; 328};
@@ -484,6 +492,11 @@ struct kvm_vcpu_arch {
484 u64 length; 492 u64 length;
485 u64 status; 493 u64 status;
486 } osvw; 494 } osvw;
495
496 struct {
497 u64 msr_val;
498 struct gfn_to_hva_cache data;
499 } pv_eoi;
487}; 500};
488 501
489struct kvm_lpage_info { 502struct kvm_lpage_info {
@@ -661,6 +674,7 @@ struct kvm_x86_ops {
661 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); 674 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
662 int (*get_lpage_level)(void); 675 int (*get_lpage_level)(void);
663 bool (*rdtscp_supported)(void); 676 bool (*rdtscp_supported)(void);
677 bool (*invpcid_supported)(void);
664 void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment, bool host); 678 void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment, bool host);
665 679
666 void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); 680 void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
@@ -802,7 +816,20 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
802void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); 816void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
803bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); 817bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
804 818
805int kvm_pic_set_irq(void *opaque, int irq, int level); 819static inline int __kvm_irq_line_state(unsigned long *irq_state,
820 int irq_source_id, int level)
821{
822 /* Logical OR for level trig interrupt */
823 if (level)
824 __set_bit(irq_source_id, irq_state);
825 else
826 __clear_bit(irq_source_id, irq_state);
827
828 return !!(*irq_state);
829}
830
831int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level);
832void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
806 833
807void kvm_inject_nmi(struct kvm_vcpu *vcpu); 834void kvm_inject_nmi(struct kvm_vcpu *vcpu);
808 835
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 63ab1661d00e..2f7712e08b1e 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -22,6 +22,7 @@
22#define KVM_FEATURE_CLOCKSOURCE2 3 22#define KVM_FEATURE_CLOCKSOURCE2 3
23#define KVM_FEATURE_ASYNC_PF 4 23#define KVM_FEATURE_ASYNC_PF 4
24#define KVM_FEATURE_STEAL_TIME 5 24#define KVM_FEATURE_STEAL_TIME 5
25#define KVM_FEATURE_PV_EOI 6
25 26
26/* The last 8 bits are used to indicate how to interpret the flags field 27/* The last 8 bits are used to indicate how to interpret the flags field
27 * in pvclock structure. If no bits are set, all flags are ignored. 28 * in pvclock structure. If no bits are set, all flags are ignored.
@@ -37,6 +38,7 @@
37#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 38#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
38#define MSR_KVM_ASYNC_PF_EN 0x4b564d02 39#define MSR_KVM_ASYNC_PF_EN 0x4b564d02
39#define MSR_KVM_STEAL_TIME 0x4b564d03 40#define MSR_KVM_STEAL_TIME 0x4b564d03
41#define MSR_KVM_PV_EOI_EN 0x4b564d04
40 42
41struct kvm_steal_time { 43struct kvm_steal_time {
42 __u64 steal; 44 __u64 steal;
@@ -89,6 +91,11 @@ struct kvm_vcpu_pv_apf_data {
89 __u32 enabled; 91 __u32 enabled;
90}; 92};
91 93
94#define KVM_PV_EOI_BIT 0
95#define KVM_PV_EOI_MASK (0x1 << KVM_PV_EOI_BIT)
96#define KVM_PV_EOI_ENABLED KVM_PV_EOI_MASK
97#define KVM_PV_EOI_DISABLED 0x0
98
92#ifdef __KERNEL__ 99#ifdef __KERNEL__
93#include <asm/processor.h> 100#include <asm/processor.h>
94 101
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 084ef95274cd..813ed103f45e 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -115,8 +115,8 @@ notrace static inline int native_write_msr_safe(unsigned int msr,
115 115
116extern unsigned long long native_read_tsc(void); 116extern unsigned long long native_read_tsc(void);
117 117
118extern int native_rdmsr_safe_regs(u32 regs[8]); 118extern int rdmsr_safe_regs(u32 regs[8]);
119extern int native_wrmsr_safe_regs(u32 regs[8]); 119extern int wrmsr_safe_regs(u32 regs[8]);
120 120
121static __always_inline unsigned long long __native_read_tsc(void) 121static __always_inline unsigned long long __native_read_tsc(void)
122{ 122{
@@ -187,43 +187,6 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
187 return err; 187 return err;
188} 188}
189 189
190static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
191{
192 u32 gprs[8] = { 0 };
193 int err;
194
195 gprs[1] = msr;
196 gprs[7] = 0x9c5a203a;
197
198 err = native_rdmsr_safe_regs(gprs);
199
200 *p = gprs[0] | ((u64)gprs[2] << 32);
201
202 return err;
203}
204
205static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val)
206{
207 u32 gprs[8] = { 0 };
208
209 gprs[0] = (u32)val;
210 gprs[1] = msr;
211 gprs[2] = val >> 32;
212 gprs[7] = 0x9c5a203a;
213
214 return native_wrmsr_safe_regs(gprs);
215}
216
217static inline int rdmsr_safe_regs(u32 regs[8])
218{
219 return native_rdmsr_safe_regs(regs);
220}
221
222static inline int wrmsr_safe_regs(u32 regs[8])
223{
224 return native_wrmsr_safe_regs(regs);
225}
226
227#define rdtscl(low) \ 190#define rdtscl(low) \
228 ((low) = (u32)__native_read_tsc()) 191 ((low) = (u32)__native_read_tsc())
229 192
@@ -237,6 +200,8 @@ do { \
237 (high) = (u32)(_l >> 32); \ 200 (high) = (u32)(_l >> 32); \
238} while (0) 201} while (0)
239 202
203#define rdpmcl(counter, val) ((val) = native_read_pmc(counter))
204
240#define rdtscp(low, high, aux) \ 205#define rdtscp(low, high, aux) \
241do { \ 206do { \
242 unsigned long long _val = native_read_tscp(&(aux)); \ 207 unsigned long long _val = native_read_tscp(&(aux)); \
@@ -248,8 +213,7 @@ do { \
248 213
249#endif /* !CONFIG_PARAVIRT */ 214#endif /* !CONFIG_PARAVIRT */
250 215
251 216#define wrmsrl_safe(msr, val) wrmsr_safe((msr), (u32)(val), \
252#define checking_wrmsrl(msr, val) wrmsr_safe((msr), (u32)(val), \
253 (u32)((val) >> 32)) 217 (u32)((val) >> 32))
254 218
255#define write_tsc(val1, val2) wrmsr(MSR_IA32_TSC, (val1), (val2)) 219#define write_tsc(val1, val2) wrmsr(MSR_IA32_TSC, (val1), (val2))
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index dc580c42851c..c0fa356e90de 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -44,28 +44,14 @@ struct nmiaction {
44 const char *name; 44 const char *name;
45}; 45};
46 46
47#define register_nmi_handler(t, fn, fg, n) \ 47#define register_nmi_handler(t, fn, fg, n, init...) \
48({ \ 48({ \
49 static struct nmiaction fn##_na = { \ 49 static struct nmiaction init fn##_na = { \
50 .handler = (fn), \ 50 .handler = (fn), \
51 .name = (n), \ 51 .name = (n), \
52 .flags = (fg), \ 52 .flags = (fg), \
53 }; \ 53 }; \
54 __register_nmi_handler((t), &fn##_na); \ 54 __register_nmi_handler((t), &fn##_na); \
55})
56
57/*
58 * For special handlers that register/unregister in the
59 * init section only. This should be considered rare.
60 */
61#define register_nmi_handler_initonly(t, fn, fg, n) \
62({ \
63 static struct nmiaction fn##_na __initdata = { \
64 .handler = (fn), \
65 .name = (n), \
66 .flags = (fg), \
67 }; \
68 __register_nmi_handler((t), &fn##_na); \
69}) 55})
70 56
71int __register_nmi_handler(unsigned int, struct nmiaction *); 57int __register_nmi_handler(unsigned int, struct nmiaction *);
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 6cbbabf52707..a0facf3908d7 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -128,21 +128,11 @@ static inline u64 paravirt_read_msr(unsigned msr, int *err)
128 return PVOP_CALL2(u64, pv_cpu_ops.read_msr, msr, err); 128 return PVOP_CALL2(u64, pv_cpu_ops.read_msr, msr, err);
129} 129}
130 130
131static inline int paravirt_rdmsr_regs(u32 *regs)
132{
133 return PVOP_CALL1(int, pv_cpu_ops.rdmsr_regs, regs);
134}
135
136static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high) 131static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high)
137{ 132{
138 return PVOP_CALL3(int, pv_cpu_ops.write_msr, msr, low, high); 133 return PVOP_CALL3(int, pv_cpu_ops.write_msr, msr, low, high);
139} 134}
140 135
141static inline int paravirt_wrmsr_regs(u32 *regs)
142{
143 return PVOP_CALL1(int, pv_cpu_ops.wrmsr_regs, regs);
144}
145
146/* These should all do BUG_ON(_err), but our headers are too tangled. */ 136/* These should all do BUG_ON(_err), but our headers are too tangled. */
147#define rdmsr(msr, val1, val2) \ 137#define rdmsr(msr, val1, val2) \
148do { \ 138do { \
@@ -176,9 +166,6 @@ do { \
176 _err; \ 166 _err; \
177}) 167})
178 168
179#define rdmsr_safe_regs(regs) paravirt_rdmsr_regs(regs)
180#define wrmsr_safe_regs(regs) paravirt_wrmsr_regs(regs)
181
182static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) 169static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
183{ 170{
184 int err; 171 int err;
@@ -186,32 +173,6 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
186 *p = paravirt_read_msr(msr, &err); 173 *p = paravirt_read_msr(msr, &err);
187 return err; 174 return err;
188} 175}
189static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
190{
191 u32 gprs[8] = { 0 };
192 int err;
193
194 gprs[1] = msr;
195 gprs[7] = 0x9c5a203a;
196
197 err = paravirt_rdmsr_regs(gprs);
198
199 *p = gprs[0] | ((u64)gprs[2] << 32);
200
201 return err;
202}
203
204static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val)
205{
206 u32 gprs[8] = { 0 };
207
208 gprs[0] = (u32)val;
209 gprs[1] = msr;
210 gprs[2] = val >> 32;
211 gprs[7] = 0x9c5a203a;
212
213 return paravirt_wrmsr_regs(gprs);
214}
215 176
216static inline u64 paravirt_read_tsc(void) 177static inline u64 paravirt_read_tsc(void)
217{ 178{
@@ -252,6 +213,8 @@ do { \
252 high = _l >> 32; \ 213 high = _l >> 32; \
253} while (0) 214} while (0)
254 215
216#define rdpmcl(counter, val) ((val) = paravirt_read_pmc(counter))
217
255static inline unsigned long long paravirt_rdtscp(unsigned int *aux) 218static inline unsigned long long paravirt_rdtscp(unsigned int *aux)
256{ 219{
257 return PVOP_CALL1(u64, pv_cpu_ops.read_tscp, aux); 220 return PVOP_CALL1(u64, pv_cpu_ops.read_tscp, aux);
@@ -397,9 +360,10 @@ static inline void __flush_tlb_single(unsigned long addr)
397 360
398static inline void flush_tlb_others(const struct cpumask *cpumask, 361static inline void flush_tlb_others(const struct cpumask *cpumask,
399 struct mm_struct *mm, 362 struct mm_struct *mm,
400 unsigned long va) 363 unsigned long start,
364 unsigned long end)
401{ 365{
402 PVOP_VCALL3(pv_mmu_ops.flush_tlb_others, cpumask, mm, va); 366 PVOP_VCALL4(pv_mmu_ops.flush_tlb_others, cpumask, mm, start, end);
403} 367}
404 368
405static inline int paravirt_pgd_alloc(struct mm_struct *mm) 369static inline int paravirt_pgd_alloc(struct mm_struct *mm)
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 8e8b9a4987ee..142236ed83af 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -153,9 +153,7 @@ struct pv_cpu_ops {
153 /* MSR, PMC and TSR operations. 153 /* MSR, PMC and TSR operations.
154 err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */ 154 err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */
155 u64 (*read_msr)(unsigned int msr, int *err); 155 u64 (*read_msr)(unsigned int msr, int *err);
156 int (*rdmsr_regs)(u32 *regs);
157 int (*write_msr)(unsigned int msr, unsigned low, unsigned high); 156 int (*write_msr)(unsigned int msr, unsigned low, unsigned high);
158 int (*wrmsr_regs)(u32 *regs);
159 157
160 u64 (*read_tsc)(void); 158 u64 (*read_tsc)(void);
161 u64 (*read_pmc)(int counter); 159 u64 (*read_pmc)(int counter);
@@ -250,7 +248,8 @@ struct pv_mmu_ops {
250 void (*flush_tlb_single)(unsigned long addr); 248 void (*flush_tlb_single)(unsigned long addr);
251 void (*flush_tlb_others)(const struct cpumask *cpus, 249 void (*flush_tlb_others)(const struct cpumask *cpus,
252 struct mm_struct *mm, 250 struct mm_struct *mm,
253 unsigned long va); 251 unsigned long start,
252 unsigned long end);
254 253
255 /* Hooks for allocating and freeing a pagetable top-level */ 254 /* Hooks for allocating and freeing a pagetable top-level */
256 int (*pgd_alloc)(struct mm_struct *mm); 255 int (*pgd_alloc)(struct mm_struct *mm);
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index b3a531746026..73e8eeff22ee 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -7,9 +7,13 @@
7#undef DEBUG 7#undef DEBUG
8 8
9#ifdef DEBUG 9#ifdef DEBUG
10#define DBG(x...) printk(x) 10#define DBG(fmt, ...) printk(fmt, ##__VA_ARGS__)
11#else 11#else
12#define DBG(x...) 12#define DBG(fmt, ...) \
13do { \
14 if (0) \
15 printk(fmt, ##__VA_ARGS__); \
16} while (0)
13#endif 17#endif
14 18
15#define PCI_PROBE_BIOS 0x0001 19#define PCI_PROBE_BIOS 0x0001
@@ -100,6 +104,7 @@ struct pci_raw_ops {
100extern const struct pci_raw_ops *raw_pci_ops; 104extern const struct pci_raw_ops *raw_pci_ops;
101extern const struct pci_raw_ops *raw_pci_ext_ops; 105extern const struct pci_raw_ops *raw_pci_ext_ops;
102 106
107extern const struct pci_raw_ops pci_mmcfg;
103extern const struct pci_raw_ops pci_direct_conf1; 108extern const struct pci_raw_ops pci_direct_conf1;
104extern bool port_cf9_safe; 109extern bool port_cf9_safe;
105 110
@@ -135,6 +140,12 @@ struct pci_mmcfg_region {
135 140
136extern int __init pci_mmcfg_arch_init(void); 141extern int __init pci_mmcfg_arch_init(void);
137extern void __init pci_mmcfg_arch_free(void); 142extern void __init pci_mmcfg_arch_free(void);
143extern int __devinit pci_mmcfg_arch_map(struct pci_mmcfg_region *cfg);
144extern void pci_mmcfg_arch_unmap(struct pci_mmcfg_region *cfg);
145extern int __devinit pci_mmconfig_insert(struct device *dev,
146 u16 seg, u8 start,
147 u8 end, phys_addr_t addr);
148extern int pci_mmconfig_delete(u16 seg, u8 start, u8 end);
138extern struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus); 149extern struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus);
139 150
140extern struct list_head pci_mmcfg_list; 151extern struct list_head pci_mmcfg_list;
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index d9b8e3f7f42a..1104afaba52b 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -551,6 +551,12 @@ DECLARE_PER_CPU(unsigned long, this_cpu_off);
551 { [0 ... NR_CPUS-1] = _initvalue }; \ 551 { [0 ... NR_CPUS-1] = _initvalue }; \
552 __typeof__(_type) *_name##_early_ptr __refdata = _name##_early_map 552 __typeof__(_type) *_name##_early_ptr __refdata = _name##_early_map
553 553
554#define DEFINE_EARLY_PER_CPU_READ_MOSTLY(_type, _name, _initvalue) \
555 DEFINE_PER_CPU_READ_MOSTLY(_type, _name) = _initvalue; \
556 __typeof__(_type) _name##_early_map[NR_CPUS] __initdata = \
557 { [0 ... NR_CPUS-1] = _initvalue }; \
558 __typeof__(_type) *_name##_early_ptr __refdata = _name##_early_map
559
554#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \ 560#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \
555 EXPORT_PER_CPU_SYMBOL(_name) 561 EXPORT_PER_CPU_SYMBOL(_name)
556 562
@@ -559,6 +565,11 @@ DECLARE_PER_CPU(unsigned long, this_cpu_off);
559 extern __typeof__(_type) *_name##_early_ptr; \ 565 extern __typeof__(_type) *_name##_early_ptr; \
560 extern __typeof__(_type) _name##_early_map[] 566 extern __typeof__(_type) _name##_early_map[]
561 567
568#define DECLARE_EARLY_PER_CPU_READ_MOSTLY(_type, _name) \
569 DECLARE_PER_CPU_READ_MOSTLY(_type, _name); \
570 extern __typeof__(_type) *_name##_early_ptr; \
571 extern __typeof__(_type) _name##_early_map[]
572
562#define early_per_cpu_ptr(_name) (_name##_early_ptr) 573#define early_per_cpu_ptr(_name) (_name##_early_ptr)
563#define early_per_cpu_map(_name, _idx) (_name##_early_map[_idx]) 574#define early_per_cpu_map(_name, _idx) (_name##_early_map[_idx])
564#define early_per_cpu(_name, _cpu) \ 575#define early_per_cpu(_name, _cpu) \
@@ -570,12 +581,18 @@ DECLARE_PER_CPU(unsigned long, this_cpu_off);
570#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \ 581#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \
571 DEFINE_PER_CPU(_type, _name) = _initvalue 582 DEFINE_PER_CPU(_type, _name) = _initvalue
572 583
584#define DEFINE_EARLY_PER_CPU_READ_MOSTLY(_type, _name, _initvalue) \
585 DEFINE_PER_CPU_READ_MOSTLY(_type, _name) = _initvalue
586
573#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \ 587#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \
574 EXPORT_PER_CPU_SYMBOL(_name) 588 EXPORT_PER_CPU_SYMBOL(_name)
575 589
576#define DECLARE_EARLY_PER_CPU(_type, _name) \ 590#define DECLARE_EARLY_PER_CPU(_type, _name) \
577 DECLARE_PER_CPU(_type, _name) 591 DECLARE_PER_CPU(_type, _name)
578 592
593#define DECLARE_EARLY_PER_CPU_READ_MOSTLY(_type, _name) \
594 DECLARE_PER_CPU_READ_MOSTLY(_type, _name)
595
579#define early_per_cpu(_name, _cpu) per_cpu(_name, _cpu) 596#define early_per_cpu(_name, _cpu) per_cpu(_name, _cpu)
580#define early_per_cpu_ptr(_name) NULL 597#define early_per_cpu_ptr(_name) NULL
581/* no early_per_cpu_map() */ 598/* no early_per_cpu_map() */
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 588f52ea810e..c78f14a0df00 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -5,11 +5,10 @@
5 * Performance event hw details: 5 * Performance event hw details:
6 */ 6 */
7 7
8#define X86_PMC_MAX_GENERIC 32 8#define INTEL_PMC_MAX_GENERIC 32
9#define X86_PMC_MAX_FIXED 3 9#define INTEL_PMC_MAX_FIXED 3
10#define INTEL_PMC_IDX_FIXED 32
10 11
11#define X86_PMC_IDX_GENERIC 0
12#define X86_PMC_IDX_FIXED 32
13#define X86_PMC_IDX_MAX 64 12#define X86_PMC_IDX_MAX 64
14 13
15#define MSR_ARCH_PERFMON_PERFCTR0 0xc1 14#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
@@ -48,8 +47,7 @@
48 (X86_RAW_EVENT_MASK | \ 47 (X86_RAW_EVENT_MASK | \
49 AMD64_EVENTSEL_EVENT) 48 AMD64_EVENTSEL_EVENT)
50#define AMD64_NUM_COUNTERS 4 49#define AMD64_NUM_COUNTERS 4
51#define AMD64_NUM_COUNTERS_F15H 6 50#define AMD64_NUM_COUNTERS_CORE 6
52#define AMD64_NUM_COUNTERS_MAX AMD64_NUM_COUNTERS_F15H
53 51
54#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c 52#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c
55#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) 53#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
@@ -121,16 +119,16 @@ struct x86_pmu_capability {
121 119
122/* Instr_Retired.Any: */ 120/* Instr_Retired.Any: */
123#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309 121#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309
124#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0) 122#define INTEL_PMC_IDX_FIXED_INSTRUCTIONS (INTEL_PMC_IDX_FIXED + 0)
125 123
126/* CPU_CLK_Unhalted.Core: */ 124/* CPU_CLK_Unhalted.Core: */
127#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a 125#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a
128#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1) 126#define INTEL_PMC_IDX_FIXED_CPU_CYCLES (INTEL_PMC_IDX_FIXED + 1)
129 127
130/* CPU_CLK_Unhalted.Ref: */ 128/* CPU_CLK_Unhalted.Ref: */
131#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b 129#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b
132#define X86_PMC_IDX_FIXED_REF_CYCLES (X86_PMC_IDX_FIXED + 2) 130#define INTEL_PMC_IDX_FIXED_REF_CYCLES (INTEL_PMC_IDX_FIXED + 2)
133#define X86_PMC_MSK_FIXED_REF_CYCLES (1ULL << X86_PMC_IDX_FIXED_REF_CYCLES) 131#define INTEL_PMC_MSK_FIXED_REF_CYCLES (1ULL << INTEL_PMC_IDX_FIXED_REF_CYCLES)
134 132
135/* 133/*
136 * We model BTS tracing as another fixed-mode PMC. 134 * We model BTS tracing as another fixed-mode PMC.
@@ -139,7 +137,7 @@ struct x86_pmu_capability {
139 * values are used by actual fixed events and higher values are used 137 * values are used by actual fixed events and higher values are used
140 * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr. 138 * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr.
141 */ 139 */
142#define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16) 140#define INTEL_PMC_IDX_FIXED_BTS (INTEL_PMC_IDX_FIXED + 16)
143 141
144/* 142/*
145 * IBS cpuid feature detection 143 * IBS cpuid feature detection
@@ -234,6 +232,7 @@ struct perf_guest_switch_msr {
234 232
235extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr); 233extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr);
236extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap); 234extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap);
235extern void perf_check_microcode(void);
237#else 236#else
238static inline perf_guest_switch_msr *perf_guest_get_msrs(int *nr) 237static inline perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
239{ 238{
@@ -247,6 +246,7 @@ static inline void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
247} 246}
248 247
249static inline void perf_events_lapic_init(void) { } 248static inline void perf_events_lapic_init(void) { }
249static inline void perf_check_microcode(void) { }
250#endif 250#endif
251 251
252#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) 252#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index 98391db840c6..f2b489cf1602 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -2,9 +2,9 @@
2#define _ASM_X86_PGTABLE_2LEVEL_H 2#define _ASM_X86_PGTABLE_2LEVEL_H
3 3
4#define pte_ERROR(e) \ 4#define pte_ERROR(e) \
5 printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, (e).pte_low) 5 pr_err("%s:%d: bad pte %08lx\n", __FILE__, __LINE__, (e).pte_low)
6#define pgd_ERROR(e) \ 6#define pgd_ERROR(e) \
7 printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) 7 pr_err("%s:%d: bad pgd %08lx\n", __FILE__, __LINE__, pgd_val(e))
8 8
9/* 9/*
10 * Certain architectures need to do special things when PTEs 10 * Certain architectures need to do special things when PTEs
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 43876f16caf1..4cc9f2b7cdc3 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -9,13 +9,13 @@
9 */ 9 */
10 10
11#define pte_ERROR(e) \ 11#define pte_ERROR(e) \
12 printk("%s:%d: bad pte %p(%08lx%08lx).\n", \ 12 pr_err("%s:%d: bad pte %p(%08lx%08lx)\n", \
13 __FILE__, __LINE__, &(e), (e).pte_high, (e).pte_low) 13 __FILE__, __LINE__, &(e), (e).pte_high, (e).pte_low)
14#define pmd_ERROR(e) \ 14#define pmd_ERROR(e) \
15 printk("%s:%d: bad pmd %p(%016Lx).\n", \ 15 pr_err("%s:%d: bad pmd %p(%016Lx)\n", \
16 __FILE__, __LINE__, &(e), pmd_val(e)) 16 __FILE__, __LINE__, &(e), pmd_val(e))
17#define pgd_ERROR(e) \ 17#define pgd_ERROR(e) \
18 printk("%s:%d: bad pgd %p(%016Lx).\n", \ 18 pr_err("%s:%d: bad pgd %p(%016Lx)\n", \
19 __FILE__, __LINE__, &(e), pgd_val(e)) 19 __FILE__, __LINE__, &(e), pgd_val(e))
20 20
21/* Rules for using set_pte: the pte being assigned *must* be 21/* Rules for using set_pte: the pte being assigned *must* be
@@ -47,16 +47,26 @@ static inline void native_set_pte(pte_t *ptep, pte_t pte)
47 * they can run pmd_offset_map_lock or pmd_trans_huge or other pmd 47 * they can run pmd_offset_map_lock or pmd_trans_huge or other pmd
48 * operations. 48 * operations.
49 * 49 *
50 * Without THP if the mmap_sem is hold for reading, the 50 * Without THP if the mmap_sem is hold for reading, the pmd can only
51 * pmd can only transition from null to not null while pmd_read_atomic runs. 51 * transition from null to not null while pmd_read_atomic runs. So
52 * So there's no need of literally reading it atomically. 52 * we can always return atomic pmd values with this function.
53 * 53 *
54 * With THP if the mmap_sem is hold for reading, the pmd can become 54 * With THP if the mmap_sem is hold for reading, the pmd can become
55 * THP or null or point to a pte (and in turn become "stable") at any 55 * trans_huge or none or point to a pte (and in turn become "stable")
56 * time under pmd_read_atomic, so it's mandatory to read it atomically 56 * at any time under pmd_read_atomic. We could read it really
57 * with cmpxchg8b. 57 * atomically here with a atomic64_read for the THP enabled case (and
58 * it would be a whole lot simpler), but to avoid using cmpxchg8b we
59 * only return an atomic pmdval if the low part of the pmdval is later
60 * found stable (i.e. pointing to a pte). And we're returning a none
61 * pmdval if the low part of the pmd is none. In some cases the high
62 * and low part of the pmdval returned may not be consistent if THP is
63 * enabled (the low part may point to previously mapped hugepage,
64 * while the high part may point to a more recently mapped hugepage),
65 * but pmd_none_or_trans_huge_or_clear_bad() only needs the low part
66 * of the pmd to be read atomically to decide if the pmd is unstable
67 * or not, with the only exception of when the low part of the pmd is
68 * zero in which case we return a none pmd.
58 */ 69 */
59#ifndef CONFIG_TRANSPARENT_HUGEPAGE
60static inline pmd_t pmd_read_atomic(pmd_t *pmdp) 70static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
61{ 71{
62 pmdval_t ret; 72 pmdval_t ret;
@@ -74,12 +84,6 @@ static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
74 84
75 return (pmd_t) { ret }; 85 return (pmd_t) { ret };
76} 86}
77#else /* CONFIG_TRANSPARENT_HUGEPAGE */
78static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
79{
80 return (pmd_t) { atomic64_read((atomic64_t *)pmdp) };
81}
82#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
83 87
84static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) 88static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
85{ 89{
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 975f709e09ae..8251be02301e 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -26,16 +26,16 @@ extern pgd_t init_level4_pgt[];
26extern void paging_init(void); 26extern void paging_init(void);
27 27
28#define pte_ERROR(e) \ 28#define pte_ERROR(e) \
29 printk("%s:%d: bad pte %p(%016lx).\n", \ 29 pr_err("%s:%d: bad pte %p(%016lx)\n", \
30 __FILE__, __LINE__, &(e), pte_val(e)) 30 __FILE__, __LINE__, &(e), pte_val(e))
31#define pmd_ERROR(e) \ 31#define pmd_ERROR(e) \
32 printk("%s:%d: bad pmd %p(%016lx).\n", \ 32 pr_err("%s:%d: bad pmd %p(%016lx)\n", \
33 __FILE__, __LINE__, &(e), pmd_val(e)) 33 __FILE__, __LINE__, &(e), pmd_val(e))
34#define pud_ERROR(e) \ 34#define pud_ERROR(e) \
35 printk("%s:%d: bad pud %p(%016lx).\n", \ 35 pr_err("%s:%d: bad pud %p(%016lx)\n", \
36 __FILE__, __LINE__, &(e), pud_val(e)) 36 __FILE__, __LINE__, &(e), pud_val(e))
37#define pgd_ERROR(e) \ 37#define pgd_ERROR(e) \
38 printk("%s:%d: bad pgd %p(%016lx).\n", \ 38 pr_err("%s:%d: bad pgd %p(%016lx)\n", \
39 __FILE__, __LINE__, &(e), pgd_val(e)) 39 __FILE__, __LINE__, &(e), pgd_val(e))
40 40
41struct mm_struct; 41struct mm_struct;
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index f8ab3eaad128..aea1d1d848c7 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -44,6 +44,7 @@
44 */ 44 */
45#define X86_CR3_PWT 0x00000008 /* Page Write Through */ 45#define X86_CR3_PWT 0x00000008 /* Page Write Through */
46#define X86_CR3_PCD 0x00000010 /* Page Cache Disable */ 46#define X86_CR3_PCD 0x00000010 /* Page Cache Disable */
47#define X86_CR3_PCID_MASK 0x00000fff /* PCID Mask */
47 48
48/* 49/*
49 * Intel CPU features in CR4 50 * Intel CPU features in CR4
@@ -61,6 +62,7 @@
61#define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */ 62#define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */
62#define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */ 63#define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */
63#define X86_CR4_RDWRGSFS 0x00010000 /* enable RDWRGSFS support */ 64#define X86_CR4_RDWRGSFS 0x00010000 /* enable RDWRGSFS support */
65#define X86_CR4_PCIDE 0x00020000 /* enable PCID support */
64#define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */ 66#define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */
65#define X86_CR4_SMEP 0x00100000 /* enable SMEP support */ 67#define X86_CR4_SMEP 0x00100000 /* enable SMEP support */
66 68
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 39bc5777211a..d048cad9bcad 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -61,6 +61,19 @@ static inline void *current_text_addr(void)
61# define ARCH_MIN_MMSTRUCT_ALIGN 0 61# define ARCH_MIN_MMSTRUCT_ALIGN 0
62#endif 62#endif
63 63
64enum tlb_infos {
65 ENTRIES,
66 NR_INFO
67};
68
69extern u16 __read_mostly tlb_lli_4k[NR_INFO];
70extern u16 __read_mostly tlb_lli_2m[NR_INFO];
71extern u16 __read_mostly tlb_lli_4m[NR_INFO];
72extern u16 __read_mostly tlb_lld_4k[NR_INFO];
73extern u16 __read_mostly tlb_lld_2m[NR_INFO];
74extern u16 __read_mostly tlb_lld_4m[NR_INFO];
75extern s8 __read_mostly tlb_flushall_shift;
76
64/* 77/*
65 * CPU type and hardware bug flags. Kept separately for each CPU. 78 * CPU type and hardware bug flags. Kept separately for each CPU.
66 * Members of this structure are referenced in head.S, so think twice 79 * Members of this structure are referenced in head.S, so think twice
diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index fce3f4ae5bd6..fe1ec5bcd846 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -21,8 +21,9 @@ struct real_mode_header {
21 u32 wakeup_header; 21 u32 wakeup_header;
22#endif 22#endif
23 /* APM/BIOS reboot */ 23 /* APM/BIOS reboot */
24#ifdef CONFIG_X86_32
25 u32 machine_real_restart_asm; 24 u32 machine_real_restart_asm;
25#ifdef CONFIG_X86_64
26 u32 machine_real_restart_seg;
26#endif 27#endif
27}; 28};
28 29
diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h
index 92f297069e87..a82c4f1b4d83 100644
--- a/arch/x86/include/asm/reboot.h
+++ b/arch/x86/include/asm/reboot.h
@@ -18,8 +18,8 @@ extern struct machine_ops machine_ops;
18 18
19void native_machine_crash_shutdown(struct pt_regs *regs); 19void native_machine_crash_shutdown(struct pt_regs *regs);
20void native_machine_shutdown(void); 20void native_machine_shutdown(void);
21void machine_real_restart(unsigned int type); 21void __noreturn machine_real_restart(unsigned int type);
22/* These must match dispatch_table in reboot_32.S */ 22/* These must match dispatch in arch/x86/realmore/rm/reboot.S */
23#define MRR_BIOS 0 23#define MRR_BIOS 0
24#define MRR_APM 1 24#define MRR_APM 1
25 25
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index f48394513c37..4f19a1526037 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -31,12 +31,12 @@ static inline bool cpu_has_ht_siblings(void)
31 return has_siblings; 31 return has_siblings;
32} 32}
33 33
34DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map); 34DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
35DECLARE_PER_CPU(cpumask_var_t, cpu_core_map); 35DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
36/* cpus sharing the last level cache: */ 36/* cpus sharing the last level cache: */
37DECLARE_PER_CPU(cpumask_var_t, cpu_llc_shared_map); 37DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
38DECLARE_PER_CPU(u16, cpu_llc_id); 38DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id);
39DECLARE_PER_CPU(int, cpu_number); 39DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);
40 40
41static inline struct cpumask *cpu_sibling_mask(int cpu) 41static inline struct cpumask *cpu_sibling_mask(int cpu)
42{ 42{
@@ -53,10 +53,10 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu)
53 return per_cpu(cpu_llc_shared_map, cpu); 53 return per_cpu(cpu_llc_shared_map, cpu);
54} 54}
55 55
56DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid); 56DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid);
57DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid); 57DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid);
58#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) 58#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
59DECLARE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid); 59DECLARE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid);
60#endif 60#endif
61 61
62/* Static state in head.S used to set up a CPU */ 62/* Static state in head.S used to set up a CPU */
@@ -169,11 +169,6 @@ void x86_idle_thread_init(unsigned int cpu, struct task_struct *idle);
169void smp_store_cpu_info(int id); 169void smp_store_cpu_info(int id);
170#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu) 170#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
171 171
172/* We don't mark CPUs online until __cpu_up(), so we need another measure */
173static inline int num_booting_cpus(void)
174{
175 return cpumask_weight(cpu_callout_mask);
176}
177#else /* !CONFIG_SMP */ 172#else /* !CONFIG_SMP */
178#define wbinvd_on_cpu(cpu) wbinvd() 173#define wbinvd_on_cpu(cpu) wbinvd()
179static inline int wbinvd_on_all_cpus(void) 174static inline int wbinvd_on_all_cpus(void)
diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
index 829215fef9ee..4fef20773b8f 100644
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
@@ -4,7 +4,14 @@
4#define tlb_start_vma(tlb, vma) do { } while (0) 4#define tlb_start_vma(tlb, vma) do { } while (0)
5#define tlb_end_vma(tlb, vma) do { } while (0) 5#define tlb_end_vma(tlb, vma) do { } while (0)
6#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) 6#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
7#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) 7
8#define tlb_flush(tlb) \
9{ \
10 if (tlb->fullmm == 0) \
11 flush_tlb_mm_range(tlb->mm, tlb->start, tlb->end, 0UL); \
12 else \
13 flush_tlb_mm_range(tlb->mm, 0UL, TLB_FLUSH_ALL, 0UL); \
14}
8 15
9#include <asm-generic/tlb.h> 16#include <asm-generic/tlb.h>
10 17
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 36a1a2ab87d2..74a44333545a 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -73,14 +73,10 @@ static inline void __flush_tlb_one(unsigned long addr)
73 * - flush_tlb_page(vma, vmaddr) flushes one page 73 * - flush_tlb_page(vma, vmaddr) flushes one page
74 * - flush_tlb_range(vma, start, end) flushes a range of pages 74 * - flush_tlb_range(vma, start, end) flushes a range of pages
75 * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages 75 * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
76 * - flush_tlb_others(cpumask, mm, va) flushes TLBs on other cpus 76 * - flush_tlb_others(cpumask, mm, start, end) flushes TLBs on other cpus
77 * 77 *
78 * ..but the i386 has somewhat limited tlb flushing capabilities, 78 * ..but the i386 has somewhat limited tlb flushing capabilities,
79 * and page-granular flushes are available only on i486 and up. 79 * and page-granular flushes are available only on i486 and up.
80 *
81 * x86-64 can only flush individual pages or full VMs. For a range flush
82 * we always do the full VM. Might be worth trying if for a small
83 * range a few INVLPGs in a row are a win.
84 */ 80 */
85 81
86#ifndef CONFIG_SMP 82#ifndef CONFIG_SMP
@@ -109,9 +105,17 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
109 __flush_tlb(); 105 __flush_tlb();
110} 106}
111 107
108static inline void flush_tlb_mm_range(struct mm_struct *mm,
109 unsigned long start, unsigned long end, unsigned long vmflag)
110{
111 if (mm == current->active_mm)
112 __flush_tlb();
113}
114
112static inline void native_flush_tlb_others(const struct cpumask *cpumask, 115static inline void native_flush_tlb_others(const struct cpumask *cpumask,
113 struct mm_struct *mm, 116 struct mm_struct *mm,
114 unsigned long va) 117 unsigned long start,
118 unsigned long end)
115{ 119{
116} 120}
117 121
@@ -119,27 +123,35 @@ static inline void reset_lazy_tlbstate(void)
119{ 123{
120} 124}
121 125
126static inline void flush_tlb_kernel_range(unsigned long start,
127 unsigned long end)
128{
129 flush_tlb_all();
130}
131
122#else /* SMP */ 132#else /* SMP */
123 133
124#include <asm/smp.h> 134#include <asm/smp.h>
125 135
126#define local_flush_tlb() __flush_tlb() 136#define local_flush_tlb() __flush_tlb()
127 137
138#define flush_tlb_mm(mm) flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL)
139
140#define flush_tlb_range(vma, start, end) \
141 flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags)
142
128extern void flush_tlb_all(void); 143extern void flush_tlb_all(void);
129extern void flush_tlb_current_task(void); 144extern void flush_tlb_current_task(void);
130extern void flush_tlb_mm(struct mm_struct *);
131extern void flush_tlb_page(struct vm_area_struct *, unsigned long); 145extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
146extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
147 unsigned long end, unsigned long vmflag);
148extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
132 149
133#define flush_tlb() flush_tlb_current_task() 150#define flush_tlb() flush_tlb_current_task()
134 151
135static inline void flush_tlb_range(struct vm_area_struct *vma,
136 unsigned long start, unsigned long end)
137{
138 flush_tlb_mm(vma->vm_mm);
139}
140
141void native_flush_tlb_others(const struct cpumask *cpumask, 152void native_flush_tlb_others(const struct cpumask *cpumask,
142 struct mm_struct *mm, unsigned long va); 153 struct mm_struct *mm,
154 unsigned long start, unsigned long end);
143 155
144#define TLBSTATE_OK 1 156#define TLBSTATE_OK 1
145#define TLBSTATE_LAZY 2 157#define TLBSTATE_LAZY 2
@@ -159,13 +171,8 @@ static inline void reset_lazy_tlbstate(void)
159#endif /* SMP */ 171#endif /* SMP */
160 172
161#ifndef CONFIG_PARAVIRT 173#ifndef CONFIG_PARAVIRT
162#define flush_tlb_others(mask, mm, va) native_flush_tlb_others(mask, mm, va) 174#define flush_tlb_others(mask, mm, start, end) \
175 native_flush_tlb_others(mask, mm, start, end)
163#endif 176#endif
164 177
165static inline void flush_tlb_kernel_range(unsigned long start,
166 unsigned long end)
167{
168 flush_tlb_all();
169}
170
171#endif /* _ASM_X86_TLBFLUSH_H */ 178#endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 8e796fbbf9c6..d8def8b3dba0 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -17,6 +17,8 @@
17 17
18/* Handles exceptions in both to and from, but doesn't do access_ok */ 18/* Handles exceptions in both to and from, but doesn't do access_ok */
19__must_check unsigned long 19__must_check unsigned long
20copy_user_enhanced_fast_string(void *to, const void *from, unsigned len);
21__must_check unsigned long
20copy_user_generic_string(void *to, const void *from, unsigned len); 22copy_user_generic_string(void *to, const void *from, unsigned len);
21__must_check unsigned long 23__must_check unsigned long
22copy_user_generic_unrolled(void *to, const void *from, unsigned len); 24copy_user_generic_unrolled(void *to, const void *from, unsigned len);
@@ -26,9 +28,16 @@ copy_user_generic(void *to, const void *from, unsigned len)
26{ 28{
27 unsigned ret; 29 unsigned ret;
28 30
29 alternative_call(copy_user_generic_unrolled, 31 /*
32 * If CPU has ERMS feature, use copy_user_enhanced_fast_string.
33 * Otherwise, if CPU has rep_good feature, use copy_user_generic_string.
34 * Otherwise, use copy_user_generic_unrolled.
35 */
36 alternative_call_2(copy_user_generic_unrolled,
30 copy_user_generic_string, 37 copy_user_generic_string,
31 X86_FEATURE_REP_GOOD, 38 X86_FEATURE_REP_GOOD,
39 copy_user_enhanced_fast_string,
40 X86_FEATURE_ERMS,
32 ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from), 41 ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from),
33 "=d" (len)), 42 "=d" (len)),
34 "1" (to), "2" (from), "3" (len) 43 "1" (to), "2" (from), "3" (len)
diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h
index 1e9bed14f7ae..f3971bbcd1de 100644
--- a/arch/x86/include/asm/uprobes.h
+++ b/arch/x86/include/asm/uprobes.h
@@ -48,7 +48,7 @@ struct arch_uprobe_task {
48#endif 48#endif
49}; 49};
50 50
51extern int arch_uprobe_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm); 51extern int arch_uprobe_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long addr);
52extern int arch_uprobe_pre_xol(struct arch_uprobe *aup, struct pt_regs *regs); 52extern int arch_uprobe_pre_xol(struct arch_uprobe *aup, struct pt_regs *regs);
53extern int arch_uprobe_post_xol(struct arch_uprobe *aup, struct pt_regs *regs); 53extern int arch_uprobe_post_xol(struct arch_uprobe *aup, struct pt_regs *regs);
54extern bool arch_uprobe_xol_was_trapped(struct task_struct *tsk); 54extern bool arch_uprobe_xol_was_trapped(struct task_struct *tsk);
diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h
index 3bb9491b7659..b47c2a82ff15 100644
--- a/arch/x86/include/asm/uv/uv.h
+++ b/arch/x86/include/asm/uv/uv.h
@@ -15,7 +15,8 @@ extern void uv_nmi_init(void);
15extern void uv_system_init(void); 15extern void uv_system_init(void);
16extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, 16extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
17 struct mm_struct *mm, 17 struct mm_struct *mm,
18 unsigned long va, 18 unsigned long start,
19 unsigned end,
19 unsigned int cpu); 20 unsigned int cpu);
20 21
21#else /* X86_UV */ 22#else /* X86_UV */
@@ -26,7 +27,7 @@ static inline void uv_cpu_init(void) { }
26static inline void uv_system_init(void) { } 27static inline void uv_system_init(void) { }
27static inline const struct cpumask * 28static inline const struct cpumask *
28uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, 29uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm,
29 unsigned long va, unsigned int cpu) 30 unsigned long start, unsigned long end, unsigned int cpu)
30{ return cpumask; } 31{ return cpumask; }
31 32
32#endif /* X86_UV */ 33#endif /* X86_UV */
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 6149b476d9df..a06983cdc125 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -140,6 +140,9 @@
140#define IPI_RESET_LIMIT 1 140#define IPI_RESET_LIMIT 1
141/* after this # consecutive successes, bump up the throttle if it was lowered */ 141/* after this # consecutive successes, bump up the throttle if it was lowered */
142#define COMPLETE_THRESHOLD 5 142#define COMPLETE_THRESHOLD 5
143/* after this # of giveups (fall back to kernel IPI's) disable the use of
144 the BAU for a period of time */
145#define GIVEUP_LIMIT 100
143 146
144#define UV_LB_SUBNODEID 0x10 147#define UV_LB_SUBNODEID 0x10
145 148
@@ -166,7 +169,6 @@
166#define FLUSH_RETRY_TIMEOUT 2 169#define FLUSH_RETRY_TIMEOUT 2
167#define FLUSH_GIVEUP 3 170#define FLUSH_GIVEUP 3
168#define FLUSH_COMPLETE 4 171#define FLUSH_COMPLETE 4
169#define FLUSH_RETRY_BUSYBUG 5
170 172
171/* 173/*
172 * tuning the action when the numalink network is extremely delayed 174 * tuning the action when the numalink network is extremely delayed
@@ -175,7 +177,7 @@
175 microseconds */ 177 microseconds */
176#define CONGESTED_REPS 10 /* long delays averaged over 178#define CONGESTED_REPS 10 /* long delays averaged over
177 this many broadcasts */ 179 this many broadcasts */
178#define CONGESTED_PERIOD 30 /* time for the bau to be 180#define DISABLED_PERIOD 10 /* time for the bau to be
179 disabled, in seconds */ 181 disabled, in seconds */
180/* see msg_type: */ 182/* see msg_type: */
181#define MSG_NOOP 0 183#define MSG_NOOP 0
@@ -520,6 +522,12 @@ struct ptc_stats {
520 unsigned long s_uv2_wars; /* uv2 workaround, perm. busy */ 522 unsigned long s_uv2_wars; /* uv2 workaround, perm. busy */
521 unsigned long s_uv2_wars_hw; /* uv2 workaround, hiwater */ 523 unsigned long s_uv2_wars_hw; /* uv2 workaround, hiwater */
522 unsigned long s_uv2_war_waits; /* uv2 workaround, long waits */ 524 unsigned long s_uv2_war_waits; /* uv2 workaround, long waits */
525 unsigned long s_overipilimit; /* over the ipi reset limit */
526 unsigned long s_giveuplimit; /* disables, over giveup limit*/
527 unsigned long s_enters; /* entries to the driver */
528 unsigned long s_ipifordisabled; /* fall back to IPI; disabled */
529 unsigned long s_plugged; /* plugged by h/w bug*/
530 unsigned long s_congested; /* giveup on long wait */
523 /* destination statistics */ 531 /* destination statistics */
524 unsigned long d_alltlb; /* times all tlb's on this 532 unsigned long d_alltlb; /* times all tlb's on this
525 cpu were flushed */ 533 cpu were flushed */
@@ -586,8 +594,8 @@ struct bau_control {
586 int timeout_tries; 594 int timeout_tries;
587 int ipi_attempts; 595 int ipi_attempts;
588 int conseccompletes; 596 int conseccompletes;
589 int baudisabled; 597 short nobau;
590 int set_bau_off; 598 short baudisabled;
591 short cpu; 599 short cpu;
592 short osnode; 600 short osnode;
593 short uvhub_cpu; 601 short uvhub_cpu;
@@ -596,14 +604,16 @@ struct bau_control {
596 short cpus_in_socket; 604 short cpus_in_socket;
597 short cpus_in_uvhub; 605 short cpus_in_uvhub;
598 short partition_base_pnode; 606 short partition_base_pnode;
599 short using_desc; /* an index, like uvhub_cpu */ 607 short busy; /* all were busy (war) */
600 unsigned int inuse_map;
601 unsigned short message_number; 608 unsigned short message_number;
602 unsigned short uvhub_quiesce; 609 unsigned short uvhub_quiesce;
603 short socket_acknowledge_count[DEST_Q_SIZE]; 610 short socket_acknowledge_count[DEST_Q_SIZE];
604 cycles_t send_message; 611 cycles_t send_message;
612 cycles_t period_end;
613 cycles_t period_time;
605 spinlock_t uvhub_lock; 614 spinlock_t uvhub_lock;
606 spinlock_t queue_lock; 615 spinlock_t queue_lock;
616 spinlock_t disable_lock;
607 /* tunables */ 617 /* tunables */
608 int max_concurr; 618 int max_concurr;
609 int max_concurr_const; 619 int max_concurr_const;
@@ -614,9 +624,9 @@ struct bau_control {
614 int complete_threshold; 624 int complete_threshold;
615 int cong_response_us; 625 int cong_response_us;
616 int cong_reps; 626 int cong_reps;
617 int cong_period; 627 cycles_t disabled_period;
618 unsigned long clocks_per_100_usec; 628 int period_giveups;
619 cycles_t period_time; 629 int giveup_limit;
620 long period_requests; 630 long period_requests;
621 struct hub_and_pnode *thp; 631 struct hub_and_pnode *thp;
622}; 632};
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 31f180c21ce9..74fcb963595b 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -60,6 +60,7 @@
60#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 60#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
61#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 61#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080
62#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 62#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400
63#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000
63 64
64 65
65#define PIN_BASED_EXT_INTR_MASK 0x00000001 66#define PIN_BASED_EXT_INTR_MASK 0x00000001
@@ -281,6 +282,7 @@ enum vmcs_field {
281#define EXIT_REASON_EPT_MISCONFIG 49 282#define EXIT_REASON_EPT_MISCONFIG 49
282#define EXIT_REASON_WBINVD 54 283#define EXIT_REASON_WBINVD 54
283#define EXIT_REASON_XSETBV 55 284#define EXIT_REASON_XSETBV 55
285#define EXIT_REASON_INVPCID 58
284 286
285/* 287/*
286 * Interruption-information format 288 * Interruption-information format
@@ -404,6 +406,7 @@ enum vmcs_field {
404#define VMX_EPTP_WB_BIT (1ull << 14) 406#define VMX_EPTP_WB_BIT (1ull << 14)
405#define VMX_EPT_2MB_PAGE_BIT (1ull << 16) 407#define VMX_EPT_2MB_PAGE_BIT (1ull << 16)
406#define VMX_EPT_1GB_PAGE_BIT (1ull << 17) 408#define VMX_EPT_1GB_PAGE_BIT (1ull << 17)
409#define VMX_EPT_AD_BIT (1ull << 21)
407#define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24) 410#define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24)
408#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) 411#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
409#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) 412#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
@@ -415,11 +418,14 @@ enum vmcs_field {
415#define VMX_EPT_MAX_GAW 0x4 418#define VMX_EPT_MAX_GAW 0x4
416#define VMX_EPT_MT_EPTE_SHIFT 3 419#define VMX_EPT_MT_EPTE_SHIFT 3
417#define VMX_EPT_GAW_EPTP_SHIFT 3 420#define VMX_EPT_GAW_EPTP_SHIFT 3
421#define VMX_EPT_AD_ENABLE_BIT (1ull << 6)
418#define VMX_EPT_DEFAULT_MT 0x6ull 422#define VMX_EPT_DEFAULT_MT 0x6ull
419#define VMX_EPT_READABLE_MASK 0x1ull 423#define VMX_EPT_READABLE_MASK 0x1ull
420#define VMX_EPT_WRITABLE_MASK 0x2ull 424#define VMX_EPT_WRITABLE_MASK 0x2ull
421#define VMX_EPT_EXECUTABLE_MASK 0x4ull 425#define VMX_EPT_EXECUTABLE_MASK 0x4ull
422#define VMX_EPT_IPAT_BIT (1ull << 6) 426#define VMX_EPT_IPAT_BIT (1ull << 6)
427#define VMX_EPT_ACCESS_BIT (1ull << 8)
428#define VMX_EPT_DIRTY_BIT (1ull << 9)
423 429
424#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul 430#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul
425 431
diff --git a/arch/x86/include/asm/x2apic.h b/arch/x86/include/asm/x2apic.h
index 92e54abf89e0..f90f0a587c66 100644
--- a/arch/x86/include/asm/x2apic.h
+++ b/arch/x86/include/asm/x2apic.h
@@ -9,15 +9,6 @@
9#include <asm/ipi.h> 9#include <asm/ipi.h>
10#include <linux/cpumask.h> 10#include <linux/cpumask.h>
11 11
12/*
13 * Need to use more than cpu 0, because we need more vectors
14 * when MSI-X are used.
15 */
16static const struct cpumask *x2apic_target_cpus(void)
17{
18 return cpu_online_mask;
19}
20
21static int x2apic_apic_id_valid(int apicid) 12static int x2apic_apic_id_valid(int apicid)
22{ 13{
23 return 1; 14 return 1;
@@ -28,15 +19,6 @@ static int x2apic_apic_id_registered(void)
28 return 1; 19 return 1;
29} 20}
30 21
31/*
32 * For now each logical cpu is in its own vector allocation domain.
33 */
34static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
35{
36 cpumask_clear(retmask);
37 cpumask_set_cpu(cpu, retmask);
38}
39
40static void 22static void
41__x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest) 23__x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
42{ 24{
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index c090af10ac7d..38155f667144 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -156,7 +156,6 @@ struct x86_cpuinit_ops {
156/** 156/**
157 * struct x86_platform_ops - platform specific runtime functions 157 * struct x86_platform_ops - platform specific runtime functions
158 * @calibrate_tsc: calibrate TSC 158 * @calibrate_tsc: calibrate TSC
159 * @wallclock_init: init the wallclock device
160 * @get_wallclock: get time from HW clock like RTC etc. 159 * @get_wallclock: get time from HW clock like RTC etc.
161 * @set_wallclock: set time back to HW clock 160 * @set_wallclock: set time back to HW clock
162 * @is_untracked_pat_range exclude from PAT logic 161 * @is_untracked_pat_range exclude from PAT logic
@@ -164,10 +163,10 @@ struct x86_cpuinit_ops {
164 * @i8042_detect pre-detect if i8042 controller exists 163 * @i8042_detect pre-detect if i8042 controller exists
165 * @save_sched_clock_state: save state for sched_clock() on suspend 164 * @save_sched_clock_state: save state for sched_clock() on suspend
166 * @restore_sched_clock_state: restore state for sched_clock() on resume 165 * @restore_sched_clock_state: restore state for sched_clock() on resume
166 * @apic_post_init: adjust apic if neeeded
167 */ 167 */
168struct x86_platform_ops { 168struct x86_platform_ops {
169 unsigned long (*calibrate_tsc)(void); 169 unsigned long (*calibrate_tsc)(void);
170 void (*wallclock_init)(void);
171 unsigned long (*get_wallclock)(void); 170 unsigned long (*get_wallclock)(void);
172 int (*set_wallclock)(unsigned long nowtime); 171 int (*set_wallclock)(unsigned long nowtime);
173 void (*iommu_shutdown)(void); 172 void (*iommu_shutdown)(void);
@@ -177,6 +176,7 @@ struct x86_platform_ops {
177 int (*i8042_detect)(void); 176 int (*i8042_detect)(void);
178 void (*save_sched_clock_state)(void); 177 void (*save_sched_clock_state)(void);
179 void (*restore_sched_clock_state)(void); 178 void (*restore_sched_clock_state)(void);
179 void (*apic_post_init)(void);
180}; 180};
181 181
182struct pci_dev; 182struct pci_dev;
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 8afb69319815..b2297e58c6ed 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -422,12 +422,14 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
422 return 0; 422 return 0;
423 } 423 }
424 424
425 if (intsrc->source_irq == 0 && intsrc->global_irq == 2) { 425 if (intsrc->source_irq == 0) {
426 if (acpi_skip_timer_override) { 426 if (acpi_skip_timer_override) {
427 printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n"); 427 printk(PREFIX "BIOS IRQ0 override ignored.\n");
428 return 0; 428 return 0;
429 } 429 }
430 if (acpi_fix_pin2_polarity && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) { 430
431 if ((intsrc->global_irq == 2) && acpi_fix_pin2_polarity
432 && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) {
431 intsrc->inti_flags &= ~ACPI_MADT_POLARITY_MASK; 433 intsrc->inti_flags &= ~ACPI_MADT_POLARITY_MASK;
432 printk(PREFIX "BIOS IRQ0 pin2 override: forcing polarity to high active.\n"); 434 printk(PREFIX "BIOS IRQ0 pin2 override: forcing polarity to high active.\n");
433 } 435 }
@@ -1334,17 +1336,12 @@ static int __init dmi_disable_acpi(const struct dmi_system_id *d)
1334} 1336}
1335 1337
1336/* 1338/*
1337 * Force ignoring BIOS IRQ0 pin2 override 1339 * Force ignoring BIOS IRQ0 override
1338 */ 1340 */
1339static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) 1341static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
1340{ 1342{
1341 /*
1342 * The ati_ixp4x0_rev() early PCI quirk should have set
1343 * the acpi_skip_timer_override flag already:
1344 */
1345 if (!acpi_skip_timer_override) { 1343 if (!acpi_skip_timer_override) {
1346 WARN(1, KERN_ERR "ati_ixp4x0 quirk not complete.\n"); 1344 pr_notice("%s detected: Ignoring BIOS IRQ0 override\n",
1347 pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n",
1348 d->ident); 1345 d->ident);
1349 acpi_skip_timer_override = 1; 1346 acpi_skip_timer_override = 1;
1350 } 1347 }
@@ -1438,7 +1435,7 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
1438 * is enabled. This input is incorrectly designated the 1435 * is enabled. This input is incorrectly designated the
1439 * ISA IRQ 0 via an interrupt source override even though 1436 * ISA IRQ 0 via an interrupt source override even though
1440 * it is wired to the output of the master 8259A and INTIN0 1437 * it is wired to the output of the master 8259A and INTIN0
1441 * is not connected at all. Force ignoring BIOS IRQ0 pin2 1438 * is not connected at all. Force ignoring BIOS IRQ0
1442 * override in that cases. 1439 * override in that cases.
1443 */ 1440 */
1444 { 1441 {
@@ -1473,6 +1470,14 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
1473 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"), 1470 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"),
1474 }, 1471 },
1475 }, 1472 },
1473 {
1474 .callback = dmi_ignore_irq0_timer_override,
1475 .ident = "FUJITSU SIEMENS",
1476 .matches = {
1477 DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"),
1478 DMI_MATCH(DMI_PRODUCT_NAME, "AMILO PRO V2030"),
1479 },
1480 },
1476 {} 1481 {}
1477}; 1482};
1478 1483
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 1f84794f0759..931280ff8299 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1,3 +1,5 @@
1#define pr_fmt(fmt) "SMP alternatives: " fmt
2
1#include <linux/module.h> 3#include <linux/module.h>
2#include <linux/sched.h> 4#include <linux/sched.h>
3#include <linux/mutex.h> 5#include <linux/mutex.h>
@@ -63,8 +65,11 @@ static int __init setup_noreplace_paravirt(char *str)
63__setup("noreplace-paravirt", setup_noreplace_paravirt); 65__setup("noreplace-paravirt", setup_noreplace_paravirt);
64#endif 66#endif
65 67
66#define DPRINTK(fmt, args...) if (debug_alternative) \ 68#define DPRINTK(fmt, ...) \
67 printk(KERN_DEBUG fmt, args) 69do { \
70 if (debug_alternative) \
71 printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
72} while (0)
68 73
69/* 74/*
70 * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes 75 * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
@@ -428,7 +433,7 @@ void alternatives_smp_switch(int smp)
428 * If this still occurs then you should see a hang 433 * If this still occurs then you should see a hang
429 * or crash shortly after this line: 434 * or crash shortly after this line:
430 */ 435 */
431 printk("lockdep: fixing up alternatives.\n"); 436 pr_info("lockdep: fixing up alternatives\n");
432#endif 437#endif
433 438
434 if (noreplace_smp || smp_alt_once || skip_smp_alternatives) 439 if (noreplace_smp || smp_alt_once || skip_smp_alternatives)
@@ -444,14 +449,14 @@ void alternatives_smp_switch(int smp)
444 if (smp == smp_mode) { 449 if (smp == smp_mode) {
445 /* nothing */ 450 /* nothing */
446 } else if (smp) { 451 } else if (smp) {
447 printk(KERN_INFO "SMP alternatives: switching to SMP code\n"); 452 pr_info("switching to SMP code\n");
448 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); 453 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
449 clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP); 454 clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
450 list_for_each_entry(mod, &smp_alt_modules, next) 455 list_for_each_entry(mod, &smp_alt_modules, next)
451 alternatives_smp_lock(mod->locks, mod->locks_end, 456 alternatives_smp_lock(mod->locks, mod->locks_end,
452 mod->text, mod->text_end); 457 mod->text, mod->text_end);
453 } else { 458 } else {
454 printk(KERN_INFO "SMP alternatives: switching to UP code\n"); 459 pr_info("switching to UP code\n");
455 set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); 460 set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
456 set_cpu_cap(&cpu_data(0), X86_FEATURE_UP); 461 set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
457 list_for_each_entry(mod, &smp_alt_modules, next) 462 list_for_each_entry(mod, &smp_alt_modules, next)
@@ -546,7 +551,7 @@ void __init alternative_instructions(void)
546#ifdef CONFIG_SMP 551#ifdef CONFIG_SMP
547 if (smp_alt_once) { 552 if (smp_alt_once) {
548 if (1 == num_possible_cpus()) { 553 if (1 == num_possible_cpus()) {
549 printk(KERN_INFO "SMP alternatives: switching to UP code\n"); 554 pr_info("switching to UP code\n");
550 set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); 555 set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
551 set_cpu_cap(&cpu_data(0), X86_FEATURE_UP); 556 set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
552 557
@@ -664,7 +669,7 @@ static int __kprobes stop_machine_text_poke(void *data)
664 struct text_poke_param *p; 669 struct text_poke_param *p;
665 int i; 670 int i;
666 671
667 if (atomic_dec_and_test(&stop_machine_first)) { 672 if (atomic_xchg(&stop_machine_first, 0)) {
668 for (i = 0; i < tpp->nparams; i++) { 673 for (i = 0; i < tpp->nparams; i++) {
669 p = &tpp->params[i]; 674 p = &tpp->params[i];
670 text_poke(p->addr, p->opcode, p->len); 675 text_poke(p->addr, p->opcode, p->len);
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index be16854591cc..aadf3359e2a7 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -2,6 +2,9 @@
2 * Shared support code for AMD K8 northbridges and derivates. 2 * Shared support code for AMD K8 northbridges and derivates.
3 * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2. 3 * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2.
4 */ 4 */
5
6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7
5#include <linux/types.h> 8#include <linux/types.h>
6#include <linux/slab.h> 9#include <linux/slab.h>
7#include <linux/init.h> 10#include <linux/init.h>
@@ -16,6 +19,7 @@ const struct pci_device_id amd_nb_misc_ids[] = {
16 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, 19 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
17 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, 20 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
18 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) }, 21 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) },
22 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) },
19 {} 23 {}
20}; 24};
21EXPORT_SYMBOL(amd_nb_misc_ids); 25EXPORT_SYMBOL(amd_nb_misc_ids);
@@ -258,7 +262,7 @@ void amd_flush_garts(void)
258 } 262 }
259 spin_unlock_irqrestore(&gart_lock, flags); 263 spin_unlock_irqrestore(&gart_lock, flags);
260 if (!flushed) 264 if (!flushed)
261 printk("nothing to flush?\n"); 265 pr_notice("nothing to flush?\n");
262} 266}
263EXPORT_SYMBOL_GPL(amd_flush_garts); 267EXPORT_SYMBOL_GPL(amd_flush_garts);
264 268
@@ -269,11 +273,10 @@ static __init int init_amd_nbs(void)
269 err = amd_cache_northbridges(); 273 err = amd_cache_northbridges();
270 274
271 if (err < 0) 275 if (err < 0)
272 printk(KERN_NOTICE "AMD NB: Cannot enumerate AMD northbridges.\n"); 276 pr_notice("Cannot enumerate AMD northbridges\n");
273 277
274 if (amd_cache_gart() < 0) 278 if (amd_cache_gart() < 0)
275 printk(KERN_NOTICE "AMD NB: Cannot initialize GART flush words, " 279 pr_notice("Cannot initialize GART flush words, GART support disabled\n");
276 "GART support disabled.\n");
277 280
278 return err; 281 return err;
279} 282}
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 39a222e094af..24deb3082328 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -75,8 +75,8 @@ physid_mask_t phys_cpu_present_map;
75/* 75/*
76 * Map cpu index to physical APIC ID 76 * Map cpu index to physical APIC ID
77 */ 77 */
78DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID); 78DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID);
79DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID); 79DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid, BAD_APICID);
80EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); 80EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
81EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); 81EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
82 82
@@ -88,7 +88,7 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
88 * used for the mapping. This is where the behaviors of x86_64 and 32 88 * used for the mapping. This is where the behaviors of x86_64 and 32
89 * actually diverge. Let's keep it ugly for now. 89 * actually diverge. Let's keep it ugly for now.
90 */ 90 */
91DEFINE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid, BAD_APICID); 91DEFINE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid, BAD_APICID);
92 92
93/* 93/*
94 * Knob to control our willingness to enable the local APIC. 94 * Knob to control our willingness to enable the local APIC.
@@ -2123,6 +2123,42 @@ void default_init_apic_ldr(void)
2123 apic_write(APIC_LDR, val); 2123 apic_write(APIC_LDR, val);
2124} 2124}
2125 2125
2126int default_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
2127 const struct cpumask *andmask,
2128 unsigned int *apicid)
2129{
2130 unsigned int cpu;
2131
2132 for_each_cpu_and(cpu, cpumask, andmask) {
2133 if (cpumask_test_cpu(cpu, cpu_online_mask))
2134 break;
2135 }
2136
2137 if (likely(cpu < nr_cpu_ids)) {
2138 *apicid = per_cpu(x86_cpu_to_apicid, cpu);
2139 return 0;
2140 }
2141
2142 return -EINVAL;
2143}
2144
2145/*
2146 * Override the generic EOI implementation with an optimized version.
2147 * Only called during early boot when only one CPU is active and with
2148 * interrupts disabled, so we know this does not race with actual APIC driver
2149 * use.
2150 */
2151void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v))
2152{
2153 struct apic **drv;
2154
2155 for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
2156 /* Should happen once for each apic */
2157 WARN_ON((*drv)->eoi_write == eoi_write);
2158 (*drv)->eoi_write = eoi_write;
2159 }
2160}
2161
2126/* 2162/*
2127 * Power management 2163 * Power management
2128 */ 2164 */
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 0e881c46e8c8..00c77cf78e9e 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -36,25 +36,6 @@ static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
36 return 1; 36 return 1;
37} 37}
38 38
39static const struct cpumask *flat_target_cpus(void)
40{
41 return cpu_online_mask;
42}
43
44static void flat_vector_allocation_domain(int cpu, struct cpumask *retmask)
45{
46 /* Careful. Some cpus do not strictly honor the set of cpus
47 * specified in the interrupt destination when using lowest
48 * priority interrupt delivery mode.
49 *
50 * In particular there was a hyperthreading cpu observed to
51 * deliver interrupts to the wrong hyperthread when only one
52 * hyperthread was specified in the interrupt desitination.
53 */
54 cpumask_clear(retmask);
55 cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
56}
57
58/* 39/*
59 * Set up the logical destination ID. 40 * Set up the logical destination ID.
60 * 41 *
@@ -92,7 +73,7 @@ static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector)
92} 73}
93 74
94static void 75static void
95 flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) 76flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector)
96{ 77{
97 unsigned long mask = cpumask_bits(cpumask)[0]; 78 unsigned long mask = cpumask_bits(cpumask)[0];
98 int cpu = smp_processor_id(); 79 int cpu = smp_processor_id();
@@ -186,7 +167,7 @@ static struct apic apic_flat = {
186 .irq_delivery_mode = dest_LowestPrio, 167 .irq_delivery_mode = dest_LowestPrio,
187 .irq_dest_mode = 1, /* logical */ 168 .irq_dest_mode = 1, /* logical */
188 169
189 .target_cpus = flat_target_cpus, 170 .target_cpus = online_target_cpus,
190 .disable_esr = 0, 171 .disable_esr = 0,
191 .dest_logical = APIC_DEST_LOGICAL, 172 .dest_logical = APIC_DEST_LOGICAL,
192 .check_apicid_used = NULL, 173 .check_apicid_used = NULL,
@@ -210,8 +191,7 @@ static struct apic apic_flat = {
210 .set_apic_id = set_apic_id, 191 .set_apic_id = set_apic_id,
211 .apic_id_mask = 0xFFu << 24, 192 .apic_id_mask = 0xFFu << 24,
212 193
213 .cpu_mask_to_apicid = default_cpu_mask_to_apicid, 194 .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
214 .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
215 195
216 .send_IPI_mask = flat_send_IPI_mask, 196 .send_IPI_mask = flat_send_IPI_mask,
217 .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself, 197 .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself,
@@ -262,17 +242,6 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
262 return 0; 242 return 0;
263} 243}
264 244
265static const struct cpumask *physflat_target_cpus(void)
266{
267 return cpu_online_mask;
268}
269
270static void physflat_vector_allocation_domain(int cpu, struct cpumask *retmask)
271{
272 cpumask_clear(retmask);
273 cpumask_set_cpu(cpu, retmask);
274}
275
276static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector) 245static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector)
277{ 246{
278 default_send_IPI_mask_sequence_phys(cpumask, vector); 247 default_send_IPI_mask_sequence_phys(cpumask, vector);
@@ -294,38 +263,6 @@ static void physflat_send_IPI_all(int vector)
294 physflat_send_IPI_mask(cpu_online_mask, vector); 263 physflat_send_IPI_mask(cpu_online_mask, vector);
295} 264}
296 265
297static unsigned int physflat_cpu_mask_to_apicid(const struct cpumask *cpumask)
298{
299 int cpu;
300
301 /*
302 * We're using fixed IRQ delivery, can only return one phys APIC ID.
303 * May as well be the first.
304 */
305 cpu = cpumask_first(cpumask);
306 if ((unsigned)cpu < nr_cpu_ids)
307 return per_cpu(x86_cpu_to_apicid, cpu);
308 else
309 return BAD_APICID;
310}
311
312static unsigned int
313physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
314 const struct cpumask *andmask)
315{
316 int cpu;
317
318 /*
319 * We're using fixed IRQ delivery, can only return one phys APIC ID.
320 * May as well be the first.
321 */
322 for_each_cpu_and(cpu, cpumask, andmask) {
323 if (cpumask_test_cpu(cpu, cpu_online_mask))
324 break;
325 }
326 return per_cpu(x86_cpu_to_apicid, cpu);
327}
328
329static int physflat_probe(void) 266static int physflat_probe(void)
330{ 267{
331 if (apic == &apic_physflat || num_possible_cpus() > 8) 268 if (apic == &apic_physflat || num_possible_cpus() > 8)
@@ -345,13 +282,13 @@ static struct apic apic_physflat = {
345 .irq_delivery_mode = dest_Fixed, 282 .irq_delivery_mode = dest_Fixed,
346 .irq_dest_mode = 0, /* physical */ 283 .irq_dest_mode = 0, /* physical */
347 284
348 .target_cpus = physflat_target_cpus, 285 .target_cpus = online_target_cpus,
349 .disable_esr = 0, 286 .disable_esr = 0,
350 .dest_logical = 0, 287 .dest_logical = 0,
351 .check_apicid_used = NULL, 288 .check_apicid_used = NULL,
352 .check_apicid_present = NULL, 289 .check_apicid_present = NULL,
353 290
354 .vector_allocation_domain = physflat_vector_allocation_domain, 291 .vector_allocation_domain = default_vector_allocation_domain,
355 /* not needed, but shouldn't hurt: */ 292 /* not needed, but shouldn't hurt: */
356 .init_apic_ldr = flat_init_apic_ldr, 293 .init_apic_ldr = flat_init_apic_ldr,
357 294
@@ -370,8 +307,7 @@ static struct apic apic_physflat = {
370 .set_apic_id = set_apic_id, 307 .set_apic_id = set_apic_id,
371 .apic_id_mask = 0xFFu << 24, 308 .apic_id_mask = 0xFFu << 24,
372 309
373 .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid, 310 .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
374 .cpu_mask_to_apicid_and = physflat_cpu_mask_to_apicid_and,
375 311
376 .send_IPI_mask = physflat_send_IPI_mask, 312 .send_IPI_mask = physflat_send_IPI_mask,
377 .send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself, 313 .send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index a6e4c6e06c08..e145f28b4099 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -100,12 +100,12 @@ static unsigned long noop_check_apicid_present(int bit)
100 return physid_isset(bit, phys_cpu_present_map); 100 return physid_isset(bit, phys_cpu_present_map);
101} 101}
102 102
103static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask) 103static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask,
104 const struct cpumask *mask)
104{ 105{
105 if (cpu != 0) 106 if (cpu != 0)
106 pr_warning("APIC: Vector allocated for non-BSP cpu\n"); 107 pr_warning("APIC: Vector allocated for non-BSP cpu\n");
107 cpumask_clear(retmask); 108 cpumask_copy(retmask, cpumask_of(cpu));
108 cpumask_set_cpu(cpu, retmask);
109} 109}
110 110
111static u32 noop_apic_read(u32 reg) 111static u32 noop_apic_read(u32 reg)
@@ -159,8 +159,7 @@ struct apic apic_noop = {
159 .set_apic_id = NULL, 159 .set_apic_id = NULL,
160 .apic_id_mask = 0x0F << 24, 160 .apic_id_mask = 0x0F << 24,
161 161
162 .cpu_mask_to_apicid = default_cpu_mask_to_apicid, 162 .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
163 .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
164 163
165 .send_IPI_mask = noop_send_IPI_mask, 164 .send_IPI_mask = noop_send_IPI_mask,
166 .send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself, 165 .send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself,
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 6ec6d5d297c3..bc552cff2578 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -72,17 +72,6 @@ static int numachip_phys_pkg_id(int initial_apic_id, int index_msb)
72 return initial_apic_id >> index_msb; 72 return initial_apic_id >> index_msb;
73} 73}
74 74
75static const struct cpumask *numachip_target_cpus(void)
76{
77 return cpu_online_mask;
78}
79
80static void numachip_vector_allocation_domain(int cpu, struct cpumask *retmask)
81{
82 cpumask_clear(retmask);
83 cpumask_set_cpu(cpu, retmask);
84}
85
86static int __cpuinit numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip) 75static int __cpuinit numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip)
87{ 76{
88 union numachip_csr_g3_ext_irq_gen int_gen; 77 union numachip_csr_g3_ext_irq_gen int_gen;
@@ -157,38 +146,6 @@ static void numachip_send_IPI_self(int vector)
157 __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); 146 __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
158} 147}
159 148
160static unsigned int numachip_cpu_mask_to_apicid(const struct cpumask *cpumask)
161{
162 int cpu;
163
164 /*
165 * We're using fixed IRQ delivery, can only return one phys APIC ID.
166 * May as well be the first.
167 */
168 cpu = cpumask_first(cpumask);
169 if (likely((unsigned)cpu < nr_cpu_ids))
170 return per_cpu(x86_cpu_to_apicid, cpu);
171
172 return BAD_APICID;
173}
174
175static unsigned int
176numachip_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
177 const struct cpumask *andmask)
178{
179 int cpu;
180
181 /*
182 * We're using fixed IRQ delivery, can only return one phys APIC ID.
183 * May as well be the first.
184 */
185 for_each_cpu_and(cpu, cpumask, andmask) {
186 if (cpumask_test_cpu(cpu, cpu_online_mask))
187 break;
188 }
189 return per_cpu(x86_cpu_to_apicid, cpu);
190}
191
192static int __init numachip_probe(void) 149static int __init numachip_probe(void)
193{ 150{
194 return apic == &apic_numachip; 151 return apic == &apic_numachip;
@@ -253,13 +210,13 @@ static struct apic apic_numachip __refconst = {
253 .irq_delivery_mode = dest_Fixed, 210 .irq_delivery_mode = dest_Fixed,
254 .irq_dest_mode = 0, /* physical */ 211 .irq_dest_mode = 0, /* physical */
255 212
256 .target_cpus = numachip_target_cpus, 213 .target_cpus = online_target_cpus,
257 .disable_esr = 0, 214 .disable_esr = 0,
258 .dest_logical = 0, 215 .dest_logical = 0,
259 .check_apicid_used = NULL, 216 .check_apicid_used = NULL,
260 .check_apicid_present = NULL, 217 .check_apicid_present = NULL,
261 218
262 .vector_allocation_domain = numachip_vector_allocation_domain, 219 .vector_allocation_domain = default_vector_allocation_domain,
263 .init_apic_ldr = flat_init_apic_ldr, 220 .init_apic_ldr = flat_init_apic_ldr,
264 221
265 .ioapic_phys_id_map = NULL, 222 .ioapic_phys_id_map = NULL,
@@ -277,8 +234,7 @@ static struct apic apic_numachip __refconst = {
277 .set_apic_id = set_apic_id, 234 .set_apic_id = set_apic_id,
278 .apic_id_mask = 0xffU << 24, 235 .apic_id_mask = 0xffU << 24,
279 236
280 .cpu_mask_to_apicid = numachip_cpu_mask_to_apicid, 237 .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
281 .cpu_mask_to_apicid_and = numachip_cpu_mask_to_apicid_and,
282 238
283 .send_IPI_mask = numachip_send_IPI_mask, 239 .send_IPI_mask = numachip_send_IPI_mask,
284 .send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself, 240 .send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself,
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 31fbdbfbf960..d50e3640d5ae 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -26,15 +26,6 @@ static int bigsmp_apic_id_registered(void)
26 return 1; 26 return 1;
27} 27}
28 28
29static const struct cpumask *bigsmp_target_cpus(void)
30{
31#ifdef CONFIG_SMP
32 return cpu_online_mask;
33#else
34 return cpumask_of(0);
35#endif
36}
37
38static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid) 29static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid)
39{ 30{
40 return 0; 31 return 0;
@@ -105,32 +96,6 @@ static int bigsmp_check_phys_apicid_present(int phys_apicid)
105 return 1; 96 return 1;
106} 97}
107 98
108/* As we are using single CPU as destination, pick only one CPU here */
109static unsigned int bigsmp_cpu_mask_to_apicid(const struct cpumask *cpumask)
110{
111 int cpu = cpumask_first(cpumask);
112
113 if (cpu < nr_cpu_ids)
114 return cpu_physical_id(cpu);
115 return BAD_APICID;
116}
117
118static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
119 const struct cpumask *andmask)
120{
121 int cpu;
122
123 /*
124 * We're using fixed IRQ delivery, can only return one phys APIC ID.
125 * May as well be the first.
126 */
127 for_each_cpu_and(cpu, cpumask, andmask) {
128 if (cpumask_test_cpu(cpu, cpu_online_mask))
129 return cpu_physical_id(cpu);
130 }
131 return BAD_APICID;
132}
133
134static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) 99static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
135{ 100{
136 return cpuid_apic >> index_msb; 101 return cpuid_apic >> index_msb;
@@ -177,12 +142,6 @@ static const struct dmi_system_id bigsmp_dmi_table[] = {
177 { } /* NULL entry stops DMI scanning */ 142 { } /* NULL entry stops DMI scanning */
178}; 143};
179 144
180static void bigsmp_vector_allocation_domain(int cpu, struct cpumask *retmask)
181{
182 cpumask_clear(retmask);
183 cpumask_set_cpu(cpu, retmask);
184}
185
186static int probe_bigsmp(void) 145static int probe_bigsmp(void)
187{ 146{
188 if (def_to_bigsmp) 147 if (def_to_bigsmp)
@@ -205,13 +164,13 @@ static struct apic apic_bigsmp = {
205 /* phys delivery to target CPU: */ 164 /* phys delivery to target CPU: */
206 .irq_dest_mode = 0, 165 .irq_dest_mode = 0,
207 166
208 .target_cpus = bigsmp_target_cpus, 167 .target_cpus = default_target_cpus,
209 .disable_esr = 1, 168 .disable_esr = 1,
210 .dest_logical = 0, 169 .dest_logical = 0,
211 .check_apicid_used = bigsmp_check_apicid_used, 170 .check_apicid_used = bigsmp_check_apicid_used,
212 .check_apicid_present = bigsmp_check_apicid_present, 171 .check_apicid_present = bigsmp_check_apicid_present,
213 172
214 .vector_allocation_domain = bigsmp_vector_allocation_domain, 173 .vector_allocation_domain = default_vector_allocation_domain,
215 .init_apic_ldr = bigsmp_init_apic_ldr, 174 .init_apic_ldr = bigsmp_init_apic_ldr,
216 175
217 .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map, 176 .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map,
@@ -229,8 +188,7 @@ static struct apic apic_bigsmp = {
229 .set_apic_id = NULL, 188 .set_apic_id = NULL,
230 .apic_id_mask = 0xFF << 24, 189 .apic_id_mask = 0xFF << 24,
231 190
232 .cpu_mask_to_apicid = bigsmp_cpu_mask_to_apicid, 191 .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
233 .cpu_mask_to_apicid_and = bigsmp_cpu_mask_to_apicid_and,
234 192
235 .send_IPI_mask = bigsmp_send_IPI_mask, 193 .send_IPI_mask = bigsmp_send_IPI_mask,
236 .send_IPI_mask_allbutself = NULL, 194 .send_IPI_mask_allbutself = NULL,
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index db4ab1be3c79..0874799a98c6 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -394,21 +394,6 @@ static void es7000_enable_apic_mode(void)
394 WARN(1, "Command failed, status = %x\n", mip_status); 394 WARN(1, "Command failed, status = %x\n", mip_status);
395} 395}
396 396
397static void es7000_vector_allocation_domain(int cpu, struct cpumask *retmask)
398{
399 /* Careful. Some cpus do not strictly honor the set of cpus
400 * specified in the interrupt destination when using lowest
401 * priority interrupt delivery mode.
402 *
403 * In particular there was a hyperthreading cpu observed to
404 * deliver interrupts to the wrong hyperthread when only one
405 * hyperthread was specified in the interrupt desitination.
406 */
407 cpumask_clear(retmask);
408 cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
409}
410
411
412static void es7000_wait_for_init_deassert(atomic_t *deassert) 397static void es7000_wait_for_init_deassert(atomic_t *deassert)
413{ 398{
414 while (!atomic_read(deassert)) 399 while (!atomic_read(deassert))
@@ -540,45 +525,49 @@ static int es7000_check_phys_apicid_present(int cpu_physical_apicid)
540 return 1; 525 return 1;
541} 526}
542 527
543static unsigned int es7000_cpu_mask_to_apicid(const struct cpumask *cpumask) 528static inline int
529es7000_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id)
544{ 530{
545 unsigned int round = 0; 531 unsigned int round = 0;
546 int cpu, uninitialized_var(apicid); 532 unsigned int cpu, uninitialized_var(apicid);
547 533
548 /* 534 /*
549 * The cpus in the mask must all be on the apic cluster. 535 * The cpus in the mask must all be on the apic cluster.
550 */ 536 */
551 for_each_cpu(cpu, cpumask) { 537 for_each_cpu_and(cpu, cpumask, cpu_online_mask) {
552 int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); 538 int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
553 539
554 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { 540 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
555 WARN(1, "Not a valid mask!"); 541 WARN(1, "Not a valid mask!");
556 542
557 return BAD_APICID; 543 return -EINVAL;
558 } 544 }
559 apicid = new_apicid; 545 apicid |= new_apicid;
560 round++; 546 round++;
561 } 547 }
562 return apicid; 548 if (!round)
549 return -EINVAL;
550 *dest_id = apicid;
551 return 0;
563} 552}
564 553
565static unsigned int 554static int
566es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask, 555es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask,
567 const struct cpumask *andmask) 556 const struct cpumask *andmask,
557 unsigned int *apicid)
568{ 558{
569 int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
570 cpumask_var_t cpumask; 559 cpumask_var_t cpumask;
560 *apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
571 561
572 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) 562 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
573 return apicid; 563 return 0;
574 564
575 cpumask_and(cpumask, inmask, andmask); 565 cpumask_and(cpumask, inmask, andmask);
576 cpumask_and(cpumask, cpumask, cpu_online_mask); 566 es7000_cpu_mask_to_apicid(cpumask, apicid);
577 apicid = es7000_cpu_mask_to_apicid(cpumask);
578 567
579 free_cpumask_var(cpumask); 568 free_cpumask_var(cpumask);
580 569
581 return apicid; 570 return 0;
582} 571}
583 572
584static int es7000_phys_pkg_id(int cpuid_apic, int index_msb) 573static int es7000_phys_pkg_id(int cpuid_apic, int index_msb)
@@ -638,7 +627,7 @@ static struct apic __refdata apic_es7000_cluster = {
638 .check_apicid_used = es7000_check_apicid_used, 627 .check_apicid_used = es7000_check_apicid_used,
639 .check_apicid_present = es7000_check_apicid_present, 628 .check_apicid_present = es7000_check_apicid_present,
640 629
641 .vector_allocation_domain = es7000_vector_allocation_domain, 630 .vector_allocation_domain = flat_vector_allocation_domain,
642 .init_apic_ldr = es7000_init_apic_ldr_cluster, 631 .init_apic_ldr = es7000_init_apic_ldr_cluster,
643 632
644 .ioapic_phys_id_map = es7000_ioapic_phys_id_map, 633 .ioapic_phys_id_map = es7000_ioapic_phys_id_map,
@@ -656,7 +645,6 @@ static struct apic __refdata apic_es7000_cluster = {
656 .set_apic_id = NULL, 645 .set_apic_id = NULL,
657 .apic_id_mask = 0xFF << 24, 646 .apic_id_mask = 0xFF << 24,
658 647
659 .cpu_mask_to_apicid = es7000_cpu_mask_to_apicid,
660 .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and, 648 .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and,
661 649
662 .send_IPI_mask = es7000_send_IPI_mask, 650 .send_IPI_mask = es7000_send_IPI_mask,
@@ -705,7 +693,7 @@ static struct apic __refdata apic_es7000 = {
705 .check_apicid_used = es7000_check_apicid_used, 693 .check_apicid_used = es7000_check_apicid_used,
706 .check_apicid_present = es7000_check_apicid_present, 694 .check_apicid_present = es7000_check_apicid_present,
707 695
708 .vector_allocation_domain = es7000_vector_allocation_domain, 696 .vector_allocation_domain = flat_vector_allocation_domain,
709 .init_apic_ldr = es7000_init_apic_ldr, 697 .init_apic_ldr = es7000_init_apic_ldr,
710 698
711 .ioapic_phys_id_map = es7000_ioapic_phys_id_map, 699 .ioapic_phys_id_map = es7000_ioapic_phys_id_map,
@@ -723,7 +711,6 @@ static struct apic __refdata apic_es7000 = {
723 .set_apic_id = NULL, 711 .set_apic_id = NULL,
724 .apic_id_mask = 0xFF << 24, 712 .apic_id_mask = 0xFF << 24,
725 713
726 .cpu_mask_to_apicid = es7000_cpu_mask_to_apicid,
727 .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and, 714 .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and,
728 715
729 .send_IPI_mask = es7000_send_IPI_mask, 716 .send_IPI_mask = es7000_send_IPI_mask,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 5f0ff597437c..406eee784684 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -448,8 +448,8 @@ static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pi
448 448
449 entry = alloc_irq_pin_list(node); 449 entry = alloc_irq_pin_list(node);
450 if (!entry) { 450 if (!entry) {
451 printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n", 451 pr_err("can not alloc irq_pin_list (%d,%d,%d)\n",
452 node, apic, pin); 452 node, apic, pin);
453 return -ENOMEM; 453 return -ENOMEM;
454 } 454 }
455 entry->apic = apic; 455 entry->apic = apic;
@@ -661,7 +661,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
661 ioapic_mask_entry(apic, pin); 661 ioapic_mask_entry(apic, pin);
662 entry = ioapic_read_entry(apic, pin); 662 entry = ioapic_read_entry(apic, pin);
663 if (entry.irr) 663 if (entry.irr)
664 printk(KERN_ERR "Unable to reset IRR for apic: %d, pin :%d\n", 664 pr_err("Unable to reset IRR for apic: %d, pin :%d\n",
665 mpc_ioapic_id(apic), pin); 665 mpc_ioapic_id(apic), pin);
666} 666}
667 667
@@ -895,7 +895,7 @@ static int irq_polarity(int idx)
895 } 895 }
896 case 2: /* reserved */ 896 case 2: /* reserved */
897 { 897 {
898 printk(KERN_WARNING "broken BIOS!!\n"); 898 pr_warn("broken BIOS!!\n");
899 polarity = 1; 899 polarity = 1;
900 break; 900 break;
901 } 901 }
@@ -906,7 +906,7 @@ static int irq_polarity(int idx)
906 } 906 }
907 default: /* invalid */ 907 default: /* invalid */
908 { 908 {
909 printk(KERN_WARNING "broken BIOS!!\n"); 909 pr_warn("broken BIOS!!\n");
910 polarity = 1; 910 polarity = 1;
911 break; 911 break;
912 } 912 }
@@ -948,7 +948,7 @@ static int irq_trigger(int idx)
948 } 948 }
949 default: 949 default:
950 { 950 {
951 printk(KERN_WARNING "broken BIOS!!\n"); 951 pr_warn("broken BIOS!!\n");
952 trigger = 1; 952 trigger = 1;
953 break; 953 break;
954 } 954 }
@@ -962,7 +962,7 @@ static int irq_trigger(int idx)
962 } 962 }
963 case 2: /* reserved */ 963 case 2: /* reserved */
964 { 964 {
965 printk(KERN_WARNING "broken BIOS!!\n"); 965 pr_warn("broken BIOS!!\n");
966 trigger = 1; 966 trigger = 1;
967 break; 967 break;
968 } 968 }
@@ -973,7 +973,7 @@ static int irq_trigger(int idx)
973 } 973 }
974 default: /* invalid */ 974 default: /* invalid */
975 { 975 {
976 printk(KERN_WARNING "broken BIOS!!\n"); 976 pr_warn("broken BIOS!!\n");
977 trigger = 0; 977 trigger = 0;
978 break; 978 break;
979 } 979 }
@@ -991,7 +991,7 @@ static int pin_2_irq(int idx, int apic, int pin)
991 * Debugging check, we are in big trouble if this message pops up! 991 * Debugging check, we are in big trouble if this message pops up!
992 */ 992 */
993 if (mp_irqs[idx].dstirq != pin) 993 if (mp_irqs[idx].dstirq != pin)
994 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); 994 pr_err("broken BIOS or MPTABLE parser, ayiee!!\n");
995 995
996 if (test_bit(bus, mp_bus_not_pci)) { 996 if (test_bit(bus, mp_bus_not_pci)) {
997 irq = mp_irqs[idx].srcbusirq; 997 irq = mp_irqs[idx].srcbusirq;
@@ -1112,8 +1112,7 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1112 * 0x80, because int 0x80 is hm, kind of importantish. ;) 1112 * 0x80, because int 0x80 is hm, kind of importantish. ;)
1113 */ 1113 */
1114 static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START; 1114 static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
1115 static int current_offset = VECTOR_OFFSET_START % 8; 1115 static int current_offset = VECTOR_OFFSET_START % 16;
1116 unsigned int old_vector;
1117 int cpu, err; 1116 int cpu, err;
1118 cpumask_var_t tmp_mask; 1117 cpumask_var_t tmp_mask;
1119 1118
@@ -1123,35 +1122,45 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1123 if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) 1122 if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
1124 return -ENOMEM; 1123 return -ENOMEM;
1125 1124
1126 old_vector = cfg->vector;
1127 if (old_vector) {
1128 cpumask_and(tmp_mask, mask, cpu_online_mask);
1129 cpumask_and(tmp_mask, cfg->domain, tmp_mask);
1130 if (!cpumask_empty(tmp_mask)) {
1131 free_cpumask_var(tmp_mask);
1132 return 0;
1133 }
1134 }
1135
1136 /* Only try and allocate irqs on cpus that are present */ 1125 /* Only try and allocate irqs on cpus that are present */
1137 err = -ENOSPC; 1126 err = -ENOSPC;
1138 for_each_cpu_and(cpu, mask, cpu_online_mask) { 1127 cpumask_clear(cfg->old_domain);
1139 int new_cpu; 1128 cpu = cpumask_first_and(mask, cpu_online_mask);
1140 int vector, offset; 1129 while (cpu < nr_cpu_ids) {
1130 int new_cpu, vector, offset;
1141 1131
1142 apic->vector_allocation_domain(cpu, tmp_mask); 1132 apic->vector_allocation_domain(cpu, tmp_mask, mask);
1133
1134 if (cpumask_subset(tmp_mask, cfg->domain)) {
1135 err = 0;
1136 if (cpumask_equal(tmp_mask, cfg->domain))
1137 break;
1138 /*
1139 * New cpumask using the vector is a proper subset of
1140 * the current in use mask. So cleanup the vector
1141 * allocation for the members that are not used anymore.
1142 */
1143 cpumask_andnot(cfg->old_domain, cfg->domain, tmp_mask);
1144 cfg->move_in_progress = 1;
1145 cpumask_and(cfg->domain, cfg->domain, tmp_mask);
1146 break;
1147 }
1143 1148
1144 vector = current_vector; 1149 vector = current_vector;
1145 offset = current_offset; 1150 offset = current_offset;
1146next: 1151next:
1147 vector += 8; 1152 vector += 16;
1148 if (vector >= first_system_vector) { 1153 if (vector >= first_system_vector) {
1149 /* If out of vectors on large boxen, must share them. */ 1154 offset = (offset + 1) % 16;
1150 offset = (offset + 1) % 8;
1151 vector = FIRST_EXTERNAL_VECTOR + offset; 1155 vector = FIRST_EXTERNAL_VECTOR + offset;
1152 } 1156 }
1153 if (unlikely(current_vector == vector)) 1157
1158 if (unlikely(current_vector == vector)) {
1159 cpumask_or(cfg->old_domain, cfg->old_domain, tmp_mask);
1160 cpumask_andnot(tmp_mask, mask, cfg->old_domain);
1161 cpu = cpumask_first_and(tmp_mask, cpu_online_mask);
1154 continue; 1162 continue;
1163 }
1155 1164
1156 if (test_bit(vector, used_vectors)) 1165 if (test_bit(vector, used_vectors))
1157 goto next; 1166 goto next;
@@ -1162,7 +1171,7 @@ next:
1162 /* Found one! */ 1171 /* Found one! */
1163 current_vector = vector; 1172 current_vector = vector;
1164 current_offset = offset; 1173 current_offset = offset;
1165 if (old_vector) { 1174 if (cfg->vector) {
1166 cfg->move_in_progress = 1; 1175 cfg->move_in_progress = 1;
1167 cpumask_copy(cfg->old_domain, cfg->domain); 1176 cpumask_copy(cfg->old_domain, cfg->domain);
1168 } 1177 }
@@ -1346,18 +1355,18 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg,
1346 1355
1347 if (!IO_APIC_IRQ(irq)) 1356 if (!IO_APIC_IRQ(irq))
1348 return; 1357 return;
1349 /*
1350 * For legacy irqs, cfg->domain starts with cpu 0 for legacy
1351 * controllers like 8259. Now that IO-APIC can handle this irq, update
1352 * the cfg->domain.
1353 */
1354 if (irq < legacy_pic->nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain))
1355 apic->vector_allocation_domain(0, cfg->domain);
1356 1358
1357 if (assign_irq_vector(irq, cfg, apic->target_cpus())) 1359 if (assign_irq_vector(irq, cfg, apic->target_cpus()))
1358 return; 1360 return;
1359 1361
1360 dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); 1362 if (apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus(),
1363 &dest)) {
1364 pr_warn("Failed to obtain apicid for ioapic %d, pin %d\n",
1365 mpc_ioapic_id(attr->ioapic), attr->ioapic_pin);
1366 __clear_irq_vector(irq, cfg);
1367
1368 return;
1369 }
1361 1370
1362 apic_printk(APIC_VERBOSE,KERN_DEBUG 1371 apic_printk(APIC_VERBOSE,KERN_DEBUG
1363 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " 1372 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
@@ -1366,7 +1375,7 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg,
1366 cfg->vector, irq, attr->trigger, attr->polarity, dest); 1375 cfg->vector, irq, attr->trigger, attr->polarity, dest);
1367 1376
1368 if (setup_ioapic_entry(irq, &entry, dest, cfg->vector, attr)) { 1377 if (setup_ioapic_entry(irq, &entry, dest, cfg->vector, attr)) {
1369 pr_warn("Failed to setup ioapic entry for ioapic %d, pin %d\n", 1378 pr_warn("Failed to setup ioapic entry for ioapic %d, pin %d\n",
1370 mpc_ioapic_id(attr->ioapic), attr->ioapic_pin); 1379 mpc_ioapic_id(attr->ioapic), attr->ioapic_pin);
1371 __clear_irq_vector(irq, cfg); 1380 __clear_irq_vector(irq, cfg);
1372 1381
@@ -1469,9 +1478,10 @@ void setup_IO_APIC_irq_extra(u32 gsi)
1469 * Set up the timer pin, possibly with the 8259A-master behind. 1478 * Set up the timer pin, possibly with the 8259A-master behind.
1470 */ 1479 */
1471static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx, 1480static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx,
1472 unsigned int pin, int vector) 1481 unsigned int pin, int vector)
1473{ 1482{
1474 struct IO_APIC_route_entry entry; 1483 struct IO_APIC_route_entry entry;
1484 unsigned int dest;
1475 1485
1476 if (irq_remapping_enabled) 1486 if (irq_remapping_enabled)
1477 return; 1487 return;
@@ -1482,9 +1492,13 @@ static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx,
1482 * We use logical delivery to get the timer IRQ 1492 * We use logical delivery to get the timer IRQ
1483 * to the first CPU. 1493 * to the first CPU.
1484 */ 1494 */
1495 if (unlikely(apic->cpu_mask_to_apicid_and(apic->target_cpus(),
1496 apic->target_cpus(), &dest)))
1497 dest = BAD_APICID;
1498
1485 entry.dest_mode = apic->irq_dest_mode; 1499 entry.dest_mode = apic->irq_dest_mode;
1486 entry.mask = 0; /* don't mask IRQ for edge */ 1500 entry.mask = 0; /* don't mask IRQ for edge */
1487 entry.dest = apic->cpu_mask_to_apicid(apic->target_cpus()); 1501 entry.dest = dest;
1488 entry.delivery_mode = apic->irq_delivery_mode; 1502 entry.delivery_mode = apic->irq_delivery_mode;
1489 entry.polarity = 0; 1503 entry.polarity = 0;
1490 entry.trigger = 0; 1504 entry.trigger = 0;
@@ -1521,7 +1535,6 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx)
1521 reg_03.raw = io_apic_read(ioapic_idx, 3); 1535 reg_03.raw = io_apic_read(ioapic_idx, 3);
1522 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 1536 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
1523 1537
1524 printk("\n");
1525 printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(ioapic_idx)); 1538 printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(ioapic_idx));
1526 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); 1539 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
1527 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); 1540 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
@@ -1578,7 +1591,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx)
1578 i, 1591 i,
1579 ir_entry->index 1592 ir_entry->index
1580 ); 1593 );
1581 printk("%1d %1d %1d %1d %1d " 1594 pr_cont("%1d %1d %1d %1d %1d "
1582 "%1d %1d %X %02X\n", 1595 "%1d %1d %X %02X\n",
1583 ir_entry->format, 1596 ir_entry->format,
1584 ir_entry->mask, 1597 ir_entry->mask,
@@ -1598,7 +1611,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx)
1598 i, 1611 i,
1599 entry.dest 1612 entry.dest
1600 ); 1613 );
1601 printk("%1d %1d %1d %1d %1d " 1614 pr_cont("%1d %1d %1d %1d %1d "
1602 "%1d %1d %02X\n", 1615 "%1d %1d %02X\n",
1603 entry.mask, 1616 entry.mask,
1604 entry.trigger, 1617 entry.trigger,
@@ -1651,8 +1664,8 @@ __apicdebuginit(void) print_IO_APICs(void)
1651 continue; 1664 continue;
1652 printk(KERN_DEBUG "IRQ%d ", irq); 1665 printk(KERN_DEBUG "IRQ%d ", irq);
1653 for_each_irq_pin(entry, cfg->irq_2_pin) 1666 for_each_irq_pin(entry, cfg->irq_2_pin)
1654 printk("-> %d:%d", entry->apic, entry->pin); 1667 pr_cont("-> %d:%d", entry->apic, entry->pin);
1655 printk("\n"); 1668 pr_cont("\n");
1656 } 1669 }
1657 1670
1658 printk(KERN_INFO ".................................... done.\n"); 1671 printk(KERN_INFO ".................................... done.\n");
@@ -1665,9 +1678,9 @@ __apicdebuginit(void) print_APIC_field(int base)
1665 printk(KERN_DEBUG); 1678 printk(KERN_DEBUG);
1666 1679
1667 for (i = 0; i < 8; i++) 1680 for (i = 0; i < 8; i++)
1668 printk(KERN_CONT "%08x", apic_read(base + i*0x10)); 1681 pr_cont("%08x", apic_read(base + i*0x10));
1669 1682
1670 printk(KERN_CONT "\n"); 1683 pr_cont("\n");
1671} 1684}
1672 1685
1673__apicdebuginit(void) print_local_APIC(void *dummy) 1686__apicdebuginit(void) print_local_APIC(void *dummy)
@@ -1769,7 +1782,7 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
1769 printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v); 1782 printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v);
1770 } 1783 }
1771 } 1784 }
1772 printk("\n"); 1785 pr_cont("\n");
1773} 1786}
1774 1787
1775__apicdebuginit(void) print_local_APICs(int maxcpu) 1788__apicdebuginit(void) print_local_APICs(int maxcpu)
@@ -2065,7 +2078,7 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)
2065 reg_00.raw = io_apic_read(ioapic_idx, 0); 2078 reg_00.raw = io_apic_read(ioapic_idx, 0);
2066 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 2079 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2067 if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx)) 2080 if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx))
2068 printk("could not set ID!\n"); 2081 pr_cont("could not set ID!\n");
2069 else 2082 else
2070 apic_printk(APIC_VERBOSE, " ok.\n"); 2083 apic_printk(APIC_VERBOSE, " ok.\n");
2071 } 2084 }
@@ -2210,71 +2223,6 @@ void send_cleanup_vector(struct irq_cfg *cfg)
2210 cfg->move_in_progress = 0; 2223 cfg->move_in_progress = 0;
2211} 2224}
2212 2225
2213static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
2214{
2215 int apic, pin;
2216 struct irq_pin_list *entry;
2217 u8 vector = cfg->vector;
2218
2219 for_each_irq_pin(entry, cfg->irq_2_pin) {
2220 unsigned int reg;
2221
2222 apic = entry->apic;
2223 pin = entry->pin;
2224 /*
2225 * With interrupt-remapping, destination information comes
2226 * from interrupt-remapping table entry.
2227 */
2228 if (!irq_remapped(cfg))
2229 io_apic_write(apic, 0x11 + pin*2, dest);
2230 reg = io_apic_read(apic, 0x10 + pin*2);
2231 reg &= ~IO_APIC_REDIR_VECTOR_MASK;
2232 reg |= vector;
2233 io_apic_modify(apic, 0x10 + pin*2, reg);
2234 }
2235}
2236
2237/*
2238 * Either sets data->affinity to a valid value, and returns
2239 * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
2240 * leaves data->affinity untouched.
2241 */
2242int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
2243 unsigned int *dest_id)
2244{
2245 struct irq_cfg *cfg = data->chip_data;
2246
2247 if (!cpumask_intersects(mask, cpu_online_mask))
2248 return -1;
2249
2250 if (assign_irq_vector(data->irq, data->chip_data, mask))
2251 return -1;
2252
2253 cpumask_copy(data->affinity, mask);
2254
2255 *dest_id = apic->cpu_mask_to_apicid_and(mask, cfg->domain);
2256 return 0;
2257}
2258
2259static int
2260ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
2261 bool force)
2262{
2263 unsigned int dest, irq = data->irq;
2264 unsigned long flags;
2265 int ret;
2266
2267 raw_spin_lock_irqsave(&ioapic_lock, flags);
2268 ret = __ioapic_set_affinity(data, mask, &dest);
2269 if (!ret) {
2270 /* Only the high 8 bits are valid. */
2271 dest = SET_APIC_LOGICAL_ID(dest);
2272 __target_IO_APIC_irq(irq, dest, data->chip_data);
2273 }
2274 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2275 return ret;
2276}
2277
2278asmlinkage void smp_irq_move_cleanup_interrupt(void) 2226asmlinkage void smp_irq_move_cleanup_interrupt(void)
2279{ 2227{
2280 unsigned vector, me; 2228 unsigned vector, me;
@@ -2362,6 +2310,87 @@ void irq_force_complete_move(int irq)
2362static inline void irq_complete_move(struct irq_cfg *cfg) { } 2310static inline void irq_complete_move(struct irq_cfg *cfg) { }
2363#endif 2311#endif
2364 2312
2313static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
2314{
2315 int apic, pin;
2316 struct irq_pin_list *entry;
2317 u8 vector = cfg->vector;
2318
2319 for_each_irq_pin(entry, cfg->irq_2_pin) {
2320 unsigned int reg;
2321
2322 apic = entry->apic;
2323 pin = entry->pin;
2324 /*
2325 * With interrupt-remapping, destination information comes
2326 * from interrupt-remapping table entry.
2327 */
2328 if (!irq_remapped(cfg))
2329 io_apic_write(apic, 0x11 + pin*2, dest);
2330 reg = io_apic_read(apic, 0x10 + pin*2);
2331 reg &= ~IO_APIC_REDIR_VECTOR_MASK;
2332 reg |= vector;
2333 io_apic_modify(apic, 0x10 + pin*2, reg);
2334 }
2335}
2336
2337/*
2338 * Either sets data->affinity to a valid value, and returns
2339 * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
2340 * leaves data->affinity untouched.
2341 */
2342int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
2343 unsigned int *dest_id)
2344{
2345 struct irq_cfg *cfg = data->chip_data;
2346 unsigned int irq = data->irq;
2347 int err;
2348
2349 if (!config_enabled(CONFIG_SMP))
2350 return -1;
2351
2352 if (!cpumask_intersects(mask, cpu_online_mask))
2353 return -EINVAL;
2354
2355 err = assign_irq_vector(irq, cfg, mask);
2356 if (err)
2357 return err;
2358
2359 err = apic->cpu_mask_to_apicid_and(mask, cfg->domain, dest_id);
2360 if (err) {
2361 if (assign_irq_vector(irq, cfg, data->affinity))
2362 pr_err("Failed to recover vector for irq %d\n", irq);
2363 return err;
2364 }
2365
2366 cpumask_copy(data->affinity, mask);
2367
2368 return 0;
2369}
2370
2371static int
2372ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
2373 bool force)
2374{
2375 unsigned int dest, irq = data->irq;
2376 unsigned long flags;
2377 int ret;
2378
2379 if (!config_enabled(CONFIG_SMP))
2380 return -1;
2381
2382 raw_spin_lock_irqsave(&ioapic_lock, flags);
2383 ret = __ioapic_set_affinity(data, mask, &dest);
2384 if (!ret) {
2385 /* Only the high 8 bits are valid. */
2386 dest = SET_APIC_LOGICAL_ID(dest);
2387 __target_IO_APIC_irq(irq, dest, data->chip_data);
2388 ret = IRQ_SET_MASK_OK_NOCOPY;
2389 }
2390 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2391 return ret;
2392}
2393
2365static void ack_apic_edge(struct irq_data *data) 2394static void ack_apic_edge(struct irq_data *data)
2366{ 2395{
2367 irq_complete_move(data->chip_data); 2396 irq_complete_move(data->chip_data);
@@ -2541,9 +2570,7 @@ static void irq_remap_modify_chip_defaults(struct irq_chip *chip)
2541 chip->irq_ack = ir_ack_apic_edge; 2570 chip->irq_ack = ir_ack_apic_edge;
2542 chip->irq_eoi = ir_ack_apic_level; 2571 chip->irq_eoi = ir_ack_apic_level;
2543 2572
2544#ifdef CONFIG_SMP
2545 chip->irq_set_affinity = set_remapped_irq_affinity; 2573 chip->irq_set_affinity = set_remapped_irq_affinity;
2546#endif
2547} 2574}
2548#endif /* CONFIG_IRQ_REMAP */ 2575#endif /* CONFIG_IRQ_REMAP */
2549 2576
@@ -2554,9 +2581,7 @@ static struct irq_chip ioapic_chip __read_mostly = {
2554 .irq_unmask = unmask_ioapic_irq, 2581 .irq_unmask = unmask_ioapic_irq,
2555 .irq_ack = ack_apic_edge, 2582 .irq_ack = ack_apic_edge,
2556 .irq_eoi = ack_apic_level, 2583 .irq_eoi = ack_apic_level,
2557#ifdef CONFIG_SMP
2558 .irq_set_affinity = ioapic_set_affinity, 2584 .irq_set_affinity = ioapic_set_affinity,
2559#endif
2560 .irq_retrigger = ioapic_retrigger_irq, 2585 .irq_retrigger = ioapic_retrigger_irq,
2561}; 2586};
2562 2587
@@ -3038,7 +3063,10 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
3038 if (err) 3063 if (err)
3039 return err; 3064 return err;
3040 3065
3041 dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); 3066 err = apic->cpu_mask_to_apicid_and(cfg->domain,
3067 apic->target_cpus(), &dest);
3068 if (err)
3069 return err;
3042 3070
3043 if (irq_remapped(cfg)) { 3071 if (irq_remapped(cfg)) {
3044 compose_remapped_msi_msg(pdev, irq, dest, msg, hpet_id); 3072 compose_remapped_msi_msg(pdev, irq, dest, msg, hpet_id);
@@ -3072,7 +3100,6 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
3072 return err; 3100 return err;
3073} 3101}
3074 3102
3075#ifdef CONFIG_SMP
3076static int 3103static int
3077msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) 3104msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
3078{ 3105{
@@ -3092,9 +3119,8 @@ msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
3092 3119
3093 __write_msi_msg(data->msi_desc, &msg); 3120 __write_msi_msg(data->msi_desc, &msg);
3094 3121
3095 return 0; 3122 return IRQ_SET_MASK_OK_NOCOPY;
3096} 3123}
3097#endif /* CONFIG_SMP */
3098 3124
3099/* 3125/*
3100 * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, 3126 * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
@@ -3105,9 +3131,7 @@ static struct irq_chip msi_chip = {
3105 .irq_unmask = unmask_msi_irq, 3131 .irq_unmask = unmask_msi_irq,
3106 .irq_mask = mask_msi_irq, 3132 .irq_mask = mask_msi_irq,
3107 .irq_ack = ack_apic_edge, 3133 .irq_ack = ack_apic_edge,
3108#ifdef CONFIG_SMP
3109 .irq_set_affinity = msi_set_affinity, 3134 .irq_set_affinity = msi_set_affinity,
3110#endif
3111 .irq_retrigger = ioapic_retrigger_irq, 3135 .irq_retrigger = ioapic_retrigger_irq,
3112}; 3136};
3113 3137
@@ -3192,7 +3216,6 @@ void native_teardown_msi_irq(unsigned int irq)
3192} 3216}
3193 3217
3194#ifdef CONFIG_DMAR_TABLE 3218#ifdef CONFIG_DMAR_TABLE
3195#ifdef CONFIG_SMP
3196static int 3219static int
3197dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, 3220dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
3198 bool force) 3221 bool force)
@@ -3214,19 +3237,15 @@ dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
3214 3237
3215 dmar_msi_write(irq, &msg); 3238 dmar_msi_write(irq, &msg);
3216 3239
3217 return 0; 3240 return IRQ_SET_MASK_OK_NOCOPY;
3218} 3241}
3219 3242
3220#endif /* CONFIG_SMP */
3221
3222static struct irq_chip dmar_msi_type = { 3243static struct irq_chip dmar_msi_type = {
3223 .name = "DMAR_MSI", 3244 .name = "DMAR_MSI",
3224 .irq_unmask = dmar_msi_unmask, 3245 .irq_unmask = dmar_msi_unmask,
3225 .irq_mask = dmar_msi_mask, 3246 .irq_mask = dmar_msi_mask,
3226 .irq_ack = ack_apic_edge, 3247 .irq_ack = ack_apic_edge,
3227#ifdef CONFIG_SMP
3228 .irq_set_affinity = dmar_msi_set_affinity, 3248 .irq_set_affinity = dmar_msi_set_affinity,
3229#endif
3230 .irq_retrigger = ioapic_retrigger_irq, 3249 .irq_retrigger = ioapic_retrigger_irq,
3231}; 3250};
3232 3251
@@ -3247,7 +3266,6 @@ int arch_setup_dmar_msi(unsigned int irq)
3247 3266
3248#ifdef CONFIG_HPET_TIMER 3267#ifdef CONFIG_HPET_TIMER
3249 3268
3250#ifdef CONFIG_SMP
3251static int hpet_msi_set_affinity(struct irq_data *data, 3269static int hpet_msi_set_affinity(struct irq_data *data,
3252 const struct cpumask *mask, bool force) 3270 const struct cpumask *mask, bool force)
3253{ 3271{
@@ -3267,19 +3285,15 @@ static int hpet_msi_set_affinity(struct irq_data *data,
3267 3285
3268 hpet_msi_write(data->handler_data, &msg); 3286 hpet_msi_write(data->handler_data, &msg);
3269 3287
3270 return 0; 3288 return IRQ_SET_MASK_OK_NOCOPY;
3271} 3289}
3272 3290
3273#endif /* CONFIG_SMP */
3274
3275static struct irq_chip hpet_msi_type = { 3291static struct irq_chip hpet_msi_type = {
3276 .name = "HPET_MSI", 3292 .name = "HPET_MSI",
3277 .irq_unmask = hpet_msi_unmask, 3293 .irq_unmask = hpet_msi_unmask,
3278 .irq_mask = hpet_msi_mask, 3294 .irq_mask = hpet_msi_mask,
3279 .irq_ack = ack_apic_edge, 3295 .irq_ack = ack_apic_edge,
3280#ifdef CONFIG_SMP
3281 .irq_set_affinity = hpet_msi_set_affinity, 3296 .irq_set_affinity = hpet_msi_set_affinity,
3282#endif
3283 .irq_retrigger = ioapic_retrigger_irq, 3297 .irq_retrigger = ioapic_retrigger_irq,
3284}; 3298};
3285 3299
@@ -3314,8 +3328,6 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
3314 */ 3328 */
3315#ifdef CONFIG_HT_IRQ 3329#ifdef CONFIG_HT_IRQ
3316 3330
3317#ifdef CONFIG_SMP
3318
3319static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) 3331static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
3320{ 3332{
3321 struct ht_irq_msg msg; 3333 struct ht_irq_msg msg;
@@ -3340,25 +3352,23 @@ ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
3340 return -1; 3352 return -1;
3341 3353
3342 target_ht_irq(data->irq, dest, cfg->vector); 3354 target_ht_irq(data->irq, dest, cfg->vector);
3343 return 0; 3355 return IRQ_SET_MASK_OK_NOCOPY;
3344} 3356}
3345 3357
3346#endif
3347
3348static struct irq_chip ht_irq_chip = { 3358static struct irq_chip ht_irq_chip = {
3349 .name = "PCI-HT", 3359 .name = "PCI-HT",
3350 .irq_mask = mask_ht_irq, 3360 .irq_mask = mask_ht_irq,
3351 .irq_unmask = unmask_ht_irq, 3361 .irq_unmask = unmask_ht_irq,
3352 .irq_ack = ack_apic_edge, 3362 .irq_ack = ack_apic_edge,
3353#ifdef CONFIG_SMP
3354 .irq_set_affinity = ht_set_affinity, 3363 .irq_set_affinity = ht_set_affinity,
3355#endif
3356 .irq_retrigger = ioapic_retrigger_irq, 3364 .irq_retrigger = ioapic_retrigger_irq,
3357}; 3365};
3358 3366
3359int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) 3367int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3360{ 3368{
3361 struct irq_cfg *cfg; 3369 struct irq_cfg *cfg;
3370 struct ht_irq_msg msg;
3371 unsigned dest;
3362 int err; 3372 int err;
3363 3373
3364 if (disable_apic) 3374 if (disable_apic)
@@ -3366,36 +3376,37 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3366 3376
3367 cfg = irq_cfg(irq); 3377 cfg = irq_cfg(irq);
3368 err = assign_irq_vector(irq, cfg, apic->target_cpus()); 3378 err = assign_irq_vector(irq, cfg, apic->target_cpus());
3369 if (!err) { 3379 if (err)
3370 struct ht_irq_msg msg; 3380 return err;
3371 unsigned dest; 3381
3382 err = apic->cpu_mask_to_apicid_and(cfg->domain,
3383 apic->target_cpus(), &dest);
3384 if (err)
3385 return err;
3372 3386
3373 dest = apic->cpu_mask_to_apicid_and(cfg->domain, 3387 msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
3374 apic->target_cpus());
3375 3388
3376 msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); 3389 msg.address_lo =
3390 HT_IRQ_LOW_BASE |
3391 HT_IRQ_LOW_DEST_ID(dest) |
3392 HT_IRQ_LOW_VECTOR(cfg->vector) |
3393 ((apic->irq_dest_mode == 0) ?
3394 HT_IRQ_LOW_DM_PHYSICAL :
3395 HT_IRQ_LOW_DM_LOGICAL) |
3396 HT_IRQ_LOW_RQEOI_EDGE |
3397 ((apic->irq_delivery_mode != dest_LowestPrio) ?
3398 HT_IRQ_LOW_MT_FIXED :
3399 HT_IRQ_LOW_MT_ARBITRATED) |
3400 HT_IRQ_LOW_IRQ_MASKED;
3377 3401
3378 msg.address_lo = 3402 write_ht_irq_msg(irq, &msg);
3379 HT_IRQ_LOW_BASE |
3380 HT_IRQ_LOW_DEST_ID(dest) |
3381 HT_IRQ_LOW_VECTOR(cfg->vector) |
3382 ((apic->irq_dest_mode == 0) ?
3383 HT_IRQ_LOW_DM_PHYSICAL :
3384 HT_IRQ_LOW_DM_LOGICAL) |
3385 HT_IRQ_LOW_RQEOI_EDGE |
3386 ((apic->irq_delivery_mode != dest_LowestPrio) ?
3387 HT_IRQ_LOW_MT_FIXED :
3388 HT_IRQ_LOW_MT_ARBITRATED) |
3389 HT_IRQ_LOW_IRQ_MASKED;
3390 3403
3391 write_ht_irq_msg(irq, &msg); 3404 irq_set_chip_and_handler_name(irq, &ht_irq_chip,
3405 handle_edge_irq, "edge");
3392 3406
3393 irq_set_chip_and_handler_name(irq, &ht_irq_chip, 3407 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq);
3394 handle_edge_irq, "edge");
3395 3408
3396 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq); 3409 return 0;
3397 }
3398 return err;
3399} 3410}
3400#endif /* CONFIG_HT_IRQ */ 3411#endif /* CONFIG_HT_IRQ */
3401 3412
@@ -3563,7 +3574,8 @@ static int __init io_apic_get_unique_id(int ioapic, int apic_id)
3563 3574
3564 /* Sanity check */ 3575 /* Sanity check */
3565 if (reg_00.bits.ID != apic_id) { 3576 if (reg_00.bits.ID != apic_id) {
3566 printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic); 3577 pr_err("IOAPIC[%d]: Unable to change apic_id!\n",
3578 ioapic);
3567 return -1; 3579 return -1;
3568 } 3580 }
3569 } 3581 }
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index f00a68cca37a..d661ee95cabf 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -406,16 +406,13 @@ static inline int numaq_check_phys_apicid_present(int phys_apicid)
406 * We use physical apicids here, not logical, so just return the default 406 * We use physical apicids here, not logical, so just return the default
407 * physical broadcast to stop people from breaking us 407 * physical broadcast to stop people from breaking us
408 */ 408 */
409static unsigned int numaq_cpu_mask_to_apicid(const struct cpumask *cpumask) 409static int
410{
411 return 0x0F;
412}
413
414static inline unsigned int
415numaq_cpu_mask_to_apicid_and(const struct cpumask *cpumask, 410numaq_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
416 const struct cpumask *andmask) 411 const struct cpumask *andmask,
412 unsigned int *apicid)
417{ 413{
418 return 0x0F; 414 *apicid = 0x0F;
415 return 0;
419} 416}
420 417
421/* No NUMA-Q box has a HT CPU, but it can't hurt to use the default code. */ 418/* No NUMA-Q box has a HT CPU, but it can't hurt to use the default code. */
@@ -441,20 +438,6 @@ static int probe_numaq(void)
441 return found_numaq; 438 return found_numaq;
442} 439}
443 440
444static void numaq_vector_allocation_domain(int cpu, struct cpumask *retmask)
445{
446 /* Careful. Some cpus do not strictly honor the set of cpus
447 * specified in the interrupt destination when using lowest
448 * priority interrupt delivery mode.
449 *
450 * In particular there was a hyperthreading cpu observed to
451 * deliver interrupts to the wrong hyperthread when only one
452 * hyperthread was specified in the interrupt desitination.
453 */
454 cpumask_clear(retmask);
455 cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
456}
457
458static void numaq_setup_portio_remap(void) 441static void numaq_setup_portio_remap(void)
459{ 442{
460 int num_quads = num_online_nodes(); 443 int num_quads = num_online_nodes();
@@ -491,7 +474,7 @@ static struct apic __refdata apic_numaq = {
491 .check_apicid_used = numaq_check_apicid_used, 474 .check_apicid_used = numaq_check_apicid_used,
492 .check_apicid_present = numaq_check_apicid_present, 475 .check_apicid_present = numaq_check_apicid_present,
493 476
494 .vector_allocation_domain = numaq_vector_allocation_domain, 477 .vector_allocation_domain = flat_vector_allocation_domain,
495 .init_apic_ldr = numaq_init_apic_ldr, 478 .init_apic_ldr = numaq_init_apic_ldr,
496 479
497 .ioapic_phys_id_map = numaq_ioapic_phys_id_map, 480 .ioapic_phys_id_map = numaq_ioapic_phys_id_map,
@@ -509,7 +492,6 @@ static struct apic __refdata apic_numaq = {
509 .set_apic_id = NULL, 492 .set_apic_id = NULL,
510 .apic_id_mask = 0x0F << 24, 493 .apic_id_mask = 0x0F << 24,
511 494
512 .cpu_mask_to_apicid = numaq_cpu_mask_to_apicid,
513 .cpu_mask_to_apicid_and = numaq_cpu_mask_to_apicid_and, 495 .cpu_mask_to_apicid_and = numaq_cpu_mask_to_apicid_and,
514 496
515 .send_IPI_mask = numaq_send_IPI_mask, 497 .send_IPI_mask = numaq_send_IPI_mask,
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 1b291da09e60..eb35ef9ee63f 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -66,21 +66,6 @@ static void setup_apic_flat_routing(void)
66#endif 66#endif
67} 67}
68 68
69static void default_vector_allocation_domain(int cpu, struct cpumask *retmask)
70{
71 /*
72 * Careful. Some cpus do not strictly honor the set of cpus
73 * specified in the interrupt destination when using lowest
74 * priority interrupt delivery mode.
75 *
76 * In particular there was a hyperthreading cpu observed to
77 * deliver interrupts to the wrong hyperthread when only one
78 * hyperthread was specified in the interrupt desitination.
79 */
80 cpumask_clear(retmask);
81 cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
82}
83
84/* should be called last. */ 69/* should be called last. */
85static int probe_default(void) 70static int probe_default(void)
86{ 71{
@@ -105,7 +90,7 @@ static struct apic apic_default = {
105 .check_apicid_used = default_check_apicid_used, 90 .check_apicid_used = default_check_apicid_used,
106 .check_apicid_present = default_check_apicid_present, 91 .check_apicid_present = default_check_apicid_present,
107 92
108 .vector_allocation_domain = default_vector_allocation_domain, 93 .vector_allocation_domain = flat_vector_allocation_domain,
109 .init_apic_ldr = default_init_apic_ldr, 94 .init_apic_ldr = default_init_apic_ldr,
110 95
111 .ioapic_phys_id_map = default_ioapic_phys_id_map, 96 .ioapic_phys_id_map = default_ioapic_phys_id_map,
@@ -123,8 +108,7 @@ static struct apic apic_default = {
123 .set_apic_id = NULL, 108 .set_apic_id = NULL,
124 .apic_id_mask = 0x0F << 24, 109 .apic_id_mask = 0x0F << 24,
125 110
126 .cpu_mask_to_apicid = default_cpu_mask_to_apicid, 111 .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
127 .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
128 112
129 .send_IPI_mask = default_send_IPI_mask_logical, 113 .send_IPI_mask = default_send_IPI_mask_logical,
130 .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_logical, 114 .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_logical,
@@ -208,6 +192,9 @@ void __init default_setup_apic_routing(void)
208 192
209 if (apic->setup_apic_routing) 193 if (apic->setup_apic_routing)
210 apic->setup_apic_routing(); 194 apic->setup_apic_routing();
195
196 if (x86_platform.apic_post_init)
197 x86_platform.apic_post_init();
211} 198}
212 199
213void __init generic_apic_probe(void) 200void __init generic_apic_probe(void)
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index 3fe986698929..1793dba7a741 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -23,11 +23,6 @@
23#include <asm/ipi.h> 23#include <asm/ipi.h>
24#include <asm/setup.h> 24#include <asm/setup.h>
25 25
26static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
27{
28 return hard_smp_processor_id() >> index_msb;
29}
30
31/* 26/*
32 * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. 27 * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
33 */ 28 */
@@ -48,10 +43,8 @@ void __init default_setup_apic_routing(void)
48 } 43 }
49 } 44 }
50 45
51 if (is_vsmp_box()) { 46 if (x86_platform.apic_post_init)
52 /* need to update phys_pkg_id */ 47 x86_platform.apic_post_init();
53 apic->phys_pkg_id = apicid_phys_pkg_id;
54 }
55} 48}
56 49
57/* Same for both flat and physical. */ 50/* Same for both flat and physical. */
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 659897c00755..77c95c0e1bf7 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -26,6 +26,8 @@
26 * 26 *
27 */ 27 */
28 28
29#define pr_fmt(fmt) "summit: %s: " fmt, __func__
30
29#include <linux/mm.h> 31#include <linux/mm.h>
30#include <linux/init.h> 32#include <linux/init.h>
31#include <asm/io.h> 33#include <asm/io.h>
@@ -235,8 +237,8 @@ static int summit_apic_id_registered(void)
235 237
236static void summit_setup_apic_routing(void) 238static void summit_setup_apic_routing(void)
237{ 239{
238 printk("Enabling APIC mode: Summit. Using %d I/O APICs\n", 240 pr_info("Enabling APIC mode: Summit. Using %d I/O APICs\n",
239 nr_ioapics); 241 nr_ioapics);
240} 242}
241 243
242static int summit_cpu_present_to_apicid(int mps_cpu) 244static int summit_cpu_present_to_apicid(int mps_cpu)
@@ -263,43 +265,48 @@ static int summit_check_phys_apicid_present(int physical_apicid)
263 return 1; 265 return 1;
264} 266}
265 267
266static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask) 268static inline int
269summit_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id)
267{ 270{
268 unsigned int round = 0; 271 unsigned int round = 0;
269 int cpu, apicid = 0; 272 unsigned int cpu, apicid = 0;
270 273
271 /* 274 /*
272 * The cpus in the mask must all be on the apic cluster. 275 * The cpus in the mask must all be on the apic cluster.
273 */ 276 */
274 for_each_cpu(cpu, cpumask) { 277 for_each_cpu_and(cpu, cpumask, cpu_online_mask) {
275 int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); 278 int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
276 279
277 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { 280 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
278 printk("%s: Not a valid mask!\n", __func__); 281 pr_err("Not a valid mask!\n");
279 return BAD_APICID; 282 return -EINVAL;
280 } 283 }
281 apicid |= new_apicid; 284 apicid |= new_apicid;
282 round++; 285 round++;
283 } 286 }
284 return apicid; 287 if (!round)
288 return -EINVAL;
289 *dest_id = apicid;
290 return 0;
285} 291}
286 292
287static unsigned int summit_cpu_mask_to_apicid_and(const struct cpumask *inmask, 293static int
288 const struct cpumask *andmask) 294summit_cpu_mask_to_apicid_and(const struct cpumask *inmask,
295 const struct cpumask *andmask,
296 unsigned int *apicid)
289{ 297{
290 int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
291 cpumask_var_t cpumask; 298 cpumask_var_t cpumask;
299 *apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
292 300
293 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) 301 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
294 return apicid; 302 return 0;
295 303
296 cpumask_and(cpumask, inmask, andmask); 304 cpumask_and(cpumask, inmask, andmask);
297 cpumask_and(cpumask, cpumask, cpu_online_mask); 305 summit_cpu_mask_to_apicid(cpumask, apicid);
298 apicid = summit_cpu_mask_to_apicid(cpumask);
299 306
300 free_cpumask_var(cpumask); 307 free_cpumask_var(cpumask);
301 308
302 return apicid; 309 return 0;
303} 310}
304 311
305/* 312/*
@@ -320,20 +327,6 @@ static int probe_summit(void)
320 return 0; 327 return 0;
321} 328}
322 329
323static void summit_vector_allocation_domain(int cpu, struct cpumask *retmask)
324{
325 /* Careful. Some cpus do not strictly honor the set of cpus
326 * specified in the interrupt destination when using lowest
327 * priority interrupt delivery mode.
328 *
329 * In particular there was a hyperthreading cpu observed to
330 * deliver interrupts to the wrong hyperthread when only one
331 * hyperthread was specified in the interrupt desitination.
332 */
333 cpumask_clear(retmask);
334 cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
335}
336
337#ifdef CONFIG_X86_SUMMIT_NUMA 330#ifdef CONFIG_X86_SUMMIT_NUMA
338static struct rio_table_hdr *rio_table_hdr; 331static struct rio_table_hdr *rio_table_hdr;
339static struct scal_detail *scal_devs[MAX_NUMNODES]; 332static struct scal_detail *scal_devs[MAX_NUMNODES];
@@ -355,7 +348,7 @@ static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
355 } 348 }
356 } 349 }
357 if (i == rio_table_hdr->num_rio_dev) { 350 if (i == rio_table_hdr->num_rio_dev) {
358 printk(KERN_ERR "%s: Couldn't find owner Cyclone for Winnipeg!\n", __func__); 351 pr_err("Couldn't find owner Cyclone for Winnipeg!\n");
359 return last_bus; 352 return last_bus;
360 } 353 }
361 354
@@ -366,7 +359,7 @@ static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
366 } 359 }
367 } 360 }
368 if (i == rio_table_hdr->num_scal_dev) { 361 if (i == rio_table_hdr->num_scal_dev) {
369 printk(KERN_ERR "%s: Couldn't find owner Twister for Cyclone!\n", __func__); 362 pr_err("Couldn't find owner Twister for Cyclone!\n");
370 return last_bus; 363 return last_bus;
371 } 364 }
372 365
@@ -396,7 +389,7 @@ static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
396 num_buses = 9; 389 num_buses = 9;
397 break; 390 break;
398 default: 391 default:
399 printk(KERN_INFO "%s: Unsupported Winnipeg type!\n", __func__); 392 pr_info("Unsupported Winnipeg type!\n");
400 return last_bus; 393 return last_bus;
401 } 394 }
402 395
@@ -411,13 +404,15 @@ static int build_detail_arrays(void)
411 int i, scal_detail_size, rio_detail_size; 404 int i, scal_detail_size, rio_detail_size;
412 405
413 if (rio_table_hdr->num_scal_dev > MAX_NUMNODES) { 406 if (rio_table_hdr->num_scal_dev > MAX_NUMNODES) {
414 printk(KERN_WARNING "%s: MAX_NUMNODES too low! Defined as %d, but system has %d nodes.\n", __func__, MAX_NUMNODES, rio_table_hdr->num_scal_dev); 407 pr_warn("MAX_NUMNODES too low! Defined as %d, but system has %d nodes\n",
408 MAX_NUMNODES, rio_table_hdr->num_scal_dev);
415 return 0; 409 return 0;
416 } 410 }
417 411
418 switch (rio_table_hdr->version) { 412 switch (rio_table_hdr->version) {
419 default: 413 default:
420 printk(KERN_WARNING "%s: Invalid Rio Grande Table Version: %d\n", __func__, rio_table_hdr->version); 414 pr_warn("Invalid Rio Grande Table Version: %d\n",
415 rio_table_hdr->version);
421 return 0; 416 return 0;
422 case 2: 417 case 2:
423 scal_detail_size = 11; 418 scal_detail_size = 11;
@@ -462,7 +457,7 @@ void setup_summit(void)
462 offset = *((unsigned short *)(ptr + offset)); 457 offset = *((unsigned short *)(ptr + offset));
463 } 458 }
464 if (!rio_table_hdr) { 459 if (!rio_table_hdr) {
465 printk(KERN_ERR "%s: Unable to locate Rio Grande Table in EBDA - bailing!\n", __func__); 460 pr_err("Unable to locate Rio Grande Table in EBDA - bailing!\n");
466 return; 461 return;
467 } 462 }
468 463
@@ -509,7 +504,7 @@ static struct apic apic_summit = {
509 .check_apicid_used = summit_check_apicid_used, 504 .check_apicid_used = summit_check_apicid_used,
510 .check_apicid_present = summit_check_apicid_present, 505 .check_apicid_present = summit_check_apicid_present,
511 506
512 .vector_allocation_domain = summit_vector_allocation_domain, 507 .vector_allocation_domain = flat_vector_allocation_domain,
513 .init_apic_ldr = summit_init_apic_ldr, 508 .init_apic_ldr = summit_init_apic_ldr,
514 509
515 .ioapic_phys_id_map = summit_ioapic_phys_id_map, 510 .ioapic_phys_id_map = summit_ioapic_phys_id_map,
@@ -527,7 +522,6 @@ static struct apic apic_summit = {
527 .set_apic_id = NULL, 522 .set_apic_id = NULL,
528 .apic_id_mask = 0xFF << 24, 523 .apic_id_mask = 0xFF << 24,
529 524
530 .cpu_mask_to_apicid = summit_cpu_mask_to_apicid,
531 .cpu_mask_to_apicid_and = summit_cpu_mask_to_apicid_and, 525 .cpu_mask_to_apicid_and = summit_cpu_mask_to_apicid_and,
532 526
533 .send_IPI_mask = summit_send_IPI_mask, 527 .send_IPI_mask = summit_send_IPI_mask,
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index ff35cff0e1a7..c88baa4ff0e5 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -81,7 +81,7 @@ static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
81} 81}
82 82
83static void 83static void
84 x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) 84x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
85{ 85{
86 __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT); 86 __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
87} 87}
@@ -96,36 +96,37 @@ static void x2apic_send_IPI_all(int vector)
96 __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC); 96 __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
97} 97}
98 98
99static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask) 99static int
100x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
101 const struct cpumask *andmask,
102 unsigned int *apicid)
100{ 103{
101 /* 104 u32 dest = 0;
102 * We're using fixed IRQ delivery, can only return one logical APIC ID. 105 u16 cluster;
103 * May as well be the first. 106 int i;
104 */
105 int cpu = cpumask_first(cpumask);
106 107
107 if ((unsigned)cpu < nr_cpu_ids) 108 for_each_cpu_and(i, cpumask, andmask) {
108 return per_cpu(x86_cpu_to_logical_apicid, cpu); 109 if (!cpumask_test_cpu(i, cpu_online_mask))
109 else 110 continue;
110 return BAD_APICID; 111 dest = per_cpu(x86_cpu_to_logical_apicid, i);
111} 112 cluster = x2apic_cluster(i);
113 break;
114 }
112 115
113static unsigned int 116 if (!dest)
114x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, 117 return -EINVAL;
115 const struct cpumask *andmask)
116{
117 int cpu;
118 118
119 /* 119 for_each_cpu_and(i, cpumask, andmask) {
120 * We're using fixed IRQ delivery, can only return one logical APIC ID. 120 if (!cpumask_test_cpu(i, cpu_online_mask))
121 * May as well be the first. 121 continue;
122 */ 122 if (cluster != x2apic_cluster(i))
123 for_each_cpu_and(cpu, cpumask, andmask) { 123 continue;
124 if (cpumask_test_cpu(cpu, cpu_online_mask)) 124 dest |= per_cpu(x86_cpu_to_logical_apicid, i);
125 break;
126 } 125 }
127 126
128 return per_cpu(x86_cpu_to_logical_apicid, cpu); 127 *apicid = dest;
128
129 return 0;
129} 130}
130 131
131static void init_x2apic_ldr(void) 132static void init_x2apic_ldr(void)
@@ -208,6 +209,32 @@ static int x2apic_cluster_probe(void)
208 return 0; 209 return 0;
209} 210}
210 211
212static const struct cpumask *x2apic_cluster_target_cpus(void)
213{
214 return cpu_all_mask;
215}
216
217/*
218 * Each x2apic cluster is an allocation domain.
219 */
220static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask,
221 const struct cpumask *mask)
222{
223 /*
224 * To minimize vector pressure, default case of boot, device bringup
225 * etc will use a single cpu for the interrupt destination.
226 *
227 * On explicit migration requests coming from irqbalance etc,
228 * interrupts will be routed to the x2apic cluster (cluster-id
229 * derived from the first cpu in the mask) members specified
230 * in the mask.
231 */
232 if (mask == x2apic_cluster_target_cpus())
233 cpumask_copy(retmask, cpumask_of(cpu));
234 else
235 cpumask_and(retmask, mask, per_cpu(cpus_in_cluster, cpu));
236}
237
211static struct apic apic_x2apic_cluster = { 238static struct apic apic_x2apic_cluster = {
212 239
213 .name = "cluster x2apic", 240 .name = "cluster x2apic",
@@ -219,13 +246,13 @@ static struct apic apic_x2apic_cluster = {
219 .irq_delivery_mode = dest_LowestPrio, 246 .irq_delivery_mode = dest_LowestPrio,
220 .irq_dest_mode = 1, /* logical */ 247 .irq_dest_mode = 1, /* logical */
221 248
222 .target_cpus = x2apic_target_cpus, 249 .target_cpus = x2apic_cluster_target_cpus,
223 .disable_esr = 0, 250 .disable_esr = 0,
224 .dest_logical = APIC_DEST_LOGICAL, 251 .dest_logical = APIC_DEST_LOGICAL,
225 .check_apicid_used = NULL, 252 .check_apicid_used = NULL,
226 .check_apicid_present = NULL, 253 .check_apicid_present = NULL,
227 254
228 .vector_allocation_domain = x2apic_vector_allocation_domain, 255 .vector_allocation_domain = cluster_vector_allocation_domain,
229 .init_apic_ldr = init_x2apic_ldr, 256 .init_apic_ldr = init_x2apic_ldr,
230 257
231 .ioapic_phys_id_map = NULL, 258 .ioapic_phys_id_map = NULL,
@@ -243,7 +270,6 @@ static struct apic apic_x2apic_cluster = {
243 .set_apic_id = x2apic_set_apic_id, 270 .set_apic_id = x2apic_set_apic_id,
244 .apic_id_mask = 0xFFFFFFFFu, 271 .apic_id_mask = 0xFFFFFFFFu,
245 272
246 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
247 .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and, 273 .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
248 274
249 .send_IPI_mask = x2apic_send_IPI_mask, 275 .send_IPI_mask = x2apic_send_IPI_mask,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index c17e982db275..e03a1e180e81 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -76,38 +76,6 @@ static void x2apic_send_IPI_all(int vector)
76 __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC); 76 __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
77} 77}
78 78
79static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
80{
81 /*
82 * We're using fixed IRQ delivery, can only return one phys APIC ID.
83 * May as well be the first.
84 */
85 int cpu = cpumask_first(cpumask);
86
87 if ((unsigned)cpu < nr_cpu_ids)
88 return per_cpu(x86_cpu_to_apicid, cpu);
89 else
90 return BAD_APICID;
91}
92
93static unsigned int
94x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
95 const struct cpumask *andmask)
96{
97 int cpu;
98
99 /*
100 * We're using fixed IRQ delivery, can only return one phys APIC ID.
101 * May as well be the first.
102 */
103 for_each_cpu_and(cpu, cpumask, andmask) {
104 if (cpumask_test_cpu(cpu, cpu_online_mask))
105 break;
106 }
107
108 return per_cpu(x86_cpu_to_apicid, cpu);
109}
110
111static void init_x2apic_ldr(void) 79static void init_x2apic_ldr(void)
112{ 80{
113} 81}
@@ -131,13 +99,13 @@ static struct apic apic_x2apic_phys = {
131 .irq_delivery_mode = dest_Fixed, 99 .irq_delivery_mode = dest_Fixed,
132 .irq_dest_mode = 0, /* physical */ 100 .irq_dest_mode = 0, /* physical */
133 101
134 .target_cpus = x2apic_target_cpus, 102 .target_cpus = online_target_cpus,
135 .disable_esr = 0, 103 .disable_esr = 0,
136 .dest_logical = 0, 104 .dest_logical = 0,
137 .check_apicid_used = NULL, 105 .check_apicid_used = NULL,
138 .check_apicid_present = NULL, 106 .check_apicid_present = NULL,
139 107
140 .vector_allocation_domain = x2apic_vector_allocation_domain, 108 .vector_allocation_domain = default_vector_allocation_domain,
141 .init_apic_ldr = init_x2apic_ldr, 109 .init_apic_ldr = init_x2apic_ldr,
142 110
143 .ioapic_phys_id_map = NULL, 111 .ioapic_phys_id_map = NULL,
@@ -155,8 +123,7 @@ static struct apic apic_x2apic_phys = {
155 .set_apic_id = x2apic_set_apic_id, 123 .set_apic_id = x2apic_set_apic_id,
156 .apic_id_mask = 0xFFFFFFFFu, 124 .apic_id_mask = 0xFFFFFFFFu,
157 125
158 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, 126 .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
159 .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
160 127
161 .send_IPI_mask = x2apic_send_IPI_mask, 128 .send_IPI_mask = x2apic_send_IPI_mask,
162 .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself, 129 .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index c6d03f7a4401..8cfade9510a4 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -185,17 +185,6 @@ EXPORT_SYMBOL_GPL(uv_possible_blades);
185unsigned long sn_rtc_cycles_per_second; 185unsigned long sn_rtc_cycles_per_second;
186EXPORT_SYMBOL(sn_rtc_cycles_per_second); 186EXPORT_SYMBOL(sn_rtc_cycles_per_second);
187 187
188static const struct cpumask *uv_target_cpus(void)
189{
190 return cpu_online_mask;
191}
192
193static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
194{
195 cpumask_clear(retmask);
196 cpumask_set_cpu(cpu, retmask);
197}
198
199static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) 188static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
200{ 189{
201#ifdef CONFIG_SMP 190#ifdef CONFIG_SMP
@@ -280,25 +269,12 @@ static void uv_init_apic_ldr(void)
280{ 269{
281} 270}
282 271
283static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask) 272static int
284{
285 /*
286 * We're using fixed IRQ delivery, can only return one phys APIC ID.
287 * May as well be the first.
288 */
289 int cpu = cpumask_first(cpumask);
290
291 if ((unsigned)cpu < nr_cpu_ids)
292 return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits;
293 else
294 return BAD_APICID;
295}
296
297static unsigned int
298uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask, 273uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
299 const struct cpumask *andmask) 274 const struct cpumask *andmask,
275 unsigned int *apicid)
300{ 276{
301 int cpu; 277 int unsigned cpu;
302 278
303 /* 279 /*
304 * We're using fixed IRQ delivery, can only return one phys APIC ID. 280 * We're using fixed IRQ delivery, can only return one phys APIC ID.
@@ -308,7 +284,13 @@ uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
308 if (cpumask_test_cpu(cpu, cpu_online_mask)) 284 if (cpumask_test_cpu(cpu, cpu_online_mask))
309 break; 285 break;
310 } 286 }
311 return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits; 287
288 if (likely(cpu < nr_cpu_ids)) {
289 *apicid = per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits;
290 return 0;
291 }
292
293 return -EINVAL;
312} 294}
313 295
314static unsigned int x2apic_get_apic_id(unsigned long x) 296static unsigned int x2apic_get_apic_id(unsigned long x)
@@ -362,13 +344,13 @@ static struct apic __refdata apic_x2apic_uv_x = {
362 .irq_delivery_mode = dest_Fixed, 344 .irq_delivery_mode = dest_Fixed,
363 .irq_dest_mode = 0, /* physical */ 345 .irq_dest_mode = 0, /* physical */
364 346
365 .target_cpus = uv_target_cpus, 347 .target_cpus = online_target_cpus,
366 .disable_esr = 0, 348 .disable_esr = 0,
367 .dest_logical = APIC_DEST_LOGICAL, 349 .dest_logical = APIC_DEST_LOGICAL,
368 .check_apicid_used = NULL, 350 .check_apicid_used = NULL,
369 .check_apicid_present = NULL, 351 .check_apicid_present = NULL,
370 352
371 .vector_allocation_domain = uv_vector_allocation_domain, 353 .vector_allocation_domain = default_vector_allocation_domain,
372 .init_apic_ldr = uv_init_apic_ldr, 354 .init_apic_ldr = uv_init_apic_ldr,
373 355
374 .ioapic_phys_id_map = NULL, 356 .ioapic_phys_id_map = NULL,
@@ -386,7 +368,6 @@ static struct apic __refdata apic_x2apic_uv_x = {
386 .set_apic_id = set_apic_id, 368 .set_apic_id = set_apic_id,
387 .apic_id_mask = 0xFFFFFFFFu, 369 .apic_id_mask = 0xFFFFFFFFu,
388 370
389 .cpu_mask_to_apicid = uv_cpu_mask_to_apicid,
390 .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and, 371 .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and,
391 372
392 .send_IPI_mask = uv_send_IPI_mask, 373 .send_IPI_mask = uv_send_IPI_mask,
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 07b0c0db466c..d65464e43503 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -201,6 +201,8 @@
201 * http://www.microsoft.com/whdc/archive/amp_12.mspx] 201 * http://www.microsoft.com/whdc/archive/amp_12.mspx]
202 */ 202 */
203 203
204#define pr_fmt(fmt) "apm: " fmt
205
204#include <linux/module.h> 206#include <linux/module.h>
205 207
206#include <linux/poll.h> 208#include <linux/poll.h>
@@ -485,11 +487,11 @@ static void apm_error(char *str, int err)
485 if (error_table[i].key == err) 487 if (error_table[i].key == err)
486 break; 488 break;
487 if (i < ERROR_COUNT) 489 if (i < ERROR_COUNT)
488 printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg); 490 pr_notice("%s: %s\n", str, error_table[i].msg);
489 else if (err < 0) 491 else if (err < 0)
490 printk(KERN_NOTICE "apm: %s: linux error code %i\n", str, err); 492 pr_notice("%s: linux error code %i\n", str, err);
491 else 493 else
492 printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n", 494 pr_notice("%s: unknown error code %#2.2x\n",
493 str, err); 495 str, err);
494} 496}
495 497
@@ -1184,7 +1186,7 @@ static void queue_event(apm_event_t event, struct apm_user *sender)
1184 static int notified; 1186 static int notified;
1185 1187
1186 if (notified++ == 0) 1188 if (notified++ == 0)
1187 printk(KERN_ERR "apm: an event queue overflowed\n"); 1189 pr_err("an event queue overflowed\n");
1188 if (++as->event_tail >= APM_MAX_EVENTS) 1190 if (++as->event_tail >= APM_MAX_EVENTS)
1189 as->event_tail = 0; 1191 as->event_tail = 0;
1190 } 1192 }
@@ -1447,7 +1449,7 @@ static void apm_mainloop(void)
1447static int check_apm_user(struct apm_user *as, const char *func) 1449static int check_apm_user(struct apm_user *as, const char *func)
1448{ 1450{
1449 if (as == NULL || as->magic != APM_BIOS_MAGIC) { 1451 if (as == NULL || as->magic != APM_BIOS_MAGIC) {
1450 printk(KERN_ERR "apm: %s passed bad filp\n", func); 1452 pr_err("%s passed bad filp\n", func);
1451 return 1; 1453 return 1;
1452 } 1454 }
1453 return 0; 1455 return 0;
@@ -1586,7 +1588,7 @@ static int do_release(struct inode *inode, struct file *filp)
1586 as1 = as1->next) 1588 as1 = as1->next)
1587 ; 1589 ;
1588 if (as1 == NULL) 1590 if (as1 == NULL)
1589 printk(KERN_ERR "apm: filp not in user list\n"); 1591 pr_err("filp not in user list\n");
1590 else 1592 else
1591 as1->next = as->next; 1593 as1->next = as->next;
1592 } 1594 }
@@ -1600,11 +1602,9 @@ static int do_open(struct inode *inode, struct file *filp)
1600 struct apm_user *as; 1602 struct apm_user *as;
1601 1603
1602 as = kmalloc(sizeof(*as), GFP_KERNEL); 1604 as = kmalloc(sizeof(*as), GFP_KERNEL);
1603 if (as == NULL) { 1605 if (as == NULL)
1604 printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n",
1605 sizeof(*as));
1606 return -ENOMEM; 1606 return -ENOMEM;
1607 } 1607
1608 as->magic = APM_BIOS_MAGIC; 1608 as->magic = APM_BIOS_MAGIC;
1609 as->event_tail = as->event_head = 0; 1609 as->event_tail = as->event_head = 0;
1610 as->suspends_pending = as->standbys_pending = 0; 1610 as->suspends_pending = as->standbys_pending = 0;
@@ -2313,16 +2313,16 @@ static int __init apm_init(void)
2313 } 2313 }
2314 2314
2315 if (apm_info.disabled) { 2315 if (apm_info.disabled) {
2316 printk(KERN_NOTICE "apm: disabled on user request.\n"); 2316 pr_notice("disabled on user request.\n");
2317 return -ENODEV; 2317 return -ENODEV;
2318 } 2318 }
2319 if ((num_online_cpus() > 1) && !power_off && !smp) { 2319 if ((num_online_cpus() > 1) && !power_off && !smp) {
2320 printk(KERN_NOTICE "apm: disabled - APM is not SMP safe.\n"); 2320 pr_notice("disabled - APM is not SMP safe.\n");
2321 apm_info.disabled = 1; 2321 apm_info.disabled = 1;
2322 return -ENODEV; 2322 return -ENODEV;
2323 } 2323 }
2324 if (!acpi_disabled) { 2324 if (!acpi_disabled) {
2325 printk(KERN_NOTICE "apm: overridden by ACPI.\n"); 2325 pr_notice("overridden by ACPI.\n");
2326 apm_info.disabled = 1; 2326 apm_info.disabled = 1;
2327 return -ENODEV; 2327 return -ENODEV;
2328 } 2328 }
@@ -2356,8 +2356,7 @@ static int __init apm_init(void)
2356 2356
2357 kapmd_task = kthread_create(apm, NULL, "kapmd"); 2357 kapmd_task = kthread_create(apm, NULL, "kapmd");
2358 if (IS_ERR(kapmd_task)) { 2358 if (IS_ERR(kapmd_task)) {
2359 printk(KERN_ERR "apm: disabled - Unable to start kernel " 2359 pr_err("disabled - Unable to start kernel thread\n");
2360 "thread.\n");
2361 err = PTR_ERR(kapmd_task); 2360 err = PTR_ERR(kapmd_task);
2362 kapmd_task = NULL; 2361 kapmd_task = NULL;
2363 remove_proc_entry("apm", NULL); 2362 remove_proc_entry("apm", NULL);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 6ab6aa2fdfdd..d30a6a9a0121 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -14,7 +14,7 @@ CFLAGS_common.o := $(nostackp)
14 14
15obj-y := intel_cacheinfo.o scattered.o topology.o 15obj-y := intel_cacheinfo.o scattered.o topology.o
16obj-y += proc.o capflags.o powerflags.o common.o 16obj-y += proc.o capflags.o powerflags.o common.o
17obj-y += vmware.o hypervisor.o sched.o mshyperv.o 17obj-y += vmware.o hypervisor.o mshyperv.o
18obj-y += rdrand.o 18obj-y += rdrand.o
19obj-y += match.o 19obj-y += match.o
20 20
@@ -32,7 +32,9 @@ obj-$(CONFIG_PERF_EVENTS) += perf_event.o
32 32
33ifdef CONFIG_PERF_EVENTS 33ifdef CONFIG_PERF_EVENTS
34obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd.o 34obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd.o
35obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_p4.o perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o 35obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_p4.o
36obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
37obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o
36endif 38endif
37 39
38obj-$(CONFIG_X86_MCE) += mcheck/ 40obj-$(CONFIG_X86_MCE) += mcheck/
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 146bb6218eec..9d92e19039f0 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -19,6 +19,39 @@
19 19
20#include "cpu.h" 20#include "cpu.h"
21 21
22static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
23{
24 struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
25 u32 gprs[8] = { 0 };
26 int err;
27
28 WARN_ONCE((c->x86 != 0xf), "%s should only be used on K8!\n", __func__);
29
30 gprs[1] = msr;
31 gprs[7] = 0x9c5a203a;
32
33 err = rdmsr_safe_regs(gprs);
34
35 *p = gprs[0] | ((u64)gprs[2] << 32);
36
37 return err;
38}
39
40static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val)
41{
42 struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
43 u32 gprs[8] = { 0 };
44
45 WARN_ONCE((c->x86 != 0xf), "%s should only be used on K8!\n", __func__);
46
47 gprs[0] = (u32)val;
48 gprs[1] = msr;
49 gprs[2] = val >> 32;
50 gprs[7] = 0x9c5a203a;
51
52 return wrmsr_safe_regs(gprs);
53}
54
22#ifdef CONFIG_X86_32 55#ifdef CONFIG_X86_32
23/* 56/*
24 * B step AMD K6 before B 9730xxxx have hardware bugs that can cause 57 * B step AMD K6 before B 9730xxxx have hardware bugs that can cause
@@ -586,9 +619,9 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
586 !cpu_has(c, X86_FEATURE_TOPOEXT)) { 619 !cpu_has(c, X86_FEATURE_TOPOEXT)) {
587 u64 val; 620 u64 val;
588 621
589 if (!rdmsrl_amd_safe(0xc0011005, &val)) { 622 if (!rdmsrl_safe(0xc0011005, &val)) {
590 val |= 1ULL << 54; 623 val |= 1ULL << 54;
591 wrmsrl_amd_safe(0xc0011005, val); 624 wrmsrl_safe(0xc0011005, val);
592 rdmsrl(0xc0011005, val); 625 rdmsrl(0xc0011005, val);
593 if (val & (1ULL << 54)) { 626 if (val & (1ULL << 54)) {
594 set_cpu_cap(c, X86_FEATURE_TOPOEXT); 627 set_cpu_cap(c, X86_FEATURE_TOPOEXT);
@@ -679,7 +712,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
679 err = rdmsrl_safe(MSR_AMD64_MCx_MASK(4), &mask); 712 err = rdmsrl_safe(MSR_AMD64_MCx_MASK(4), &mask);
680 if (err == 0) { 713 if (err == 0) {
681 mask |= (1 << 10); 714 mask |= (1 << 10);
682 checking_wrmsrl(MSR_AMD64_MCx_MASK(4), mask); 715 wrmsrl_safe(MSR_AMD64_MCx_MASK(4), mask);
683 } 716 }
684 } 717 }
685 718
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 46674fbb62ba..c97bb7b5a9f8 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -55,8 +55,8 @@ static void __init check_fpu(void)
55 55
56 if (!boot_cpu_data.hard_math) { 56 if (!boot_cpu_data.hard_math) {
57#ifndef CONFIG_MATH_EMULATION 57#ifndef CONFIG_MATH_EMULATION
58 printk(KERN_EMERG "No coprocessor found and no math emulation present.\n"); 58 pr_emerg("No coprocessor found and no math emulation present\n");
59 printk(KERN_EMERG "Giving up.\n"); 59 pr_emerg("Giving up\n");
60 for (;;) ; 60 for (;;) ;
61#endif 61#endif
62 return; 62 return;
@@ -86,7 +86,7 @@ static void __init check_fpu(void)
86 86
87 boot_cpu_data.fdiv_bug = fdiv_bug; 87 boot_cpu_data.fdiv_bug = fdiv_bug;
88 if (boot_cpu_data.fdiv_bug) 88 if (boot_cpu_data.fdiv_bug)
89 printk(KERN_WARNING "Hmm, FPU with FDIV bug.\n"); 89 pr_warn("Hmm, FPU with FDIV bug\n");
90} 90}
91 91
92static void __init check_hlt(void) 92static void __init check_hlt(void)
@@ -94,16 +94,16 @@ static void __init check_hlt(void)
94 if (boot_cpu_data.x86 >= 5 || paravirt_enabled()) 94 if (boot_cpu_data.x86 >= 5 || paravirt_enabled())
95 return; 95 return;
96 96
97 printk(KERN_INFO "Checking 'hlt' instruction... "); 97 pr_info("Checking 'hlt' instruction... ");
98 if (!boot_cpu_data.hlt_works_ok) { 98 if (!boot_cpu_data.hlt_works_ok) {
99 printk("disabled\n"); 99 pr_cont("disabled\n");
100 return; 100 return;
101 } 101 }
102 halt(); 102 halt();
103 halt(); 103 halt();
104 halt(); 104 halt();
105 halt(); 105 halt();
106 printk(KERN_CONT "OK.\n"); 106 pr_cont("OK\n");
107} 107}
108 108
109/* 109/*
@@ -116,7 +116,7 @@ static void __init check_popad(void)
116#ifndef CONFIG_X86_POPAD_OK 116#ifndef CONFIG_X86_POPAD_OK
117 int res, inp = (int) &res; 117 int res, inp = (int) &res;
118 118
119 printk(KERN_INFO "Checking for popad bug... "); 119 pr_info("Checking for popad bug... ");
120 __asm__ __volatile__( 120 __asm__ __volatile__(
121 "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx " 121 "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx "
122 : "=&a" (res) 122 : "=&a" (res)
@@ -127,9 +127,9 @@ static void __init check_popad(void)
127 * CPU hard. Too bad. 127 * CPU hard. Too bad.
128 */ 128 */
129 if (res != 12345678) 129 if (res != 12345678)
130 printk(KERN_CONT "Buggy.\n"); 130 pr_cont("Buggy\n");
131 else 131 else
132 printk(KERN_CONT "OK.\n"); 132 pr_cont("OK\n");
133#endif 133#endif
134} 134}
135 135
@@ -161,7 +161,7 @@ void __init check_bugs(void)
161{ 161{
162 identify_boot_cpu(); 162 identify_boot_cpu();
163#ifndef CONFIG_SMP 163#ifndef CONFIG_SMP
164 printk(KERN_INFO "CPU: "); 164 pr_info("CPU: ");
165 print_cpu_info(&boot_cpu_data); 165 print_cpu_info(&boot_cpu_data);
166#endif 166#endif
167 check_config(); 167 check_config();
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 6b9333b429ba..46d8786d655e 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -452,6 +452,35 @@ void __cpuinit cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
452 c->x86_cache_size = l2size; 452 c->x86_cache_size = l2size;
453} 453}
454 454
455u16 __read_mostly tlb_lli_4k[NR_INFO];
456u16 __read_mostly tlb_lli_2m[NR_INFO];
457u16 __read_mostly tlb_lli_4m[NR_INFO];
458u16 __read_mostly tlb_lld_4k[NR_INFO];
459u16 __read_mostly tlb_lld_2m[NR_INFO];
460u16 __read_mostly tlb_lld_4m[NR_INFO];
461
462/*
463 * tlb_flushall_shift shows the balance point in replacing cr3 write
464 * with multiple 'invlpg'. It will do this replacement when
465 * flush_tlb_lines <= active_lines/2^tlb_flushall_shift.
466 * If tlb_flushall_shift is -1, means the replacement will be disabled.
467 */
468s8 __read_mostly tlb_flushall_shift = -1;
469
470void __cpuinit cpu_detect_tlb(struct cpuinfo_x86 *c)
471{
472 if (this_cpu->c_detect_tlb)
473 this_cpu->c_detect_tlb(c);
474
475 printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \
476 "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \
477 "tlb_flushall_shift is 0x%x\n",
478 tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES],
479 tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES],
480 tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES],
481 tlb_flushall_shift);
482}
483
455void __cpuinit detect_ht(struct cpuinfo_x86 *c) 484void __cpuinit detect_ht(struct cpuinfo_x86 *c)
456{ 485{
457#ifdef CONFIG_X86_HT 486#ifdef CONFIG_X86_HT
@@ -911,6 +940,8 @@ void __init identify_boot_cpu(void)
911#else 940#else
912 vgetcpu_set_mode(); 941 vgetcpu_set_mode();
913#endif 942#endif
943 if (boot_cpu_data.cpuid_level >= 2)
944 cpu_detect_tlb(&boot_cpu_data);
914} 945}
915 946
916void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) 947void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
@@ -947,7 +978,7 @@ static void __cpuinit __print_cpu_msr(void)
947 index_max = msr_range_array[i].max; 978 index_max = msr_range_array[i].max;
948 979
949 for (index = index_min; index < index_max; index++) { 980 for (index = index_min; index < index_max; index++) {
950 if (rdmsrl_amd_safe(index, &val)) 981 if (rdmsrl_safe(index, &val))
951 continue; 982 continue;
952 printk(KERN_INFO " MSR%08x: %016llx\n", index, val); 983 printk(KERN_INFO " MSR%08x: %016llx\n", index, val);
953 } 984 }
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 8bacc7826fb3..4041c24ae7db 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -20,10 +20,19 @@ struct cpu_dev {
20 void (*c_bsp_init)(struct cpuinfo_x86 *); 20 void (*c_bsp_init)(struct cpuinfo_x86 *);
21 void (*c_init)(struct cpuinfo_x86 *); 21 void (*c_init)(struct cpuinfo_x86 *);
22 void (*c_identify)(struct cpuinfo_x86 *); 22 void (*c_identify)(struct cpuinfo_x86 *);
23 void (*c_detect_tlb)(struct cpuinfo_x86 *);
23 unsigned int (*c_size_cache)(struct cpuinfo_x86 *, unsigned int); 24 unsigned int (*c_size_cache)(struct cpuinfo_x86 *, unsigned int);
24 int c_x86_vendor; 25 int c_x86_vendor;
25}; 26};
26 27
28struct _tlb_table {
29 unsigned char descriptor;
30 char tlb_type;
31 unsigned int entries;
32 /* unsigned int ways; */
33 char info[128];
34};
35
27#define cpu_dev_register(cpu_devX) \ 36#define cpu_dev_register(cpu_devX) \
28 static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \ 37 static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \
29 __attribute__((__section__(".x86_cpu_dev.init"))) = \ 38 __attribute__((__section__(".x86_cpu_dev.init"))) = \
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 755f64fb0743..a8f8fa9769d6 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -37,6 +37,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] =
37#endif 37#endif
38 &x86_hyper_vmware, 38 &x86_hyper_vmware,
39 &x86_hyper_ms_hyperv, 39 &x86_hyper_ms_hyperv,
40#ifdef CONFIG_KVM_GUEST
41 &x86_hyper_kvm,
42#endif
40}; 43};
41 44
42const struct hypervisor_x86 *x86_hyper; 45const struct hypervisor_x86 *x86_hyper;
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 3e6ff6cbf42a..0a4ce2980a5a 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -491,6 +491,181 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned i
491} 491}
492#endif 492#endif
493 493
494#define TLB_INST_4K 0x01
495#define TLB_INST_4M 0x02
496#define TLB_INST_2M_4M 0x03
497
498#define TLB_INST_ALL 0x05
499#define TLB_INST_1G 0x06
500
501#define TLB_DATA_4K 0x11
502#define TLB_DATA_4M 0x12
503#define TLB_DATA_2M_4M 0x13
504#define TLB_DATA_4K_4M 0x14
505
506#define TLB_DATA_1G 0x16
507
508#define TLB_DATA0_4K 0x21
509#define TLB_DATA0_4M 0x22
510#define TLB_DATA0_2M_4M 0x23
511
512#define STLB_4K 0x41
513
514static const struct _tlb_table intel_tlb_table[] __cpuinitconst = {
515 { 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" },
516 { 0x02, TLB_INST_4M, 2, " TLB_INST 4 MByte pages, full associative" },
517 { 0x03, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way set associative" },
518 { 0x04, TLB_DATA_4M, 8, " TLB_DATA 4 MByte pages, 4-way set associative" },
519 { 0x05, TLB_DATA_4M, 32, " TLB_DATA 4 MByte pages, 4-way set associative" },
520 { 0x0b, TLB_INST_4M, 4, " TLB_INST 4 MByte pages, 4-way set associative" },
521 { 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages */" },
522 { 0x50, TLB_INST_ALL, 64, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
523 { 0x51, TLB_INST_ALL, 128, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
524 { 0x52, TLB_INST_ALL, 256, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
525 { 0x55, TLB_INST_2M_4M, 7, " TLB_INST 2-MByte or 4-MByte pages, fully associative" },
526 { 0x56, TLB_DATA0_4M, 16, " TLB_DATA0 4 MByte pages, 4-way set associative" },
527 { 0x57, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, 4-way associative" },
528 { 0x59, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, fully associative" },
529 { 0x5a, TLB_DATA0_2M_4M, 32, " TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative" },
530 { 0x5b, TLB_DATA_4K_4M, 64, " TLB_DATA 4 KByte and 4 MByte pages" },
531 { 0x5c, TLB_DATA_4K_4M, 128, " TLB_DATA 4 KByte and 4 MByte pages" },
532 { 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" },
533 { 0xb0, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 4-way set associative" },
534 { 0xb1, TLB_INST_2M_4M, 4, " TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" },
535 { 0xb2, TLB_INST_4K, 64, " TLB_INST 4KByte pages, 4-way set associative" },
536 { 0xb3, TLB_DATA_4K, 128, " TLB_DATA 4 KByte pages, 4-way set associative" },
537 { 0xb4, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 4-way associative" },
538 { 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" },
539 { 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" },
540 { 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" },
541 { 0x00, 0, 0 }
542};
543
544static void __cpuinit intel_tlb_lookup(const unsigned char desc)
545{
546 unsigned char k;
547 if (desc == 0)
548 return;
549
550 /* look up this descriptor in the table */
551 for (k = 0; intel_tlb_table[k].descriptor != desc && \
552 intel_tlb_table[k].descriptor != 0; k++)
553 ;
554
555 if (intel_tlb_table[k].tlb_type == 0)
556 return;
557
558 switch (intel_tlb_table[k].tlb_type) {
559 case STLB_4K:
560 if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
561 tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
562 if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
563 tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
564 break;
565 case TLB_INST_ALL:
566 if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
567 tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
568 if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
569 tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
570 if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
571 tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
572 break;
573 case TLB_INST_4K:
574 if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
575 tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
576 break;
577 case TLB_INST_4M:
578 if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
579 tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
580 break;
581 case TLB_INST_2M_4M:
582 if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
583 tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
584 if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
585 tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
586 break;
587 case TLB_DATA_4K:
588 case TLB_DATA0_4K:
589 if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
590 tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
591 break;
592 case TLB_DATA_4M:
593 case TLB_DATA0_4M:
594 if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
595 tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
596 break;
597 case TLB_DATA_2M_4M:
598 case TLB_DATA0_2M_4M:
599 if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries)
600 tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries;
601 if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
602 tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
603 break;
604 case TLB_DATA_4K_4M:
605 if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
606 tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
607 if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
608 tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
609 break;
610 }
611}
612
613static void __cpuinit intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c)
614{
615 if (!cpu_has_invlpg) {
616 tlb_flushall_shift = -1;
617 return;
618 }
619 switch ((c->x86 << 8) + c->x86_model) {
620 case 0x60f: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
621 case 0x616: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
622 case 0x617: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
623 case 0x61d: /* six-core 45 nm xeon "Dunnington" */
624 tlb_flushall_shift = -1;
625 break;
626 case 0x61a: /* 45 nm nehalem, "Bloomfield" */
627 case 0x61e: /* 45 nm nehalem, "Lynnfield" */
628 case 0x625: /* 32 nm nehalem, "Clarkdale" */
629 case 0x62c: /* 32 nm nehalem, "Gulftown" */
630 case 0x62e: /* 45 nm nehalem-ex, "Beckton" */
631 case 0x62f: /* 32 nm Xeon E7 */
632 tlb_flushall_shift = 6;
633 break;
634 case 0x62a: /* SandyBridge */
635 case 0x62d: /* SandyBridge, "Romely-EP" */
636 tlb_flushall_shift = 5;
637 break;
638 case 0x63a: /* Ivybridge */
639 tlb_flushall_shift = 1;
640 break;
641 default:
642 tlb_flushall_shift = 6;
643 }
644}
645
646static void __cpuinit intel_detect_tlb(struct cpuinfo_x86 *c)
647{
648 int i, j, n;
649 unsigned int regs[4];
650 unsigned char *desc = (unsigned char *)regs;
651 /* Number of times to iterate */
652 n = cpuid_eax(2) & 0xFF;
653
654 for (i = 0 ; i < n ; i++) {
655 cpuid(2, &regs[0], &regs[1], &regs[2], &regs[3]);
656
657 /* If bit 31 is set, this is an unknown format */
658 for (j = 0 ; j < 3 ; j++)
659 if (regs[j] & (1 << 31))
660 regs[j] = 0;
661
662 /* Byte 0 is level count, not a descriptor */
663 for (j = 1 ; j < 16 ; j++)
664 intel_tlb_lookup(desc[j]);
665 }
666 intel_tlb_flushall_shift_set(c);
667}
668
494static const struct cpu_dev __cpuinitconst intel_cpu_dev = { 669static const struct cpu_dev __cpuinitconst intel_cpu_dev = {
495 .c_vendor = "Intel", 670 .c_vendor = "Intel",
496 .c_ident = { "GenuineIntel" }, 671 .c_ident = { "GenuineIntel" },
@@ -546,6 +721,7 @@ static const struct cpu_dev __cpuinitconst intel_cpu_dev = {
546 }, 721 },
547 .c_size_cache = intel_size_cache, 722 .c_size_cache = intel_size_cache,
548#endif 723#endif
724 .c_detect_tlb = intel_detect_tlb,
549 .c_early_init = early_init_intel, 725 .c_early_init = early_init_intel,
550 .c_init = init_intel, 726 .c_init = init_intel,
551 .c_x86_vendor = X86_VENDOR_INTEL, 727 .c_x86_vendor = X86_VENDOR_INTEL,
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index aa7548799af4..5e095f873e3e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -7,6 +7,9 @@
7 * Copyright 2008 Intel Corporation 7 * Copyright 2008 Intel Corporation
8 * Author: Andi Kleen 8 * Author: Andi Kleen
9 */ 9 */
10
11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12
10#include <linux/thread_info.h> 13#include <linux/thread_info.h>
11#include <linux/capability.h> 14#include <linux/capability.h>
12#include <linux/miscdevice.h> 15#include <linux/miscdevice.h>
@@ -208,7 +211,7 @@ static void drain_mcelog_buffer(void)
208 cpu_relax(); 211 cpu_relax();
209 212
210 if (!m->finished && retries >= 4) { 213 if (!m->finished && retries >= 4) {
211 pr_err("MCE: skipping error being logged currently!\n"); 214 pr_err("skipping error being logged currently!\n");
212 break; 215 break;
213 } 216 }
214 } 217 }
@@ -1165,8 +1168,9 @@ int memory_failure(unsigned long pfn, int vector, int flags)
1165{ 1168{
1166 /* mce_severity() should not hand us an ACTION_REQUIRED error */ 1169 /* mce_severity() should not hand us an ACTION_REQUIRED error */
1167 BUG_ON(flags & MF_ACTION_REQUIRED); 1170 BUG_ON(flags & MF_ACTION_REQUIRED);
1168 printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n" 1171 pr_err("Uncorrected memory error in page 0x%lx ignored\n"
1169 "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn); 1172 "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
1173 pfn);
1170 1174
1171 return 0; 1175 return 0;
1172} 1176}
@@ -1184,6 +1188,7 @@ void mce_notify_process(void)
1184{ 1188{
1185 unsigned long pfn; 1189 unsigned long pfn;
1186 struct mce_info *mi = mce_find_info(); 1190 struct mce_info *mi = mce_find_info();
1191 int flags = MF_ACTION_REQUIRED;
1187 1192
1188 if (!mi) 1193 if (!mi)
1189 mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL); 1194 mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL);
@@ -1198,8 +1203,9 @@ void mce_notify_process(void)
1198 * doomed. We still need to mark the page as poisoned and alert any 1203 * doomed. We still need to mark the page as poisoned and alert any
1199 * other users of the page. 1204 * other users of the page.
1200 */ 1205 */
1201 if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0 || 1206 if (!mi->restartable)
1202 mi->restartable == 0) { 1207 flags |= MF_MUST_KILL;
1208 if (memory_failure(pfn, MCE_VECTOR, flags) < 0) {
1203 pr_err("Memory error not recovered"); 1209 pr_err("Memory error not recovered");
1204 force_sig(SIGBUS, current); 1210 force_sig(SIGBUS, current);
1205 } 1211 }
@@ -1356,11 +1362,10 @@ static int __cpuinit __mcheck_cpu_cap_init(void)
1356 1362
1357 b = cap & MCG_BANKCNT_MASK; 1363 b = cap & MCG_BANKCNT_MASK;
1358 if (!banks) 1364 if (!banks)
1359 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); 1365 pr_info("CPU supports %d MCE banks\n", b);
1360 1366
1361 if (b > MAX_NR_BANKS) { 1367 if (b > MAX_NR_BANKS) {
1362 printk(KERN_WARNING 1368 pr_warn("Using only %u machine check banks out of %u\n",
1363 "MCE: Using only %u machine check banks out of %u\n",
1364 MAX_NR_BANKS, b); 1369 MAX_NR_BANKS, b);
1365 b = MAX_NR_BANKS; 1370 b = MAX_NR_BANKS;
1366 } 1371 }
@@ -1417,7 +1422,7 @@ static void __mcheck_cpu_init_generic(void)
1417static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) 1422static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1418{ 1423{
1419 if (c->x86_vendor == X86_VENDOR_UNKNOWN) { 1424 if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1420 pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); 1425 pr_info("unknown CPU type - not enabling MCE support\n");
1421 return -EOPNOTSUPP; 1426 return -EOPNOTSUPP;
1422 } 1427 }
1423 1428
@@ -1572,7 +1577,7 @@ static void __mcheck_cpu_init_timer(void)
1572/* Handle unconfigured int18 (should never happen) */ 1577/* Handle unconfigured int18 (should never happen) */
1573static void unexpected_machine_check(struct pt_regs *regs, long error_code) 1578static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1574{ 1579{
1575 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", 1580 pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
1576 smp_processor_id()); 1581 smp_processor_id());
1577} 1582}
1578 1583
@@ -1891,8 +1896,7 @@ static int __init mcheck_enable(char *str)
1891 get_option(&str, &monarch_timeout); 1896 get_option(&str, &monarch_timeout);
1892 } 1897 }
1893 } else { 1898 } else {
1894 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", 1899 pr_info("mce argument %s ignored. Please use /sys\n", str);
1895 str);
1896 return 0; 1900 return 0;
1897 } 1901 }
1898 return 1; 1902 return 1;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index be5274490428..c4e916d77378 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -1,15 +1,17 @@
1/* 1/*
2 * (c) 2005, 2006 Advanced Micro Devices, Inc. 2 * (c) 2005-2012 Advanced Micro Devices, Inc.
3 * Your use of this code is subject to the terms and conditions of the 3 * Your use of this code is subject to the terms and conditions of the
4 * GNU general public license version 2. See "COPYING" or 4 * GNU general public license version 2. See "COPYING" or
5 * http://www.gnu.org/licenses/gpl.html 5 * http://www.gnu.org/licenses/gpl.html
6 * 6 *
7 * Written by Jacob Shin - AMD, Inc. 7 * Written by Jacob Shin - AMD, Inc.
8 * 8 *
9 * Support : jacob.shin@amd.com 9 * Support: borislav.petkov@amd.com
10 * 10 *
11 * April 2006 11 * April 2006
12 * - added support for AMD Family 0x10 processors 12 * - added support for AMD Family 0x10 processors
13 * May 2012
14 * - major scrubbing
13 * 15 *
14 * All MC4_MISCi registers are shared between multi-cores 16 * All MC4_MISCi registers are shared between multi-cores
15 */ 17 */
@@ -25,6 +27,7 @@
25#include <linux/cpu.h> 27#include <linux/cpu.h>
26#include <linux/smp.h> 28#include <linux/smp.h>
27 29
30#include <asm/amd_nb.h>
28#include <asm/apic.h> 31#include <asm/apic.h>
29#include <asm/idle.h> 32#include <asm/idle.h>
30#include <asm/mce.h> 33#include <asm/mce.h>
@@ -45,23 +48,15 @@
45#define MASK_BLKPTR_LO 0xFF000000 48#define MASK_BLKPTR_LO 0xFF000000
46#define MCG_XBLK_ADDR 0xC0000400 49#define MCG_XBLK_ADDR 0xC0000400
47 50
48struct threshold_block { 51static const char * const th_names[] = {
49 unsigned int block; 52 "load_store",
50 unsigned int bank; 53 "insn_fetch",
51 unsigned int cpu; 54 "combined_unit",
52 u32 address; 55 "",
53 u16 interrupt_enable; 56 "northbridge",
54 bool interrupt_capable; 57 "execution_unit",
55 u16 threshold_limit;
56 struct kobject kobj;
57 struct list_head miscj;
58}; 58};
59 59
60struct threshold_bank {
61 struct kobject *kobj;
62 struct threshold_block *blocks;
63 cpumask_var_t cpus;
64};
65static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks); 60static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks);
66 61
67static unsigned char shared_bank[NR_BANKS] = { 62static unsigned char shared_bank[NR_BANKS] = {
@@ -84,6 +79,26 @@ struct thresh_restart {
84 u16 old_limit; 79 u16 old_limit;
85}; 80};
86 81
82static const char * const bank4_names(struct threshold_block *b)
83{
84 switch (b->address) {
85 /* MSR4_MISC0 */
86 case 0x00000413:
87 return "dram";
88
89 case 0xc0000408:
90 return "ht_links";
91
92 case 0xc0000409:
93 return "l3_cache";
94
95 default:
96 WARN(1, "Funny MSR: 0x%08x\n", b->address);
97 return "";
98 }
99};
100
101
87static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits) 102static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits)
88{ 103{
89 /* 104 /*
@@ -224,8 +239,6 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
224 239
225 if (!block) 240 if (!block)
226 per_cpu(bank_map, cpu) |= (1 << bank); 241 per_cpu(bank_map, cpu) |= (1 << bank);
227 if (shared_bank[bank] && c->cpu_core_id)
228 break;
229 242
230 memset(&b, 0, sizeof(b)); 243 memset(&b, 0, sizeof(b));
231 b.cpu = cpu; 244 b.cpu = cpu;
@@ -326,7 +339,7 @@ struct threshold_attr {
326#define SHOW_FIELDS(name) \ 339#define SHOW_FIELDS(name) \
327static ssize_t show_ ## name(struct threshold_block *b, char *buf) \ 340static ssize_t show_ ## name(struct threshold_block *b, char *buf) \
328{ \ 341{ \
329 return sprintf(buf, "%lx\n", (unsigned long) b->name); \ 342 return sprintf(buf, "%lu\n", (unsigned long) b->name); \
330} 343}
331SHOW_FIELDS(interrupt_enable) 344SHOW_FIELDS(interrupt_enable)
332SHOW_FIELDS(threshold_limit) 345SHOW_FIELDS(threshold_limit)
@@ -377,38 +390,21 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
377 return size; 390 return size;
378} 391}
379 392
380struct threshold_block_cross_cpu {
381 struct threshold_block *tb;
382 long retval;
383};
384
385static void local_error_count_handler(void *_tbcc)
386{
387 struct threshold_block_cross_cpu *tbcc = _tbcc;
388 struct threshold_block *b = tbcc->tb;
389 u32 low, high;
390
391 rdmsr(b->address, low, high);
392 tbcc->retval = (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit);
393}
394
395static ssize_t show_error_count(struct threshold_block *b, char *buf) 393static ssize_t show_error_count(struct threshold_block *b, char *buf)
396{ 394{
397 struct threshold_block_cross_cpu tbcc = { .tb = b, }; 395 u32 lo, hi;
398 396
399 smp_call_function_single(b->cpu, local_error_count_handler, &tbcc, 1); 397 rdmsr_on_cpu(b->cpu, b->address, &lo, &hi);
400 return sprintf(buf, "%lx\n", tbcc.retval);
401}
402 398
403static ssize_t store_error_count(struct threshold_block *b, 399 return sprintf(buf, "%u\n", ((hi & THRESHOLD_MAX) -
404 const char *buf, size_t count) 400 (THRESHOLD_MAX - b->threshold_limit)));
405{
406 struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 };
407
408 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
409 return 1;
410} 401}
411 402
403static struct threshold_attr error_count = {
404 .attr = {.name = __stringify(error_count), .mode = 0444 },
405 .show = show_error_count,
406};
407
412#define RW_ATTR(val) \ 408#define RW_ATTR(val) \
413static struct threshold_attr val = { \ 409static struct threshold_attr val = { \
414 .attr = {.name = __stringify(val), .mode = 0644 }, \ 410 .attr = {.name = __stringify(val), .mode = 0644 }, \
@@ -418,7 +414,6 @@ static struct threshold_attr val = { \
418 414
419RW_ATTR(interrupt_enable); 415RW_ATTR(interrupt_enable);
420RW_ATTR(threshold_limit); 416RW_ATTR(threshold_limit);
421RW_ATTR(error_count);
422 417
423static struct attribute *default_attrs[] = { 418static struct attribute *default_attrs[] = {
424 &threshold_limit.attr, 419 &threshold_limit.attr,
@@ -517,7 +512,7 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
517 512
518 err = kobject_init_and_add(&b->kobj, &threshold_ktype, 513 err = kobject_init_and_add(&b->kobj, &threshold_ktype,
519 per_cpu(threshold_banks, cpu)[bank]->kobj, 514 per_cpu(threshold_banks, cpu)[bank]->kobj,
520 "misc%i", block); 515 (bank == 4 ? bank4_names(b) : th_names[bank]));
521 if (err) 516 if (err)
522 goto out_free; 517 goto out_free;
523recurse: 518recurse:
@@ -548,98 +543,91 @@ out_free:
548 return err; 543 return err;
549} 544}
550 545
551static __cpuinit long 546static __cpuinit int __threshold_add_blocks(struct threshold_bank *b)
552local_allocate_threshold_blocks(int cpu, unsigned int bank)
553{ 547{
554 return allocate_threshold_blocks(cpu, bank, 0, 548 struct list_head *head = &b->blocks->miscj;
555 MSR_IA32_MC0_MISC + bank * 4); 549 struct threshold_block *pos = NULL;
550 struct threshold_block *tmp = NULL;
551 int err = 0;
552
553 err = kobject_add(&b->blocks->kobj, b->kobj, b->blocks->kobj.name);
554 if (err)
555 return err;
556
557 list_for_each_entry_safe(pos, tmp, head, miscj) {
558
559 err = kobject_add(&pos->kobj, b->kobj, pos->kobj.name);
560 if (err) {
561 list_for_each_entry_safe_reverse(pos, tmp, head, miscj)
562 kobject_del(&pos->kobj);
563
564 return err;
565 }
566 }
567 return err;
556} 568}
557 569
558/* symlinks sibling shared banks to first core. first core owns dir/files. */
559static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) 570static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
560{ 571{
561 int i, err = 0;
562 struct threshold_bank *b = NULL;
563 struct device *dev = per_cpu(mce_device, cpu); 572 struct device *dev = per_cpu(mce_device, cpu);
564 char name[32]; 573 struct amd_northbridge *nb = NULL;
565 574 struct threshold_bank *b = NULL;
566 sprintf(name, "threshold_bank%i", bank); 575 const char *name = th_names[bank];
576 int err = 0;
567 577
568#ifdef CONFIG_SMP 578 if (shared_bank[bank]) {
569 if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */
570 i = cpumask_first(cpu_llc_shared_mask(cpu));
571 579
572 /* first core not up yet */ 580 nb = node_to_amd_nb(amd_get_nb_id(cpu));
573 if (cpu_data(i).cpu_core_id) 581 WARN_ON(!nb);
574 goto out;
575 582
576 /* already linked */ 583 /* threshold descriptor already initialized on this node? */
577 if (per_cpu(threshold_banks, cpu)[bank]) 584 if (nb->bank4) {
578 goto out; 585 /* yes, use it */
586 b = nb->bank4;
587 err = kobject_add(b->kobj, &dev->kobj, name);
588 if (err)
589 goto out;
579 590
580 b = per_cpu(threshold_banks, i)[bank]; 591 per_cpu(threshold_banks, cpu)[bank] = b;
592 atomic_inc(&b->cpus);
581 593
582 if (!b) 594 err = __threshold_add_blocks(b);
583 goto out;
584 595
585 err = sysfs_create_link(&dev->kobj, b->kobj, name);
586 if (err)
587 goto out; 596 goto out;
588 597 }
589 cpumask_copy(b->cpus, cpu_llc_shared_mask(cpu));
590 per_cpu(threshold_banks, cpu)[bank] = b;
591
592 goto out;
593 } 598 }
594#endif
595 599
596 b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); 600 b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
597 if (!b) { 601 if (!b) {
598 err = -ENOMEM; 602 err = -ENOMEM;
599 goto out; 603 goto out;
600 } 604 }
601 if (!zalloc_cpumask_var(&b->cpus, GFP_KERNEL)) {
602 kfree(b);
603 err = -ENOMEM;
604 goto out;
605 }
606 605
607 b->kobj = kobject_create_and_add(name, &dev->kobj); 606 b->kobj = kobject_create_and_add(name, &dev->kobj);
608 if (!b->kobj) 607 if (!b->kobj) {
608 err = -EINVAL;
609 goto out_free; 609 goto out_free;
610 610 }
611#ifndef CONFIG_SMP
612 cpumask_setall(b->cpus);
613#else
614 cpumask_set_cpu(cpu, b->cpus);
615#endif
616 611
617 per_cpu(threshold_banks, cpu)[bank] = b; 612 per_cpu(threshold_banks, cpu)[bank] = b;
618 613
619 err = local_allocate_threshold_blocks(cpu, bank); 614 if (shared_bank[bank]) {
620 if (err) 615 atomic_set(&b->cpus, 1);
621 goto out_free;
622
623 for_each_cpu(i, b->cpus) {
624 if (i == cpu)
625 continue;
626 616
627 dev = per_cpu(mce_device, i); 617 /* nb is already initialized, see above */
628 if (dev) 618 WARN_ON(nb->bank4);
629 err = sysfs_create_link(&dev->kobj,b->kobj, name); 619 nb->bank4 = b;
630 if (err)
631 goto out;
632
633 per_cpu(threshold_banks, i)[bank] = b;
634 } 620 }
635 621
636 goto out; 622 err = allocate_threshold_blocks(cpu, bank, 0,
623 MSR_IA32_MC0_MISC + bank * 4);
624 if (!err)
625 goto out;
637 626
638out_free: 627 out_free:
639 per_cpu(threshold_banks, cpu)[bank] = NULL;
640 free_cpumask_var(b->cpus);
641 kfree(b); 628 kfree(b);
642out: 629
630 out:
643 return err; 631 return err;
644} 632}
645 633
@@ -660,12 +648,6 @@ static __cpuinit int threshold_create_device(unsigned int cpu)
660 return err; 648 return err;
661} 649}
662 650
663/*
664 * let's be hotplug friendly.
665 * in case of multiple core processors, the first core always takes ownership
666 * of shared sysfs dir/files, and rest of the cores will be symlinked to it.
667 */
668
669static void deallocate_threshold_block(unsigned int cpu, 651static void deallocate_threshold_block(unsigned int cpu,
670 unsigned int bank) 652 unsigned int bank)
671{ 653{
@@ -686,41 +668,42 @@ static void deallocate_threshold_block(unsigned int cpu,
686 per_cpu(threshold_banks, cpu)[bank]->blocks = NULL; 668 per_cpu(threshold_banks, cpu)[bank]->blocks = NULL;
687} 669}
688 670
671static void __threshold_remove_blocks(struct threshold_bank *b)
672{
673 struct threshold_block *pos = NULL;
674 struct threshold_block *tmp = NULL;
675
676 kobject_del(b->kobj);
677
678 list_for_each_entry_safe(pos, tmp, &b->blocks->miscj, miscj)
679 kobject_del(&pos->kobj);
680}
681
689static void threshold_remove_bank(unsigned int cpu, int bank) 682static void threshold_remove_bank(unsigned int cpu, int bank)
690{ 683{
684 struct amd_northbridge *nb;
691 struct threshold_bank *b; 685 struct threshold_bank *b;
692 struct device *dev;
693 char name[32];
694 int i = 0;
695 686
696 b = per_cpu(threshold_banks, cpu)[bank]; 687 b = per_cpu(threshold_banks, cpu)[bank];
697 if (!b) 688 if (!b)
698 return; 689 return;
690
699 if (!b->blocks) 691 if (!b->blocks)
700 goto free_out; 692 goto free_out;
701 693
702 sprintf(name, "threshold_bank%i", bank); 694 if (shared_bank[bank]) {
703 695 if (!atomic_dec_and_test(&b->cpus)) {
704#ifdef CONFIG_SMP 696 __threshold_remove_blocks(b);
705 /* sibling symlink */ 697 per_cpu(threshold_banks, cpu)[bank] = NULL;
706 if (shared_bank[bank] && b->blocks->cpu != cpu) { 698 return;
707 dev = per_cpu(mce_device, cpu); 699 } else {
708 sysfs_remove_link(&dev->kobj, name); 700 /*
709 per_cpu(threshold_banks, cpu)[bank] = NULL; 701 * the last CPU on this node using the shared bank is
710 702 * going away, remove that bank now.
711 return; 703 */
712 } 704 nb = node_to_amd_nb(amd_get_nb_id(cpu));
713#endif 705 nb->bank4 = NULL;
714 706 }
715 /* remove all sibling symlinks before unregistering */
716 for_each_cpu(i, b->cpus) {
717 if (i == cpu)
718 continue;
719
720 dev = per_cpu(mce_device, i);
721 if (dev)
722 sysfs_remove_link(&dev->kobj, name);
723 per_cpu(threshold_banks, i)[bank] = NULL;
724 } 707 }
725 708
726 deallocate_threshold_block(cpu, bank); 709 deallocate_threshold_block(cpu, bank);
@@ -728,7 +711,6 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
728free_out: 711free_out:
729 kobject_del(b->kobj); 712 kobject_del(b->kobj);
730 kobject_put(b->kobj); 713 kobject_put(b->kobj);
731 free_cpumask_var(b->cpus);
732 kfree(b); 714 kfree(b);
733 per_cpu(threshold_banks, cpu)[bank] = NULL; 715 per_cpu(threshold_banks, cpu)[bank] = NULL;
734} 716}
diff --git a/arch/x86/kernel/cpu/mkcapflags.pl b/arch/x86/kernel/cpu/mkcapflags.pl
index dfea390e1608..c7b3fe2d72e0 100644
--- a/arch/x86/kernel/cpu/mkcapflags.pl
+++ b/arch/x86/kernel/cpu/mkcapflags.pl
@@ -1,4 +1,4 @@
1#!/usr/bin/perl 1#!/usr/bin/perl -w
2# 2#
3# Generate the x86_cap_flags[] array from include/asm-x86/cpufeature.h 3# Generate the x86_cap_flags[] array from include/asm-x86/cpufeature.h
4# 4#
@@ -11,22 +11,35 @@ open(OUT, "> $out\0") or die "$0: cannot create: $out: $!\n";
11print OUT "#include <asm/cpufeature.h>\n\n"; 11print OUT "#include <asm/cpufeature.h>\n\n";
12print OUT "const char * const x86_cap_flags[NCAPINTS*32] = {\n"; 12print OUT "const char * const x86_cap_flags[NCAPINTS*32] = {\n";
13 13
14%features = ();
15$err = 0;
16
14while (defined($line = <IN>)) { 17while (defined($line = <IN>)) {
15 if ($line =~ /^\s*\#\s*define\s+(X86_FEATURE_(\S+))\s+(.*)$/) { 18 if ($line =~ /^\s*\#\s*define\s+(X86_FEATURE_(\S+))\s+(.*)$/) {
16 $macro = $1; 19 $macro = $1;
17 $feature = $2; 20 $feature = "\L$2";
18 $tail = $3; 21 $tail = $3;
19 if ($tail =~ /\/\*\s*\"([^"]*)\".*\*\//) { 22 if ($tail =~ /\/\*\s*\"([^"]*)\".*\*\//) {
20 $feature = $1; 23 $feature = "\L$1";
21 } 24 }
22 25
23 if ($feature ne '') { 26 next if ($feature eq '');
24 printf OUT "\t%-32s = \"%s\",\n", 27
25 "[$macro]", "\L$feature"; 28 if ($features{$feature}++) {
29 print STDERR "$in: duplicate feature name: $feature\n";
30 $err++;
26 } 31 }
32 printf OUT "\t%-32s = \"%s\",\n", "[$macro]", $feature;
27 } 33 }
28} 34}
29print OUT "};\n"; 35print OUT "};\n";
30 36
31close(IN); 37close(IN);
32close(OUT); 38close(OUT);
39
40if ($err) {
41 unlink($out);
42 exit(1);
43}
44
45exit(0);
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index bdda2e6c673b..35ffda5d0727 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -258,11 +258,11 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk,
258 258
259 /* Compute the maximum size with which we can make a range: */ 259 /* Compute the maximum size with which we can make a range: */
260 if (range_startk) 260 if (range_startk)
261 max_align = ffs(range_startk) - 1; 261 max_align = __ffs(range_startk);
262 else 262 else
263 max_align = 32; 263 max_align = BITS_PER_LONG - 1;
264 264
265 align = fls(range_sizek) - 1; 265 align = __fls(range_sizek);
266 if (align > max_align) 266 if (align > max_align)
267 align = max_align; 267 align = max_align;
268 268
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 75772ae6c65f..e9fe907cd249 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -361,11 +361,7 @@ static void __init print_mtrr_state(void)
361 } 361 }
362 pr_debug("MTRR variable ranges %sabled:\n", 362 pr_debug("MTRR variable ranges %sabled:\n",
363 mtrr_state.enabled & 2 ? "en" : "dis"); 363 mtrr_state.enabled & 2 ? "en" : "dis");
364 if (size_or_mask & 0xffffffffUL) 364 high_width = (__ffs64(size_or_mask) - (32 - PAGE_SHIFT) + 3) / 4;
365 high_width = ffs(size_or_mask & 0xffffffffUL) - 1;
366 else
367 high_width = ffs(size_or_mask>>32) + 32 - 1;
368 high_width = (high_width - (32 - PAGE_SHIFT) + 3) / 4;
369 365
370 for (i = 0; i < num_var_ranges; ++i) { 366 for (i = 0; i < num_var_ranges; ++i) {
371 if (mtrr_state.var_ranges[i].mask_lo & (1 << 11)) 367 if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index c4706cf9c011..29557aa06dda 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -35,17 +35,6 @@
35 35
36#include "perf_event.h" 36#include "perf_event.h"
37 37
38#if 0
39#undef wrmsrl
40#define wrmsrl(msr, val) \
41do { \
42 trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\
43 (unsigned long)(val)); \
44 native_write_msr((msr), (u32)((u64)(val)), \
45 (u32)((u64)(val) >> 32)); \
46} while (0)
47#endif
48
49struct x86_pmu x86_pmu __read_mostly; 38struct x86_pmu x86_pmu __read_mostly;
50 39
51DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { 40DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
@@ -74,7 +63,7 @@ u64 x86_perf_event_update(struct perf_event *event)
74 int idx = hwc->idx; 63 int idx = hwc->idx;
75 s64 delta; 64 s64 delta;
76 65
77 if (idx == X86_PMC_IDX_FIXED_BTS) 66 if (idx == INTEL_PMC_IDX_FIXED_BTS)
78 return 0; 67 return 0;
79 68
80 /* 69 /*
@@ -86,7 +75,7 @@ u64 x86_perf_event_update(struct perf_event *event)
86 */ 75 */
87again: 76again:
88 prev_raw_count = local64_read(&hwc->prev_count); 77 prev_raw_count = local64_read(&hwc->prev_count);
89 rdmsrl(hwc->event_base, new_raw_count); 78 rdpmcl(hwc->event_base_rdpmc, new_raw_count);
90 79
91 if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, 80 if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
92 new_raw_count) != prev_raw_count) 81 new_raw_count) != prev_raw_count)
@@ -189,7 +178,7 @@ static void release_pmc_hardware(void) {}
189 178
190static bool check_hw_exists(void) 179static bool check_hw_exists(void)
191{ 180{
192 u64 val, val_new = 0; 181 u64 val, val_new = ~0;
193 int i, reg, ret = 0; 182 int i, reg, ret = 0;
194 183
195 /* 184 /*
@@ -222,8 +211,9 @@ static bool check_hw_exists(void)
222 * that don't trap on the MSR access and always return 0s. 211 * that don't trap on the MSR access and always return 0s.
223 */ 212 */
224 val = 0xabcdUL; 213 val = 0xabcdUL;
225 ret = checking_wrmsrl(x86_pmu_event_addr(0), val); 214 reg = x86_pmu_event_addr(0);
226 ret |= rdmsrl_safe(x86_pmu_event_addr(0), &val_new); 215 ret = wrmsrl_safe(reg, val);
216 ret |= rdmsrl_safe(reg, &val_new);
227 if (ret || val != val_new) 217 if (ret || val != val_new)
228 goto msr_fail; 218 goto msr_fail;
229 219
@@ -240,6 +230,7 @@ bios_fail:
240 230
241msr_fail: 231msr_fail:
242 printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n"); 232 printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n");
233 printk(KERN_ERR "Failed to access perfctr msr (MSR %x is %Lx)\n", reg, val_new);
243 234
244 return false; 235 return false;
245} 236}
@@ -388,7 +379,7 @@ int x86_pmu_hw_config(struct perf_event *event)
388 int precise = 0; 379 int precise = 0;
389 380
390 /* Support for constant skid */ 381 /* Support for constant skid */
391 if (x86_pmu.pebs_active) { 382 if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
392 precise++; 383 precise++;
393 384
394 /* Support for IP fixup */ 385 /* Support for IP fixup */
@@ -637,8 +628,8 @@ static bool __perf_sched_find_counter(struct perf_sched *sched)
637 c = sched->constraints[sched->state.event]; 628 c = sched->constraints[sched->state.event];
638 629
639 /* Prefer fixed purpose counters */ 630 /* Prefer fixed purpose counters */
640 if (x86_pmu.num_counters_fixed) { 631 if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
641 idx = X86_PMC_IDX_FIXED; 632 idx = INTEL_PMC_IDX_FIXED;
642 for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) { 633 for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
643 if (!__test_and_set_bit(idx, sched->state.used)) 634 if (!__test_and_set_bit(idx, sched->state.used))
644 goto done; 635 goto done;
@@ -646,7 +637,7 @@ static bool __perf_sched_find_counter(struct perf_sched *sched)
646 } 637 }
647 /* Grab the first unused counter starting with idx */ 638 /* Grab the first unused counter starting with idx */
648 idx = sched->state.counter; 639 idx = sched->state.counter;
649 for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_FIXED) { 640 for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
650 if (!__test_and_set_bit(idx, sched->state.used)) 641 if (!__test_and_set_bit(idx, sched->state.used))
651 goto done; 642 goto done;
652 } 643 }
@@ -704,8 +695,8 @@ static bool perf_sched_next_event(struct perf_sched *sched)
704/* 695/*
705 * Assign a counter for each event. 696 * Assign a counter for each event.
706 */ 697 */
707static int perf_assign_events(struct event_constraint **constraints, int n, 698int perf_assign_events(struct event_constraint **constraints, int n,
708 int wmin, int wmax, int *assign) 699 int wmin, int wmax, int *assign)
709{ 700{
710 struct perf_sched sched; 701 struct perf_sched sched;
711 702
@@ -824,15 +815,17 @@ static inline void x86_assign_hw_event(struct perf_event *event,
824 hwc->last_cpu = smp_processor_id(); 815 hwc->last_cpu = smp_processor_id();
825 hwc->last_tag = ++cpuc->tags[i]; 816 hwc->last_tag = ++cpuc->tags[i];
826 817
827 if (hwc->idx == X86_PMC_IDX_FIXED_BTS) { 818 if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) {
828 hwc->config_base = 0; 819 hwc->config_base = 0;
829 hwc->event_base = 0; 820 hwc->event_base = 0;
830 } else if (hwc->idx >= X86_PMC_IDX_FIXED) { 821 } else if (hwc->idx >= INTEL_PMC_IDX_FIXED) {
831 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; 822 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
832 hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - X86_PMC_IDX_FIXED); 823 hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - INTEL_PMC_IDX_FIXED);
824 hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 1<<30;
833 } else { 825 } else {
834 hwc->config_base = x86_pmu_config_addr(hwc->idx); 826 hwc->config_base = x86_pmu_config_addr(hwc->idx);
835 hwc->event_base = x86_pmu_event_addr(hwc->idx); 827 hwc->event_base = x86_pmu_event_addr(hwc->idx);
828 hwc->event_base_rdpmc = hwc->idx;
836 } 829 }
837} 830}
838 831
@@ -930,7 +923,7 @@ int x86_perf_event_set_period(struct perf_event *event)
930 s64 period = hwc->sample_period; 923 s64 period = hwc->sample_period;
931 int ret = 0, idx = hwc->idx; 924 int ret = 0, idx = hwc->idx;
932 925
933 if (idx == X86_PMC_IDX_FIXED_BTS) 926 if (idx == INTEL_PMC_IDX_FIXED_BTS)
934 return 0; 927 return 0;
935 928
936 /* 929 /*
@@ -1316,7 +1309,6 @@ static struct attribute_group x86_pmu_format_group = {
1316static int __init init_hw_perf_events(void) 1309static int __init init_hw_perf_events(void)
1317{ 1310{
1318 struct x86_pmu_quirk *quirk; 1311 struct x86_pmu_quirk *quirk;
1319 struct event_constraint *c;
1320 int err; 1312 int err;
1321 1313
1322 pr_info("Performance Events: "); 1314 pr_info("Performance Events: ");
@@ -1347,21 +1339,8 @@ static int __init init_hw_perf_events(void)
1347 for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next) 1339 for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next)
1348 quirk->func(); 1340 quirk->func();
1349 1341
1350 if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { 1342 if (!x86_pmu.intel_ctrl)
1351 WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", 1343 x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
1352 x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
1353 x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
1354 }
1355 x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
1356
1357 if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
1358 WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
1359 x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
1360 x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
1361 }
1362
1363 x86_pmu.intel_ctrl |=
1364 ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
1365 1344
1366 perf_events_lapic_init(); 1345 perf_events_lapic_init();
1367 register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI"); 1346 register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI");
@@ -1370,22 +1349,6 @@ static int __init init_hw_perf_events(void)
1370 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1, 1349 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
1371 0, x86_pmu.num_counters, 0); 1350 0, x86_pmu.num_counters, 0);
1372 1351
1373 if (x86_pmu.event_constraints) {
1374 /*
1375 * event on fixed counter2 (REF_CYCLES) only works on this
1376 * counter, so do not extend mask to generic counters
1377 */
1378 for_each_event_constraint(c, x86_pmu.event_constraints) {
1379 if (c->cmask != X86_RAW_EVENT_MASK
1380 || c->idxmsk64 == X86_PMC_MSK_FIXED_REF_CYCLES) {
1381 continue;
1382 }
1383
1384 c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
1385 c->weight += x86_pmu.num_counters;
1386 }
1387 }
1388
1389 x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ 1352 x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */
1390 x86_pmu_format_group.attrs = x86_pmu.format_attrs; 1353 x86_pmu_format_group.attrs = x86_pmu.format_attrs;
1391 1354
@@ -1620,8 +1583,8 @@ static int x86_pmu_event_idx(struct perf_event *event)
1620 if (!x86_pmu.attr_rdpmc) 1583 if (!x86_pmu.attr_rdpmc)
1621 return 0; 1584 return 0;
1622 1585
1623 if (x86_pmu.num_counters_fixed && idx >= X86_PMC_IDX_FIXED) { 1586 if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) {
1624 idx -= X86_PMC_IDX_FIXED; 1587 idx -= INTEL_PMC_IDX_FIXED;
1625 idx |= 1 << 30; 1588 idx |= 1 << 30;
1626 } 1589 }
1627 1590
@@ -1649,7 +1612,12 @@ static ssize_t set_attr_rdpmc(struct device *cdev,
1649 struct device_attribute *attr, 1612 struct device_attribute *attr,
1650 const char *buf, size_t count) 1613 const char *buf, size_t count)
1651{ 1614{
1652 unsigned long val = simple_strtoul(buf, NULL, 0); 1615 unsigned long val;
1616 ssize_t ret;
1617
1618 ret = kstrtoul(buf, 0, &val);
1619 if (ret)
1620 return ret;
1653 1621
1654 if (!!val != !!x86_pmu.attr_rdpmc) { 1622 if (!!val != !!x86_pmu.attr_rdpmc) {
1655 x86_pmu.attr_rdpmc = !!val; 1623 x86_pmu.attr_rdpmc = !!val;
@@ -1682,13 +1650,20 @@ static void x86_pmu_flush_branch_stack(void)
1682 x86_pmu.flush_branch_stack(); 1650 x86_pmu.flush_branch_stack();
1683} 1651}
1684 1652
1653void perf_check_microcode(void)
1654{
1655 if (x86_pmu.check_microcode)
1656 x86_pmu.check_microcode();
1657}
1658EXPORT_SYMBOL_GPL(perf_check_microcode);
1659
1685static struct pmu pmu = { 1660static struct pmu pmu = {
1686 .pmu_enable = x86_pmu_enable, 1661 .pmu_enable = x86_pmu_enable,
1687 .pmu_disable = x86_pmu_disable, 1662 .pmu_disable = x86_pmu_disable,
1688 1663
1689 .attr_groups = x86_pmu_attr_groups, 1664 .attr_groups = x86_pmu_attr_groups,
1690 1665
1691 .event_init = x86_pmu_event_init, 1666 .event_init = x86_pmu_event_init,
1692 1667
1693 .add = x86_pmu_add, 1668 .add = x86_pmu_add,
1694 .del = x86_pmu_del, 1669 .del = x86_pmu_del,
@@ -1696,11 +1671,11 @@ static struct pmu pmu = {
1696 .stop = x86_pmu_stop, 1671 .stop = x86_pmu_stop,
1697 .read = x86_pmu_read, 1672 .read = x86_pmu_read,
1698 1673
1699 .start_txn = x86_pmu_start_txn, 1674 .start_txn = x86_pmu_start_txn,
1700 .cancel_txn = x86_pmu_cancel_txn, 1675 .cancel_txn = x86_pmu_cancel_txn,
1701 .commit_txn = x86_pmu_commit_txn, 1676 .commit_txn = x86_pmu_commit_txn,
1702 1677
1703 .event_idx = x86_pmu_event_idx, 1678 .event_idx = x86_pmu_event_idx,
1704 .flush_branch_stack = x86_pmu_flush_branch_stack, 1679 .flush_branch_stack = x86_pmu_flush_branch_stack,
1705}; 1680};
1706 1681
@@ -1863,7 +1838,7 @@ unsigned long perf_misc_flags(struct pt_regs *regs)
1863 else 1838 else
1864 misc |= PERF_RECORD_MISC_GUEST_KERNEL; 1839 misc |= PERF_RECORD_MISC_GUEST_KERNEL;
1865 } else { 1840 } else {
1866 if (user_mode(regs)) 1841 if (!kernel_ip(regs->ip))
1867 misc |= PERF_RECORD_MISC_USER; 1842 misc |= PERF_RECORD_MISC_USER;
1868 else 1843 else
1869 misc |= PERF_RECORD_MISC_KERNEL; 1844 misc |= PERF_RECORD_MISC_KERNEL;
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 7241e2fc3c17..a15df4be151f 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -14,6 +14,18 @@
14 14
15#include <linux/perf_event.h> 15#include <linux/perf_event.h>
16 16
17#if 0
18#undef wrmsrl
19#define wrmsrl(msr, val) \
20do { \
21 unsigned int _msr = (msr); \
22 u64 _val = (val); \
23 trace_printk("wrmsrl(%x, %Lx)\n", (unsigned int)(_msr), \
24 (unsigned long long)(_val)); \
25 native_write_msr((_msr), (u32)(_val), (u32)(_val >> 32)); \
26} while (0)
27#endif
28
17/* 29/*
18 * | NHM/WSM | SNB | 30 * | NHM/WSM | SNB |
19 * register ------------------------------- 31 * register -------------------------------
@@ -57,7 +69,7 @@ struct amd_nb {
57}; 69};
58 70
59/* The maximal number of PEBS events: */ 71/* The maximal number of PEBS events: */
60#define MAX_PEBS_EVENTS 4 72#define MAX_PEBS_EVENTS 8
61 73
62/* 74/*
63 * A debug store configuration. 75 * A debug store configuration.
@@ -349,6 +361,8 @@ struct x86_pmu {
349 void (*cpu_starting)(int cpu); 361 void (*cpu_starting)(int cpu);
350 void (*cpu_dying)(int cpu); 362 void (*cpu_dying)(int cpu);
351 void (*cpu_dead)(int cpu); 363 void (*cpu_dead)(int cpu);
364
365 void (*check_microcode)(void);
352 void (*flush_branch_stack)(void); 366 void (*flush_branch_stack)(void);
353 367
354 /* 368 /*
@@ -360,12 +374,16 @@ struct x86_pmu {
360 /* 374 /*
361 * Intel DebugStore bits 375 * Intel DebugStore bits
362 */ 376 */
363 int bts, pebs; 377 int bts :1,
364 int bts_active, pebs_active; 378 bts_active :1,
379 pebs :1,
380 pebs_active :1,
381 pebs_broken :1;
365 int pebs_record_size; 382 int pebs_record_size;
366 void (*drain_pebs)(struct pt_regs *regs); 383 void (*drain_pebs)(struct pt_regs *regs);
367 struct event_constraint *pebs_constraints; 384 struct event_constraint *pebs_constraints;
368 void (*pebs_aliases)(struct perf_event *event); 385 void (*pebs_aliases)(struct perf_event *event);
386 int max_pebs_events;
369 387
370 /* 388 /*
371 * Intel LBR 389 * Intel LBR
@@ -468,6 +486,8 @@ static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
468 486
469void x86_pmu_enable_all(int added); 487void x86_pmu_enable_all(int added);
470 488
489int perf_assign_events(struct event_constraint **constraints, int n,
490 int wmin, int wmax, int *assign);
471int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign); 491int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign);
472 492
473void x86_pmu_stop(struct perf_event *event, int flags); 493void x86_pmu_stop(struct perf_event *event, int flags);
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 11a4eb9131d5..4528ae7b6ec4 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -366,7 +366,7 @@ static void amd_pmu_cpu_starting(int cpu)
366 366
367 cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY; 367 cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY;
368 368
369 if (boot_cpu_data.x86_max_cores < 2 || boot_cpu_data.x86 == 0x15) 369 if (boot_cpu_data.x86_max_cores < 2)
370 return; 370 return;
371 371
372 nb_id = amd_get_nb_id(cpu); 372 nb_id = amd_get_nb_id(cpu);
@@ -422,35 +422,6 @@ static struct attribute *amd_format_attr[] = {
422 NULL, 422 NULL,
423}; 423};
424 424
425static __initconst const struct x86_pmu amd_pmu = {
426 .name = "AMD",
427 .handle_irq = x86_pmu_handle_irq,
428 .disable_all = x86_pmu_disable_all,
429 .enable_all = x86_pmu_enable_all,
430 .enable = x86_pmu_enable_event,
431 .disable = x86_pmu_disable_event,
432 .hw_config = amd_pmu_hw_config,
433 .schedule_events = x86_schedule_events,
434 .eventsel = MSR_K7_EVNTSEL0,
435 .perfctr = MSR_K7_PERFCTR0,
436 .event_map = amd_pmu_event_map,
437 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
438 .num_counters = AMD64_NUM_COUNTERS,
439 .cntval_bits = 48,
440 .cntval_mask = (1ULL << 48) - 1,
441 .apic = 1,
442 /* use highest bit to detect overflow */
443 .max_period = (1ULL << 47) - 1,
444 .get_event_constraints = amd_get_event_constraints,
445 .put_event_constraints = amd_put_event_constraints,
446
447 .format_attrs = amd_format_attr,
448
449 .cpu_prepare = amd_pmu_cpu_prepare,
450 .cpu_starting = amd_pmu_cpu_starting,
451 .cpu_dead = amd_pmu_cpu_dead,
452};
453
454/* AMD Family 15h */ 425/* AMD Family 15h */
455 426
456#define AMD_EVENT_TYPE_MASK 0x000000F0ULL 427#define AMD_EVENT_TYPE_MASK 0x000000F0ULL
@@ -597,8 +568,8 @@ amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *ev
597 } 568 }
598} 569}
599 570
600static __initconst const struct x86_pmu amd_pmu_f15h = { 571static __initconst const struct x86_pmu amd_pmu = {
601 .name = "AMD Family 15h", 572 .name = "AMD",
602 .handle_irq = x86_pmu_handle_irq, 573 .handle_irq = x86_pmu_handle_irq,
603 .disable_all = x86_pmu_disable_all, 574 .disable_all = x86_pmu_disable_all,
604 .enable_all = x86_pmu_enable_all, 575 .enable_all = x86_pmu_enable_all,
@@ -606,50 +577,68 @@ static __initconst const struct x86_pmu amd_pmu_f15h = {
606 .disable = x86_pmu_disable_event, 577 .disable = x86_pmu_disable_event,
607 .hw_config = amd_pmu_hw_config, 578 .hw_config = amd_pmu_hw_config,
608 .schedule_events = x86_schedule_events, 579 .schedule_events = x86_schedule_events,
609 .eventsel = MSR_F15H_PERF_CTL, 580 .eventsel = MSR_K7_EVNTSEL0,
610 .perfctr = MSR_F15H_PERF_CTR, 581 .perfctr = MSR_K7_PERFCTR0,
611 .event_map = amd_pmu_event_map, 582 .event_map = amd_pmu_event_map,
612 .max_events = ARRAY_SIZE(amd_perfmon_event_map), 583 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
613 .num_counters = AMD64_NUM_COUNTERS_F15H, 584 .num_counters = AMD64_NUM_COUNTERS,
614 .cntval_bits = 48, 585 .cntval_bits = 48,
615 .cntval_mask = (1ULL << 48) - 1, 586 .cntval_mask = (1ULL << 48) - 1,
616 .apic = 1, 587 .apic = 1,
617 /* use highest bit to detect overflow */ 588 /* use highest bit to detect overflow */
618 .max_period = (1ULL << 47) - 1, 589 .max_period = (1ULL << 47) - 1,
619 .get_event_constraints = amd_get_event_constraints_f15h, 590 .get_event_constraints = amd_get_event_constraints,
620 /* nortbridge counters not yet implemented: */
621#if 0
622 .put_event_constraints = amd_put_event_constraints, 591 .put_event_constraints = amd_put_event_constraints,
623 592
593 .format_attrs = amd_format_attr,
594
624 .cpu_prepare = amd_pmu_cpu_prepare, 595 .cpu_prepare = amd_pmu_cpu_prepare,
625 .cpu_dead = amd_pmu_cpu_dead,
626#endif
627 .cpu_starting = amd_pmu_cpu_starting, 596 .cpu_starting = amd_pmu_cpu_starting,
628 .format_attrs = amd_format_attr, 597 .cpu_dead = amd_pmu_cpu_dead,
629}; 598};
630 599
600static int setup_event_constraints(void)
601{
602 if (boot_cpu_data.x86 >= 0x15)
603 x86_pmu.get_event_constraints = amd_get_event_constraints_f15h;
604 return 0;
605}
606
607static int setup_perfctr_core(void)
608{
609 if (!cpu_has_perfctr_core) {
610 WARN(x86_pmu.get_event_constraints == amd_get_event_constraints_f15h,
611 KERN_ERR "Odd, counter constraints enabled but no core perfctrs detected!");
612 return -ENODEV;
613 }
614
615 WARN(x86_pmu.get_event_constraints == amd_get_event_constraints,
616 KERN_ERR "hw perf events core counters need constraints handler!");
617
618 /*
619 * If core performance counter extensions exists, we must use
620 * MSR_F15H_PERF_CTL/MSR_F15H_PERF_CTR msrs. See also
621 * x86_pmu_addr_offset().
622 */
623 x86_pmu.eventsel = MSR_F15H_PERF_CTL;
624 x86_pmu.perfctr = MSR_F15H_PERF_CTR;
625 x86_pmu.num_counters = AMD64_NUM_COUNTERS_CORE;
626
627 printk(KERN_INFO "perf: AMD core performance counters detected\n");
628
629 return 0;
630}
631
631__init int amd_pmu_init(void) 632__init int amd_pmu_init(void)
632{ 633{
633 /* Performance-monitoring supported from K7 and later: */ 634 /* Performance-monitoring supported from K7 and later: */
634 if (boot_cpu_data.x86 < 6) 635 if (boot_cpu_data.x86 < 6)
635 return -ENODEV; 636 return -ENODEV;
636 637
637 /* 638 x86_pmu = amd_pmu;
638 * If core performance counter extensions exists, it must be 639
639 * family 15h, otherwise fail. See x86_pmu_addr_offset(). 640 setup_event_constraints();
640 */ 641 setup_perfctr_core();
641 switch (boot_cpu_data.x86) {
642 case 0x15:
643 if (!cpu_has_perfctr_core)
644 return -ENODEV;
645 x86_pmu = amd_pmu_f15h;
646 break;
647 default:
648 if (cpu_has_perfctr_core)
649 return -ENODEV;
650 x86_pmu = amd_pmu;
651 break;
652 }
653 642
654 /* Events are common for all AMDs */ 643 /* Events are common for all AMDs */
655 memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, 644 memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 187c294bc658..7a8b9d0abcaa 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -5,6 +5,8 @@
5 * among events on a single PMU. 5 * among events on a single PMU.
6 */ 6 */
7 7
8#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
9
8#include <linux/stddef.h> 10#include <linux/stddef.h>
9#include <linux/types.h> 11#include <linux/types.h>
10#include <linux/init.h> 12#include <linux/init.h>
@@ -21,14 +23,14 @@
21 */ 23 */
22static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = 24static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
23{ 25{
24 [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, 26 [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
25 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, 27 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
26 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, 28 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
27 [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, 29 [PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
28 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, 30 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
29 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, 31 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
30 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, 32 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
31 [PERF_COUNT_HW_REF_CPU_CYCLES] = 0x0300, /* pseudo-encoding */ 33 [PERF_COUNT_HW_REF_CPU_CYCLES] = 0x0300, /* pseudo-encoding */
32}; 34};
33 35
34static struct event_constraint intel_core_event_constraints[] __read_mostly = 36static struct event_constraint intel_core_event_constraints[] __read_mostly =
@@ -747,7 +749,7 @@ static void intel_pmu_disable_all(void)
747 749
748 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); 750 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
749 751
750 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) 752 if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
751 intel_pmu_disable_bts(); 753 intel_pmu_disable_bts();
752 754
753 intel_pmu_pebs_disable_all(); 755 intel_pmu_pebs_disable_all();
@@ -763,9 +765,9 @@ static void intel_pmu_enable_all(int added)
763 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 765 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL,
764 x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask); 766 x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask);
765 767
766 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { 768 if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
767 struct perf_event *event = 769 struct perf_event *event =
768 cpuc->events[X86_PMC_IDX_FIXED_BTS]; 770 cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
769 771
770 if (WARN_ON_ONCE(!event)) 772 if (WARN_ON_ONCE(!event))
771 return; 773 return;
@@ -871,7 +873,7 @@ static inline void intel_pmu_ack_status(u64 ack)
871 873
872static void intel_pmu_disable_fixed(struct hw_perf_event *hwc) 874static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
873{ 875{
874 int idx = hwc->idx - X86_PMC_IDX_FIXED; 876 int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
875 u64 ctrl_val, mask; 877 u64 ctrl_val, mask;
876 878
877 mask = 0xfULL << (idx * 4); 879 mask = 0xfULL << (idx * 4);
@@ -886,7 +888,7 @@ static void intel_pmu_disable_event(struct perf_event *event)
886 struct hw_perf_event *hwc = &event->hw; 888 struct hw_perf_event *hwc = &event->hw;
887 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 889 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
888 890
889 if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { 891 if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) {
890 intel_pmu_disable_bts(); 892 intel_pmu_disable_bts();
891 intel_pmu_drain_bts_buffer(); 893 intel_pmu_drain_bts_buffer();
892 return; 894 return;
@@ -915,7 +917,7 @@ static void intel_pmu_disable_event(struct perf_event *event)
915 917
916static void intel_pmu_enable_fixed(struct hw_perf_event *hwc) 918static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
917{ 919{
918 int idx = hwc->idx - X86_PMC_IDX_FIXED; 920 int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
919 u64 ctrl_val, bits, mask; 921 u64 ctrl_val, bits, mask;
920 922
921 /* 923 /*
@@ -949,7 +951,7 @@ static void intel_pmu_enable_event(struct perf_event *event)
949 struct hw_perf_event *hwc = &event->hw; 951 struct hw_perf_event *hwc = &event->hw;
950 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 952 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
951 953
952 if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { 954 if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) {
953 if (!__this_cpu_read(cpu_hw_events.enabled)) 955 if (!__this_cpu_read(cpu_hw_events.enabled))
954 return; 956 return;
955 957
@@ -1000,14 +1002,14 @@ static void intel_pmu_reset(void)
1000 1002
1001 local_irq_save(flags); 1003 local_irq_save(flags);
1002 1004
1003 printk("clearing PMU state on CPU#%d\n", smp_processor_id()); 1005 pr_info("clearing PMU state on CPU#%d\n", smp_processor_id());
1004 1006
1005 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1007 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1006 checking_wrmsrl(x86_pmu_config_addr(idx), 0ull); 1008 wrmsrl_safe(x86_pmu_config_addr(idx), 0ull);
1007 checking_wrmsrl(x86_pmu_event_addr(idx), 0ull); 1009 wrmsrl_safe(x86_pmu_event_addr(idx), 0ull);
1008 } 1010 }
1009 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) 1011 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
1010 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); 1012 wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
1011 1013
1012 if (ds) 1014 if (ds)
1013 ds->bts_index = ds->bts_buffer_base; 1015 ds->bts_index = ds->bts_buffer_base;
@@ -1707,16 +1709,61 @@ static __init void intel_clovertown_quirk(void)
1707 * But taken together it might just make sense to not enable PEBS on 1709 * But taken together it might just make sense to not enable PEBS on
1708 * these chips. 1710 * these chips.
1709 */ 1711 */
1710 printk(KERN_WARNING "PEBS disabled due to CPU errata.\n"); 1712 pr_warn("PEBS disabled due to CPU errata\n");
1711 x86_pmu.pebs = 0; 1713 x86_pmu.pebs = 0;
1712 x86_pmu.pebs_constraints = NULL; 1714 x86_pmu.pebs_constraints = NULL;
1713} 1715}
1714 1716
1717static int intel_snb_pebs_broken(int cpu)
1718{
1719 u32 rev = UINT_MAX; /* default to broken for unknown models */
1720
1721 switch (cpu_data(cpu).x86_model) {
1722 case 42: /* SNB */
1723 rev = 0x28;
1724 break;
1725
1726 case 45: /* SNB-EP */
1727 switch (cpu_data(cpu).x86_mask) {
1728 case 6: rev = 0x618; break;
1729 case 7: rev = 0x70c; break;
1730 }
1731 }
1732
1733 return (cpu_data(cpu).microcode < rev);
1734}
1735
1736static void intel_snb_check_microcode(void)
1737{
1738 int pebs_broken = 0;
1739 int cpu;
1740
1741 get_online_cpus();
1742 for_each_online_cpu(cpu) {
1743 if ((pebs_broken = intel_snb_pebs_broken(cpu)))
1744 break;
1745 }
1746 put_online_cpus();
1747
1748 if (pebs_broken == x86_pmu.pebs_broken)
1749 return;
1750
1751 /*
1752 * Serialized by the microcode lock..
1753 */
1754 if (x86_pmu.pebs_broken) {
1755 pr_info("PEBS enabled due to microcode update\n");
1756 x86_pmu.pebs_broken = 0;
1757 } else {
1758 pr_info("PEBS disabled due to CPU errata, please upgrade microcode\n");
1759 x86_pmu.pebs_broken = 1;
1760 }
1761}
1762
1715static __init void intel_sandybridge_quirk(void) 1763static __init void intel_sandybridge_quirk(void)
1716{ 1764{
1717 printk(KERN_WARNING "PEBS disabled due to CPU errata.\n"); 1765 x86_pmu.check_microcode = intel_snb_check_microcode;
1718 x86_pmu.pebs = 0; 1766 intel_snb_check_microcode();
1719 x86_pmu.pebs_constraints = NULL;
1720} 1767}
1721 1768
1722static const struct { int id; char *name; } intel_arch_events_map[] __initconst = { 1769static const struct { int id; char *name; } intel_arch_events_map[] __initconst = {
@@ -1736,8 +1783,8 @@ static __init void intel_arch_events_quirk(void)
1736 /* disable event that reported as not presend by cpuid */ 1783 /* disable event that reported as not presend by cpuid */
1737 for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) { 1784 for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) {
1738 intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0; 1785 intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0;
1739 printk(KERN_WARNING "CPUID marked event: \'%s\' unavailable\n", 1786 pr_warn("CPUID marked event: \'%s\' unavailable\n",
1740 intel_arch_events_map[bit].name); 1787 intel_arch_events_map[bit].name);
1741 } 1788 }
1742} 1789}
1743 1790
@@ -1756,7 +1803,7 @@ static __init void intel_nehalem_quirk(void)
1756 intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89; 1803 intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
1757 ebx.split.no_branch_misses_retired = 0; 1804 ebx.split.no_branch_misses_retired = 0;
1758 x86_pmu.events_maskl = ebx.full; 1805 x86_pmu.events_maskl = ebx.full;
1759 printk(KERN_INFO "CPU erratum AAJ80 worked around\n"); 1806 pr_info("CPU erratum AAJ80 worked around\n");
1760 } 1807 }
1761} 1808}
1762 1809
@@ -1765,6 +1812,7 @@ __init int intel_pmu_init(void)
1765 union cpuid10_edx edx; 1812 union cpuid10_edx edx;
1766 union cpuid10_eax eax; 1813 union cpuid10_eax eax;
1767 union cpuid10_ebx ebx; 1814 union cpuid10_ebx ebx;
1815 struct event_constraint *c;
1768 unsigned int unused; 1816 unsigned int unused;
1769 int version; 1817 int version;
1770 1818
@@ -1800,6 +1848,8 @@ __init int intel_pmu_init(void)
1800 x86_pmu.events_maskl = ebx.full; 1848 x86_pmu.events_maskl = ebx.full;
1801 x86_pmu.events_mask_len = eax.split.mask_length; 1849 x86_pmu.events_mask_len = eax.split.mask_length;
1802 1850
1851 x86_pmu.max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters);
1852
1803 /* 1853 /*
1804 * Quirk: v2 perfmon does not report fixed-purpose events, so 1854 * Quirk: v2 perfmon does not report fixed-purpose events, so
1805 * assume at least 3 events: 1855 * assume at least 3 events:
@@ -1951,5 +2001,37 @@ __init int intel_pmu_init(void)
1951 } 2001 }
1952 } 2002 }
1953 2003
2004 if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) {
2005 WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
2006 x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC);
2007 x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC;
2008 }
2009 x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
2010
2011 if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) {
2012 WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
2013 x86_pmu.num_counters_fixed, INTEL_PMC_MAX_FIXED);
2014 x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED;
2015 }
2016
2017 x86_pmu.intel_ctrl |=
2018 ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED;
2019
2020 if (x86_pmu.event_constraints) {
2021 /*
2022 * event on fixed counter2 (REF_CYCLES) only works on this
2023 * counter, so do not extend mask to generic counters
2024 */
2025 for_each_event_constraint(c, x86_pmu.event_constraints) {
2026 if (c->cmask != X86_RAW_EVENT_MASK
2027 || c->idxmsk64 == INTEL_PMC_MSK_FIXED_REF_CYCLES) {
2028 continue;
2029 }
2030
2031 c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
2032 c->weight += x86_pmu.num_counters;
2033 }
2034 }
2035
1954 return 0; 2036 return 0;
1955} 2037}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 35e2192df9f4..629ae0b7ad90 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -248,7 +248,7 @@ void reserve_ds_buffers(void)
248 */ 248 */
249 249
250struct event_constraint bts_constraint = 250struct event_constraint bts_constraint =
251 EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0); 251 EVENT_CONSTRAINT(0, 1ULL << INTEL_PMC_IDX_FIXED_BTS, 0);
252 252
253void intel_pmu_enable_bts(u64 config) 253void intel_pmu_enable_bts(u64 config)
254{ 254{
@@ -295,7 +295,7 @@ int intel_pmu_drain_bts_buffer(void)
295 u64 to; 295 u64 to;
296 u64 flags; 296 u64 flags;
297 }; 297 };
298 struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS]; 298 struct perf_event *event = cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
299 struct bts_record *at, *top; 299 struct bts_record *at, *top;
300 struct perf_output_handle handle; 300 struct perf_output_handle handle;
301 struct perf_event_header header; 301 struct perf_event_header header;
@@ -620,7 +620,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
620 * Should not happen, we program the threshold at 1 and do not 620 * Should not happen, we program the threshold at 1 and do not
621 * set a reset value. 621 * set a reset value.
622 */ 622 */
623 WARN_ON_ONCE(n > 1); 623 WARN_ONCE(n > 1, "bad leftover pebs %d\n", n);
624 at += n - 1; 624 at += n - 1;
625 625
626 __intel_pmu_pebs_event(event, iregs, at); 626 __intel_pmu_pebs_event(event, iregs, at);
@@ -651,10 +651,10 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
651 * Should not happen, we program the threshold at 1 and do not 651 * Should not happen, we program the threshold at 1 and do not
652 * set a reset value. 652 * set a reset value.
653 */ 653 */
654 WARN_ON_ONCE(n > MAX_PEBS_EVENTS); 654 WARN_ONCE(n > x86_pmu.max_pebs_events, "Unexpected number of pebs records %d\n", n);
655 655
656 for ( ; at < top; at++) { 656 for ( ; at < top; at++) {
657 for_each_set_bit(bit, (unsigned long *)&at->status, MAX_PEBS_EVENTS) { 657 for_each_set_bit(bit, (unsigned long *)&at->status, x86_pmu.max_pebs_events) {
658 event = cpuc->events[bit]; 658 event = cpuc->events[bit];
659 if (!test_bit(bit, cpuc->active_mask)) 659 if (!test_bit(bit, cpuc->active_mask))
660 continue; 660 continue;
@@ -670,7 +670,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
670 break; 670 break;
671 } 671 }
672 672
673 if (!event || bit >= MAX_PEBS_EVENTS) 673 if (!event || bit >= x86_pmu.max_pebs_events)
674 continue; 674 continue;
675 675
676 __intel_pmu_pebs_event(event, iregs, at); 676 __intel_pmu_pebs_event(event, iregs, at);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
new file mode 100644
index 000000000000..19faffc60886
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -0,0 +1,1850 @@
1#include "perf_event_intel_uncore.h"
2
3static struct intel_uncore_type *empty_uncore[] = { NULL, };
4static struct intel_uncore_type **msr_uncores = empty_uncore;
5static struct intel_uncore_type **pci_uncores = empty_uncore;
6/* pci bus to socket mapping */
7static int pcibus_to_physid[256] = { [0 ... 255] = -1, };
8
9static DEFINE_RAW_SPINLOCK(uncore_box_lock);
10
11/* mask of cpus that collect uncore events */
12static cpumask_t uncore_cpu_mask;
13
14/* constraint for the fixed counter */
15static struct event_constraint constraint_fixed =
16 EVENT_CONSTRAINT(~0ULL, 1 << UNCORE_PMC_IDX_FIXED, ~0ULL);
17static struct event_constraint constraint_empty =
18 EVENT_CONSTRAINT(0, 0, 0);
19
20DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
21DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
22DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
23DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19");
24DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
25DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28");
26DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31");
27DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
28DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28");
29DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15");
30DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30");
31DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51");
32DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4");
33DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17");
34DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22");
35DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31");
36DEFINE_UNCORE_FORMAT_ATTR(filter_brand0, filter_brand0, "config1:0-7");
37DEFINE_UNCORE_FORMAT_ATTR(filter_brand1, filter_brand1, "config1:8-15");
38DEFINE_UNCORE_FORMAT_ATTR(filter_brand2, filter_brand2, "config1:16-23");
39DEFINE_UNCORE_FORMAT_ATTR(filter_brand3, filter_brand3, "config1:24-31");
40
41/* Sandy Bridge-EP uncore support */
42static struct intel_uncore_type snbep_uncore_cbox;
43static struct intel_uncore_type snbep_uncore_pcu;
44
45static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box)
46{
47 struct pci_dev *pdev = box->pci_dev;
48 int box_ctl = uncore_pci_box_ctl(box);
49 u32 config;
50
51 pci_read_config_dword(pdev, box_ctl, &config);
52 config |= SNBEP_PMON_BOX_CTL_FRZ;
53 pci_write_config_dword(pdev, box_ctl, config);
54}
55
56static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box)
57{
58 struct pci_dev *pdev = box->pci_dev;
59 int box_ctl = uncore_pci_box_ctl(box);
60 u32 config;
61
62 pci_read_config_dword(pdev, box_ctl, &config);
63 config &= ~SNBEP_PMON_BOX_CTL_FRZ;
64 pci_write_config_dword(pdev, box_ctl, config);
65}
66
67static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box,
68 struct perf_event *event)
69{
70 struct pci_dev *pdev = box->pci_dev;
71 struct hw_perf_event *hwc = &event->hw;
72
73 pci_write_config_dword(pdev, hwc->config_base, hwc->config |
74 SNBEP_PMON_CTL_EN);
75}
76
77static void snbep_uncore_pci_disable_event(struct intel_uncore_box *box,
78 struct perf_event *event)
79{
80 struct pci_dev *pdev = box->pci_dev;
81 struct hw_perf_event *hwc = &event->hw;
82
83 pci_write_config_dword(pdev, hwc->config_base, hwc->config);
84}
85
86static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box,
87 struct perf_event *event)
88{
89 struct pci_dev *pdev = box->pci_dev;
90 struct hw_perf_event *hwc = &event->hw;
91 u64 count;
92
93 pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count);
94 pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1);
95 return count;
96}
97
98static void snbep_uncore_pci_init_box(struct intel_uncore_box *box)
99{
100 struct pci_dev *pdev = box->pci_dev;
101 pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL,
102 SNBEP_PMON_BOX_CTL_INT);
103}
104
105static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box)
106{
107 u64 config;
108 unsigned msr;
109
110 msr = uncore_msr_box_ctl(box);
111 if (msr) {
112 rdmsrl(msr, config);
113 config |= SNBEP_PMON_BOX_CTL_FRZ;
114 wrmsrl(msr, config);
115 return;
116 }
117}
118
119static void snbep_uncore_msr_enable_box(struct intel_uncore_box *box)
120{
121 u64 config;
122 unsigned msr;
123
124 msr = uncore_msr_box_ctl(box);
125 if (msr) {
126 rdmsrl(msr, config);
127 config &= ~SNBEP_PMON_BOX_CTL_FRZ;
128 wrmsrl(msr, config);
129 return;
130 }
131}
132
133static void snbep_uncore_msr_enable_event(struct intel_uncore_box *box,
134 struct perf_event *event)
135{
136 struct hw_perf_event *hwc = &event->hw;
137 struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
138
139 if (reg1->idx != EXTRA_REG_NONE)
140 wrmsrl(reg1->reg, reg1->config);
141
142 wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
143}
144
145static void snbep_uncore_msr_disable_event(struct intel_uncore_box *box,
146 struct perf_event *event)
147{
148 struct hw_perf_event *hwc = &event->hw;
149
150 wrmsrl(hwc->config_base, hwc->config);
151}
152
153static u64 snbep_uncore_msr_read_counter(struct intel_uncore_box *box,
154 struct perf_event *event)
155{
156 struct hw_perf_event *hwc = &event->hw;
157 u64 count;
158
159 rdmsrl(hwc->event_base, count);
160 return count;
161}
162
163static void snbep_uncore_msr_init_box(struct intel_uncore_box *box)
164{
165 unsigned msr = uncore_msr_box_ctl(box);
166 if (msr)
167 wrmsrl(msr, SNBEP_PMON_BOX_CTL_INT);
168}
169
170static struct event_constraint *
171snbep_uncore_get_constraint(struct intel_uncore_box *box,
172 struct perf_event *event)
173{
174 struct intel_uncore_extra_reg *er;
175 struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
176 unsigned long flags;
177 bool ok = false;
178
179 if (reg1->idx == EXTRA_REG_NONE || (box->phys_id >= 0 && reg1->alloc))
180 return NULL;
181
182 er = &box->shared_regs[reg1->idx];
183 raw_spin_lock_irqsave(&er->lock, flags);
184 if (!atomic_read(&er->ref) || er->config1 == reg1->config) {
185 atomic_inc(&er->ref);
186 er->config1 = reg1->config;
187 ok = true;
188 }
189 raw_spin_unlock_irqrestore(&er->lock, flags);
190
191 if (ok) {
192 if (box->phys_id >= 0)
193 reg1->alloc = 1;
194 return NULL;
195 }
196 return &constraint_empty;
197}
198
199static void snbep_uncore_put_constraint(struct intel_uncore_box *box,
200 struct perf_event *event)
201{
202 struct intel_uncore_extra_reg *er;
203 struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
204
205 if (box->phys_id < 0 || !reg1->alloc)
206 return;
207
208 er = &box->shared_regs[reg1->idx];
209 atomic_dec(&er->ref);
210 reg1->alloc = 0;
211}
212
213static int snbep_uncore_hw_config(struct intel_uncore_box *box,
214 struct perf_event *event)
215{
216 struct hw_perf_event *hwc = &event->hw;
217 struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
218
219 if (box->pmu->type == &snbep_uncore_cbox) {
220 reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
221 SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
222 reg1->config = event->attr.config1 &
223 SNBEP_CB0_MSR_PMON_BOX_FILTER_MASK;
224 } else if (box->pmu->type == &snbep_uncore_pcu) {
225 reg1->reg = SNBEP_PCU_MSR_PMON_BOX_FILTER;
226 reg1->config = event->attr.config1 &
227 SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK;
228 } else {
229 return 0;
230 }
231 reg1->idx = 0;
232 return 0;
233}
234
235static struct attribute *snbep_uncore_formats_attr[] = {
236 &format_attr_event.attr,
237 &format_attr_umask.attr,
238 &format_attr_edge.attr,
239 &format_attr_inv.attr,
240 &format_attr_thresh8.attr,
241 NULL,
242};
243
244static struct attribute *snbep_uncore_ubox_formats_attr[] = {
245 &format_attr_event.attr,
246 &format_attr_umask.attr,
247 &format_attr_edge.attr,
248 &format_attr_inv.attr,
249 &format_attr_thresh5.attr,
250 NULL,
251};
252
253static struct attribute *snbep_uncore_cbox_formats_attr[] = {
254 &format_attr_event.attr,
255 &format_attr_umask.attr,
256 &format_attr_edge.attr,
257 &format_attr_tid_en.attr,
258 &format_attr_inv.attr,
259 &format_attr_thresh8.attr,
260 &format_attr_filter_tid.attr,
261 &format_attr_filter_nid.attr,
262 &format_attr_filter_state.attr,
263 &format_attr_filter_opc.attr,
264 NULL,
265};
266
267static struct attribute *snbep_uncore_pcu_formats_attr[] = {
268 &format_attr_event.attr,
269 &format_attr_occ_sel.attr,
270 &format_attr_edge.attr,
271 &format_attr_inv.attr,
272 &format_attr_thresh5.attr,
273 &format_attr_occ_invert.attr,
274 &format_attr_occ_edge.attr,
275 &format_attr_filter_brand0.attr,
276 &format_attr_filter_brand1.attr,
277 &format_attr_filter_brand2.attr,
278 &format_attr_filter_brand3.attr,
279 NULL,
280};
281
282static struct uncore_event_desc snbep_uncore_imc_events[] = {
283 INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
284 INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03"),
285 INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"),
286 { /* end: all zeroes */ },
287};
288
289static struct uncore_event_desc snbep_uncore_qpi_events[] = {
290 INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x14"),
291 INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"),
292 INTEL_UNCORE_EVENT_DESC(drs_data, "event=0x02,umask=0x08"),
293 INTEL_UNCORE_EVENT_DESC(ncb_data, "event=0x03,umask=0x04"),
294 { /* end: all zeroes */ },
295};
296
297static struct attribute_group snbep_uncore_format_group = {
298 .name = "format",
299 .attrs = snbep_uncore_formats_attr,
300};
301
302static struct attribute_group snbep_uncore_ubox_format_group = {
303 .name = "format",
304 .attrs = snbep_uncore_ubox_formats_attr,
305};
306
307static struct attribute_group snbep_uncore_cbox_format_group = {
308 .name = "format",
309 .attrs = snbep_uncore_cbox_formats_attr,
310};
311
312static struct attribute_group snbep_uncore_pcu_format_group = {
313 .name = "format",
314 .attrs = snbep_uncore_pcu_formats_attr,
315};
316
317static struct intel_uncore_ops snbep_uncore_msr_ops = {
318 .init_box = snbep_uncore_msr_init_box,
319 .disable_box = snbep_uncore_msr_disable_box,
320 .enable_box = snbep_uncore_msr_enable_box,
321 .disable_event = snbep_uncore_msr_disable_event,
322 .enable_event = snbep_uncore_msr_enable_event,
323 .read_counter = snbep_uncore_msr_read_counter,
324 .get_constraint = snbep_uncore_get_constraint,
325 .put_constraint = snbep_uncore_put_constraint,
326 .hw_config = snbep_uncore_hw_config,
327};
328
329static struct intel_uncore_ops snbep_uncore_pci_ops = {
330 .init_box = snbep_uncore_pci_init_box,
331 .disable_box = snbep_uncore_pci_disable_box,
332 .enable_box = snbep_uncore_pci_enable_box,
333 .disable_event = snbep_uncore_pci_disable_event,
334 .enable_event = snbep_uncore_pci_enable_event,
335 .read_counter = snbep_uncore_pci_read_counter,
336};
337
338static struct event_constraint snbep_uncore_cbox_constraints[] = {
339 UNCORE_EVENT_CONSTRAINT(0x01, 0x1),
340 UNCORE_EVENT_CONSTRAINT(0x02, 0x3),
341 UNCORE_EVENT_CONSTRAINT(0x04, 0x3),
342 UNCORE_EVENT_CONSTRAINT(0x05, 0x3),
343 UNCORE_EVENT_CONSTRAINT(0x07, 0x3),
344 UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
345 UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
346 UNCORE_EVENT_CONSTRAINT(0x13, 0x3),
347 UNCORE_EVENT_CONSTRAINT(0x1b, 0xc),
348 UNCORE_EVENT_CONSTRAINT(0x1c, 0xc),
349 UNCORE_EVENT_CONSTRAINT(0x1d, 0xc),
350 UNCORE_EVENT_CONSTRAINT(0x1e, 0xc),
351 EVENT_CONSTRAINT_OVERLAP(0x1f, 0xe, 0xff),
352 UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
353 UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
354 UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
355 UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
356 UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
357 UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
358 UNCORE_EVENT_CONSTRAINT(0x35, 0x3),
359 UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
360 UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
361 UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
362 UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
363 UNCORE_EVENT_CONSTRAINT(0x3b, 0x1),
364 EVENT_CONSTRAINT_END
365};
366
367static struct event_constraint snbep_uncore_r2pcie_constraints[] = {
368 UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
369 UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
370 UNCORE_EVENT_CONSTRAINT(0x12, 0x1),
371 UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
372 UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
373 UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
374 UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
375 UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
376 UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
377 UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
378 EVENT_CONSTRAINT_END
379};
380
381static struct event_constraint snbep_uncore_r3qpi_constraints[] = {
382 UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
383 UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
384 UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
385 UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
386 UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
387 UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
388 UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
389 UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
390 UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
391 UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
392 UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
393 UNCORE_EVENT_CONSTRAINT(0x30, 0x3),
394 UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
395 UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
396 UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
397 UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
398 UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
399 UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
400 EVENT_CONSTRAINT_END
401};
402
403static struct intel_uncore_type snbep_uncore_ubox = {
404 .name = "ubox",
405 .num_counters = 2,
406 .num_boxes = 1,
407 .perf_ctr_bits = 44,
408 .fixed_ctr_bits = 48,
409 .perf_ctr = SNBEP_U_MSR_PMON_CTR0,
410 .event_ctl = SNBEP_U_MSR_PMON_CTL0,
411 .event_mask = SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
412 .fixed_ctr = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
413 .fixed_ctl = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
414 .ops = &snbep_uncore_msr_ops,
415 .format_group = &snbep_uncore_ubox_format_group,
416};
417
418static struct intel_uncore_type snbep_uncore_cbox = {
419 .name = "cbox",
420 .num_counters = 4,
421 .num_boxes = 8,
422 .perf_ctr_bits = 44,
423 .event_ctl = SNBEP_C0_MSR_PMON_CTL0,
424 .perf_ctr = SNBEP_C0_MSR_PMON_CTR0,
425 .event_mask = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
426 .box_ctl = SNBEP_C0_MSR_PMON_BOX_CTL,
427 .msr_offset = SNBEP_CBO_MSR_OFFSET,
428 .num_shared_regs = 1,
429 .constraints = snbep_uncore_cbox_constraints,
430 .ops = &snbep_uncore_msr_ops,
431 .format_group = &snbep_uncore_cbox_format_group,
432};
433
434static struct intel_uncore_type snbep_uncore_pcu = {
435 .name = "pcu",
436 .num_counters = 4,
437 .num_boxes = 1,
438 .perf_ctr_bits = 48,
439 .perf_ctr = SNBEP_PCU_MSR_PMON_CTR0,
440 .event_ctl = SNBEP_PCU_MSR_PMON_CTL0,
441 .event_mask = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
442 .box_ctl = SNBEP_PCU_MSR_PMON_BOX_CTL,
443 .num_shared_regs = 1,
444 .ops = &snbep_uncore_msr_ops,
445 .format_group = &snbep_uncore_pcu_format_group,
446};
447
448static struct intel_uncore_type *snbep_msr_uncores[] = {
449 &snbep_uncore_ubox,
450 &snbep_uncore_cbox,
451 &snbep_uncore_pcu,
452 NULL,
453};
454
455#define SNBEP_UNCORE_PCI_COMMON_INIT() \
456 .perf_ctr = SNBEP_PCI_PMON_CTR0, \
457 .event_ctl = SNBEP_PCI_PMON_CTL0, \
458 .event_mask = SNBEP_PMON_RAW_EVENT_MASK, \
459 .box_ctl = SNBEP_PCI_PMON_BOX_CTL, \
460 .ops = &snbep_uncore_pci_ops, \
461 .format_group = &snbep_uncore_format_group
462
463static struct intel_uncore_type snbep_uncore_ha = {
464 .name = "ha",
465 .num_counters = 4,
466 .num_boxes = 1,
467 .perf_ctr_bits = 48,
468 SNBEP_UNCORE_PCI_COMMON_INIT(),
469};
470
471static struct intel_uncore_type snbep_uncore_imc = {
472 .name = "imc",
473 .num_counters = 4,
474 .num_boxes = 4,
475 .perf_ctr_bits = 48,
476 .fixed_ctr_bits = 48,
477 .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
478 .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
479 .event_descs = snbep_uncore_imc_events,
480 SNBEP_UNCORE_PCI_COMMON_INIT(),
481};
482
483static struct intel_uncore_type snbep_uncore_qpi = {
484 .name = "qpi",
485 .num_counters = 4,
486 .num_boxes = 2,
487 .perf_ctr_bits = 48,
488 .event_descs = snbep_uncore_qpi_events,
489 SNBEP_UNCORE_PCI_COMMON_INIT(),
490};
491
492
493static struct intel_uncore_type snbep_uncore_r2pcie = {
494 .name = "r2pcie",
495 .num_counters = 4,
496 .num_boxes = 1,
497 .perf_ctr_bits = 44,
498 .constraints = snbep_uncore_r2pcie_constraints,
499 SNBEP_UNCORE_PCI_COMMON_INIT(),
500};
501
502static struct intel_uncore_type snbep_uncore_r3qpi = {
503 .name = "r3qpi",
504 .num_counters = 3,
505 .num_boxes = 2,
506 .perf_ctr_bits = 44,
507 .constraints = snbep_uncore_r3qpi_constraints,
508 SNBEP_UNCORE_PCI_COMMON_INIT(),
509};
510
511static struct intel_uncore_type *snbep_pci_uncores[] = {
512 &snbep_uncore_ha,
513 &snbep_uncore_imc,
514 &snbep_uncore_qpi,
515 &snbep_uncore_r2pcie,
516 &snbep_uncore_r3qpi,
517 NULL,
518};
519
520static DEFINE_PCI_DEVICE_TABLE(snbep_uncore_pci_ids) = {
521 { /* Home Agent */
522 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA),
523 .driver_data = (unsigned long)&snbep_uncore_ha,
524 },
525 { /* MC Channel 0 */
526 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0),
527 .driver_data = (unsigned long)&snbep_uncore_imc,
528 },
529 { /* MC Channel 1 */
530 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1),
531 .driver_data = (unsigned long)&snbep_uncore_imc,
532 },
533 { /* MC Channel 2 */
534 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2),
535 .driver_data = (unsigned long)&snbep_uncore_imc,
536 },
537 { /* MC Channel 3 */
538 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3),
539 .driver_data = (unsigned long)&snbep_uncore_imc,
540 },
541 { /* QPI Port 0 */
542 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0),
543 .driver_data = (unsigned long)&snbep_uncore_qpi,
544 },
545 { /* QPI Port 1 */
546 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1),
547 .driver_data = (unsigned long)&snbep_uncore_qpi,
548 },
549 { /* P2PCIe */
550 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE),
551 .driver_data = (unsigned long)&snbep_uncore_r2pcie,
552 },
553 { /* R3QPI Link 0 */
554 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0),
555 .driver_data = (unsigned long)&snbep_uncore_r3qpi,
556 },
557 { /* R3QPI Link 1 */
558 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1),
559 .driver_data = (unsigned long)&snbep_uncore_r3qpi,
560 },
561 { /* end: all zeroes */ }
562};
563
564static struct pci_driver snbep_uncore_pci_driver = {
565 .name = "snbep_uncore",
566 .id_table = snbep_uncore_pci_ids,
567};
568
569/*
570 * build pci bus to socket mapping
571 */
572static void snbep_pci2phy_map_init(void)
573{
574 struct pci_dev *ubox_dev = NULL;
575 int i, bus, nodeid;
576 u32 config;
577
578 while (1) {
579 /* find the UBOX device */
580 ubox_dev = pci_get_device(PCI_VENDOR_ID_INTEL,
581 PCI_DEVICE_ID_INTEL_JAKETOWN_UBOX,
582 ubox_dev);
583 if (!ubox_dev)
584 break;
585 bus = ubox_dev->bus->number;
586 /* get the Node ID of the local register */
587 pci_read_config_dword(ubox_dev, 0x40, &config);
588 nodeid = config;
589 /* get the Node ID mapping */
590 pci_read_config_dword(ubox_dev, 0x54, &config);
591 /*
592 * every three bits in the Node ID mapping register maps
593 * to a particular node.
594 */
595 for (i = 0; i < 8; i++) {
596 if (nodeid == ((config >> (3 * i)) & 0x7)) {
597 pcibus_to_physid[bus] = i;
598 break;
599 }
600 }
601 };
602 return;
603}
604/* end of Sandy Bridge-EP uncore support */
605
606
607/* Sandy Bridge uncore support */
608static void snb_uncore_msr_enable_event(struct intel_uncore_box *box,
609 struct perf_event *event)
610{
611 struct hw_perf_event *hwc = &event->hw;
612
613 if (hwc->idx < UNCORE_PMC_IDX_FIXED)
614 wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
615 else
616 wrmsrl(hwc->config_base, SNB_UNC_CTL_EN);
617}
618
619static void snb_uncore_msr_disable_event(struct intel_uncore_box *box,
620 struct perf_event *event)
621{
622 wrmsrl(event->hw.config_base, 0);
623}
624
625static u64 snb_uncore_msr_read_counter(struct intel_uncore_box *box,
626 struct perf_event *event)
627{
628 u64 count;
629 rdmsrl(event->hw.event_base, count);
630 return count;
631}
632
633static void snb_uncore_msr_init_box(struct intel_uncore_box *box)
634{
635 if (box->pmu->pmu_idx == 0) {
636 wrmsrl(SNB_UNC_PERF_GLOBAL_CTL,
637 SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL);
638 }
639}
640
641static struct attribute *snb_uncore_formats_attr[] = {
642 &format_attr_event.attr,
643 &format_attr_umask.attr,
644 &format_attr_edge.attr,
645 &format_attr_inv.attr,
646 &format_attr_cmask5.attr,
647 NULL,
648};
649
650static struct attribute_group snb_uncore_format_group = {
651 .name = "format",
652 .attrs = snb_uncore_formats_attr,
653};
654
655static struct intel_uncore_ops snb_uncore_msr_ops = {
656 .init_box = snb_uncore_msr_init_box,
657 .disable_event = snb_uncore_msr_disable_event,
658 .enable_event = snb_uncore_msr_enable_event,
659 .read_counter = snb_uncore_msr_read_counter,
660};
661
662static struct event_constraint snb_uncore_cbox_constraints[] = {
663 UNCORE_EVENT_CONSTRAINT(0x80, 0x1),
664 UNCORE_EVENT_CONSTRAINT(0x83, 0x1),
665 EVENT_CONSTRAINT_END
666};
667
668static struct intel_uncore_type snb_uncore_cbox = {
669 .name = "cbox",
670 .num_counters = 2,
671 .num_boxes = 4,
672 .perf_ctr_bits = 44,
673 .fixed_ctr_bits = 48,
674 .perf_ctr = SNB_UNC_CBO_0_PER_CTR0,
675 .event_ctl = SNB_UNC_CBO_0_PERFEVTSEL0,
676 .fixed_ctr = SNB_UNC_FIXED_CTR,
677 .fixed_ctl = SNB_UNC_FIXED_CTR_CTRL,
678 .single_fixed = 1,
679 .event_mask = SNB_UNC_RAW_EVENT_MASK,
680 .msr_offset = SNB_UNC_CBO_MSR_OFFSET,
681 .constraints = snb_uncore_cbox_constraints,
682 .ops = &snb_uncore_msr_ops,
683 .format_group = &snb_uncore_format_group,
684};
685
686static struct intel_uncore_type *snb_msr_uncores[] = {
687 &snb_uncore_cbox,
688 NULL,
689};
690/* end of Sandy Bridge uncore support */
691
692/* Nehalem uncore support */
693static void nhm_uncore_msr_disable_box(struct intel_uncore_box *box)
694{
695 wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, 0);
696}
697
698static void nhm_uncore_msr_enable_box(struct intel_uncore_box *box)
699{
700 wrmsrl(NHM_UNC_PERF_GLOBAL_CTL,
701 NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC);
702}
703
704static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box,
705 struct perf_event *event)
706{
707 struct hw_perf_event *hwc = &event->hw;
708
709 if (hwc->idx < UNCORE_PMC_IDX_FIXED)
710 wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
711 else
712 wrmsrl(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN);
713}
714
715static struct attribute *nhm_uncore_formats_attr[] = {
716 &format_attr_event.attr,
717 &format_attr_umask.attr,
718 &format_attr_edge.attr,
719 &format_attr_inv.attr,
720 &format_attr_cmask8.attr,
721 NULL,
722};
723
724static struct attribute_group nhm_uncore_format_group = {
725 .name = "format",
726 .attrs = nhm_uncore_formats_attr,
727};
728
729static struct uncore_event_desc nhm_uncore_events[] = {
730 INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
731 INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any, "event=0x2f,umask=0x0f"),
732 INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any, "event=0x2c,umask=0x0f"),
733 INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads, "event=0x20,umask=0x01"),
734 INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_writes, "event=0x20,umask=0x02"),
735 INTEL_UNCORE_EVENT_DESC(qhl_request_remote_reads, "event=0x20,umask=0x04"),
736 INTEL_UNCORE_EVENT_DESC(qhl_request_remote_writes, "event=0x20,umask=0x08"),
737 INTEL_UNCORE_EVENT_DESC(qhl_request_local_reads, "event=0x20,umask=0x10"),
738 INTEL_UNCORE_EVENT_DESC(qhl_request_local_writes, "event=0x20,umask=0x20"),
739 { /* end: all zeroes */ },
740};
741
742static struct intel_uncore_ops nhm_uncore_msr_ops = {
743 .disable_box = nhm_uncore_msr_disable_box,
744 .enable_box = nhm_uncore_msr_enable_box,
745 .disable_event = snb_uncore_msr_disable_event,
746 .enable_event = nhm_uncore_msr_enable_event,
747 .read_counter = snb_uncore_msr_read_counter,
748};
749
750static struct intel_uncore_type nhm_uncore = {
751 .name = "",
752 .num_counters = 8,
753 .num_boxes = 1,
754 .perf_ctr_bits = 48,
755 .fixed_ctr_bits = 48,
756 .event_ctl = NHM_UNC_PERFEVTSEL0,
757 .perf_ctr = NHM_UNC_UNCORE_PMC0,
758 .fixed_ctr = NHM_UNC_FIXED_CTR,
759 .fixed_ctl = NHM_UNC_FIXED_CTR_CTRL,
760 .event_mask = NHM_UNC_RAW_EVENT_MASK,
761 .event_descs = nhm_uncore_events,
762 .ops = &nhm_uncore_msr_ops,
763 .format_group = &nhm_uncore_format_group,
764};
765
766static struct intel_uncore_type *nhm_msr_uncores[] = {
767 &nhm_uncore,
768 NULL,
769};
770/* end of Nehalem uncore support */
771
772static void uncore_assign_hw_event(struct intel_uncore_box *box,
773 struct perf_event *event, int idx)
774{
775 struct hw_perf_event *hwc = &event->hw;
776
777 hwc->idx = idx;
778 hwc->last_tag = ++box->tags[idx];
779
780 if (hwc->idx == UNCORE_PMC_IDX_FIXED) {
781 hwc->event_base = uncore_fixed_ctr(box);
782 hwc->config_base = uncore_fixed_ctl(box);
783 return;
784 }
785
786 hwc->config_base = uncore_event_ctl(box, hwc->idx);
787 hwc->event_base = uncore_perf_ctr(box, hwc->idx);
788}
789
790static void uncore_perf_event_update(struct intel_uncore_box *box,
791 struct perf_event *event)
792{
793 u64 prev_count, new_count, delta;
794 int shift;
795
796 if (event->hw.idx >= UNCORE_PMC_IDX_FIXED)
797 shift = 64 - uncore_fixed_ctr_bits(box);
798 else
799 shift = 64 - uncore_perf_ctr_bits(box);
800
801 /* the hrtimer might modify the previous event value */
802again:
803 prev_count = local64_read(&event->hw.prev_count);
804 new_count = uncore_read_counter(box, event);
805 if (local64_xchg(&event->hw.prev_count, new_count) != prev_count)
806 goto again;
807
808 delta = (new_count << shift) - (prev_count << shift);
809 delta >>= shift;
810
811 local64_add(delta, &event->count);
812}
813
814/*
815 * The overflow interrupt is unavailable for SandyBridge-EP, is broken
816 * for SandyBridge. So we use hrtimer to periodically poll the counter
817 * to avoid overflow.
818 */
819static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer)
820{
821 struct intel_uncore_box *box;
822 unsigned long flags;
823 int bit;
824
825 box = container_of(hrtimer, struct intel_uncore_box, hrtimer);
826 if (!box->n_active || box->cpu != smp_processor_id())
827 return HRTIMER_NORESTART;
828 /*
829 * disable local interrupt to prevent uncore_pmu_event_start/stop
830 * to interrupt the update process
831 */
832 local_irq_save(flags);
833
834 for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX)
835 uncore_perf_event_update(box, box->events[bit]);
836
837 local_irq_restore(flags);
838
839 hrtimer_forward_now(hrtimer, ns_to_ktime(UNCORE_PMU_HRTIMER_INTERVAL));
840 return HRTIMER_RESTART;
841}
842
843static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box)
844{
845 __hrtimer_start_range_ns(&box->hrtimer,
846 ns_to_ktime(UNCORE_PMU_HRTIMER_INTERVAL), 0,
847 HRTIMER_MODE_REL_PINNED, 0);
848}
849
850static void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box)
851{
852 hrtimer_cancel(&box->hrtimer);
853}
854
855static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box)
856{
857 hrtimer_init(&box->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
858 box->hrtimer.function = uncore_pmu_hrtimer;
859}
860
861struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type,
862 int cpu)
863{
864 struct intel_uncore_box *box;
865 int i, size;
866
867 size = sizeof(*box) + type->num_shared_regs *
868 sizeof(struct intel_uncore_extra_reg);
869
870 box = kmalloc_node(size, GFP_KERNEL | __GFP_ZERO, cpu_to_node(cpu));
871 if (!box)
872 return NULL;
873
874 for (i = 0; i < type->num_shared_regs; i++)
875 raw_spin_lock_init(&box->shared_regs[i].lock);
876
877 uncore_pmu_init_hrtimer(box);
878 atomic_set(&box->refcnt, 1);
879 box->cpu = -1;
880 box->phys_id = -1;
881
882 return box;
883}
884
885static struct intel_uncore_box *
886uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
887{
888 static struct intel_uncore_box *box;
889
890 box = *per_cpu_ptr(pmu->box, cpu);
891 if (box)
892 return box;
893
894 raw_spin_lock(&uncore_box_lock);
895 list_for_each_entry(box, &pmu->box_list, list) {
896 if (box->phys_id == topology_physical_package_id(cpu)) {
897 atomic_inc(&box->refcnt);
898 *per_cpu_ptr(pmu->box, cpu) = box;
899 break;
900 }
901 }
902 raw_spin_unlock(&uncore_box_lock);
903
904 return *per_cpu_ptr(pmu->box, cpu);
905}
906
907static struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event)
908{
909 return container_of(event->pmu, struct intel_uncore_pmu, pmu);
910}
911
912static struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
913{
914 /*
915 * perf core schedules event on the basis of cpu, uncore events are
916 * collected by one of the cpus inside a physical package.
917 */
918 return uncore_pmu_to_box(uncore_event_to_pmu(event),
919 smp_processor_id());
920}
921
922static int uncore_collect_events(struct intel_uncore_box *box,
923 struct perf_event *leader, bool dogrp)
924{
925 struct perf_event *event;
926 int n, max_count;
927
928 max_count = box->pmu->type->num_counters;
929 if (box->pmu->type->fixed_ctl)
930 max_count++;
931
932 if (box->n_events >= max_count)
933 return -EINVAL;
934
935 n = box->n_events;
936 box->event_list[n] = leader;
937 n++;
938 if (!dogrp)
939 return n;
940
941 list_for_each_entry(event, &leader->sibling_list, group_entry) {
942 if (event->state <= PERF_EVENT_STATE_OFF)
943 continue;
944
945 if (n >= max_count)
946 return -EINVAL;
947
948 box->event_list[n] = event;
949 n++;
950 }
951 return n;
952}
953
954static struct event_constraint *
955uncore_get_event_constraint(struct intel_uncore_box *box,
956 struct perf_event *event)
957{
958 struct intel_uncore_type *type = box->pmu->type;
959 struct event_constraint *c;
960
961 if (type->ops->get_constraint) {
962 c = type->ops->get_constraint(box, event);
963 if (c)
964 return c;
965 }
966
967 if (event->hw.config == ~0ULL)
968 return &constraint_fixed;
969
970 if (type->constraints) {
971 for_each_event_constraint(c, type->constraints) {
972 if ((event->hw.config & c->cmask) == c->code)
973 return c;
974 }
975 }
976
977 return &type->unconstrainted;
978}
979
980static void uncore_put_event_constraint(struct intel_uncore_box *box,
981 struct perf_event *event)
982{
983 if (box->pmu->type->ops->put_constraint)
984 box->pmu->type->ops->put_constraint(box, event);
985}
986
987static int uncore_assign_events(struct intel_uncore_box *box,
988 int assign[], int n)
989{
990 unsigned long used_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
991 struct event_constraint *c, *constraints[UNCORE_PMC_IDX_MAX];
992 int i, wmin, wmax, ret = 0;
993 struct hw_perf_event *hwc;
994
995 bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX);
996
997 for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) {
998 c = uncore_get_event_constraint(box, box->event_list[i]);
999 constraints[i] = c;
1000 wmin = min(wmin, c->weight);
1001 wmax = max(wmax, c->weight);
1002 }
1003
1004 /* fastpath, try to reuse previous register */
1005 for (i = 0; i < n; i++) {
1006 hwc = &box->event_list[i]->hw;
1007 c = constraints[i];
1008
1009 /* never assigned */
1010 if (hwc->idx == -1)
1011 break;
1012
1013 /* constraint still honored */
1014 if (!test_bit(hwc->idx, c->idxmsk))
1015 break;
1016
1017 /* not already used */
1018 if (test_bit(hwc->idx, used_mask))
1019 break;
1020
1021 __set_bit(hwc->idx, used_mask);
1022 if (assign)
1023 assign[i] = hwc->idx;
1024 }
1025 /* slow path */
1026 if (i != n)
1027 ret = perf_assign_events(constraints, n, wmin, wmax, assign);
1028
1029 if (!assign || ret) {
1030 for (i = 0; i < n; i++)
1031 uncore_put_event_constraint(box, box->event_list[i]);
1032 }
1033 return ret ? -EINVAL : 0;
1034}
1035
1036static void uncore_pmu_event_start(struct perf_event *event, int flags)
1037{
1038 struct intel_uncore_box *box = uncore_event_to_box(event);
1039 int idx = event->hw.idx;
1040
1041 if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
1042 return;
1043
1044 if (WARN_ON_ONCE(idx == -1 || idx >= UNCORE_PMC_IDX_MAX))
1045 return;
1046
1047 event->hw.state = 0;
1048 box->events[idx] = event;
1049 box->n_active++;
1050 __set_bit(idx, box->active_mask);
1051
1052 local64_set(&event->hw.prev_count, uncore_read_counter(box, event));
1053 uncore_enable_event(box, event);
1054
1055 if (box->n_active == 1) {
1056 uncore_enable_box(box);
1057 uncore_pmu_start_hrtimer(box);
1058 }
1059}
1060
1061static void uncore_pmu_event_stop(struct perf_event *event, int flags)
1062{
1063 struct intel_uncore_box *box = uncore_event_to_box(event);
1064 struct hw_perf_event *hwc = &event->hw;
1065
1066 if (__test_and_clear_bit(hwc->idx, box->active_mask)) {
1067 uncore_disable_event(box, event);
1068 box->n_active--;
1069 box->events[hwc->idx] = NULL;
1070 WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
1071 hwc->state |= PERF_HES_STOPPED;
1072
1073 if (box->n_active == 0) {
1074 uncore_disable_box(box);
1075 uncore_pmu_cancel_hrtimer(box);
1076 }
1077 }
1078
1079 if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
1080 /*
1081 * Drain the remaining delta count out of a event
1082 * that we are disabling:
1083 */
1084 uncore_perf_event_update(box, event);
1085 hwc->state |= PERF_HES_UPTODATE;
1086 }
1087}
1088
1089static int uncore_pmu_event_add(struct perf_event *event, int flags)
1090{
1091 struct intel_uncore_box *box = uncore_event_to_box(event);
1092 struct hw_perf_event *hwc = &event->hw;
1093 int assign[UNCORE_PMC_IDX_MAX];
1094 int i, n, ret;
1095
1096 if (!box)
1097 return -ENODEV;
1098
1099 ret = n = uncore_collect_events(box, event, false);
1100 if (ret < 0)
1101 return ret;
1102
1103 hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
1104 if (!(flags & PERF_EF_START))
1105 hwc->state |= PERF_HES_ARCH;
1106
1107 ret = uncore_assign_events(box, assign, n);
1108 if (ret)
1109 return ret;
1110
1111 /* save events moving to new counters */
1112 for (i = 0; i < box->n_events; i++) {
1113 event = box->event_list[i];
1114 hwc = &event->hw;
1115
1116 if (hwc->idx == assign[i] &&
1117 hwc->last_tag == box->tags[assign[i]])
1118 continue;
1119 /*
1120 * Ensure we don't accidentally enable a stopped
1121 * counter simply because we rescheduled.
1122 */
1123 if (hwc->state & PERF_HES_STOPPED)
1124 hwc->state |= PERF_HES_ARCH;
1125
1126 uncore_pmu_event_stop(event, PERF_EF_UPDATE);
1127 }
1128
1129 /* reprogram moved events into new counters */
1130 for (i = 0; i < n; i++) {
1131 event = box->event_list[i];
1132 hwc = &event->hw;
1133
1134 if (hwc->idx != assign[i] ||
1135 hwc->last_tag != box->tags[assign[i]])
1136 uncore_assign_hw_event(box, event, assign[i]);
1137 else if (i < box->n_events)
1138 continue;
1139
1140 if (hwc->state & PERF_HES_ARCH)
1141 continue;
1142
1143 uncore_pmu_event_start(event, 0);
1144 }
1145 box->n_events = n;
1146
1147 return 0;
1148}
1149
1150static void uncore_pmu_event_del(struct perf_event *event, int flags)
1151{
1152 struct intel_uncore_box *box = uncore_event_to_box(event);
1153 int i;
1154
1155 uncore_pmu_event_stop(event, PERF_EF_UPDATE);
1156
1157 for (i = 0; i < box->n_events; i++) {
1158 if (event == box->event_list[i]) {
1159 uncore_put_event_constraint(box, event);
1160
1161 while (++i < box->n_events)
1162 box->event_list[i - 1] = box->event_list[i];
1163
1164 --box->n_events;
1165 break;
1166 }
1167 }
1168
1169 event->hw.idx = -1;
1170 event->hw.last_tag = ~0ULL;
1171}
1172
1173static void uncore_pmu_event_read(struct perf_event *event)
1174{
1175 struct intel_uncore_box *box = uncore_event_to_box(event);
1176 uncore_perf_event_update(box, event);
1177}
1178
1179/*
1180 * validation ensures the group can be loaded onto the
1181 * PMU if it was the only group available.
1182 */
1183static int uncore_validate_group(struct intel_uncore_pmu *pmu,
1184 struct perf_event *event)
1185{
1186 struct perf_event *leader = event->group_leader;
1187 struct intel_uncore_box *fake_box;
1188 int ret = -EINVAL, n;
1189
1190 fake_box = uncore_alloc_box(pmu->type, smp_processor_id());
1191 if (!fake_box)
1192 return -ENOMEM;
1193
1194 fake_box->pmu = pmu;
1195 /*
1196 * the event is not yet connected with its
1197 * siblings therefore we must first collect
1198 * existing siblings, then add the new event
1199 * before we can simulate the scheduling
1200 */
1201 n = uncore_collect_events(fake_box, leader, true);
1202 if (n < 0)
1203 goto out;
1204
1205 fake_box->n_events = n;
1206 n = uncore_collect_events(fake_box, event, false);
1207 if (n < 0)
1208 goto out;
1209
1210 fake_box->n_events = n;
1211
1212 ret = uncore_assign_events(fake_box, NULL, n);
1213out:
1214 kfree(fake_box);
1215 return ret;
1216}
1217
1218int uncore_pmu_event_init(struct perf_event *event)
1219{
1220 struct intel_uncore_pmu *pmu;
1221 struct intel_uncore_box *box;
1222 struct hw_perf_event *hwc = &event->hw;
1223 int ret;
1224
1225 if (event->attr.type != event->pmu->type)
1226 return -ENOENT;
1227
1228 pmu = uncore_event_to_pmu(event);
1229 /* no device found for this pmu */
1230 if (pmu->func_id < 0)
1231 return -ENOENT;
1232
1233 /*
1234 * Uncore PMU does measure at all privilege level all the time.
1235 * So it doesn't make sense to specify any exclude bits.
1236 */
1237 if (event->attr.exclude_user || event->attr.exclude_kernel ||
1238 event->attr.exclude_hv || event->attr.exclude_idle)
1239 return -EINVAL;
1240
1241 /* Sampling not supported yet */
1242 if (hwc->sample_period)
1243 return -EINVAL;
1244
1245 /*
1246 * Place all uncore events for a particular physical package
1247 * onto a single cpu
1248 */
1249 if (event->cpu < 0)
1250 return -EINVAL;
1251 box = uncore_pmu_to_box(pmu, event->cpu);
1252 if (!box || box->cpu < 0)
1253 return -EINVAL;
1254 event->cpu = box->cpu;
1255
1256 event->hw.idx = -1;
1257 event->hw.last_tag = ~0ULL;
1258 event->hw.extra_reg.idx = EXTRA_REG_NONE;
1259
1260 if (event->attr.config == UNCORE_FIXED_EVENT) {
1261 /* no fixed counter */
1262 if (!pmu->type->fixed_ctl)
1263 return -EINVAL;
1264 /*
1265 * if there is only one fixed counter, only the first pmu
1266 * can access the fixed counter
1267 */
1268 if (pmu->type->single_fixed && pmu->pmu_idx > 0)
1269 return -EINVAL;
1270 hwc->config = ~0ULL;
1271 } else {
1272 hwc->config = event->attr.config & pmu->type->event_mask;
1273 if (pmu->type->ops->hw_config) {
1274 ret = pmu->type->ops->hw_config(box, event);
1275 if (ret)
1276 return ret;
1277 }
1278 }
1279
1280 if (event->group_leader != event)
1281 ret = uncore_validate_group(pmu, event);
1282 else
1283 ret = 0;
1284
1285 return ret;
1286}
1287
1288static int __init uncore_pmu_register(struct intel_uncore_pmu *pmu)
1289{
1290 int ret;
1291
1292 pmu->pmu = (struct pmu) {
1293 .attr_groups = pmu->type->attr_groups,
1294 .task_ctx_nr = perf_invalid_context,
1295 .event_init = uncore_pmu_event_init,
1296 .add = uncore_pmu_event_add,
1297 .del = uncore_pmu_event_del,
1298 .start = uncore_pmu_event_start,
1299 .stop = uncore_pmu_event_stop,
1300 .read = uncore_pmu_event_read,
1301 };
1302
1303 if (pmu->type->num_boxes == 1) {
1304 if (strlen(pmu->type->name) > 0)
1305 sprintf(pmu->name, "uncore_%s", pmu->type->name);
1306 else
1307 sprintf(pmu->name, "uncore");
1308 } else {
1309 sprintf(pmu->name, "uncore_%s_%d", pmu->type->name,
1310 pmu->pmu_idx);
1311 }
1312
1313 ret = perf_pmu_register(&pmu->pmu, pmu->name, -1);
1314 return ret;
1315}
1316
1317static void __init uncore_type_exit(struct intel_uncore_type *type)
1318{
1319 int i;
1320
1321 for (i = 0; i < type->num_boxes; i++)
1322 free_percpu(type->pmus[i].box);
1323 kfree(type->pmus);
1324 type->pmus = NULL;
1325 kfree(type->attr_groups[1]);
1326 type->attr_groups[1] = NULL;
1327}
1328
1329static void uncore_types_exit(struct intel_uncore_type **types)
1330{
1331 int i;
1332 for (i = 0; types[i]; i++)
1333 uncore_type_exit(types[i]);
1334}
1335
1336static int __init uncore_type_init(struct intel_uncore_type *type)
1337{
1338 struct intel_uncore_pmu *pmus;
1339 struct attribute_group *events_group;
1340 struct attribute **attrs;
1341 int i, j;
1342
1343 pmus = kzalloc(sizeof(*pmus) * type->num_boxes, GFP_KERNEL);
1344 if (!pmus)
1345 return -ENOMEM;
1346
1347 type->unconstrainted = (struct event_constraint)
1348 __EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1,
1349 0, type->num_counters, 0);
1350
1351 for (i = 0; i < type->num_boxes; i++) {
1352 pmus[i].func_id = -1;
1353 pmus[i].pmu_idx = i;
1354 pmus[i].type = type;
1355 INIT_LIST_HEAD(&pmus[i].box_list);
1356 pmus[i].box = alloc_percpu(struct intel_uncore_box *);
1357 if (!pmus[i].box)
1358 goto fail;
1359 }
1360
1361 if (type->event_descs) {
1362 i = 0;
1363 while (type->event_descs[i].attr.attr.name)
1364 i++;
1365
1366 events_group = kzalloc(sizeof(struct attribute *) * (i + 1) +
1367 sizeof(*events_group), GFP_KERNEL);
1368 if (!events_group)
1369 goto fail;
1370
1371 attrs = (struct attribute **)(events_group + 1);
1372 events_group->name = "events";
1373 events_group->attrs = attrs;
1374
1375 for (j = 0; j < i; j++)
1376 attrs[j] = &type->event_descs[j].attr.attr;
1377
1378 type->attr_groups[1] = events_group;
1379 }
1380
1381 type->pmus = pmus;
1382 return 0;
1383fail:
1384 uncore_type_exit(type);
1385 return -ENOMEM;
1386}
1387
1388static int __init uncore_types_init(struct intel_uncore_type **types)
1389{
1390 int i, ret;
1391
1392 for (i = 0; types[i]; i++) {
1393 ret = uncore_type_init(types[i]);
1394 if (ret)
1395 goto fail;
1396 }
1397 return 0;
1398fail:
1399 while (--i >= 0)
1400 uncore_type_exit(types[i]);
1401 return ret;
1402}
1403
1404static struct pci_driver *uncore_pci_driver;
1405static bool pcidrv_registered;
1406
1407/*
1408 * add a pci uncore device
1409 */
1410static int __devinit uncore_pci_add(struct intel_uncore_type *type,
1411 struct pci_dev *pdev)
1412{
1413 struct intel_uncore_pmu *pmu;
1414 struct intel_uncore_box *box;
1415 int i, phys_id;
1416
1417 phys_id = pcibus_to_physid[pdev->bus->number];
1418 if (phys_id < 0)
1419 return -ENODEV;
1420
1421 box = uncore_alloc_box(type, 0);
1422 if (!box)
1423 return -ENOMEM;
1424
1425 /*
1426 * for performance monitoring unit with multiple boxes,
1427 * each box has a different function id.
1428 */
1429 for (i = 0; i < type->num_boxes; i++) {
1430 pmu = &type->pmus[i];
1431 if (pmu->func_id == pdev->devfn)
1432 break;
1433 if (pmu->func_id < 0) {
1434 pmu->func_id = pdev->devfn;
1435 break;
1436 }
1437 pmu = NULL;
1438 }
1439
1440 if (!pmu) {
1441 kfree(box);
1442 return -EINVAL;
1443 }
1444
1445 box->phys_id = phys_id;
1446 box->pci_dev = pdev;
1447 box->pmu = pmu;
1448 uncore_box_init(box);
1449 pci_set_drvdata(pdev, box);
1450
1451 raw_spin_lock(&uncore_box_lock);
1452 list_add_tail(&box->list, &pmu->box_list);
1453 raw_spin_unlock(&uncore_box_lock);
1454
1455 return 0;
1456}
1457
1458static void uncore_pci_remove(struct pci_dev *pdev)
1459{
1460 struct intel_uncore_box *box = pci_get_drvdata(pdev);
1461 struct intel_uncore_pmu *pmu = box->pmu;
1462 int cpu, phys_id = pcibus_to_physid[pdev->bus->number];
1463
1464 if (WARN_ON_ONCE(phys_id != box->phys_id))
1465 return;
1466
1467 raw_spin_lock(&uncore_box_lock);
1468 list_del(&box->list);
1469 raw_spin_unlock(&uncore_box_lock);
1470
1471 for_each_possible_cpu(cpu) {
1472 if (*per_cpu_ptr(pmu->box, cpu) == box) {
1473 *per_cpu_ptr(pmu->box, cpu) = NULL;
1474 atomic_dec(&box->refcnt);
1475 }
1476 }
1477
1478 WARN_ON_ONCE(atomic_read(&box->refcnt) != 1);
1479 kfree(box);
1480}
1481
1482static int __devinit uncore_pci_probe(struct pci_dev *pdev,
1483 const struct pci_device_id *id)
1484{
1485 struct intel_uncore_type *type;
1486
1487 type = (struct intel_uncore_type *)id->driver_data;
1488 return uncore_pci_add(type, pdev);
1489}
1490
1491static int __init uncore_pci_init(void)
1492{
1493 int ret;
1494
1495 switch (boot_cpu_data.x86_model) {
1496 case 45: /* Sandy Bridge-EP */
1497 pci_uncores = snbep_pci_uncores;
1498 uncore_pci_driver = &snbep_uncore_pci_driver;
1499 snbep_pci2phy_map_init();
1500 break;
1501 default:
1502 return 0;
1503 }
1504
1505 ret = uncore_types_init(pci_uncores);
1506 if (ret)
1507 return ret;
1508
1509 uncore_pci_driver->probe = uncore_pci_probe;
1510 uncore_pci_driver->remove = uncore_pci_remove;
1511
1512 ret = pci_register_driver(uncore_pci_driver);
1513 if (ret == 0)
1514 pcidrv_registered = true;
1515 else
1516 uncore_types_exit(pci_uncores);
1517
1518 return ret;
1519}
1520
1521static void __init uncore_pci_exit(void)
1522{
1523 if (pcidrv_registered) {
1524 pcidrv_registered = false;
1525 pci_unregister_driver(uncore_pci_driver);
1526 uncore_types_exit(pci_uncores);
1527 }
1528}
1529
1530static void __cpuinit uncore_cpu_dying(int cpu)
1531{
1532 struct intel_uncore_type *type;
1533 struct intel_uncore_pmu *pmu;
1534 struct intel_uncore_box *box;
1535 int i, j;
1536
1537 for (i = 0; msr_uncores[i]; i++) {
1538 type = msr_uncores[i];
1539 for (j = 0; j < type->num_boxes; j++) {
1540 pmu = &type->pmus[j];
1541 box = *per_cpu_ptr(pmu->box, cpu);
1542 *per_cpu_ptr(pmu->box, cpu) = NULL;
1543 if (box && atomic_dec_and_test(&box->refcnt))
1544 kfree(box);
1545 }
1546 }
1547}
1548
1549static int __cpuinit uncore_cpu_starting(int cpu)
1550{
1551 struct intel_uncore_type *type;
1552 struct intel_uncore_pmu *pmu;
1553 struct intel_uncore_box *box, *exist;
1554 int i, j, k, phys_id;
1555
1556 phys_id = topology_physical_package_id(cpu);
1557
1558 for (i = 0; msr_uncores[i]; i++) {
1559 type = msr_uncores[i];
1560 for (j = 0; j < type->num_boxes; j++) {
1561 pmu = &type->pmus[j];
1562 box = *per_cpu_ptr(pmu->box, cpu);
1563 /* called by uncore_cpu_init? */
1564 if (box && box->phys_id >= 0) {
1565 uncore_box_init(box);
1566 continue;
1567 }
1568
1569 for_each_online_cpu(k) {
1570 exist = *per_cpu_ptr(pmu->box, k);
1571 if (exist && exist->phys_id == phys_id) {
1572 atomic_inc(&exist->refcnt);
1573 *per_cpu_ptr(pmu->box, cpu) = exist;
1574 kfree(box);
1575 box = NULL;
1576 break;
1577 }
1578 }
1579
1580 if (box) {
1581 box->phys_id = phys_id;
1582 uncore_box_init(box);
1583 }
1584 }
1585 }
1586 return 0;
1587}
1588
1589static int __cpuinit uncore_cpu_prepare(int cpu, int phys_id)
1590{
1591 struct intel_uncore_type *type;
1592 struct intel_uncore_pmu *pmu;
1593 struct intel_uncore_box *box;
1594 int i, j;
1595
1596 for (i = 0; msr_uncores[i]; i++) {
1597 type = msr_uncores[i];
1598 for (j = 0; j < type->num_boxes; j++) {
1599 pmu = &type->pmus[j];
1600 if (pmu->func_id < 0)
1601 pmu->func_id = j;
1602
1603 box = uncore_alloc_box(type, cpu);
1604 if (!box)
1605 return -ENOMEM;
1606
1607 box->pmu = pmu;
1608 box->phys_id = phys_id;
1609 *per_cpu_ptr(pmu->box, cpu) = box;
1610 }
1611 }
1612 return 0;
1613}
1614
1615static void __cpuinit uncore_change_context(struct intel_uncore_type **uncores,
1616 int old_cpu, int new_cpu)
1617{
1618 struct intel_uncore_type *type;
1619 struct intel_uncore_pmu *pmu;
1620 struct intel_uncore_box *box;
1621 int i, j;
1622
1623 for (i = 0; uncores[i]; i++) {
1624 type = uncores[i];
1625 for (j = 0; j < type->num_boxes; j++) {
1626 pmu = &type->pmus[j];
1627 if (old_cpu < 0)
1628 box = uncore_pmu_to_box(pmu, new_cpu);
1629 else
1630 box = uncore_pmu_to_box(pmu, old_cpu);
1631 if (!box)
1632 continue;
1633
1634 if (old_cpu < 0) {
1635 WARN_ON_ONCE(box->cpu != -1);
1636 box->cpu = new_cpu;
1637 continue;
1638 }
1639
1640 WARN_ON_ONCE(box->cpu != old_cpu);
1641 if (new_cpu >= 0) {
1642 uncore_pmu_cancel_hrtimer(box);
1643 perf_pmu_migrate_context(&pmu->pmu,
1644 old_cpu, new_cpu);
1645 box->cpu = new_cpu;
1646 } else {
1647 box->cpu = -1;
1648 }
1649 }
1650 }
1651}
1652
1653static void __cpuinit uncore_event_exit_cpu(int cpu)
1654{
1655 int i, phys_id, target;
1656
1657 /* if exiting cpu is used for collecting uncore events */
1658 if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask))
1659 return;
1660
1661 /* find a new cpu to collect uncore events */
1662 phys_id = topology_physical_package_id(cpu);
1663 target = -1;
1664 for_each_online_cpu(i) {
1665 if (i == cpu)
1666 continue;
1667 if (phys_id == topology_physical_package_id(i)) {
1668 target = i;
1669 break;
1670 }
1671 }
1672
1673 /* migrate uncore events to the new cpu */
1674 if (target >= 0)
1675 cpumask_set_cpu(target, &uncore_cpu_mask);
1676
1677 uncore_change_context(msr_uncores, cpu, target);
1678 uncore_change_context(pci_uncores, cpu, target);
1679}
1680
1681static void __cpuinit uncore_event_init_cpu(int cpu)
1682{
1683 int i, phys_id;
1684
1685 phys_id = topology_physical_package_id(cpu);
1686 for_each_cpu(i, &uncore_cpu_mask) {
1687 if (phys_id == topology_physical_package_id(i))
1688 return;
1689 }
1690
1691 cpumask_set_cpu(cpu, &uncore_cpu_mask);
1692
1693 uncore_change_context(msr_uncores, -1, cpu);
1694 uncore_change_context(pci_uncores, -1, cpu);
1695}
1696
1697static int __cpuinit uncore_cpu_notifier(struct notifier_block *self,
1698 unsigned long action, void *hcpu)
1699{
1700 unsigned int cpu = (long)hcpu;
1701
1702 /* allocate/free data structure for uncore box */
1703 switch (action & ~CPU_TASKS_FROZEN) {
1704 case CPU_UP_PREPARE:
1705 uncore_cpu_prepare(cpu, -1);
1706 break;
1707 case CPU_STARTING:
1708 uncore_cpu_starting(cpu);
1709 break;
1710 case CPU_UP_CANCELED:
1711 case CPU_DYING:
1712 uncore_cpu_dying(cpu);
1713 break;
1714 default:
1715 break;
1716 }
1717
1718 /* select the cpu that collects uncore events */
1719 switch (action & ~CPU_TASKS_FROZEN) {
1720 case CPU_DOWN_FAILED:
1721 case CPU_STARTING:
1722 uncore_event_init_cpu(cpu);
1723 break;
1724 case CPU_DOWN_PREPARE:
1725 uncore_event_exit_cpu(cpu);
1726 break;
1727 default:
1728 break;
1729 }
1730
1731 return NOTIFY_OK;
1732}
1733
1734static struct notifier_block uncore_cpu_nb __cpuinitdata = {
1735 .notifier_call = uncore_cpu_notifier,
1736 /*
1737 * to migrate uncore events, our notifier should be executed
1738 * before perf core's notifier.
1739 */
1740 .priority = CPU_PRI_PERF + 1,
1741};
1742
1743static void __init uncore_cpu_setup(void *dummy)
1744{
1745 uncore_cpu_starting(smp_processor_id());
1746}
1747
1748static int __init uncore_cpu_init(void)
1749{
1750 int ret, cpu, max_cores;
1751
1752 max_cores = boot_cpu_data.x86_max_cores;
1753 switch (boot_cpu_data.x86_model) {
1754 case 26: /* Nehalem */
1755 case 30:
1756 case 37: /* Westmere */
1757 case 44:
1758 msr_uncores = nhm_msr_uncores;
1759 break;
1760 case 42: /* Sandy Bridge */
1761 if (snb_uncore_cbox.num_boxes > max_cores)
1762 snb_uncore_cbox.num_boxes = max_cores;
1763 msr_uncores = snb_msr_uncores;
1764 break;
1765 case 45: /* Sandy Birdge-EP */
1766 if (snbep_uncore_cbox.num_boxes > max_cores)
1767 snbep_uncore_cbox.num_boxes = max_cores;
1768 msr_uncores = snbep_msr_uncores;
1769 break;
1770 default:
1771 return 0;
1772 }
1773
1774 ret = uncore_types_init(msr_uncores);
1775 if (ret)
1776 return ret;
1777
1778 get_online_cpus();
1779
1780 for_each_online_cpu(cpu) {
1781 int i, phys_id = topology_physical_package_id(cpu);
1782
1783 for_each_cpu(i, &uncore_cpu_mask) {
1784 if (phys_id == topology_physical_package_id(i)) {
1785 phys_id = -1;
1786 break;
1787 }
1788 }
1789 if (phys_id < 0)
1790 continue;
1791
1792 uncore_cpu_prepare(cpu, phys_id);
1793 uncore_event_init_cpu(cpu);
1794 }
1795 on_each_cpu(uncore_cpu_setup, NULL, 1);
1796
1797 register_cpu_notifier(&uncore_cpu_nb);
1798
1799 put_online_cpus();
1800
1801 return 0;
1802}
1803
1804static int __init uncore_pmus_register(void)
1805{
1806 struct intel_uncore_pmu *pmu;
1807 struct intel_uncore_type *type;
1808 int i, j;
1809
1810 for (i = 0; msr_uncores[i]; i++) {
1811 type = msr_uncores[i];
1812 for (j = 0; j < type->num_boxes; j++) {
1813 pmu = &type->pmus[j];
1814 uncore_pmu_register(pmu);
1815 }
1816 }
1817
1818 for (i = 0; pci_uncores[i]; i++) {
1819 type = pci_uncores[i];
1820 for (j = 0; j < type->num_boxes; j++) {
1821 pmu = &type->pmus[j];
1822 uncore_pmu_register(pmu);
1823 }
1824 }
1825
1826 return 0;
1827}
1828
1829static int __init intel_uncore_init(void)
1830{
1831 int ret;
1832
1833 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
1834 return -ENODEV;
1835
1836 ret = uncore_pci_init();
1837 if (ret)
1838 goto fail;
1839 ret = uncore_cpu_init();
1840 if (ret) {
1841 uncore_pci_exit();
1842 goto fail;
1843 }
1844
1845 uncore_pmus_register();
1846 return 0;
1847fail:
1848 return ret;
1849}
1850device_initcall(intel_uncore_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
new file mode 100644
index 000000000000..b13e9ea81def
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -0,0 +1,424 @@
1#include <linux/module.h>
2#include <linux/slab.h>
3#include <linux/pci.h>
4#include <linux/perf_event.h>
5#include "perf_event.h"
6
7#define UNCORE_PMU_NAME_LEN 32
8#define UNCORE_BOX_HASH_SIZE 8
9
10#define UNCORE_PMU_HRTIMER_INTERVAL (60 * NSEC_PER_SEC)
11
12#define UNCORE_FIXED_EVENT 0xff
13#define UNCORE_PMC_IDX_MAX_GENERIC 8
14#define UNCORE_PMC_IDX_FIXED UNCORE_PMC_IDX_MAX_GENERIC
15#define UNCORE_PMC_IDX_MAX (UNCORE_PMC_IDX_FIXED + 1)
16
17#define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff)
18
19/* SNB event control */
20#define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff
21#define SNB_UNC_CTL_UMASK_MASK 0x0000ff00
22#define SNB_UNC_CTL_EDGE_DET (1 << 18)
23#define SNB_UNC_CTL_EN (1 << 22)
24#define SNB_UNC_CTL_INVERT (1 << 23)
25#define SNB_UNC_CTL_CMASK_MASK 0x1f000000
26#define NHM_UNC_CTL_CMASK_MASK 0xff000000
27#define NHM_UNC_FIXED_CTR_CTL_EN (1 << 0)
28
29#define SNB_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \
30 SNB_UNC_CTL_UMASK_MASK | \
31 SNB_UNC_CTL_EDGE_DET | \
32 SNB_UNC_CTL_INVERT | \
33 SNB_UNC_CTL_CMASK_MASK)
34
35#define NHM_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \
36 SNB_UNC_CTL_UMASK_MASK | \
37 SNB_UNC_CTL_EDGE_DET | \
38 SNB_UNC_CTL_INVERT | \
39 NHM_UNC_CTL_CMASK_MASK)
40
41/* SNB global control register */
42#define SNB_UNC_PERF_GLOBAL_CTL 0x391
43#define SNB_UNC_FIXED_CTR_CTRL 0x394
44#define SNB_UNC_FIXED_CTR 0x395
45
46/* SNB uncore global control */
47#define SNB_UNC_GLOBAL_CTL_CORE_ALL ((1 << 4) - 1)
48#define SNB_UNC_GLOBAL_CTL_EN (1 << 29)
49
50/* SNB Cbo register */
51#define SNB_UNC_CBO_0_PERFEVTSEL0 0x700
52#define SNB_UNC_CBO_0_PER_CTR0 0x706
53#define SNB_UNC_CBO_MSR_OFFSET 0x10
54
55/* NHM global control register */
56#define NHM_UNC_PERF_GLOBAL_CTL 0x391
57#define NHM_UNC_FIXED_CTR 0x394
58#define NHM_UNC_FIXED_CTR_CTRL 0x395
59
60/* NHM uncore global control */
61#define NHM_UNC_GLOBAL_CTL_EN_PC_ALL ((1ULL << 8) - 1)
62#define NHM_UNC_GLOBAL_CTL_EN_FC (1ULL << 32)
63
64/* NHM uncore register */
65#define NHM_UNC_PERFEVTSEL0 0x3c0
66#define NHM_UNC_UNCORE_PMC0 0x3b0
67
68/* SNB-EP Box level control */
69#define SNBEP_PMON_BOX_CTL_RST_CTRL (1 << 0)
70#define SNBEP_PMON_BOX_CTL_RST_CTRS (1 << 1)
71#define SNBEP_PMON_BOX_CTL_FRZ (1 << 8)
72#define SNBEP_PMON_BOX_CTL_FRZ_EN (1 << 16)
73#define SNBEP_PMON_BOX_CTL_INT (SNBEP_PMON_BOX_CTL_RST_CTRL | \
74 SNBEP_PMON_BOX_CTL_RST_CTRS | \
75 SNBEP_PMON_BOX_CTL_FRZ_EN)
76/* SNB-EP event control */
77#define SNBEP_PMON_CTL_EV_SEL_MASK 0x000000ff
78#define SNBEP_PMON_CTL_UMASK_MASK 0x0000ff00
79#define SNBEP_PMON_CTL_RST (1 << 17)
80#define SNBEP_PMON_CTL_EDGE_DET (1 << 18)
81#define SNBEP_PMON_CTL_EV_SEL_EXT (1 << 21) /* only for QPI */
82#define SNBEP_PMON_CTL_EN (1 << 22)
83#define SNBEP_PMON_CTL_INVERT (1 << 23)
84#define SNBEP_PMON_CTL_TRESH_MASK 0xff000000
85#define SNBEP_PMON_RAW_EVENT_MASK (SNBEP_PMON_CTL_EV_SEL_MASK | \
86 SNBEP_PMON_CTL_UMASK_MASK | \
87 SNBEP_PMON_CTL_EDGE_DET | \
88 SNBEP_PMON_CTL_INVERT | \
89 SNBEP_PMON_CTL_TRESH_MASK)
90
91/* SNB-EP Ubox event control */
92#define SNBEP_U_MSR_PMON_CTL_TRESH_MASK 0x1f000000
93#define SNBEP_U_MSR_PMON_RAW_EVENT_MASK \
94 (SNBEP_PMON_CTL_EV_SEL_MASK | \
95 SNBEP_PMON_CTL_UMASK_MASK | \
96 SNBEP_PMON_CTL_EDGE_DET | \
97 SNBEP_PMON_CTL_INVERT | \
98 SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
99
100#define SNBEP_CBO_PMON_CTL_TID_EN (1 << 19)
101#define SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK (SNBEP_PMON_RAW_EVENT_MASK | \
102 SNBEP_CBO_PMON_CTL_TID_EN)
103
104/* SNB-EP PCU event control */
105#define SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK 0x0000c000
106#define SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK 0x1f000000
107#define SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT (1 << 30)
108#define SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET (1 << 31)
109#define SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK \
110 (SNBEP_PMON_CTL_EV_SEL_MASK | \
111 SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
112 SNBEP_PMON_CTL_EDGE_DET | \
113 SNBEP_PMON_CTL_INVERT | \
114 SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
115 SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
116 SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
117
118/* SNB-EP pci control register */
119#define SNBEP_PCI_PMON_BOX_CTL 0xf4
120#define SNBEP_PCI_PMON_CTL0 0xd8
121/* SNB-EP pci counter register */
122#define SNBEP_PCI_PMON_CTR0 0xa0
123
124/* SNB-EP home agent register */
125#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH0 0x40
126#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH1 0x44
127#define SNBEP_HA_PCI_PMON_BOX_OPCODEMATCH 0x48
128/* SNB-EP memory controller register */
129#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTL 0xf0
130#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTR 0xd0
131/* SNB-EP QPI register */
132#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH0 0x228
133#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH1 0x22c
134#define SNBEP_Q_Py_PCI_PMON_PKT_MASK0 0x238
135#define SNBEP_Q_Py_PCI_PMON_PKT_MASK1 0x23c
136
137/* SNB-EP Ubox register */
138#define SNBEP_U_MSR_PMON_CTR0 0xc16
139#define SNBEP_U_MSR_PMON_CTL0 0xc10
140
141#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTL 0xc08
142#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTR 0xc09
143
144/* SNB-EP Cbo register */
145#define SNBEP_C0_MSR_PMON_CTR0 0xd16
146#define SNBEP_C0_MSR_PMON_CTL0 0xd10
147#define SNBEP_C0_MSR_PMON_BOX_CTL 0xd04
148#define SNBEP_C0_MSR_PMON_BOX_FILTER 0xd14
149#define SNBEP_CB0_MSR_PMON_BOX_FILTER_MASK 0xfffffc1f
150#define SNBEP_CBO_MSR_OFFSET 0x20
151
152/* SNB-EP PCU register */
153#define SNBEP_PCU_MSR_PMON_CTR0 0xc36
154#define SNBEP_PCU_MSR_PMON_CTL0 0xc30
155#define SNBEP_PCU_MSR_PMON_BOX_CTL 0xc24
156#define SNBEP_PCU_MSR_PMON_BOX_FILTER 0xc34
157#define SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK 0xffffffff
158#define SNBEP_PCU_MSR_CORE_C3_CTR 0x3fc
159#define SNBEP_PCU_MSR_CORE_C6_CTR 0x3fd
160
161struct intel_uncore_ops;
162struct intel_uncore_pmu;
163struct intel_uncore_box;
164struct uncore_event_desc;
165
166struct intel_uncore_type {
167 const char *name;
168 int num_counters;
169 int num_boxes;
170 int perf_ctr_bits;
171 int fixed_ctr_bits;
172 unsigned perf_ctr;
173 unsigned event_ctl;
174 unsigned event_mask;
175 unsigned fixed_ctr;
176 unsigned fixed_ctl;
177 unsigned box_ctl;
178 unsigned msr_offset;
179 unsigned num_shared_regs:8;
180 unsigned single_fixed:1;
181 struct event_constraint unconstrainted;
182 struct event_constraint *constraints;
183 struct intel_uncore_pmu *pmus;
184 struct intel_uncore_ops *ops;
185 struct uncore_event_desc *event_descs;
186 const struct attribute_group *attr_groups[3];
187};
188
189#define format_group attr_groups[0]
190
191struct intel_uncore_ops {
192 void (*init_box)(struct intel_uncore_box *);
193 void (*disable_box)(struct intel_uncore_box *);
194 void (*enable_box)(struct intel_uncore_box *);
195 void (*disable_event)(struct intel_uncore_box *, struct perf_event *);
196 void (*enable_event)(struct intel_uncore_box *, struct perf_event *);
197 u64 (*read_counter)(struct intel_uncore_box *, struct perf_event *);
198 int (*hw_config)(struct intel_uncore_box *, struct perf_event *);
199 struct event_constraint *(*get_constraint)(struct intel_uncore_box *,
200 struct perf_event *);
201 void (*put_constraint)(struct intel_uncore_box *, struct perf_event *);
202};
203
204struct intel_uncore_pmu {
205 struct pmu pmu;
206 char name[UNCORE_PMU_NAME_LEN];
207 int pmu_idx;
208 int func_id;
209 struct intel_uncore_type *type;
210 struct intel_uncore_box ** __percpu box;
211 struct list_head box_list;
212};
213
214struct intel_uncore_extra_reg {
215 raw_spinlock_t lock;
216 u64 config1;
217 atomic_t ref;
218};
219
220struct intel_uncore_box {
221 int phys_id;
222 int n_active; /* number of active events */
223 int n_events;
224 int cpu; /* cpu to collect events */
225 unsigned long flags;
226 atomic_t refcnt;
227 struct perf_event *events[UNCORE_PMC_IDX_MAX];
228 struct perf_event *event_list[UNCORE_PMC_IDX_MAX];
229 unsigned long active_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
230 u64 tags[UNCORE_PMC_IDX_MAX];
231 struct pci_dev *pci_dev;
232 struct intel_uncore_pmu *pmu;
233 struct hrtimer hrtimer;
234 struct list_head list;
235 struct intel_uncore_extra_reg shared_regs[0];
236};
237
238#define UNCORE_BOX_FLAG_INITIATED 0
239
240struct uncore_event_desc {
241 struct kobj_attribute attr;
242 const char *config;
243};
244
245#define INTEL_UNCORE_EVENT_DESC(_name, _config) \
246{ \
247 .attr = __ATTR(_name, 0444, uncore_event_show, NULL), \
248 .config = _config, \
249}
250
251#define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format) \
252static ssize_t __uncore_##_var##_show(struct kobject *kobj, \
253 struct kobj_attribute *attr, \
254 char *page) \
255{ \
256 BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \
257 return sprintf(page, _format "\n"); \
258} \
259static struct kobj_attribute format_attr_##_var = \
260 __ATTR(_name, 0444, __uncore_##_var##_show, NULL)
261
262
263static ssize_t uncore_event_show(struct kobject *kobj,
264 struct kobj_attribute *attr, char *buf)
265{
266 struct uncore_event_desc *event =
267 container_of(attr, struct uncore_event_desc, attr);
268 return sprintf(buf, "%s", event->config);
269}
270
271static inline unsigned uncore_pci_box_ctl(struct intel_uncore_box *box)
272{
273 return box->pmu->type->box_ctl;
274}
275
276static inline unsigned uncore_pci_fixed_ctl(struct intel_uncore_box *box)
277{
278 return box->pmu->type->fixed_ctl;
279}
280
281static inline unsigned uncore_pci_fixed_ctr(struct intel_uncore_box *box)
282{
283 return box->pmu->type->fixed_ctr;
284}
285
286static inline
287unsigned uncore_pci_event_ctl(struct intel_uncore_box *box, int idx)
288{
289 return idx * 4 + box->pmu->type->event_ctl;
290}
291
292static inline
293unsigned uncore_pci_perf_ctr(struct intel_uncore_box *box, int idx)
294{
295 return idx * 8 + box->pmu->type->perf_ctr;
296}
297
298static inline
299unsigned uncore_msr_box_ctl(struct intel_uncore_box *box)
300{
301 if (!box->pmu->type->box_ctl)
302 return 0;
303 return box->pmu->type->box_ctl +
304 box->pmu->type->msr_offset * box->pmu->pmu_idx;
305}
306
307static inline
308unsigned uncore_msr_fixed_ctl(struct intel_uncore_box *box)
309{
310 if (!box->pmu->type->fixed_ctl)
311 return 0;
312 return box->pmu->type->fixed_ctl +
313 box->pmu->type->msr_offset * box->pmu->pmu_idx;
314}
315
316static inline
317unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box)
318{
319 return box->pmu->type->fixed_ctr +
320 box->pmu->type->msr_offset * box->pmu->pmu_idx;
321}
322
323static inline
324unsigned uncore_msr_event_ctl(struct intel_uncore_box *box, int idx)
325{
326 return idx + box->pmu->type->event_ctl +
327 box->pmu->type->msr_offset * box->pmu->pmu_idx;
328}
329
330static inline
331unsigned uncore_msr_perf_ctr(struct intel_uncore_box *box, int idx)
332{
333 return idx + box->pmu->type->perf_ctr +
334 box->pmu->type->msr_offset * box->pmu->pmu_idx;
335}
336
337static inline
338unsigned uncore_fixed_ctl(struct intel_uncore_box *box)
339{
340 if (box->pci_dev)
341 return uncore_pci_fixed_ctl(box);
342 else
343 return uncore_msr_fixed_ctl(box);
344}
345
346static inline
347unsigned uncore_fixed_ctr(struct intel_uncore_box *box)
348{
349 if (box->pci_dev)
350 return uncore_pci_fixed_ctr(box);
351 else
352 return uncore_msr_fixed_ctr(box);
353}
354
355static inline
356unsigned uncore_event_ctl(struct intel_uncore_box *box, int idx)
357{
358 if (box->pci_dev)
359 return uncore_pci_event_ctl(box, idx);
360 else
361 return uncore_msr_event_ctl(box, idx);
362}
363
364static inline
365unsigned uncore_perf_ctr(struct intel_uncore_box *box, int idx)
366{
367 if (box->pci_dev)
368 return uncore_pci_perf_ctr(box, idx);
369 else
370 return uncore_msr_perf_ctr(box, idx);
371}
372
373static inline int uncore_perf_ctr_bits(struct intel_uncore_box *box)
374{
375 return box->pmu->type->perf_ctr_bits;
376}
377
378static inline int uncore_fixed_ctr_bits(struct intel_uncore_box *box)
379{
380 return box->pmu->type->fixed_ctr_bits;
381}
382
383static inline int uncore_num_counters(struct intel_uncore_box *box)
384{
385 return box->pmu->type->num_counters;
386}
387
388static inline void uncore_disable_box(struct intel_uncore_box *box)
389{
390 if (box->pmu->type->ops->disable_box)
391 box->pmu->type->ops->disable_box(box);
392}
393
394static inline void uncore_enable_box(struct intel_uncore_box *box)
395{
396 if (box->pmu->type->ops->enable_box)
397 box->pmu->type->ops->enable_box(box);
398}
399
400static inline void uncore_disable_event(struct intel_uncore_box *box,
401 struct perf_event *event)
402{
403 box->pmu->type->ops->disable_event(box, event);
404}
405
406static inline void uncore_enable_event(struct intel_uncore_box *box,
407 struct perf_event *event)
408{
409 box->pmu->type->ops->enable_event(box, event);
410}
411
412static inline u64 uncore_read_counter(struct intel_uncore_box *box,
413 struct perf_event *event)
414{
415 return box->pmu->type->ops->read_counter(box, event);
416}
417
418static inline void uncore_box_init(struct intel_uncore_box *box)
419{
420 if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) {
421 if (box->pmu->type->ops->init_box)
422 box->pmu->type->ops->init_box(box);
423 }
424}
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 47124a73dd73..92c7e39a079f 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -895,8 +895,8 @@ static void p4_pmu_disable_pebs(void)
895 * So at moment let leave metrics turned on forever -- it's 895 * So at moment let leave metrics turned on forever -- it's
896 * ok for now but need to be revisited! 896 * ok for now but need to be revisited!
897 * 897 *
898 * (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)0); 898 * (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, (u64)0);
899 * (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)0); 899 * (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, (u64)0);
900 */ 900 */
901} 901}
902 902
@@ -909,7 +909,7 @@ static inline void p4_pmu_disable_event(struct perf_event *event)
909 * state we need to clear P4_CCCR_OVF, otherwise interrupt get 909 * state we need to clear P4_CCCR_OVF, otherwise interrupt get
910 * asserted again and again 910 * asserted again and again
911 */ 911 */
912 (void)checking_wrmsrl(hwc->config_base, 912 (void)wrmsrl_safe(hwc->config_base,
913 (u64)(p4_config_unpack_cccr(hwc->config)) & 913 (u64)(p4_config_unpack_cccr(hwc->config)) &
914 ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED); 914 ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
915} 915}
@@ -943,8 +943,8 @@ static void p4_pmu_enable_pebs(u64 config)
943 943
944 bind = &p4_pebs_bind_map[idx]; 944 bind = &p4_pebs_bind_map[idx];
945 945
946 (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs); 946 (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs);
947 (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert); 947 (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert);
948} 948}
949 949
950static void p4_pmu_enable_event(struct perf_event *event) 950static void p4_pmu_enable_event(struct perf_event *event)
@@ -978,8 +978,8 @@ static void p4_pmu_enable_event(struct perf_event *event)
978 */ 978 */
979 p4_pmu_enable_pebs(hwc->config); 979 p4_pmu_enable_pebs(hwc->config);
980 980
981 (void)checking_wrmsrl(escr_addr, escr_conf); 981 (void)wrmsrl_safe(escr_addr, escr_conf);
982 (void)checking_wrmsrl(hwc->config_base, 982 (void)wrmsrl_safe(hwc->config_base,
983 (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE); 983 (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
984} 984}
985 985
@@ -1325,7 +1325,7 @@ __init int p4_pmu_init(void)
1325 unsigned int low, high; 1325 unsigned int low, high;
1326 1326
1327 /* If we get stripped -- indexing fails */ 1327 /* If we get stripped -- indexing fails */
1328 BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC); 1328 BUILD_BUG_ON(ARCH_P4_MAX_CCCR > INTEL_PMC_MAX_GENERIC);
1329 1329
1330 rdmsr(MSR_IA32_MISC_ENABLE, low, high); 1330 rdmsr(MSR_IA32_MISC_ENABLE, low, high);
1331 if (!(low & (1 << 7))) { 1331 if (!(low & (1 << 7))) {
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index 32bcfc7dd230..e4dd0f7a0453 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -71,7 +71,7 @@ p6_pmu_disable_event(struct perf_event *event)
71 if (cpuc->enabled) 71 if (cpuc->enabled)
72 val |= ARCH_PERFMON_EVENTSEL_ENABLE; 72 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
73 73
74 (void)checking_wrmsrl(hwc->config_base, val); 74 (void)wrmsrl_safe(hwc->config_base, val);
75} 75}
76 76
77static void p6_pmu_enable_event(struct perf_event *event) 77static void p6_pmu_enable_event(struct perf_event *event)
@@ -84,7 +84,7 @@ static void p6_pmu_enable_event(struct perf_event *event)
84 if (cpuc->enabled) 84 if (cpuc->enabled)
85 val |= ARCH_PERFMON_EVENTSEL_ENABLE; 85 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
86 86
87 (void)checking_wrmsrl(hwc->config_base, val); 87 (void)wrmsrl_safe(hwc->config_base, val);
88} 88}
89 89
90PMU_FORMAT_ATTR(event, "config:0-7" ); 90PMU_FORMAT_ATTR(event, "config:0-7" );
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index addf9e82a7f2..ee8e9abc859f 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -31,7 +31,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
31 const struct cpuid_bit *cb; 31 const struct cpuid_bit *cb;
32 32
33 static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { 33 static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
34 { X86_FEATURE_DTS, CR_EAX, 0, 0x00000006, 0 }, 34 { X86_FEATURE_DTHERM, CR_EAX, 0, 0x00000006, 0 },
35 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 }, 35 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 },
36 { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 }, 36 { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 },
37 { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 }, 37 { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 },
diff --git a/arch/x86/kernel/cpu/sched.c b/arch/x86/kernel/cpu/sched.c
deleted file mode 100644
index a640ae5ad201..000000000000
--- a/arch/x86/kernel/cpu/sched.c
+++ /dev/null
@@ -1,55 +0,0 @@
1#include <linux/sched.h>
2#include <linux/math64.h>
3#include <linux/percpu.h>
4#include <linux/irqflags.h>
5
6#include <asm/cpufeature.h>
7#include <asm/processor.h>
8
9#ifdef CONFIG_SMP
10
11static DEFINE_PER_CPU(struct aperfmperf, old_perf_sched);
12
13static unsigned long scale_aperfmperf(void)
14{
15 struct aperfmperf val, *old = &__get_cpu_var(old_perf_sched);
16 unsigned long ratio, flags;
17
18 local_irq_save(flags);
19 get_aperfmperf(&val);
20 local_irq_restore(flags);
21
22 ratio = calc_aperfmperf_ratio(old, &val);
23 *old = val;
24
25 return ratio;
26}
27
28unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu)
29{
30 /*
31 * do aperf/mperf on the cpu level because it includes things
32 * like turbo mode, which are relevant to full cores.
33 */
34 if (boot_cpu_has(X86_FEATURE_APERFMPERF))
35 return scale_aperfmperf();
36
37 /*
38 * maybe have something cpufreq here
39 */
40
41 return default_scale_freq_power(sd, cpu);
42}
43
44unsigned long arch_scale_smt_power(struct sched_domain *sd, int cpu)
45{
46 /*
47 * aperf/mperf already includes the smt gain
48 */
49 if (boot_cpu_has(X86_FEATURE_APERFMPERF))
50 return SCHED_LOAD_SCALE;
51
52 return default_scale_smt_power(sd, cpu);
53}
54
55#endif
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 571246d81edf..ae42418bc50f 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -27,8 +27,8 @@ static int die_counter;
27 27
28void printk_address(unsigned long address, int reliable) 28void printk_address(unsigned long address, int reliable)
29{ 29{
30 printk(" [<%p>] %s%pB\n", (void *) address, 30 pr_cont(" [<%p>] %s%pB\n",
31 reliable ? "" : "? ", (void *) address); 31 (void *)address, reliable ? "" : "? ", (void *)address);
32} 32}
33 33
34#ifdef CONFIG_FUNCTION_GRAPH_TRACER 34#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -271,6 +271,7 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
271 current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP) 271 current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)
272 return 1; 272 return 1;
273 273
274 print_modules();
274 show_regs(regs); 275 show_regs(regs);
275#ifdef CONFIG_X86_32 276#ifdef CONFIG_X86_32
276 if (user_mode_vm(regs)) { 277 if (user_mode_vm(regs)) {
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index e0b1d783daab..1038a417ea53 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -73,11 +73,11 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
73 if (kstack_end(stack)) 73 if (kstack_end(stack))
74 break; 74 break;
75 if (i && ((i % STACKSLOTS_PER_LINE) == 0)) 75 if (i && ((i % STACKSLOTS_PER_LINE) == 0))
76 printk(KERN_CONT "\n"); 76 pr_cont("\n");
77 printk(KERN_CONT " %08lx", *stack++); 77 pr_cont(" %08lx", *stack++);
78 touch_nmi_watchdog(); 78 touch_nmi_watchdog();
79 } 79 }
80 printk(KERN_CONT "\n"); 80 pr_cont("\n");
81 show_trace_log_lvl(task, regs, sp, bp, log_lvl); 81 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
82} 82}
83 83
@@ -86,12 +86,11 @@ void show_regs(struct pt_regs *regs)
86{ 86{
87 int i; 87 int i;
88 88
89 print_modules();
90 __show_regs(regs, !user_mode_vm(regs)); 89 __show_regs(regs, !user_mode_vm(regs));
91 90
92 printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n", 91 pr_emerg("Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n",
93 TASK_COMM_LEN, current->comm, task_pid_nr(current), 92 TASK_COMM_LEN, current->comm, task_pid_nr(current),
94 current_thread_info(), current, task_thread_info(current)); 93 current_thread_info(), current, task_thread_info(current));
95 /* 94 /*
96 * When in-kernel, we also print out the stack and code at the 95 * When in-kernel, we also print out the stack and code at the
97 * time of the fault.. 96 * time of the fault..
@@ -102,10 +101,10 @@ void show_regs(struct pt_regs *regs)
102 unsigned char c; 101 unsigned char c;
103 u8 *ip; 102 u8 *ip;
104 103
105 printk(KERN_EMERG "Stack:\n"); 104 pr_emerg("Stack:\n");
106 show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG); 105 show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG);
107 106
108 printk(KERN_EMERG "Code: "); 107 pr_emerg("Code:");
109 108
110 ip = (u8 *)regs->ip - code_prologue; 109 ip = (u8 *)regs->ip - code_prologue;
111 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { 110 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
@@ -116,16 +115,16 @@ void show_regs(struct pt_regs *regs)
116 for (i = 0; i < code_len; i++, ip++) { 115 for (i = 0; i < code_len; i++, ip++) {
117 if (ip < (u8 *)PAGE_OFFSET || 116 if (ip < (u8 *)PAGE_OFFSET ||
118 probe_kernel_address(ip, c)) { 117 probe_kernel_address(ip, c)) {
119 printk(KERN_CONT " Bad EIP value."); 118 pr_cont(" Bad EIP value.");
120 break; 119 break;
121 } 120 }
122 if (ip == (u8 *)regs->ip) 121 if (ip == (u8 *)regs->ip)
123 printk(KERN_CONT "<%02x> ", c); 122 pr_cont(" <%02x>", c);
124 else 123 else
125 printk(KERN_CONT "%02x ", c); 124 pr_cont(" %02x", c);
126 } 125 }
127 } 126 }
128 printk(KERN_CONT "\n"); 127 pr_cont("\n");
129} 128}
130 129
131int is_valid_bugaddr(unsigned long ip) 130int is_valid_bugaddr(unsigned long ip)
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 791b76122aa8..b653675d5288 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -228,20 +228,20 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
228 if (stack >= irq_stack && stack <= irq_stack_end) { 228 if (stack >= irq_stack && stack <= irq_stack_end) {
229 if (stack == irq_stack_end) { 229 if (stack == irq_stack_end) {
230 stack = (unsigned long *) (irq_stack_end[-1]); 230 stack = (unsigned long *) (irq_stack_end[-1]);
231 printk(KERN_CONT " <EOI> "); 231 pr_cont(" <EOI> ");
232 } 232 }
233 } else { 233 } else {
234 if (((long) stack & (THREAD_SIZE-1)) == 0) 234 if (((long) stack & (THREAD_SIZE-1)) == 0)
235 break; 235 break;
236 } 236 }
237 if (i && ((i % STACKSLOTS_PER_LINE) == 0)) 237 if (i && ((i % STACKSLOTS_PER_LINE) == 0))
238 printk(KERN_CONT "\n"); 238 pr_cont("\n");
239 printk(KERN_CONT " %016lx", *stack++); 239 pr_cont(" %016lx", *stack++);
240 touch_nmi_watchdog(); 240 touch_nmi_watchdog();
241 } 241 }
242 preempt_enable(); 242 preempt_enable();
243 243
244 printk(KERN_CONT "\n"); 244 pr_cont("\n");
245 show_trace_log_lvl(task, regs, sp, bp, log_lvl); 245 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
246} 246}
247 247
@@ -254,10 +254,9 @@ void show_regs(struct pt_regs *regs)
254 254
255 sp = regs->sp; 255 sp = regs->sp;
256 printk("CPU %d ", cpu); 256 printk("CPU %d ", cpu);
257 print_modules();
258 __show_regs(regs, 1); 257 __show_regs(regs, 1);
259 printk("Process %s (pid: %d, threadinfo %p, task %p)\n", 258 printk(KERN_DEFAULT "Process %s (pid: %d, threadinfo %p, task %p)\n",
260 cur->comm, cur->pid, task_thread_info(cur), cur); 259 cur->comm, cur->pid, task_thread_info(cur), cur);
261 260
262 /* 261 /*
263 * When in-kernel, we also print out the stack and code at the 262 * When in-kernel, we also print out the stack and code at the
@@ -284,16 +283,16 @@ void show_regs(struct pt_regs *regs)
284 for (i = 0; i < code_len; i++, ip++) { 283 for (i = 0; i < code_len; i++, ip++) {
285 if (ip < (u8 *)PAGE_OFFSET || 284 if (ip < (u8 *)PAGE_OFFSET ||
286 probe_kernel_address(ip, c)) { 285 probe_kernel_address(ip, c)) {
287 printk(KERN_CONT " Bad RIP value."); 286 pr_cont(" Bad RIP value.");
288 break; 287 break;
289 } 288 }
290 if (ip == (u8 *)regs->ip) 289 if (ip == (u8 *)regs->ip)
291 printk(KERN_CONT "<%02x> ", c); 290 pr_cont("<%02x> ", c);
292 else 291 else
293 printk(KERN_CONT "%02x ", c); 292 pr_cont("%02x ", c);
294 } 293 }
295 } 294 }
296 printk(KERN_CONT "\n"); 295 pr_cont("\n");
297} 296}
298 297
299int is_valid_bugaddr(unsigned long ip) 298int is_valid_bugaddr(unsigned long ip)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 7d65133b51be..69babd8c834f 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1048,24 +1048,6 @@ apicinterrupt LOCAL_TIMER_VECTOR \
1048apicinterrupt X86_PLATFORM_IPI_VECTOR \ 1048apicinterrupt X86_PLATFORM_IPI_VECTOR \
1049 x86_platform_ipi smp_x86_platform_ipi 1049 x86_platform_ipi smp_x86_platform_ipi
1050 1050
1051#ifdef CONFIG_SMP
1052 ALIGN
1053 INTR_FRAME
1054.irp idx,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
1055 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
1056.if NUM_INVALIDATE_TLB_VECTORS > \idx
1057ENTRY(invalidate_interrupt\idx)
1058 pushq_cfi $~(INVALIDATE_TLB_VECTOR_START+\idx)
1059 jmp .Lcommon_invalidate_interrupt0
1060 CFI_ADJUST_CFA_OFFSET -8
1061END(invalidate_interrupt\idx)
1062.endif
1063.endr
1064 CFI_ENDPROC
1065apicinterrupt INVALIDATE_TLB_VECTOR_START, \
1066 invalidate_interrupt0, smp_invalidate_interrupt
1067#endif
1068
1069apicinterrupt THRESHOLD_APIC_VECTOR \ 1051apicinterrupt THRESHOLD_APIC_VECTOR \
1070 threshold_interrupt smp_threshold_interrupt 1052 threshold_interrupt smp_threshold_interrupt
1071apicinterrupt THERMAL_APIC_VECTOR \ 1053apicinterrupt THERMAL_APIC_VECTOR \
@@ -1758,10 +1740,30 @@ end_repeat_nmi:
1758 */ 1740 */
1759 call save_paranoid 1741 call save_paranoid
1760 DEFAULT_FRAME 0 1742 DEFAULT_FRAME 0
1743
1744 /*
1745 * Save off the CR2 register. If we take a page fault in the NMI then
1746 * it could corrupt the CR2 value. If the NMI preempts a page fault
1747 * handler before it was able to read the CR2 register, and then the
1748 * NMI itself takes a page fault, the page fault that was preempted
1749 * will read the information from the NMI page fault and not the
1750 * origin fault. Save it off and restore it if it changes.
1751 * Use the r12 callee-saved register.
1752 */
1753 movq %cr2, %r12
1754
1761 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ 1755 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
1762 movq %rsp,%rdi 1756 movq %rsp,%rdi
1763 movq $-1,%rsi 1757 movq $-1,%rsi
1764 call do_nmi 1758 call do_nmi
1759
1760 /* Did the NMI take a page fault? Restore cr2 if it did */
1761 movq %cr2, %rcx
1762 cmpq %rcx, %r12
1763 je 1f
1764 movq %r12, %cr2
17651:
1766
1765 testl %ebx,%ebx /* swapgs needed? */ 1767 testl %ebx,%ebx /* swapgs needed? */
1766 jnz nmi_restore 1768 jnz nmi_restore
1767nmi_swapgs: 1769nmi_swapgs:
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 3dafc6003b7c..1f5f1d5d2a02 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -294,9 +294,9 @@ void fixup_irqs(void)
294 raw_spin_unlock(&desc->lock); 294 raw_spin_unlock(&desc->lock);
295 295
296 if (break_affinity && set_affinity) 296 if (break_affinity && set_affinity)
297 printk("Broke affinity for irq %i\n", irq); 297 pr_notice("Broke affinity for irq %i\n", irq);
298 else if (!set_affinity) 298 else if (!set_affinity)
299 printk("Cannot set affinity for irq %i\n", irq); 299 pr_notice("Cannot set affinity for irq %i\n", irq);
300 } 300 }
301 301
302 /* 302 /*
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 252981afd6c4..6e03b0d69138 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -171,79 +171,6 @@ static void __init smp_intr_init(void)
171 */ 171 */
172 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); 172 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
173 173
174 /* IPIs for invalidation */
175#define ALLOC_INVTLB_VEC(NR) \
176 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+NR, \
177 invalidate_interrupt##NR)
178
179 switch (NUM_INVALIDATE_TLB_VECTORS) {
180 default:
181 ALLOC_INVTLB_VEC(31);
182 case 31:
183 ALLOC_INVTLB_VEC(30);
184 case 30:
185 ALLOC_INVTLB_VEC(29);
186 case 29:
187 ALLOC_INVTLB_VEC(28);
188 case 28:
189 ALLOC_INVTLB_VEC(27);
190 case 27:
191 ALLOC_INVTLB_VEC(26);
192 case 26:
193 ALLOC_INVTLB_VEC(25);
194 case 25:
195 ALLOC_INVTLB_VEC(24);
196 case 24:
197 ALLOC_INVTLB_VEC(23);
198 case 23:
199 ALLOC_INVTLB_VEC(22);
200 case 22:
201 ALLOC_INVTLB_VEC(21);
202 case 21:
203 ALLOC_INVTLB_VEC(20);
204 case 20:
205 ALLOC_INVTLB_VEC(19);
206 case 19:
207 ALLOC_INVTLB_VEC(18);
208 case 18:
209 ALLOC_INVTLB_VEC(17);
210 case 17:
211 ALLOC_INVTLB_VEC(16);
212 case 16:
213 ALLOC_INVTLB_VEC(15);
214 case 15:
215 ALLOC_INVTLB_VEC(14);
216 case 14:
217 ALLOC_INVTLB_VEC(13);
218 case 13:
219 ALLOC_INVTLB_VEC(12);
220 case 12:
221 ALLOC_INVTLB_VEC(11);
222 case 11:
223 ALLOC_INVTLB_VEC(10);
224 case 10:
225 ALLOC_INVTLB_VEC(9);
226 case 9:
227 ALLOC_INVTLB_VEC(8);
228 case 8:
229 ALLOC_INVTLB_VEC(7);
230 case 7:
231 ALLOC_INVTLB_VEC(6);
232 case 6:
233 ALLOC_INVTLB_VEC(5);
234 case 5:
235 ALLOC_INVTLB_VEC(4);
236 case 4:
237 ALLOC_INVTLB_VEC(3);
238 case 3:
239 ALLOC_INVTLB_VEC(2);
240 case 2:
241 ALLOC_INVTLB_VEC(1);
242 case 1:
243 ALLOC_INVTLB_VEC(0);
244 break;
245 }
246
247 /* IPI for generic function call */ 174 /* IPI for generic function call */
248 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); 175 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
249 176
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 8bfb6146f753..3f61904365cf 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -444,12 +444,12 @@ void kgdb_roundup_cpus(unsigned long flags)
444 444
445/** 445/**
446 * kgdb_arch_handle_exception - Handle architecture specific GDB packets. 446 * kgdb_arch_handle_exception - Handle architecture specific GDB packets.
447 * @vector: The error vector of the exception that happened. 447 * @e_vector: The error vector of the exception that happened.
448 * @signo: The signal number of the exception that happened. 448 * @signo: The signal number of the exception that happened.
449 * @err_code: The error code of the exception that happened. 449 * @err_code: The error code of the exception that happened.
450 * @remcom_in_buffer: The buffer of the packet we have read. 450 * @remcomInBuffer: The buffer of the packet we have read.
451 * @remcom_out_buffer: The buffer of %BUFMAX bytes to write a packet into. 451 * @remcomOutBuffer: The buffer of %BUFMAX bytes to write a packet into.
452 * @regs: The &struct pt_regs of the current process. 452 * @linux_regs: The &struct pt_regs of the current process.
453 * 453 *
454 * This function MUST handle the 'c' and 's' command packets, 454 * This function MUST handle the 'c' and 's' command packets,
455 * as well packets to set / remove a hardware breakpoint, if used. 455 * as well packets to set / remove a hardware breakpoint, if used.
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index e554e5ad2fe8..c1d61ee4b4f1 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -39,6 +39,9 @@
39#include <asm/desc.h> 39#include <asm/desc.h>
40#include <asm/tlbflush.h> 40#include <asm/tlbflush.h>
41#include <asm/idle.h> 41#include <asm/idle.h>
42#include <asm/apic.h>
43#include <asm/apicdef.h>
44#include <asm/hypervisor.h>
42 45
43static int kvmapf = 1; 46static int kvmapf = 1;
44 47
@@ -283,6 +286,22 @@ static void kvm_register_steal_time(void)
283 cpu, __pa(st)); 286 cpu, __pa(st));
284} 287}
285 288
289static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
290
291static void kvm_guest_apic_eoi_write(u32 reg, u32 val)
292{
293 /**
294 * This relies on __test_and_clear_bit to modify the memory
295 * in a way that is atomic with respect to the local CPU.
296 * The hypervisor only accesses this memory from the local CPU so
297 * there's no need for lock or memory barriers.
298 * An optimization barrier is implied in apic write.
299 */
300 if (__test_and_clear_bit(KVM_PV_EOI_BIT, &__get_cpu_var(kvm_apic_eoi)))
301 return;
302 apic_write(APIC_EOI, APIC_EOI_ACK);
303}
304
286void __cpuinit kvm_guest_cpu_init(void) 305void __cpuinit kvm_guest_cpu_init(void)
287{ 306{
288 if (!kvm_para_available()) 307 if (!kvm_para_available())
@@ -300,11 +319,20 @@ void __cpuinit kvm_guest_cpu_init(void)
300 smp_processor_id()); 319 smp_processor_id());
301 } 320 }
302 321
322 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) {
323 unsigned long pa;
324 /* Size alignment is implied but just to make it explicit. */
325 BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4);
326 __get_cpu_var(kvm_apic_eoi) = 0;
327 pa = __pa(&__get_cpu_var(kvm_apic_eoi)) | KVM_MSR_ENABLED;
328 wrmsrl(MSR_KVM_PV_EOI_EN, pa);
329 }
330
303 if (has_steal_clock) 331 if (has_steal_clock)
304 kvm_register_steal_time(); 332 kvm_register_steal_time();
305} 333}
306 334
307static void kvm_pv_disable_apf(void *unused) 335static void kvm_pv_disable_apf(void)
308{ 336{
309 if (!__get_cpu_var(apf_reason).enabled) 337 if (!__get_cpu_var(apf_reason).enabled)
310 return; 338 return;
@@ -316,11 +344,23 @@ static void kvm_pv_disable_apf(void *unused)
316 smp_processor_id()); 344 smp_processor_id());
317} 345}
318 346
347static void kvm_pv_guest_cpu_reboot(void *unused)
348{
349 /*
350 * We disable PV EOI before we load a new kernel by kexec,
351 * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory.
352 * New kernel can re-enable when it boots.
353 */
354 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
355 wrmsrl(MSR_KVM_PV_EOI_EN, 0);
356 kvm_pv_disable_apf();
357}
358
319static int kvm_pv_reboot_notify(struct notifier_block *nb, 359static int kvm_pv_reboot_notify(struct notifier_block *nb,
320 unsigned long code, void *unused) 360 unsigned long code, void *unused)
321{ 361{
322 if (code == SYS_RESTART) 362 if (code == SYS_RESTART)
323 on_each_cpu(kvm_pv_disable_apf, NULL, 1); 363 on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1);
324 return NOTIFY_DONE; 364 return NOTIFY_DONE;
325} 365}
326 366
@@ -371,7 +411,9 @@ static void __cpuinit kvm_guest_cpu_online(void *dummy)
371static void kvm_guest_cpu_offline(void *dummy) 411static void kvm_guest_cpu_offline(void *dummy)
372{ 412{
373 kvm_disable_steal_time(); 413 kvm_disable_steal_time();
374 kvm_pv_disable_apf(NULL); 414 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
415 wrmsrl(MSR_KVM_PV_EOI_EN, 0);
416 kvm_pv_disable_apf();
375 apf_task_wake_all(); 417 apf_task_wake_all();
376} 418}
377 419
@@ -424,6 +466,9 @@ void __init kvm_guest_init(void)
424 pv_time_ops.steal_clock = kvm_steal_clock; 466 pv_time_ops.steal_clock = kvm_steal_clock;
425 } 467 }
426 468
469 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
470 apic_set_eoi_write(kvm_guest_apic_eoi_write);
471
427#ifdef CONFIG_SMP 472#ifdef CONFIG_SMP
428 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; 473 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
429 register_cpu_notifier(&kvm_cpu_notifier); 474 register_cpu_notifier(&kvm_cpu_notifier);
@@ -432,6 +477,19 @@ void __init kvm_guest_init(void)
432#endif 477#endif
433} 478}
434 479
480static bool __init kvm_detect(void)
481{
482 if (!kvm_para_available())
483 return false;
484 return true;
485}
486
487const struct hypervisor_x86 x86_hyper_kvm __refconst = {
488 .name = "KVM",
489 .detect = kvm_detect,
490};
491EXPORT_SYMBOL_GPL(x86_hyper_kvm);
492
435static __init int activate_jump_labels(void) 493static __init int activate_jump_labels(void)
436{ 494{
437 if (has_steal_clock) { 495 if (has_steal_clock) {
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index fbdfc6917180..4873e62db6a1 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -87,6 +87,7 @@
87#include <asm/microcode.h> 87#include <asm/microcode.h>
88#include <asm/processor.h> 88#include <asm/processor.h>
89#include <asm/cpu_device_id.h> 89#include <asm/cpu_device_id.h>
90#include <asm/perf_event.h>
90 91
91MODULE_DESCRIPTION("Microcode Update Driver"); 92MODULE_DESCRIPTION("Microcode Update Driver");
92MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); 93MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
@@ -277,7 +278,6 @@ static int reload_for_cpu(int cpu)
277 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 278 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
278 int err = 0; 279 int err = 0;
279 280
280 mutex_lock(&microcode_mutex);
281 if (uci->valid) { 281 if (uci->valid) {
282 enum ucode_state ustate; 282 enum ucode_state ustate;
283 283
@@ -288,7 +288,6 @@ static int reload_for_cpu(int cpu)
288 if (ustate == UCODE_ERROR) 288 if (ustate == UCODE_ERROR)
289 err = -EINVAL; 289 err = -EINVAL;
290 } 290 }
291 mutex_unlock(&microcode_mutex);
292 291
293 return err; 292 return err;
294} 293}
@@ -298,19 +297,31 @@ static ssize_t reload_store(struct device *dev,
298 const char *buf, size_t size) 297 const char *buf, size_t size)
299{ 298{
300 unsigned long val; 299 unsigned long val;
301 int cpu = dev->id; 300 int cpu;
302 ssize_t ret = 0; 301 ssize_t ret = 0, tmp_ret;
303 302
304 ret = kstrtoul(buf, 0, &val); 303 ret = kstrtoul(buf, 0, &val);
305 if (ret) 304 if (ret)
306 return ret; 305 return ret;
307 306
308 if (val == 1) { 307 if (val != 1)
309 get_online_cpus(); 308 return size;
310 if (cpu_online(cpu)) 309
311 ret = reload_for_cpu(cpu); 310 get_online_cpus();
312 put_online_cpus(); 311 mutex_lock(&microcode_mutex);
312 for_each_online_cpu(cpu) {
313 tmp_ret = reload_for_cpu(cpu);
314 if (tmp_ret != 0)
315 pr_warn("Error reloading microcode on CPU %d\n", cpu);
316
317 /* save retval of the first encountered reload error */
318 if (!ret)
319 ret = tmp_ret;
313 } 320 }
321 if (!ret)
322 perf_check_microcode();
323 mutex_unlock(&microcode_mutex);
324 put_online_cpus();
314 325
315 if (!ret) 326 if (!ret)
316 ret = size; 327 ret = size;
@@ -339,7 +350,6 @@ static DEVICE_ATTR(version, 0400, version_show, NULL);
339static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL); 350static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL);
340 351
341static struct attribute *mc_default_attrs[] = { 352static struct attribute *mc_default_attrs[] = {
342 &dev_attr_reload.attr,
343 &dev_attr_version.attr, 353 &dev_attr_version.attr,
344 &dev_attr_processor_flags.attr, 354 &dev_attr_processor_flags.attr,
345 NULL 355 NULL
@@ -504,7 +514,7 @@ static struct notifier_block __refdata mc_cpu_notifier = {
504 514
505#ifdef MODULE 515#ifdef MODULE
506/* Autoload on Intel and AMD systems */ 516/* Autoload on Intel and AMD systems */
507static const struct x86_cpu_id microcode_id[] = { 517static const struct x86_cpu_id __initconst microcode_id[] = {
508#ifdef CONFIG_MICROCODE_INTEL 518#ifdef CONFIG_MICROCODE_INTEL
509 { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, }, 519 { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, },
510#endif 520#endif
@@ -516,6 +526,16 @@ static const struct x86_cpu_id microcode_id[] = {
516MODULE_DEVICE_TABLE(x86cpu, microcode_id); 526MODULE_DEVICE_TABLE(x86cpu, microcode_id);
517#endif 527#endif
518 528
529static struct attribute *cpu_root_microcode_attrs[] = {
530 &dev_attr_reload.attr,
531 NULL
532};
533
534static struct attribute_group cpu_root_microcode_group = {
535 .name = "microcode",
536 .attrs = cpu_root_microcode_attrs,
537};
538
519static int __init microcode_init(void) 539static int __init microcode_init(void)
520{ 540{
521 struct cpuinfo_x86 *c = &cpu_data(0); 541 struct cpuinfo_x86 *c = &cpu_data(0);
@@ -540,16 +560,25 @@ static int __init microcode_init(void)
540 mutex_lock(&microcode_mutex); 560 mutex_lock(&microcode_mutex);
541 561
542 error = subsys_interface_register(&mc_cpu_interface); 562 error = subsys_interface_register(&mc_cpu_interface);
543 563 if (!error)
564 perf_check_microcode();
544 mutex_unlock(&microcode_mutex); 565 mutex_unlock(&microcode_mutex);
545 put_online_cpus(); 566 put_online_cpus();
546 567
547 if (error) 568 if (error)
548 goto out_pdev; 569 goto out_pdev;
549 570
571 error = sysfs_create_group(&cpu_subsys.dev_root->kobj,
572 &cpu_root_microcode_group);
573
574 if (error) {
575 pr_err("Error creating microcode group!\n");
576 goto out_driver;
577 }
578
550 error = microcode_dev_init(); 579 error = microcode_dev_init();
551 if (error) 580 if (error)
552 goto out_driver; 581 goto out_ucode_group;
553 582
554 register_syscore_ops(&mc_syscore_ops); 583 register_syscore_ops(&mc_syscore_ops);
555 register_hotcpu_notifier(&mc_cpu_notifier); 584 register_hotcpu_notifier(&mc_cpu_notifier);
@@ -559,7 +588,11 @@ static int __init microcode_init(void)
559 588
560 return 0; 589 return 0;
561 590
562out_driver: 591 out_ucode_group:
592 sysfs_remove_group(&cpu_subsys.dev_root->kobj,
593 &cpu_root_microcode_group);
594
595 out_driver:
563 get_online_cpus(); 596 get_online_cpus();
564 mutex_lock(&microcode_mutex); 597 mutex_lock(&microcode_mutex);
565 598
@@ -568,7 +601,7 @@ out_driver:
568 mutex_unlock(&microcode_mutex); 601 mutex_unlock(&microcode_mutex);
569 put_online_cpus(); 602 put_online_cpus();
570 603
571out_pdev: 604 out_pdev:
572 platform_device_unregister(microcode_pdev); 605 platform_device_unregister(microcode_pdev);
573 return error; 606 return error;
574 607
@@ -584,6 +617,9 @@ static void __exit microcode_exit(void)
584 unregister_hotcpu_notifier(&mc_cpu_notifier); 617 unregister_hotcpu_notifier(&mc_cpu_notifier);
585 unregister_syscore_ops(&mc_syscore_ops); 618 unregister_syscore_ops(&mc_syscore_ops);
586 619
620 sysfs_remove_group(&cpu_subsys.dev_root->kobj,
621 &cpu_root_microcode_group);
622
587 get_online_cpus(); 623 get_online_cpus();
588 mutex_lock(&microcode_mutex); 624 mutex_lock(&microcode_mutex);
589 625
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index f21fd94ac897..216a4d754b0c 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -15,6 +15,9 @@
15 along with this program; if not, write to the Free Software 15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/ 17*/
18
19#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
18#include <linux/moduleloader.h> 21#include <linux/moduleloader.h>
19#include <linux/elf.h> 22#include <linux/elf.h>
20#include <linux/vmalloc.h> 23#include <linux/vmalloc.h>
@@ -30,9 +33,14 @@
30#include <asm/pgtable.h> 33#include <asm/pgtable.h>
31 34
32#if 0 35#if 0
33#define DEBUGP printk 36#define DEBUGP(fmt, ...) \
37 printk(KERN_DEBUG fmt, ##__VA_ARGS__)
34#else 38#else
35#define DEBUGP(fmt...) 39#define DEBUGP(fmt, ...) \
40do { \
41 if (0) \
42 printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
43} while (0)
36#endif 44#endif
37 45
38void *module_alloc(unsigned long size) 46void *module_alloc(unsigned long size)
@@ -56,8 +64,8 @@ int apply_relocate(Elf32_Shdr *sechdrs,
56 Elf32_Sym *sym; 64 Elf32_Sym *sym;
57 uint32_t *location; 65 uint32_t *location;
58 66
59 DEBUGP("Applying relocate section %u to %u\n", relsec, 67 DEBUGP("Applying relocate section %u to %u\n",
60 sechdrs[relsec].sh_info); 68 relsec, sechdrs[relsec].sh_info);
61 for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { 69 for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
62 /* This is where to make the change */ 70 /* This is where to make the change */
63 location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr 71 location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
@@ -73,11 +81,11 @@ int apply_relocate(Elf32_Shdr *sechdrs,
73 *location += sym->st_value; 81 *location += sym->st_value;
74 break; 82 break;
75 case R_386_PC32: 83 case R_386_PC32:
76 /* Add the value, subtract its postition */ 84 /* Add the value, subtract its position */
77 *location += sym->st_value - (uint32_t)location; 85 *location += sym->st_value - (uint32_t)location;
78 break; 86 break;
79 default: 87 default:
80 printk(KERN_ERR "module %s: Unknown relocation: %u\n", 88 pr_err("%s: Unknown relocation: %u\n",
81 me->name, ELF32_R_TYPE(rel[i].r_info)); 89 me->name, ELF32_R_TYPE(rel[i].r_info));
82 return -ENOEXEC; 90 return -ENOEXEC;
83 } 91 }
@@ -97,8 +105,8 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
97 void *loc; 105 void *loc;
98 u64 val; 106 u64 val;
99 107
100 DEBUGP("Applying relocate section %u to %u\n", relsec, 108 DEBUGP("Applying relocate section %u to %u\n",
101 sechdrs[relsec].sh_info); 109 relsec, sechdrs[relsec].sh_info);
102 for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { 110 for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
103 /* This is where to make the change */ 111 /* This is where to make the change */
104 loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr 112 loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
@@ -110,8 +118,8 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
110 + ELF64_R_SYM(rel[i].r_info); 118 + ELF64_R_SYM(rel[i].r_info);
111 119
112 DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n", 120 DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n",
113 (int)ELF64_R_TYPE(rel[i].r_info), 121 (int)ELF64_R_TYPE(rel[i].r_info),
114 sym->st_value, rel[i].r_addend, (u64)loc); 122 sym->st_value, rel[i].r_addend, (u64)loc);
115 123
116 val = sym->st_value + rel[i].r_addend; 124 val = sym->st_value + rel[i].r_addend;
117 125
@@ -140,7 +148,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
140#endif 148#endif
141 break; 149 break;
142 default: 150 default:
143 printk(KERN_ERR "module %s: Unknown rela relocation: %llu\n", 151 pr_err("%s: Unknown rela relocation: %llu\n",
144 me->name, ELF64_R_TYPE(rel[i].r_info)); 152 me->name, ELF64_R_TYPE(rel[i].r_info));
145 return -ENOEXEC; 153 return -ENOEXEC;
146 } 154 }
@@ -148,9 +156,9 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
148 return 0; 156 return 0;
149 157
150overflow: 158overflow:
151 printk(KERN_ERR "overflow in relocation type %d val %Lx\n", 159 pr_err("overflow in relocation type %d val %Lx\n",
152 (int)ELF64_R_TYPE(rel[i].r_info), val); 160 (int)ELF64_R_TYPE(rel[i].r_info), val);
153 printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n", 161 pr_err("`%s' likely not compiled with -mcmodel=kernel\n",
154 me->name); 162 me->name);
155 return -ENOEXEC; 163 return -ENOEXEC;
156} 164}
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index a0b2f84457be..f84f5c57de35 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -365,8 +365,9 @@ static __kprobes void default_do_nmi(struct pt_regs *regs)
365#ifdef CONFIG_X86_32 365#ifdef CONFIG_X86_32
366/* 366/*
367 * For i386, NMIs use the same stack as the kernel, and we can 367 * For i386, NMIs use the same stack as the kernel, and we can
368 * add a workaround to the iret problem in C. Simply have 3 states 368 * add a workaround to the iret problem in C (preventing nested
369 * the NMI can be in. 369 * NMIs if an NMI takes a trap). Simply have 3 states the NMI
370 * can be in:
370 * 371 *
371 * 1) not running 372 * 1) not running
372 * 2) executing 373 * 2) executing
@@ -383,32 +384,50 @@ static __kprobes void default_do_nmi(struct pt_regs *regs)
383 * If an NMI hits a breakpoint that executes an iret, another 384 * If an NMI hits a breakpoint that executes an iret, another
384 * NMI can preempt it. We do not want to allow this new NMI 385 * NMI can preempt it. We do not want to allow this new NMI
385 * to run, but we want to execute it when the first one finishes. 386 * to run, but we want to execute it when the first one finishes.
386 * We set the state to "latched", and the first NMI will perform 387 * We set the state to "latched", and the exit of the first NMI will
387 * an cmpxchg on the state, and if it doesn't successfully 388 * perform a dec_return, if the result is zero (NOT_RUNNING), then
388 * reset the state to "not running" it will restart the next 389 * it will simply exit the NMI handler. If not, the dec_return
389 * NMI. 390 * would have set the state to NMI_EXECUTING (what we want it to
391 * be when we are running). In this case, we simply jump back
392 * to rerun the NMI handler again, and restart the 'latched' NMI.
393 *
394 * No trap (breakpoint or page fault) should be hit before nmi_restart,
395 * thus there is no race between the first check of state for NOT_RUNNING
396 * and setting it to NMI_EXECUTING. The HW will prevent nested NMIs
397 * at this point.
398 *
399 * In case the NMI takes a page fault, we need to save off the CR2
400 * because the NMI could have preempted another page fault and corrupt
401 * the CR2 that is about to be read. As nested NMIs must be restarted
402 * and they can not take breakpoints or page faults, the update of the
403 * CR2 must be done before converting the nmi state back to NOT_RUNNING.
404 * Otherwise, there would be a race of another nested NMI coming in
405 * after setting state to NOT_RUNNING but before updating the nmi_cr2.
390 */ 406 */
391enum nmi_states { 407enum nmi_states {
392 NMI_NOT_RUNNING, 408 NMI_NOT_RUNNING = 0,
393 NMI_EXECUTING, 409 NMI_EXECUTING,
394 NMI_LATCHED, 410 NMI_LATCHED,
395}; 411};
396static DEFINE_PER_CPU(enum nmi_states, nmi_state); 412static DEFINE_PER_CPU(enum nmi_states, nmi_state);
413static DEFINE_PER_CPU(unsigned long, nmi_cr2);
397 414
398#define nmi_nesting_preprocess(regs) \ 415#define nmi_nesting_preprocess(regs) \
399 do { \ 416 do { \
400 if (__get_cpu_var(nmi_state) != NMI_NOT_RUNNING) { \ 417 if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) { \
401 __get_cpu_var(nmi_state) = NMI_LATCHED; \ 418 this_cpu_write(nmi_state, NMI_LATCHED); \
402 return; \ 419 return; \
403 } \ 420 } \
404 nmi_restart: \ 421 this_cpu_write(nmi_state, NMI_EXECUTING); \
405 __get_cpu_var(nmi_state) = NMI_EXECUTING; \ 422 this_cpu_write(nmi_cr2, read_cr2()); \
406 } while (0) 423 } while (0); \
424 nmi_restart:
407 425
408#define nmi_nesting_postprocess() \ 426#define nmi_nesting_postprocess() \
409 do { \ 427 do { \
410 if (cmpxchg(&__get_cpu_var(nmi_state), \ 428 if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) \
411 NMI_EXECUTING, NMI_NOT_RUNNING) != NMI_EXECUTING) \ 429 write_cr2(this_cpu_read(nmi_cr2)); \
430 if (this_cpu_dec_return(nmi_state)) \
412 goto nmi_restart; \ 431 goto nmi_restart; \
413 } while (0) 432 } while (0)
414#else /* x86_64 */ 433#else /* x86_64 */
diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c
index 149b8d9c6ad4..6d9582ec0324 100644
--- a/arch/x86/kernel/nmi_selftest.c
+++ b/arch/x86/kernel/nmi_selftest.c
@@ -42,7 +42,8 @@ static int __init nmi_unk_cb(unsigned int val, struct pt_regs *regs)
42static void __init init_nmi_testsuite(void) 42static void __init init_nmi_testsuite(void)
43{ 43{
44 /* trap all the unknown NMIs we may generate */ 44 /* trap all the unknown NMIs we may generate */
45 register_nmi_handler_initonly(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk"); 45 register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk",
46 __initdata);
46} 47}
47 48
48static void __init cleanup_nmi_testsuite(void) 49static void __init cleanup_nmi_testsuite(void)
@@ -64,8 +65,8 @@ static void __init test_nmi_ipi(struct cpumask *mask)
64{ 65{
65 unsigned long timeout; 66 unsigned long timeout;
66 67
67 if (register_nmi_handler_initonly(NMI_LOCAL, test_nmi_ipi_callback, 68 if (register_nmi_handler(NMI_LOCAL, test_nmi_ipi_callback,
68 NMI_FLAG_FIRST, "nmi_selftest")) { 69 NMI_FLAG_FIRST, "nmi_selftest", __initdata)) {
69 nmi_fail = FAILURE; 70 nmi_fail = FAILURE;
70 return; 71 return;
71 } 72 }
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 9ce885996fd7..17fff18a1031 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -352,9 +352,7 @@ struct pv_cpu_ops pv_cpu_ops = {
352#endif 352#endif
353 .wbinvd = native_wbinvd, 353 .wbinvd = native_wbinvd,
354 .read_msr = native_read_msr_safe, 354 .read_msr = native_read_msr_safe,
355 .rdmsr_regs = native_rdmsr_safe_regs,
356 .write_msr = native_write_msr_safe, 355 .write_msr = native_write_msr_safe,
357 .wrmsr_regs = native_wrmsr_safe_regs,
358 .read_tsc = native_read_tsc, 356 .read_tsc = native_read_tsc,
359 .read_pmc = native_read_pmc, 357 .read_pmc = native_read_pmc,
360 .read_tscp = native_read_tscp, 358 .read_tscp = native_read_tscp,
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index b72838bae64a..299d49302e7d 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -22,6 +22,8 @@
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 */ 23 */
24 24
25#define pr_fmt(fmt) "Calgary: " fmt
26
25#include <linux/kernel.h> 27#include <linux/kernel.h>
26#include <linux/init.h> 28#include <linux/init.h>
27#include <linux/types.h> 29#include <linux/types.h>
@@ -245,7 +247,7 @@ static unsigned long iommu_range_alloc(struct device *dev,
245 offset = iommu_area_alloc(tbl->it_map, tbl->it_size, 0, 247 offset = iommu_area_alloc(tbl->it_map, tbl->it_size, 0,
246 npages, 0, boundary_size, 0); 248 npages, 0, boundary_size, 0);
247 if (offset == ~0UL) { 249 if (offset == ~0UL) {
248 printk(KERN_WARNING "Calgary: IOMMU full.\n"); 250 pr_warn("IOMMU full\n");
249 spin_unlock_irqrestore(&tbl->it_lock, flags); 251 spin_unlock_irqrestore(&tbl->it_lock, flags);
250 if (panic_on_overflow) 252 if (panic_on_overflow)
251 panic("Calgary: fix the allocator.\n"); 253 panic("Calgary: fix the allocator.\n");
@@ -271,8 +273,8 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
271 entry = iommu_range_alloc(dev, tbl, npages); 273 entry = iommu_range_alloc(dev, tbl, npages);
272 274
273 if (unlikely(entry == DMA_ERROR_CODE)) { 275 if (unlikely(entry == DMA_ERROR_CODE)) {
274 printk(KERN_WARNING "Calgary: failed to allocate %u pages in " 276 pr_warn("failed to allocate %u pages in iommu %p\n",
275 "iommu %p\n", npages, tbl); 277 npages, tbl);
276 return DMA_ERROR_CODE; 278 return DMA_ERROR_CODE;
277 } 279 }
278 280
@@ -561,8 +563,7 @@ static void calgary_tce_cache_blast(struct iommu_table *tbl)
561 i++; 563 i++;
562 } while ((val & 0xff) != 0xff && i < 100); 564 } while ((val & 0xff) != 0xff && i < 100);
563 if (i == 100) 565 if (i == 100)
564 printk(KERN_WARNING "Calgary: PCI bus not quiesced, " 566 pr_warn("PCI bus not quiesced, continuing anyway\n");
565 "continuing anyway\n");
566 567
567 /* invalidate TCE cache */ 568 /* invalidate TCE cache */
568 target = calgary_reg(bbar, tar_offset(tbl->it_busno)); 569 target = calgary_reg(bbar, tar_offset(tbl->it_busno));
@@ -604,8 +605,7 @@ begin:
604 i++; 605 i++;
605 } while ((val64 & 0xff) != 0xff && i < 100); 606 } while ((val64 & 0xff) != 0xff && i < 100);
606 if (i == 100) 607 if (i == 100)
607 printk(KERN_WARNING "CalIOC2: PCI bus not quiesced, " 608 pr_warn("CalIOC2: PCI bus not quiesced, continuing anyway\n");
608 "continuing anyway\n");
609 609
610 /* 3. poll Page Migration DEBUG for SoftStopFault */ 610 /* 3. poll Page Migration DEBUG for SoftStopFault */
611 target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG); 611 target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG);
@@ -617,8 +617,7 @@ begin:
617 if (++count < 100) 617 if (++count < 100)
618 goto begin; 618 goto begin;
619 else { 619 else {
620 printk(KERN_WARNING "CalIOC2: too many SoftStopFaults, " 620 pr_warn("CalIOC2: too many SoftStopFaults, aborting TCE cache flush sequence!\n");
621 "aborting TCE cache flush sequence!\n");
622 return; /* pray for the best */ 621 return; /* pray for the best */
623 } 622 }
624 } 623 }
@@ -840,8 +839,8 @@ static void calgary_dump_error_regs(struct iommu_table *tbl)
840 plssr = be32_to_cpu(readl(target)); 839 plssr = be32_to_cpu(readl(target));
841 840
842 /* If no error, the agent ID in the CSR is not valid */ 841 /* If no error, the agent ID in the CSR is not valid */
843 printk(KERN_EMERG "Calgary: DMA error on Calgary PHB 0x%x, " 842 pr_emerg("DMA error on Calgary PHB 0x%x, 0x%08x@CSR 0x%08x@PLSSR\n",
844 "0x%08x@CSR 0x%08x@PLSSR\n", tbl->it_busno, csr, plssr); 843 tbl->it_busno, csr, plssr);
845} 844}
846 845
847static void calioc2_dump_error_regs(struct iommu_table *tbl) 846static void calioc2_dump_error_regs(struct iommu_table *tbl)
@@ -867,22 +866,21 @@ static void calioc2_dump_error_regs(struct iommu_table *tbl)
867 target = calgary_reg(bbar, phboff | 0x800); 866 target = calgary_reg(bbar, phboff | 0x800);
868 mck = be32_to_cpu(readl(target)); 867 mck = be32_to_cpu(readl(target));
869 868
870 printk(KERN_EMERG "Calgary: DMA error on CalIOC2 PHB 0x%x\n", 869 pr_emerg("DMA error on CalIOC2 PHB 0x%x\n", tbl->it_busno);
871 tbl->it_busno);
872 870
873 printk(KERN_EMERG "Calgary: 0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n", 871 pr_emerg("0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n",
874 csr, plssr, csmr, mck); 872 csr, plssr, csmr, mck);
875 873
876 /* dump rest of error regs */ 874 /* dump rest of error regs */
877 printk(KERN_EMERG "Calgary: "); 875 pr_emerg("");
878 for (i = 0; i < ARRAY_SIZE(errregs); i++) { 876 for (i = 0; i < ARRAY_SIZE(errregs); i++) {
879 /* err regs are at 0x810 - 0x870 */ 877 /* err regs are at 0x810 - 0x870 */
880 erroff = (0x810 + (i * 0x10)); 878 erroff = (0x810 + (i * 0x10));
881 target = calgary_reg(bbar, phboff | erroff); 879 target = calgary_reg(bbar, phboff | erroff);
882 errregs[i] = be32_to_cpu(readl(target)); 880 errregs[i] = be32_to_cpu(readl(target));
883 printk("0x%08x@0x%lx ", errregs[i], erroff); 881 pr_cont("0x%08x@0x%lx ", errregs[i], erroff);
884 } 882 }
885 printk("\n"); 883 pr_cont("\n");
886 884
887 /* root complex status */ 885 /* root complex status */
888 target = calgary_reg(bbar, phboff | PHB_ROOT_COMPLEX_STATUS); 886 target = calgary_reg(bbar, phboff | PHB_ROOT_COMPLEX_STATUS);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index c0f420f76cd3..de2b7ad70273 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -45,15 +45,6 @@ int iommu_detected __read_mostly = 0;
45 */ 45 */
46int iommu_pass_through __read_mostly; 46int iommu_pass_through __read_mostly;
47 47
48/*
49 * Group multi-function PCI devices into a single device-group for the
50 * iommu_device_group interface. This tells the iommu driver to pretend
51 * it cannot distinguish between functions of a device, exposing only one
52 * group for the device. Useful for disallowing use of individual PCI
53 * functions from userspace drivers.
54 */
55int iommu_group_mf __read_mostly;
56
57extern struct iommu_table_entry __iommu_table[], __iommu_table_end[]; 48extern struct iommu_table_entry __iommu_table[], __iommu_table_end[];
58 49
59/* Dummy device used for NULL arguments (normally ISA). */ 50/* Dummy device used for NULL arguments (normally ISA). */
@@ -194,8 +185,6 @@ static __init int iommu_setup(char *p)
194#endif 185#endif
195 if (!strncmp(p, "pt", 2)) 186 if (!strncmp(p, "pt", 2))
196 iommu_pass_through = 1; 187 iommu_pass_through = 1;
197 if (!strncmp(p, "group_mf", 8))
198 iommu_group_mf = 1;
199 188
200 gart_parse_options(p); 189 gart_parse_options(p);
201 190
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 735279e54e59..ef6a8456f719 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -1,3 +1,5 @@
1#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
2
1#include <linux/errno.h> 3#include <linux/errno.h>
2#include <linux/kernel.h> 4#include <linux/kernel.h>
3#include <linux/mm.h> 5#include <linux/mm.h>
@@ -145,16 +147,14 @@ void show_regs_common(void)
145 /* Board Name is optional */ 147 /* Board Name is optional */
146 board = dmi_get_system_info(DMI_BOARD_NAME); 148 board = dmi_get_system_info(DMI_BOARD_NAME);
147 149
148 printk(KERN_CONT "\n"); 150 printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s %s %s%s%s\n",
149 printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s", 151 current->pid, current->comm, print_tainted(),
150 current->pid, current->comm, print_tainted(), 152 init_utsname()->release,
151 init_utsname()->release, 153 (int)strcspn(init_utsname()->version, " "),
152 (int)strcspn(init_utsname()->version, " "), 154 init_utsname()->version,
153 init_utsname()->version); 155 vendor, product,
154 printk(KERN_CONT " %s %s", vendor, product); 156 board ? "/" : "",
155 if (board) 157 board ? board : "");
156 printk(KERN_CONT "/%s", board);
157 printk(KERN_CONT "\n");
158} 158}
159 159
160void flush_thread(void) 160void flush_thread(void)
@@ -645,7 +645,7 @@ static void amd_e400_idle(void)
645 amd_e400_c1e_detected = true; 645 amd_e400_c1e_detected = true;
646 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) 646 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
647 mark_tsc_unstable("TSC halt in AMD C1E"); 647 mark_tsc_unstable("TSC halt in AMD C1E");
648 printk(KERN_INFO "System has AMD C1E enabled\n"); 648 pr_info("System has AMD C1E enabled\n");
649 } 649 }
650 } 650 }
651 651
@@ -659,8 +659,7 @@ static void amd_e400_idle(void)
659 */ 659 */
660 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, 660 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
661 &cpu); 661 &cpu);
662 printk(KERN_INFO "Switch to broadcast mode on CPU%d\n", 662 pr_info("Switch to broadcast mode on CPU%d\n", cpu);
663 cpu);
664 } 663 }
665 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); 664 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
666 665
@@ -681,8 +680,7 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
681{ 680{
682#ifdef CONFIG_SMP 681#ifdef CONFIG_SMP
683 if (pm_idle == poll_idle && smp_num_siblings > 1) { 682 if (pm_idle == poll_idle && smp_num_siblings > 1) {
684 printk_once(KERN_WARNING "WARNING: polling idle and HT enabled," 683 pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n");
685 " performance may degrade.\n");
686 } 684 }
687#endif 685#endif
688 if (pm_idle) 686 if (pm_idle)
@@ -692,11 +690,11 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
692 /* 690 /*
693 * One CPU supports mwait => All CPUs supports mwait 691 * One CPU supports mwait => All CPUs supports mwait
694 */ 692 */
695 printk(KERN_INFO "using mwait in idle threads.\n"); 693 pr_info("using mwait in idle threads\n");
696 pm_idle = mwait_idle; 694 pm_idle = mwait_idle;
697 } else if (cpu_has_amd_erratum(amd_erratum_400)) { 695 } else if (cpu_has_amd_erratum(amd_erratum_400)) {
698 /* E400: APIC timer interrupt does not wake up CPU from C1e */ 696 /* E400: APIC timer interrupt does not wake up CPU from C1e */
699 printk(KERN_INFO "using AMD E400 aware idle routine\n"); 697 pr_info("using AMD E400 aware idle routine\n");
700 pm_idle = amd_e400_idle; 698 pm_idle = amd_e400_idle;
701 } else 699 } else
702 pm_idle = default_idle; 700 pm_idle = default_idle;
@@ -715,7 +713,7 @@ static int __init idle_setup(char *str)
715 return -EINVAL; 713 return -EINVAL;
716 714
717 if (!strcmp(str, "poll")) { 715 if (!strcmp(str, "poll")) {
718 printk("using polling idle threads.\n"); 716 pr_info("using polling idle threads\n");
719 pm_idle = poll_idle; 717 pm_idle = poll_idle;
720 boot_option_idle_override = IDLE_POLL; 718 boot_option_idle_override = IDLE_POLL;
721 } else if (!strcmp(str, "mwait")) { 719 } else if (!strcmp(str, "mwait")) {
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 61cdf7fdf099..0a980c9d7cb8 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -117,10 +117,10 @@ void release_thread(struct task_struct *dead_task)
117{ 117{
118 if (dead_task->mm) { 118 if (dead_task->mm) {
119 if (dead_task->mm->context.size) { 119 if (dead_task->mm->context.size) {
120 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", 120 pr_warn("WARNING: dead process %8s still has LDT? <%p/%d>\n",
121 dead_task->comm, 121 dead_task->comm,
122 dead_task->mm->context.ldt, 122 dead_task->mm->context.ldt,
123 dead_task->mm->context.size); 123 dead_task->mm->context.size);
124 BUG(); 124 BUG();
125 } 125 }
126 } 126 }
@@ -466,7 +466,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
466 task->thread.gs = addr; 466 task->thread.gs = addr;
467 if (doit) { 467 if (doit) {
468 load_gs_index(0); 468 load_gs_index(0);
469 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); 469 ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr);
470 } 470 }
471 } 471 }
472 put_cpu(); 472 put_cpu();
@@ -494,7 +494,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
494 /* set the selector to 0 to not confuse 494 /* set the selector to 0 to not confuse
495 __switch_to */ 495 __switch_to */
496 loadsegment(fs, 0); 496 loadsegment(fs, 0);
497 ret = checking_wrmsrl(MSR_FS_BASE, addr); 497 ret = wrmsrl_safe(MSR_FS_BASE, addr);
498 } 498 }
499 } 499 }
500 put_cpu(); 500 put_cpu();
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 03920a15a632..1b27de563561 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -512,7 +512,7 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
512 512
513#if defined(CONFIG_PCI) && defined(CONFIG_NUMA) 513#if defined(CONFIG_PCI) && defined(CONFIG_NUMA)
514/* Set correct numa_node information for AMD NB functions */ 514/* Set correct numa_node information for AMD NB functions */
515static void __init quirk_amd_nb_node(struct pci_dev *dev) 515static void __devinit quirk_amd_nb_node(struct pci_dev *dev)
516{ 516{
517 struct pci_dev *nb_ht; 517 struct pci_dev *nb_ht;
518 unsigned int devfn; 518 unsigned int devfn;
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 25b48edb847c..52190a938b4a 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -1,3 +1,5 @@
1#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
2
1#include <linux/module.h> 3#include <linux/module.h>
2#include <linux/reboot.h> 4#include <linux/reboot.h>
3#include <linux/init.h> 5#include <linux/init.h>
@@ -20,14 +22,12 @@
20#include <asm/virtext.h> 22#include <asm/virtext.h>
21#include <asm/cpu.h> 23#include <asm/cpu.h>
22#include <asm/nmi.h> 24#include <asm/nmi.h>
25#include <asm/smp.h>
23 26
24#ifdef CONFIG_X86_32 27#include <linux/ctype.h>
25# include <linux/ctype.h> 28#include <linux/mc146818rtc.h>
26# include <linux/mc146818rtc.h> 29#include <asm/realmode.h>
27# include <asm/realmode.h> 30#include <asm/x86_init.h>
28#else
29# include <asm/x86_init.h>
30#endif
31 31
32/* 32/*
33 * Power off function, if any 33 * Power off function, if any
@@ -49,7 +49,7 @@ int reboot_force;
49 */ 49 */
50static int reboot_default = 1; 50static int reboot_default = 1;
51 51
52#if defined(CONFIG_X86_32) && defined(CONFIG_SMP) 52#ifdef CONFIG_SMP
53static int reboot_cpu = -1; 53static int reboot_cpu = -1;
54#endif 54#endif
55 55
@@ -67,8 +67,8 @@ bool port_cf9_safe = false;
67 * reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci] 67 * reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci]
68 * warm Don't set the cold reboot flag 68 * warm Don't set the cold reboot flag
69 * cold Set the cold reboot flag 69 * cold Set the cold reboot flag
70 * bios Reboot by jumping through the BIOS (only for X86_32) 70 * bios Reboot by jumping through the BIOS
71 * smp Reboot by executing reset on BSP or other CPU (only for X86_32) 71 * smp Reboot by executing reset on BSP or other CPU
72 * triple Force a triple fault (init) 72 * triple Force a triple fault (init)
73 * kbd Use the keyboard controller. cold reset (default) 73 * kbd Use the keyboard controller. cold reset (default)
74 * acpi Use the RESET_REG in the FADT 74 * acpi Use the RESET_REG in the FADT
@@ -95,7 +95,6 @@ static int __init reboot_setup(char *str)
95 reboot_mode = 0; 95 reboot_mode = 0;
96 break; 96 break;
97 97
98#ifdef CONFIG_X86_32
99#ifdef CONFIG_SMP 98#ifdef CONFIG_SMP
100 case 's': 99 case 's':
101 if (isdigit(*(str+1))) { 100 if (isdigit(*(str+1))) {
@@ -112,7 +111,6 @@ static int __init reboot_setup(char *str)
112#endif /* CONFIG_SMP */ 111#endif /* CONFIG_SMP */
113 112
114 case 'b': 113 case 'b':
115#endif
116 case 'a': 114 case 'a':
117 case 'k': 115 case 'k':
118 case 't': 116 case 't':
@@ -138,7 +136,6 @@ static int __init reboot_setup(char *str)
138__setup("reboot=", reboot_setup); 136__setup("reboot=", reboot_setup);
139 137
140 138
141#ifdef CONFIG_X86_32
142/* 139/*
143 * Reboot options and system auto-detection code provided by 140 * Reboot options and system auto-detection code provided by
144 * Dell Inc. so their systems "just work". :-) 141 * Dell Inc. so their systems "just work". :-)
@@ -152,16 +149,14 @@ static int __init set_bios_reboot(const struct dmi_system_id *d)
152{ 149{
153 if (reboot_type != BOOT_BIOS) { 150 if (reboot_type != BOOT_BIOS) {
154 reboot_type = BOOT_BIOS; 151 reboot_type = BOOT_BIOS;
155 printk(KERN_INFO "%s series board detected. Selecting BIOS-method for reboots.\n", d->ident); 152 pr_info("%s series board detected. Selecting %s-method for reboots.\n",
153 "BIOS", d->ident);
156 } 154 }
157 return 0; 155 return 0;
158} 156}
159 157
160void machine_real_restart(unsigned int type) 158void __noreturn machine_real_restart(unsigned int type)
161{ 159{
162 void (*restart_lowmem)(unsigned int) = (void (*)(unsigned int))
163 real_mode_header->machine_real_restart_asm;
164
165 local_irq_disable(); 160 local_irq_disable();
166 161
167 /* 162 /*
@@ -181,25 +176,28 @@ void machine_real_restart(unsigned int type)
181 /* 176 /*
182 * Switch back to the initial page table. 177 * Switch back to the initial page table.
183 */ 178 */
179#ifdef CONFIG_X86_32
184 load_cr3(initial_page_table); 180 load_cr3(initial_page_table);
185 181#else
186 /* 182 write_cr3(real_mode_header->trampoline_pgd);
187 * Write 0x1234 to absolute memory location 0x472. The BIOS reads 183#endif
188 * this on booting to tell it to "Bypass memory test (also warm
189 * boot)". This seems like a fairly standard thing that gets set by
190 * REBOOT.COM programs, and the previous reset routine did this
191 * too. */
192 *((unsigned short *)0x472) = reboot_mode;
193 184
194 /* Jump to the identity-mapped low memory code */ 185 /* Jump to the identity-mapped low memory code */
195 restart_lowmem(type); 186#ifdef CONFIG_X86_32
187 asm volatile("jmpl *%0" : :
188 "rm" (real_mode_header->machine_real_restart_asm),
189 "a" (type));
190#else
191 asm volatile("ljmpl *%0" : :
192 "m" (real_mode_header->machine_real_restart_asm),
193 "D" (type));
194#endif
195 unreachable();
196} 196}
197#ifdef CONFIG_APM_MODULE 197#ifdef CONFIG_APM_MODULE
198EXPORT_SYMBOL(machine_real_restart); 198EXPORT_SYMBOL(machine_real_restart);
199#endif 199#endif
200 200
201#endif /* CONFIG_X86_32 */
202
203/* 201/*
204 * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot 202 * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot
205 */ 203 */
@@ -207,8 +205,8 @@ static int __init set_pci_reboot(const struct dmi_system_id *d)
207{ 205{
208 if (reboot_type != BOOT_CF9) { 206 if (reboot_type != BOOT_CF9) {
209 reboot_type = BOOT_CF9; 207 reboot_type = BOOT_CF9;
210 printk(KERN_INFO "%s series board detected. " 208 pr_info("%s series board detected. Selecting %s-method for reboots.\n",
211 "Selecting PCI-method for reboots.\n", d->ident); 209 "PCI", d->ident);
212 } 210 }
213 return 0; 211 return 0;
214} 212}
@@ -217,17 +215,16 @@ static int __init set_kbd_reboot(const struct dmi_system_id *d)
217{ 215{
218 if (reboot_type != BOOT_KBD) { 216 if (reboot_type != BOOT_KBD) {
219 reboot_type = BOOT_KBD; 217 reboot_type = BOOT_KBD;
220 printk(KERN_INFO "%s series board detected. Selecting KBD-method for reboot.\n", d->ident); 218 pr_info("%s series board detected. Selecting %s-method for reboot.\n",
219 "KBD", d->ident);
221 } 220 }
222 return 0; 221 return 0;
223} 222}
224 223
225/* 224/*
226 * This is a single dmi_table handling all reboot quirks. Note that 225 * This is a single dmi_table handling all reboot quirks.
227 * REBOOT_BIOS is only available for 32bit
228 */ 226 */
229static struct dmi_system_id __initdata reboot_dmi_table[] = { 227static struct dmi_system_id __initdata reboot_dmi_table[] = {
230#ifdef CONFIG_X86_32
231 { /* Handle problems with rebooting on Dell E520's */ 228 { /* Handle problems with rebooting on Dell E520's */
232 .callback = set_bios_reboot, 229 .callback = set_bios_reboot,
233 .ident = "Dell E520", 230 .ident = "Dell E520",
@@ -377,7 +374,6 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
377 DMI_MATCH(DMI_BOARD_NAME, "P4S800"), 374 DMI_MATCH(DMI_BOARD_NAME, "P4S800"),
378 }, 375 },
379 }, 376 },
380#endif /* CONFIG_X86_32 */
381 377
382 { /* Handle reboot issue on Acer Aspire one */ 378 { /* Handle reboot issue on Acer Aspire one */
383 .callback = set_kbd_reboot, 379 .callback = set_kbd_reboot,
@@ -451,6 +447,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
451 DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"), 447 DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"),
452 }, 448 },
453 }, 449 },
450 { /* Handle problems with rebooting on the Precision M6600. */
451 .callback = set_pci_reboot,
452 .ident = "Dell OptiPlex 990",
453 .matches = {
454 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
455 DMI_MATCH(DMI_PRODUCT_NAME, "Precision M6600"),
456 },
457 },
454 { } 458 { }
455}; 459};
456 460
@@ -576,13 +580,11 @@ static void native_machine_emergency_restart(void)
576 reboot_type = BOOT_KBD; 580 reboot_type = BOOT_KBD;
577 break; 581 break;
578 582
579#ifdef CONFIG_X86_32
580 case BOOT_BIOS: 583 case BOOT_BIOS:
581 machine_real_restart(MRR_BIOS); 584 machine_real_restart(MRR_BIOS);
582 585
583 reboot_type = BOOT_KBD; 586 reboot_type = BOOT_KBD;
584 break; 587 break;
585#endif
586 588
587 case BOOT_ACPI: 589 case BOOT_ACPI:
588 acpi_reboot(); 590 acpi_reboot();
@@ -624,12 +626,10 @@ void native_machine_shutdown(void)
624 /* The boot cpu is always logical cpu 0 */ 626 /* The boot cpu is always logical cpu 0 */
625 int reboot_cpu_id = 0; 627 int reboot_cpu_id = 0;
626 628
627#ifdef CONFIG_X86_32
628 /* See if there has been given a command line override */ 629 /* See if there has been given a command line override */
629 if ((reboot_cpu != -1) && (reboot_cpu < nr_cpu_ids) && 630 if ((reboot_cpu != -1) && (reboot_cpu < nr_cpu_ids) &&
630 cpu_online(reboot_cpu)) 631 cpu_online(reboot_cpu))
631 reboot_cpu_id = reboot_cpu; 632 reboot_cpu_id = reboot_cpu;
632#endif
633 633
634 /* Make certain the cpu I'm about to reboot on is online */ 634 /* Make certain the cpu I'm about to reboot on is online */
635 if (!cpu_online(reboot_cpu_id)) 635 if (!cpu_online(reboot_cpu_id))
@@ -670,7 +670,7 @@ static void __machine_emergency_restart(int emergency)
670 670
671static void native_machine_restart(char *__unused) 671static void native_machine_restart(char *__unused)
672{ 672{
673 printk("machine restart\n"); 673 pr_notice("machine restart\n");
674 674
675 if (!reboot_force) 675 if (!reboot_force)
676 machine_shutdown(); 676 machine_shutdown();
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 16be6dc14db1..f4b9b80e1b95 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1031,8 +1031,6 @@ void __init setup_arch(char **cmdline_p)
1031 1031
1032 x86_init.timers.wallclock_init(); 1032 x86_init.timers.wallclock_init();
1033 1033
1034 x86_platform.wallclock_init();
1035
1036 mcheck_init(); 1034 mcheck_init();
1037 1035
1038 arch_init_ideal_nops(); 1036 arch_init_ideal_nops();
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 5a98aa272184..5cdff0357746 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -21,7 +21,7 @@
21#include <asm/cpu.h> 21#include <asm/cpu.h>
22#include <asm/stackprotector.h> 22#include <asm/stackprotector.h>
23 23
24DEFINE_PER_CPU(int, cpu_number); 24DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number);
25EXPORT_PER_CPU_SYMBOL(cpu_number); 25EXPORT_PER_CPU_SYMBOL(cpu_number);
26 26
27#ifdef CONFIG_X86_64 27#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 21af737053aa..b280908a376e 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -6,6 +6,9 @@
6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes 6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
7 * 2000-2002 x86-64 support by Andi Kleen 7 * 2000-2002 x86-64 support by Andi Kleen
8 */ 8 */
9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
9#include <linux/sched.h> 12#include <linux/sched.h>
10#include <linux/mm.h> 13#include <linux/mm.h>
11#include <linux/smp.h> 14#include <linux/smp.h>
@@ -814,7 +817,7 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
814 me->comm, me->pid, where, frame, 817 me->comm, me->pid, where, frame,
815 regs->ip, regs->sp, regs->orig_ax); 818 regs->ip, regs->sp, regs->orig_ax);
816 print_vma_addr(" in ", regs->ip); 819 print_vma_addr(" in ", regs->ip);
817 printk(KERN_CONT "\n"); 820 pr_cont("\n");
818 } 821 }
819 822
820 force_sig(SIGSEGV, me); 823 force_sig(SIGSEGV, me);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 7bd8a0823654..7c5a8c314c02 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1,4 +1,4 @@
1/* 1 /*
2 * x86 SMP booting functions 2 * x86 SMP booting functions
3 * 3 *
4 * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk> 4 * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
@@ -39,6 +39,8 @@
39 * Glauber Costa : i386 and x86_64 integration 39 * Glauber Costa : i386 and x86_64 integration
40 */ 40 */
41 41
42#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
43
42#include <linux/init.h> 44#include <linux/init.h>
43#include <linux/smp.h> 45#include <linux/smp.h>
44#include <linux/module.h> 46#include <linux/module.h>
@@ -104,17 +106,17 @@ int smp_num_siblings = 1;
104EXPORT_SYMBOL(smp_num_siblings); 106EXPORT_SYMBOL(smp_num_siblings);
105 107
106/* Last level cache ID of each logical CPU */ 108/* Last level cache ID of each logical CPU */
107DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID; 109DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID;
108 110
109/* representing HT siblings of each logical CPU */ 111/* representing HT siblings of each logical CPU */
110DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map); 112DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
111EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); 113EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
112 114
113/* representing HT and core siblings of each logical CPU */ 115/* representing HT and core siblings of each logical CPU */
114DEFINE_PER_CPU(cpumask_var_t, cpu_core_map); 116DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
115EXPORT_PER_CPU_SYMBOL(cpu_core_map); 117EXPORT_PER_CPU_SYMBOL(cpu_core_map);
116 118
117DEFINE_PER_CPU(cpumask_var_t, cpu_llc_shared_map); 119DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
118 120
119/* Per CPU bogomips and other parameters */ 121/* Per CPU bogomips and other parameters */
120DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); 122DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
@@ -184,7 +186,7 @@ static void __cpuinit smp_callin(void)
184 * boards) 186 * boards)
185 */ 187 */
186 188
187 pr_debug("CALLIN, before setup_local_APIC().\n"); 189 pr_debug("CALLIN, before setup_local_APIC()\n");
188 if (apic->smp_callin_clear_local_apic) 190 if (apic->smp_callin_clear_local_apic)
189 apic->smp_callin_clear_local_apic(); 191 apic->smp_callin_clear_local_apic();
190 setup_local_APIC(); 192 setup_local_APIC();
@@ -255,22 +257,13 @@ notrace static void __cpuinit start_secondary(void *unused)
255 check_tsc_sync_target(); 257 check_tsc_sync_target();
256 258
257 /* 259 /*
258 * We need to hold call_lock, so there is no inconsistency
259 * between the time smp_call_function() determines number of
260 * IPI recipients, and the time when the determination is made
261 * for which cpus receive the IPI. Holding this
262 * lock helps us to not include this cpu in a currently in progress
263 * smp_call_function().
264 *
265 * We need to hold vector_lock so there the set of online cpus 260 * We need to hold vector_lock so there the set of online cpus
266 * does not change while we are assigning vectors to cpus. Holding 261 * does not change while we are assigning vectors to cpus. Holding
267 * this lock ensures we don't half assign or remove an irq from a cpu. 262 * this lock ensures we don't half assign or remove an irq from a cpu.
268 */ 263 */
269 ipi_call_lock();
270 lock_vector_lock(); 264 lock_vector_lock();
271 set_cpu_online(smp_processor_id(), true); 265 set_cpu_online(smp_processor_id(), true);
272 unlock_vector_lock(); 266 unlock_vector_lock();
273 ipi_call_unlock();
274 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; 267 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
275 x86_platform.nmi_init(); 268 x86_platform.nmi_init();
276 269
@@ -432,17 +425,16 @@ static void impress_friends(void)
432 /* 425 /*
433 * Allow the user to impress friends. 426 * Allow the user to impress friends.
434 */ 427 */
435 pr_debug("Before bogomips.\n"); 428 pr_debug("Before bogomips\n");
436 for_each_possible_cpu(cpu) 429 for_each_possible_cpu(cpu)
437 if (cpumask_test_cpu(cpu, cpu_callout_mask)) 430 if (cpumask_test_cpu(cpu, cpu_callout_mask))
438 bogosum += cpu_data(cpu).loops_per_jiffy; 431 bogosum += cpu_data(cpu).loops_per_jiffy;
439 printk(KERN_INFO 432 pr_info("Total of %d processors activated (%lu.%02lu BogoMIPS)\n",
440 "Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
441 num_online_cpus(), 433 num_online_cpus(),
442 bogosum/(500000/HZ), 434 bogosum/(500000/HZ),
443 (bogosum/(5000/HZ))%100); 435 (bogosum/(5000/HZ))%100);
444 436
445 pr_debug("Before bogocount - setting activated=1.\n"); 437 pr_debug("Before bogocount - setting activated=1\n");
446} 438}
447 439
448void __inquire_remote_apic(int apicid) 440void __inquire_remote_apic(int apicid)
@@ -452,18 +444,17 @@ void __inquire_remote_apic(int apicid)
452 int timeout; 444 int timeout;
453 u32 status; 445 u32 status;
454 446
455 printk(KERN_INFO "Inquiring remote APIC 0x%x...\n", apicid); 447 pr_info("Inquiring remote APIC 0x%x...\n", apicid);
456 448
457 for (i = 0; i < ARRAY_SIZE(regs); i++) { 449 for (i = 0; i < ARRAY_SIZE(regs); i++) {
458 printk(KERN_INFO "... APIC 0x%x %s: ", apicid, names[i]); 450 pr_info("... APIC 0x%x %s: ", apicid, names[i]);
459 451
460 /* 452 /*
461 * Wait for idle. 453 * Wait for idle.
462 */ 454 */
463 status = safe_apic_wait_icr_idle(); 455 status = safe_apic_wait_icr_idle();
464 if (status) 456 if (status)
465 printk(KERN_CONT 457 pr_cont("a previous APIC delivery may have failed\n");
466 "a previous APIC delivery may have failed\n");
467 458
468 apic_icr_write(APIC_DM_REMRD | regs[i], apicid); 459 apic_icr_write(APIC_DM_REMRD | regs[i], apicid);
469 460
@@ -476,10 +467,10 @@ void __inquire_remote_apic(int apicid)
476 switch (status) { 467 switch (status) {
477 case APIC_ICR_RR_VALID: 468 case APIC_ICR_RR_VALID:
478 status = apic_read(APIC_RRR); 469 status = apic_read(APIC_RRR);
479 printk(KERN_CONT "%08x\n", status); 470 pr_cont("%08x\n", status);
480 break; 471 break;
481 default: 472 default:
482 printk(KERN_CONT "failed\n"); 473 pr_cont("failed\n");
483 } 474 }
484 } 475 }
485} 476}
@@ -513,12 +504,12 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
513 apic_write(APIC_ESR, 0); 504 apic_write(APIC_ESR, 0);
514 accept_status = (apic_read(APIC_ESR) & 0xEF); 505 accept_status = (apic_read(APIC_ESR) & 0xEF);
515 } 506 }
516 pr_debug("NMI sent.\n"); 507 pr_debug("NMI sent\n");
517 508
518 if (send_status) 509 if (send_status)
519 printk(KERN_ERR "APIC never delivered???\n"); 510 pr_err("APIC never delivered???\n");
520 if (accept_status) 511 if (accept_status)
521 printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status); 512 pr_err("APIC delivery error (%lx)\n", accept_status);
522 513
523 return (send_status | accept_status); 514 return (send_status | accept_status);
524} 515}
@@ -540,7 +531,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
540 apic_read(APIC_ESR); 531 apic_read(APIC_ESR);
541 } 532 }
542 533
543 pr_debug("Asserting INIT.\n"); 534 pr_debug("Asserting INIT\n");
544 535
545 /* 536 /*
546 * Turn INIT on target chip 537 * Turn INIT on target chip
@@ -556,7 +547,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
556 547
557 mdelay(10); 548 mdelay(10);
558 549
559 pr_debug("Deasserting INIT.\n"); 550 pr_debug("Deasserting INIT\n");
560 551
561 /* Target chip */ 552 /* Target chip */
562 /* Send IPI */ 553 /* Send IPI */
@@ -589,14 +580,14 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
589 /* 580 /*
590 * Run STARTUP IPI loop. 581 * Run STARTUP IPI loop.
591 */ 582 */
592 pr_debug("#startup loops: %d.\n", num_starts); 583 pr_debug("#startup loops: %d\n", num_starts);
593 584
594 for (j = 1; j <= num_starts; j++) { 585 for (j = 1; j <= num_starts; j++) {
595 pr_debug("Sending STARTUP #%d.\n", j); 586 pr_debug("Sending STARTUP #%d\n", j);
596 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ 587 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
597 apic_write(APIC_ESR, 0); 588 apic_write(APIC_ESR, 0);
598 apic_read(APIC_ESR); 589 apic_read(APIC_ESR);
599 pr_debug("After apic_write.\n"); 590 pr_debug("After apic_write\n");
600 591
601 /* 592 /*
602 * STARTUP IPI 593 * STARTUP IPI
@@ -613,7 +604,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
613 */ 604 */
614 udelay(300); 605 udelay(300);
615 606
616 pr_debug("Startup point 1.\n"); 607 pr_debug("Startup point 1\n");
617 608
618 pr_debug("Waiting for send to finish...\n"); 609 pr_debug("Waiting for send to finish...\n");
619 send_status = safe_apic_wait_icr_idle(); 610 send_status = safe_apic_wait_icr_idle();
@@ -628,12 +619,12 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
628 if (send_status || accept_status) 619 if (send_status || accept_status)
629 break; 620 break;
630 } 621 }
631 pr_debug("After Startup.\n"); 622 pr_debug("After Startup\n");
632 623
633 if (send_status) 624 if (send_status)
634 printk(KERN_ERR "APIC never delivered???\n"); 625 pr_err("APIC never delivered???\n");
635 if (accept_status) 626 if (accept_status)
636 printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status); 627 pr_err("APIC delivery error (%lx)\n", accept_status);
637 628
638 return (send_status | accept_status); 629 return (send_status | accept_status);
639} 630}
@@ -647,11 +638,11 @@ static void __cpuinit announce_cpu(int cpu, int apicid)
647 if (system_state == SYSTEM_BOOTING) { 638 if (system_state == SYSTEM_BOOTING) {
648 if (node != current_node) { 639 if (node != current_node) {
649 if (current_node > (-1)) 640 if (current_node > (-1))
650 pr_cont(" Ok.\n"); 641 pr_cont(" OK\n");
651 current_node = node; 642 current_node = node;
652 pr_info("Booting Node %3d, Processors ", node); 643 pr_info("Booting Node %3d, Processors ", node);
653 } 644 }
654 pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " Ok.\n" : ""); 645 pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " OK\n" : "");
655 return; 646 return;
656 } else 647 } else
657 pr_info("Booting Node %d Processor %d APIC 0x%x\n", 648 pr_info("Booting Node %d Processor %d APIC 0x%x\n",
@@ -731,9 +722,9 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
731 /* 722 /*
732 * allow APs to start initializing. 723 * allow APs to start initializing.
733 */ 724 */
734 pr_debug("Before Callout %d.\n", cpu); 725 pr_debug("Before Callout %d\n", cpu);
735 cpumask_set_cpu(cpu, cpu_callout_mask); 726 cpumask_set_cpu(cpu, cpu_callout_mask);
736 pr_debug("After Callout %d.\n", cpu); 727 pr_debug("After Callout %d\n", cpu);
737 728
738 /* 729 /*
739 * Wait 5s total for a response 730 * Wait 5s total for a response
@@ -761,7 +752,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
761 pr_err("CPU%d: Stuck ??\n", cpu); 752 pr_err("CPU%d: Stuck ??\n", cpu);
762 else 753 else
763 /* trampoline code not run */ 754 /* trampoline code not run */
764 pr_err("CPU%d: Not responding.\n", cpu); 755 pr_err("CPU%d: Not responding\n", cpu);
765 if (apic->inquire_remote_apic) 756 if (apic->inquire_remote_apic)
766 apic->inquire_remote_apic(apicid); 757 apic->inquire_remote_apic(apicid);
767 } 758 }
@@ -806,7 +797,7 @@ int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle)
806 if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || 797 if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid ||
807 !physid_isset(apicid, phys_cpu_present_map) || 798 !physid_isset(apicid, phys_cpu_present_map) ||
808 !apic->apic_id_valid(apicid)) { 799 !apic->apic_id_valid(apicid)) {
809 printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu); 800 pr_err("%s: bad cpu %d\n", __func__, cpu);
810 return -EINVAL; 801 return -EINVAL;
811 } 802 }
812 803
@@ -887,9 +878,8 @@ static int __init smp_sanity_check(unsigned max_cpus)
887 unsigned int cpu; 878 unsigned int cpu;
888 unsigned nr; 879 unsigned nr;
889 880
890 printk(KERN_WARNING 881 pr_warn("More than 8 CPUs detected - skipping them\n"
891 "More than 8 CPUs detected - skipping them.\n" 882 "Use CONFIG_X86_BIGSMP\n");
892 "Use CONFIG_X86_BIGSMP.\n");
893 883
894 nr = 0; 884 nr = 0;
895 for_each_present_cpu(cpu) { 885 for_each_present_cpu(cpu) {
@@ -910,8 +900,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
910#endif 900#endif
911 901
912 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { 902 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
913 printk(KERN_WARNING 903 pr_warn("weird, boot CPU (#%d) not listed by the BIOS\n",
914 "weird, boot CPU (#%d) not listed by the BIOS.\n",
915 hard_smp_processor_id()); 904 hard_smp_processor_id());
916 905
917 physid_set(hard_smp_processor_id(), phys_cpu_present_map); 906 physid_set(hard_smp_processor_id(), phys_cpu_present_map);
@@ -923,11 +912,10 @@ static int __init smp_sanity_check(unsigned max_cpus)
923 */ 912 */
924 if (!smp_found_config && !acpi_lapic) { 913 if (!smp_found_config && !acpi_lapic) {
925 preempt_enable(); 914 preempt_enable();
926 printk(KERN_NOTICE "SMP motherboard not detected.\n"); 915 pr_notice("SMP motherboard not detected\n");
927 disable_smp(); 916 disable_smp();
928 if (APIC_init_uniprocessor()) 917 if (APIC_init_uniprocessor())
929 printk(KERN_NOTICE "Local APIC not detected." 918 pr_notice("Local APIC not detected. Using dummy APIC emulation.\n");
930 " Using dummy APIC emulation.\n");
931 return -1; 919 return -1;
932 } 920 }
933 921
@@ -936,9 +924,8 @@ static int __init smp_sanity_check(unsigned max_cpus)
936 * CPU too, but we do it for the sake of robustness anyway. 924 * CPU too, but we do it for the sake of robustness anyway.
937 */ 925 */
938 if (!apic->check_phys_apicid_present(boot_cpu_physical_apicid)) { 926 if (!apic->check_phys_apicid_present(boot_cpu_physical_apicid)) {
939 printk(KERN_NOTICE 927 pr_notice("weird, boot CPU (#%d) not listed by the BIOS\n",
940 "weird, boot CPU (#%d) not listed by the BIOS.\n", 928 boot_cpu_physical_apicid);
941 boot_cpu_physical_apicid);
942 physid_set(hard_smp_processor_id(), phys_cpu_present_map); 929 physid_set(hard_smp_processor_id(), phys_cpu_present_map);
943 } 930 }
944 preempt_enable(); 931 preempt_enable();
@@ -951,8 +938,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
951 if (!disable_apic) { 938 if (!disable_apic) {
952 pr_err("BIOS bug, local APIC #%d not detected!...\n", 939 pr_err("BIOS bug, local APIC #%d not detected!...\n",
953 boot_cpu_physical_apicid); 940 boot_cpu_physical_apicid);
954 pr_err("... forcing use of dummy APIC emulation." 941 pr_err("... forcing use of dummy APIC emulation (tell your hw vendor)\n");
955 "(tell your hw vendor)\n");
956 } 942 }
957 smpboot_clear_io_apic(); 943 smpboot_clear_io_apic();
958 disable_ioapic_support(); 944 disable_ioapic_support();
@@ -965,7 +951,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
965 * If SMP should be disabled, then really disable it! 951 * If SMP should be disabled, then really disable it!
966 */ 952 */
967 if (!max_cpus) { 953 if (!max_cpus) {
968 printk(KERN_INFO "SMP mode deactivated.\n"); 954 pr_info("SMP mode deactivated\n");
969 smpboot_clear_io_apic(); 955 smpboot_clear_io_apic();
970 956
971 connect_bsp_APIC(); 957 connect_bsp_APIC();
@@ -1017,7 +1003,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1017 1003
1018 1004
1019 if (smp_sanity_check(max_cpus) < 0) { 1005 if (smp_sanity_check(max_cpus) < 0) {
1020 printk(KERN_INFO "SMP disabled\n"); 1006 pr_info("SMP disabled\n");
1021 disable_smp(); 1007 disable_smp();
1022 goto out; 1008 goto out;
1023 } 1009 }
@@ -1055,7 +1041,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1055 * Set up local APIC timer on boot CPU. 1041 * Set up local APIC timer on boot CPU.
1056 */ 1042 */
1057 1043
1058 printk(KERN_INFO "CPU%d: ", 0); 1044 pr_info("CPU%d: ", 0);
1059 print_cpu_info(&cpu_data(0)); 1045 print_cpu_info(&cpu_data(0));
1060 x86_init.timers.setup_percpu_clockev(); 1046 x86_init.timers.setup_percpu_clockev();
1061 1047
@@ -1105,7 +1091,7 @@ void __init native_smp_prepare_boot_cpu(void)
1105 1091
1106void __init native_smp_cpus_done(unsigned int max_cpus) 1092void __init native_smp_cpus_done(unsigned int max_cpus)
1107{ 1093{
1108 pr_debug("Boot done.\n"); 1094 pr_debug("Boot done\n");
1109 1095
1110 nmi_selftest(); 1096 nmi_selftest();
1111 impress_friends(); 1097 impress_friends();
@@ -1166,8 +1152,7 @@ __init void prefill_possible_map(void)
1166 1152
1167 /* nr_cpu_ids could be reduced via nr_cpus= */ 1153 /* nr_cpu_ids could be reduced via nr_cpus= */
1168 if (possible > nr_cpu_ids) { 1154 if (possible > nr_cpu_ids) {
1169 printk(KERN_WARNING 1155 pr_warn("%d Processors exceeds NR_CPUS limit of %d\n",
1170 "%d Processors exceeds NR_CPUS limit of %d\n",
1171 possible, nr_cpu_ids); 1156 possible, nr_cpu_ids);
1172 possible = nr_cpu_ids; 1157 possible = nr_cpu_ids;
1173 } 1158 }
@@ -1176,13 +1161,12 @@ __init void prefill_possible_map(void)
1176 if (!setup_max_cpus) 1161 if (!setup_max_cpus)
1177#endif 1162#endif
1178 if (possible > i) { 1163 if (possible > i) {
1179 printk(KERN_WARNING 1164 pr_warn("%d Processors exceeds max_cpus limit of %u\n",
1180 "%d Processors exceeds max_cpus limit of %u\n",
1181 possible, setup_max_cpus); 1165 possible, setup_max_cpus);
1182 possible = i; 1166 possible = i;
1183 } 1167 }
1184 1168
1185 printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", 1169 pr_info("Allowing %d CPUs, %d hotplug CPUs\n",
1186 possible, max_t(int, possible - num_processors, 0)); 1170 possible, max_t(int, possible - num_processors, 0));
1187 1171
1188 for (i = 0; i < possible; i++) 1172 for (i = 0; i < possible; i++)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 05b31d92f69c..b481341c9369 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -9,6 +9,9 @@
9/* 9/*
10 * Handle hardware traps and faults. 10 * Handle hardware traps and faults.
11 */ 11 */
12
13#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14
12#include <linux/interrupt.h> 15#include <linux/interrupt.h>
13#include <linux/kallsyms.h> 16#include <linux/kallsyms.h>
14#include <linux/spinlock.h> 17#include <linux/spinlock.h>
@@ -143,12 +146,11 @@ trap_signal:
143#ifdef CONFIG_X86_64 146#ifdef CONFIG_X86_64
144 if (show_unhandled_signals && unhandled_signal(tsk, signr) && 147 if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
145 printk_ratelimit()) { 148 printk_ratelimit()) {
146 printk(KERN_INFO 149 pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx",
147 "%s[%d] trap %s ip:%lx sp:%lx error:%lx", 150 tsk->comm, tsk->pid, str,
148 tsk->comm, tsk->pid, str, 151 regs->ip, regs->sp, error_code);
149 regs->ip, regs->sp, error_code);
150 print_vma_addr(" in ", regs->ip); 152 print_vma_addr(" in ", regs->ip);
151 printk("\n"); 153 pr_cont("\n");
152 } 154 }
153#endif 155#endif
154 156
@@ -269,12 +271,11 @@ do_general_protection(struct pt_regs *regs, long error_code)
269 271
270 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && 272 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
271 printk_ratelimit()) { 273 printk_ratelimit()) {
272 printk(KERN_INFO 274 pr_info("%s[%d] general protection ip:%lx sp:%lx error:%lx",
273 "%s[%d] general protection ip:%lx sp:%lx error:%lx",
274 tsk->comm, task_pid_nr(tsk), 275 tsk->comm, task_pid_nr(tsk),
275 regs->ip, regs->sp, error_code); 276 regs->ip, regs->sp, error_code);
276 print_vma_addr(" in ", regs->ip); 277 print_vma_addr(" in ", regs->ip);
277 printk("\n"); 278 pr_cont("\n");
278 } 279 }
279 280
280 force_sig(SIGSEGV, tsk); 281 force_sig(SIGSEGV, tsk);
@@ -570,7 +571,7 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
570 conditional_sti(regs); 571 conditional_sti(regs);
571#if 0 572#if 0
572 /* No need to warn about this any longer. */ 573 /* No need to warn about this any longer. */
573 printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); 574 pr_info("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
574#endif 575#endif
575} 576}
576 577
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index fc0a147e3727..cfa5d4f7ca56 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1,3 +1,5 @@
1#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
2
1#include <linux/kernel.h> 3#include <linux/kernel.h>
2#include <linux/sched.h> 4#include <linux/sched.h>
3#include <linux/init.h> 5#include <linux/init.h>
@@ -84,8 +86,7 @@ EXPORT_SYMBOL_GPL(check_tsc_unstable);
84#ifdef CONFIG_X86_TSC 86#ifdef CONFIG_X86_TSC
85int __init notsc_setup(char *str) 87int __init notsc_setup(char *str)
86{ 88{
87 printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, " 89 pr_warn("Kernel compiled with CONFIG_X86_TSC, cannot disable TSC completely\n");
88 "cannot disable TSC completely.\n");
89 tsc_disabled = 1; 90 tsc_disabled = 1;
90 return 1; 91 return 1;
91} 92}
@@ -373,7 +374,7 @@ static unsigned long quick_pit_calibrate(void)
373 goto success; 374 goto success;
374 } 375 }
375 } 376 }
376 printk("Fast TSC calibration failed\n"); 377 pr_err("Fast TSC calibration failed\n");
377 return 0; 378 return 0;
378 379
379success: 380success:
@@ -392,7 +393,7 @@ success:
392 */ 393 */
393 delta *= PIT_TICK_RATE; 394 delta *= PIT_TICK_RATE;
394 do_div(delta, i*256*1000); 395 do_div(delta, i*256*1000);
395 printk("Fast TSC calibration using PIT\n"); 396 pr_info("Fast TSC calibration using PIT\n");
396 return delta; 397 return delta;
397} 398}
398 399
@@ -487,9 +488,8 @@ unsigned long native_calibrate_tsc(void)
487 * use the reference value, as it is more precise. 488 * use the reference value, as it is more precise.
488 */ 489 */
489 if (delta >= 90 && delta <= 110) { 490 if (delta >= 90 && delta <= 110) {
490 printk(KERN_INFO 491 pr_info("PIT calibration matches %s. %d loops\n",
491 "TSC: PIT calibration matches %s. %d loops\n", 492 hpet ? "HPET" : "PMTIMER", i + 1);
492 hpet ? "HPET" : "PMTIMER", i + 1);
493 return tsc_ref_min; 493 return tsc_ref_min;
494 } 494 }
495 495
@@ -511,38 +511,36 @@ unsigned long native_calibrate_tsc(void)
511 */ 511 */
512 if (tsc_pit_min == ULONG_MAX) { 512 if (tsc_pit_min == ULONG_MAX) {
513 /* PIT gave no useful value */ 513 /* PIT gave no useful value */
514 printk(KERN_WARNING "TSC: Unable to calibrate against PIT\n"); 514 pr_warn("Unable to calibrate against PIT\n");
515 515
516 /* We don't have an alternative source, disable TSC */ 516 /* We don't have an alternative source, disable TSC */
517 if (!hpet && !ref1 && !ref2) { 517 if (!hpet && !ref1 && !ref2) {
518 printk("TSC: No reference (HPET/PMTIMER) available\n"); 518 pr_notice("No reference (HPET/PMTIMER) available\n");
519 return 0; 519 return 0;
520 } 520 }
521 521
522 /* The alternative source failed as well, disable TSC */ 522 /* The alternative source failed as well, disable TSC */
523 if (tsc_ref_min == ULONG_MAX) { 523 if (tsc_ref_min == ULONG_MAX) {
524 printk(KERN_WARNING "TSC: HPET/PMTIMER calibration " 524 pr_warn("HPET/PMTIMER calibration failed\n");
525 "failed.\n");
526 return 0; 525 return 0;
527 } 526 }
528 527
529 /* Use the alternative source */ 528 /* Use the alternative source */
530 printk(KERN_INFO "TSC: using %s reference calibration\n", 529 pr_info("using %s reference calibration\n",
531 hpet ? "HPET" : "PMTIMER"); 530 hpet ? "HPET" : "PMTIMER");
532 531
533 return tsc_ref_min; 532 return tsc_ref_min;
534 } 533 }
535 534
536 /* We don't have an alternative source, use the PIT calibration value */ 535 /* We don't have an alternative source, use the PIT calibration value */
537 if (!hpet && !ref1 && !ref2) { 536 if (!hpet && !ref1 && !ref2) {
538 printk(KERN_INFO "TSC: Using PIT calibration value\n"); 537 pr_info("Using PIT calibration value\n");
539 return tsc_pit_min; 538 return tsc_pit_min;
540 } 539 }
541 540
542 /* The alternative source failed, use the PIT calibration value */ 541 /* The alternative source failed, use the PIT calibration value */
543 if (tsc_ref_min == ULONG_MAX) { 542 if (tsc_ref_min == ULONG_MAX) {
544 printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed. " 543 pr_warn("HPET/PMTIMER calibration failed. Using PIT calibration.\n");
545 "Using PIT calibration\n");
546 return tsc_pit_min; 544 return tsc_pit_min;
547 } 545 }
548 546
@@ -551,9 +549,9 @@ unsigned long native_calibrate_tsc(void)
551 * the PIT value as we know that there are PMTIMERs around 549 * the PIT value as we know that there are PMTIMERs around
552 * running at double speed. At least we let the user know: 550 * running at double speed. At least we let the user know:
553 */ 551 */
554 printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n", 552 pr_warn("PIT calibration deviates from %s: %lu %lu\n",
555 hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min); 553 hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
556 printk(KERN_INFO "TSC: Using PIT calibration value\n"); 554 pr_info("Using PIT calibration value\n");
557 return tsc_pit_min; 555 return tsc_pit_min;
558} 556}
559 557
@@ -785,7 +783,7 @@ void mark_tsc_unstable(char *reason)
785 tsc_unstable = 1; 783 tsc_unstable = 1;
786 sched_clock_stable = 0; 784 sched_clock_stable = 0;
787 disable_sched_clock_irqtime(); 785 disable_sched_clock_irqtime();
788 printk(KERN_INFO "Marking TSC unstable due to %s\n", reason); 786 pr_info("Marking TSC unstable due to %s\n", reason);
789 /* Change only the rating, when not registered */ 787 /* Change only the rating, when not registered */
790 if (clocksource_tsc.mult) 788 if (clocksource_tsc.mult)
791 clocksource_mark_unstable(&clocksource_tsc); 789 clocksource_mark_unstable(&clocksource_tsc);
@@ -912,9 +910,9 @@ static void tsc_refine_calibration_work(struct work_struct *work)
912 goto out; 910 goto out;
913 911
914 tsc_khz = freq; 912 tsc_khz = freq;
915 printk(KERN_INFO "Refined TSC clocksource calibration: " 913 pr_info("Refined TSC clocksource calibration: %lu.%03lu MHz\n",
916 "%lu.%03lu MHz.\n", (unsigned long)tsc_khz / 1000, 914 (unsigned long)tsc_khz / 1000,
917 (unsigned long)tsc_khz % 1000); 915 (unsigned long)tsc_khz % 1000);
918 916
919out: 917out:
920 clocksource_register_khz(&clocksource_tsc, tsc_khz); 918 clocksource_register_khz(&clocksource_tsc, tsc_khz);
@@ -970,9 +968,9 @@ void __init tsc_init(void)
970 return; 968 return;
971 } 969 }
972 970
973 printk("Detected %lu.%03lu MHz processor.\n", 971 pr_info("Detected %lu.%03lu MHz processor\n",
974 (unsigned long)cpu_khz / 1000, 972 (unsigned long)cpu_khz / 1000,
975 (unsigned long)cpu_khz % 1000); 973 (unsigned long)cpu_khz % 1000);
976 974
977 /* 975 /*
978 * Secondary CPUs do not run through tsc_init(), so set up 976 * Secondary CPUs do not run through tsc_init(), so set up
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index dc4e910a7d96..36fd42091fa7 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -409,9 +409,10 @@ static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm,
409 * arch_uprobe_analyze_insn - instruction analysis including validity and fixups. 409 * arch_uprobe_analyze_insn - instruction analysis including validity and fixups.
410 * @mm: the probed address space. 410 * @mm: the probed address space.
411 * @arch_uprobe: the probepoint information. 411 * @arch_uprobe: the probepoint information.
412 * @addr: virtual address at which to install the probepoint
412 * Return 0 on success or a -ve number on error. 413 * Return 0 on success or a -ve number on error.
413 */ 414 */
414int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm) 415int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr)
415{ 416{
416 int ret; 417 int ret;
417 struct insn insn; 418 struct insn insn;
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 255f58ae71e8..54abcc0baf23 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -28,6 +28,8 @@
28 * 28 *
29 */ 29 */
30 30
31#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
32
31#include <linux/capability.h> 33#include <linux/capability.h>
32#include <linux/errno.h> 34#include <linux/errno.h>
33#include <linux/interrupt.h> 35#include <linux/interrupt.h>
@@ -137,14 +139,14 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
137 local_irq_enable(); 139 local_irq_enable();
138 140
139 if (!current->thread.vm86_info) { 141 if (!current->thread.vm86_info) {
140 printk("no vm86_info: BAD\n"); 142 pr_alert("no vm86_info: BAD\n");
141 do_exit(SIGSEGV); 143 do_exit(SIGSEGV);
142 } 144 }
143 set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | current->thread.v86mask); 145 set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | current->thread.v86mask);
144 tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs, regs); 146 tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs, regs);
145 tmp += put_user(current->thread.screen_bitmap, &current->thread.vm86_info->screen_bitmap); 147 tmp += put_user(current->thread.screen_bitmap, &current->thread.vm86_info->screen_bitmap);
146 if (tmp) { 148 if (tmp) {
147 printk("vm86: could not access userspace vm86_info\n"); 149 pr_alert("could not access userspace vm86_info\n");
148 do_exit(SIGSEGV); 150 do_exit(SIGSEGV);
149 } 151 }
150 152
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index 8eeb55a551b4..992f890283e9 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -16,6 +16,7 @@
16#include <linux/pci_ids.h> 16#include <linux/pci_ids.h>
17#include <linux/pci_regs.h> 17#include <linux/pci_regs.h>
18#include <linux/smp.h> 18#include <linux/smp.h>
19#include <linux/irq.h>
19 20
20#include <asm/apic.h> 21#include <asm/apic.h>
21#include <asm/pci-direct.h> 22#include <asm/pci-direct.h>
@@ -95,6 +96,18 @@ static void __init set_vsmp_pv_ops(void)
95 ctl = readl(address + 4); 96 ctl = readl(address + 4);
96 printk(KERN_INFO "vSMP CTL: capabilities:0x%08x control:0x%08x\n", 97 printk(KERN_INFO "vSMP CTL: capabilities:0x%08x control:0x%08x\n",
97 cap, ctl); 98 cap, ctl);
99
100 /* If possible, let the vSMP foundation route the interrupt optimally */
101#ifdef CONFIG_SMP
102 if (cap & ctl & BIT(8)) {
103 ctl &= ~BIT(8);
104#ifdef CONFIG_PROC_FS
105 /* Don't let users change irq affinity via procfs */
106 no_irq_affinity = 1;
107#endif
108 }
109#endif
110
98 if (cap & ctl & (1 << 4)) { 111 if (cap & ctl & (1 << 4)) {
99 /* Setup irq ops and turn on vSMP IRQ fastpath handling */ 112 /* Setup irq ops and turn on vSMP IRQ fastpath handling */
100 pv_irq_ops.irq_disable = PV_CALLEE_SAVE(vsmp_irq_disable); 113 pv_irq_ops.irq_disable = PV_CALLEE_SAVE(vsmp_irq_disable);
@@ -102,12 +115,11 @@ static void __init set_vsmp_pv_ops(void)
102 pv_irq_ops.save_fl = PV_CALLEE_SAVE(vsmp_save_fl); 115 pv_irq_ops.save_fl = PV_CALLEE_SAVE(vsmp_save_fl);
103 pv_irq_ops.restore_fl = PV_CALLEE_SAVE(vsmp_restore_fl); 116 pv_irq_ops.restore_fl = PV_CALLEE_SAVE(vsmp_restore_fl);
104 pv_init_ops.patch = vsmp_patch; 117 pv_init_ops.patch = vsmp_patch;
105
106 ctl &= ~(1 << 4); 118 ctl &= ~(1 << 4);
107 writel(ctl, address + 4);
108 ctl = readl(address + 4);
109 printk(KERN_INFO "vSMP CTL: control set to:0x%08x\n", ctl);
110 } 119 }
120 writel(ctl, address + 4);
121 ctl = readl(address + 4);
122 pr_info("vSMP CTL: control set to:0x%08x\n", ctl);
111 123
112 early_iounmap(address, 8); 124 early_iounmap(address, 8);
113} 125}
@@ -187,12 +199,36 @@ static void __init vsmp_cap_cpus(void)
187#endif 199#endif
188} 200}
189 201
202static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
203{
204 return hard_smp_processor_id() >> index_msb;
205}
206
207/*
208 * In vSMP, all cpus should be capable of handling interrupts, regardless of
209 * the APIC used.
210 */
211static void fill_vector_allocation_domain(int cpu, struct cpumask *retmask,
212 const struct cpumask *mask)
213{
214 cpumask_setall(retmask);
215}
216
217static void vsmp_apic_post_init(void)
218{
219 /* need to update phys_pkg_id */
220 apic->phys_pkg_id = apicid_phys_pkg_id;
221 apic->vector_allocation_domain = fill_vector_allocation_domain;
222}
223
190void __init vsmp_init(void) 224void __init vsmp_init(void)
191{ 225{
192 detect_vsmp_box(); 226 detect_vsmp_box();
193 if (!is_vsmp_box()) 227 if (!is_vsmp_box())
194 return; 228 return;
195 229
230 x86_platform.apic_post_init = vsmp_apic_post_init;
231
196 vsmp_cap_cpus(); 232 vsmp_cap_cpus();
197 233
198 set_vsmp_pv_ops(); 234 set_vsmp_pv_ops();
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 7515cf0e1805..8d141b309046 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -18,6 +18,8 @@
18 * use the vDSO. 18 * use the vDSO.
19 */ 19 */
20 20
21#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
22
21#include <linux/time.h> 23#include <linux/time.h>
22#include <linux/init.h> 24#include <linux/init.h>
23#include <linux/kernel.h> 25#include <linux/kernel.h>
@@ -111,18 +113,13 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
111static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, 113static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
112 const char *message) 114 const char *message)
113{ 115{
114 static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); 116 if (!show_unhandled_signals)
115 struct task_struct *tsk;
116
117 if (!show_unhandled_signals || !__ratelimit(&rs))
118 return; 117 return;
119 118
120 tsk = current; 119 pr_notice_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
121 120 level, current->comm, task_pid_nr(current),
122 printk("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", 121 message, regs->ip, regs->cs,
123 level, tsk->comm, task_pid_nr(tsk), 122 regs->sp, regs->ax, regs->si, regs->di);
124 message, regs->ip, regs->cs,
125 regs->sp, regs->ax, regs->si, regs->di);
126} 123}
127 124
128static int addr_to_vsyscall_nr(unsigned long addr) 125static int addr_to_vsyscall_nr(unsigned long addr)
@@ -139,6 +136,19 @@ static int addr_to_vsyscall_nr(unsigned long addr)
139 return nr; 136 return nr;
140} 137}
141 138
139#ifdef CONFIG_SECCOMP
140static int vsyscall_seccomp(struct task_struct *tsk, int syscall_nr)
141{
142 if (!seccomp_mode(&tsk->seccomp))
143 return 0;
144 task_pt_regs(tsk)->orig_ax = syscall_nr;
145 task_pt_regs(tsk)->ax = syscall_nr;
146 return __secure_computing(syscall_nr);
147}
148#else
149#define vsyscall_seccomp(_tsk, _nr) 0
150#endif
151
142static bool write_ok_or_segv(unsigned long ptr, size_t size) 152static bool write_ok_or_segv(unsigned long ptr, size_t size)
143{ 153{
144 /* 154 /*
@@ -174,6 +184,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
174 int vsyscall_nr; 184 int vsyscall_nr;
175 int prev_sig_on_uaccess_error; 185 int prev_sig_on_uaccess_error;
176 long ret; 186 long ret;
187 int skip;
177 188
178 /* 189 /*
179 * No point in checking CS -- the only way to get here is a user mode 190 * No point in checking CS -- the only way to get here is a user mode
@@ -205,9 +216,6 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
205 } 216 }
206 217
207 tsk = current; 218 tsk = current;
208 if (seccomp_mode(&tsk->seccomp))
209 do_exit(SIGKILL);
210
211 /* 219 /*
212 * With a real vsyscall, page faults cause SIGSEGV. We want to 220 * With a real vsyscall, page faults cause SIGSEGV. We want to
213 * preserve that behavior to make writing exploits harder. 221 * preserve that behavior to make writing exploits harder.
@@ -222,8 +230,13 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
222 * address 0". 230 * address 0".
223 */ 231 */
224 ret = -EFAULT; 232 ret = -EFAULT;
233 skip = 0;
225 switch (vsyscall_nr) { 234 switch (vsyscall_nr) {
226 case 0: 235 case 0:
236 skip = vsyscall_seccomp(tsk, __NR_gettimeofday);
237 if (skip)
238 break;
239
227 if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || 240 if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
228 !write_ok_or_segv(regs->si, sizeof(struct timezone))) 241 !write_ok_or_segv(regs->si, sizeof(struct timezone)))
229 break; 242 break;
@@ -234,6 +247,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
234 break; 247 break;
235 248
236 case 1: 249 case 1:
250 skip = vsyscall_seccomp(tsk, __NR_time);
251 if (skip)
252 break;
253
237 if (!write_ok_or_segv(regs->di, sizeof(time_t))) 254 if (!write_ok_or_segv(regs->di, sizeof(time_t)))
238 break; 255 break;
239 256
@@ -241,6 +258,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
241 break; 258 break;
242 259
243 case 2: 260 case 2:
261 skip = vsyscall_seccomp(tsk, __NR_getcpu);
262 if (skip)
263 break;
264
244 if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || 265 if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
245 !write_ok_or_segv(regs->si, sizeof(unsigned))) 266 !write_ok_or_segv(regs->si, sizeof(unsigned)))
246 break; 267 break;
@@ -253,6 +274,12 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
253 274
254 current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; 275 current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
255 276
277 if (skip) {
278 if ((long)regs->ax <= 0L) /* seccomp errno emulation */
279 goto do_ret;
280 goto done; /* seccomp trace/trap */
281 }
282
256 if (ret == -EFAULT) { 283 if (ret == -EFAULT) {
257 /* Bad news -- userspace fed a bad pointer to a vsyscall. */ 284 /* Bad news -- userspace fed a bad pointer to a vsyscall. */
258 warn_bad_vsyscall(KERN_INFO, regs, 285 warn_bad_vsyscall(KERN_INFO, regs,
@@ -271,10 +298,11 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
271 298
272 regs->ax = ret; 299 regs->ax = ret;
273 300
301do_ret:
274 /* Emulate a ret instruction. */ 302 /* Emulate a ret instruction. */
275 regs->ip = caller; 303 regs->ip = caller;
276 regs->sp += 8; 304 regs->sp += 8;
277 305done:
278 return true; 306 return true;
279 307
280sigsegv: 308sigsegv:
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 9796c2f3d074..6020f6f5927c 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -28,6 +28,7 @@ EXPORT_SYMBOL(__put_user_8);
28 28
29EXPORT_SYMBOL(copy_user_generic_string); 29EXPORT_SYMBOL(copy_user_generic_string);
30EXPORT_SYMBOL(copy_user_generic_unrolled); 30EXPORT_SYMBOL(copy_user_generic_unrolled);
31EXPORT_SYMBOL(copy_user_enhanced_fast_string);
31EXPORT_SYMBOL(__copy_user_nocache); 32EXPORT_SYMBOL(__copy_user_nocache);
32EXPORT_SYMBOL(_copy_from_user); 33EXPORT_SYMBOL(_copy_from_user);
33EXPORT_SYMBOL(_copy_to_user); 34EXPORT_SYMBOL(_copy_to_user);
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 35c5e543f550..9f3167e891ef 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -29,7 +29,6 @@ void __init x86_init_uint_noop(unsigned int unused) { }
29void __init x86_init_pgd_noop(pgd_t *unused) { } 29void __init x86_init_pgd_noop(pgd_t *unused) { }
30int __init iommu_init_noop(void) { return 0; } 30int __init iommu_init_noop(void) { return 0; }
31void iommu_shutdown_noop(void) { } 31void iommu_shutdown_noop(void) { }
32void wallclock_init_noop(void) { }
33 32
34/* 33/*
35 * The platform setup functions are preset with the default functions 34 * The platform setup functions are preset with the default functions
@@ -101,7 +100,6 @@ static int default_i8042_detect(void) { return 1; };
101 100
102struct x86_platform_ops x86_platform = { 101struct x86_platform_ops x86_platform = {
103 .calibrate_tsc = native_calibrate_tsc, 102 .calibrate_tsc = native_calibrate_tsc,
104 .wallclock_init = wallclock_init_noop,
105 .get_wallclock = mach_get_cmos_time, 103 .get_wallclock = mach_get_cmos_time,
106 .set_wallclock = mach_set_rtc_mmss, 104 .set_wallclock = mach_set_rtc_mmss,
107 .iommu_shutdown = iommu_shutdown_noop, 105 .iommu_shutdown = iommu_shutdown_noop,
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index bd18149b2b0f..3d3e20709119 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -3,6 +3,9 @@
3 * 3 *
4 * Author: Suresh Siddha <suresh.b.siddha@intel.com> 4 * Author: Suresh Siddha <suresh.b.siddha@intel.com>
5 */ 5 */
6
7#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
8
6#include <linux/bootmem.h> 9#include <linux/bootmem.h>
7#include <linux/compat.h> 10#include <linux/compat.h>
8#include <asm/i387.h> 11#include <asm/i387.h>
@@ -162,7 +165,7 @@ int save_i387_xstate(void __user *buf)
162 BUG_ON(sig_xstate_size < xstate_size); 165 BUG_ON(sig_xstate_size < xstate_size);
163 166
164 if ((unsigned long)buf % 64) 167 if ((unsigned long)buf % 64)
165 printk("save_i387_xstate: bad fpstate %p\n", buf); 168 pr_err("%s: bad fpstate %p\n", __func__, buf);
166 169
167 if (!used_math()) 170 if (!used_math())
168 return 0; 171 return 0;
@@ -422,7 +425,7 @@ static void __init xstate_enable_boot_cpu(void)
422 pcntxt_mask = eax + ((u64)edx << 32); 425 pcntxt_mask = eax + ((u64)edx << 32);
423 426
424 if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) { 427 if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) {
425 printk(KERN_ERR "FP/SSE not shown under xsave features 0x%llx\n", 428 pr_err("FP/SSE not shown under xsave features 0x%llx\n",
426 pcntxt_mask); 429 pcntxt_mask);
427 BUG(); 430 BUG();
428 } 431 }
@@ -445,9 +448,8 @@ static void __init xstate_enable_boot_cpu(void)
445 448
446 setup_xstate_init(); 449 setup_xstate_init();
447 450
448 printk(KERN_INFO "xsave/xrstor: enabled xstate_bv 0x%llx, " 451 pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n",
449 "cntxt size 0x%x\n", 452 pcntxt_mask, xstate_size);
450 pcntxt_mask, xstate_size);
451} 453}
452 454
453/* 455/*
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 7df1c6d839fb..0595f1397b7c 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -201,6 +201,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
201 unsigned f_lm = 0; 201 unsigned f_lm = 0;
202#endif 202#endif
203 unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; 203 unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
204 unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0;
204 205
205 /* cpuid 1.edx */ 206 /* cpuid 1.edx */
206 const u32 kvm_supported_word0_x86_features = 207 const u32 kvm_supported_word0_x86_features =
@@ -228,7 +229,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
228 0 /* DS-CPL, VMX, SMX, EST */ | 229 0 /* DS-CPL, VMX, SMX, EST */ |
229 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | 230 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
230 F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ | 231 F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ |
231 0 /* Reserved, DCA */ | F(XMM4_1) | 232 F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) |
232 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | 233 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
233 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | 234 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
234 F(F16C) | F(RDRAND); 235 F(F16C) | F(RDRAND);
@@ -248,7 +249,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
248 /* cpuid 7.0.ebx */ 249 /* cpuid 7.0.ebx */
249 const u32 kvm_supported_word9_x86_features = 250 const u32 kvm_supported_word9_x86_features =
250 F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | 251 F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
251 F(BMI2) | F(ERMS) | F(RTM); 252 F(BMI2) | F(ERMS) | f_invpcid | F(RTM);
252 253
253 /* all calls to cpuid_count() should be made on the same cpu */ 254 /* all calls to cpuid_count() should be made on the same cpu */
254 get_cpu(); 255 get_cpu();
@@ -409,6 +410,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
409 (1 << KVM_FEATURE_NOP_IO_DELAY) | 410 (1 << KVM_FEATURE_NOP_IO_DELAY) |
410 (1 << KVM_FEATURE_CLOCKSOURCE2) | 411 (1 << KVM_FEATURE_CLOCKSOURCE2) |
411 (1 << KVM_FEATURE_ASYNC_PF) | 412 (1 << KVM_FEATURE_ASYNC_PF) |
413 (1 << KVM_FEATURE_PV_EOI) |
412 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); 414 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
413 415
414 if (sched_info_on()) 416 if (sched_info_on())
@@ -639,33 +641,37 @@ static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu,
639 return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index); 641 return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index);
640} 642}
641 643
642void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) 644void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
643{ 645{
644 u32 function, index; 646 u32 function = *eax, index = *ecx;
645 struct kvm_cpuid_entry2 *best; 647 struct kvm_cpuid_entry2 *best;
646 648
647 function = kvm_register_read(vcpu, VCPU_REGS_RAX);
648 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
649 kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
650 kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
651 kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
652 kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
653 best = kvm_find_cpuid_entry(vcpu, function, index); 649 best = kvm_find_cpuid_entry(vcpu, function, index);
654 650
655 if (!best) 651 if (!best)
656 best = check_cpuid_limit(vcpu, function, index); 652 best = check_cpuid_limit(vcpu, function, index);
657 653
658 if (best) { 654 if (best) {
659 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); 655 *eax = best->eax;
660 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); 656 *ebx = best->ebx;
661 kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); 657 *ecx = best->ecx;
662 kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); 658 *edx = best->edx;
663 } 659 } else
660 *eax = *ebx = *ecx = *edx = 0;
661}
662
663void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
664{
665 u32 function, eax, ebx, ecx, edx;
666
667 function = eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
668 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
669 kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx);
670 kvm_register_write(vcpu, VCPU_REGS_RAX, eax);
671 kvm_register_write(vcpu, VCPU_REGS_RBX, ebx);
672 kvm_register_write(vcpu, VCPU_REGS_RCX, ecx);
673 kvm_register_write(vcpu, VCPU_REGS_RDX, edx);
664 kvm_x86_ops->skip_emulated_instruction(vcpu); 674 kvm_x86_ops->skip_emulated_instruction(vcpu);
665 trace_kvm_cpuid(function, 675 trace_kvm_cpuid(function, eax, ebx, ecx, edx);
666 kvm_register_read(vcpu, VCPU_REGS_RAX),
667 kvm_register_read(vcpu, VCPU_REGS_RBX),
668 kvm_register_read(vcpu, VCPU_REGS_RCX),
669 kvm_register_read(vcpu, VCPU_REGS_RDX));
670} 676}
671EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); 677EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 26d1fb437eb5..a10e46016851 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -17,6 +17,7 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
17int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, 17int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
18 struct kvm_cpuid2 *cpuid, 18 struct kvm_cpuid2 *cpuid,
19 struct kvm_cpuid_entry2 __user *entries); 19 struct kvm_cpuid_entry2 __user *entries);
20void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
20 21
21 22
22static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) 23static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
@@ -51,4 +52,12 @@ static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu)
51 return best && (best->ecx & bit(X86_FEATURE_OSVW)); 52 return best && (best->ecx & bit(X86_FEATURE_OSVW));
52} 53}
53 54
55static inline bool guest_cpuid_has_pcid(struct kvm_vcpu *vcpu)
56{
57 struct kvm_cpuid_entry2 *best;
58
59 best = kvm_find_cpuid_entry(vcpu, 1, 0);
60 return best && (best->ecx & bit(X86_FEATURE_PCID));
61}
62
54#endif 63#endif
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index f95d242ee9f7..97d9a9914ba8 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -433,11 +433,32 @@ static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,
433 return ctxt->ops->intercept(ctxt, &info, stage); 433 return ctxt->ops->intercept(ctxt, &info, stage);
434} 434}
435 435
436static void assign_masked(ulong *dest, ulong src, ulong mask)
437{
438 *dest = (*dest & ~mask) | (src & mask);
439}
440
436static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt) 441static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt)
437{ 442{
438 return (1UL << (ctxt->ad_bytes << 3)) - 1; 443 return (1UL << (ctxt->ad_bytes << 3)) - 1;
439} 444}
440 445
446static ulong stack_mask(struct x86_emulate_ctxt *ctxt)
447{
448 u16 sel;
449 struct desc_struct ss;
450
451 if (ctxt->mode == X86EMUL_MODE_PROT64)
452 return ~0UL;
453 ctxt->ops->get_segment(ctxt, &sel, &ss, NULL, VCPU_SREG_SS);
454 return ~0U >> ((ss.d ^ 1) * 16); /* d=0: 0xffff; d=1: 0xffffffff */
455}
456
457static int stack_size(struct x86_emulate_ctxt *ctxt)
458{
459 return (__fls(stack_mask(ctxt)) + 1) >> 3;
460}
461
441/* Access/update address held in a register, based on addressing mode. */ 462/* Access/update address held in a register, based on addressing mode. */
442static inline unsigned long 463static inline unsigned long
443address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg) 464address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg)
@@ -958,6 +979,12 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
958 op->orig_val = op->val; 979 op->orig_val = op->val;
959} 980}
960 981
982static void adjust_modrm_seg(struct x86_emulate_ctxt *ctxt, int base_reg)
983{
984 if (base_reg == VCPU_REGS_RSP || base_reg == VCPU_REGS_RBP)
985 ctxt->modrm_seg = VCPU_SREG_SS;
986}
987
961static int decode_modrm(struct x86_emulate_ctxt *ctxt, 988static int decode_modrm(struct x86_emulate_ctxt *ctxt,
962 struct operand *op) 989 struct operand *op)
963{ 990{
@@ -1061,15 +1088,20 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
1061 1088
1062 if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0) 1089 if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0)
1063 modrm_ea += insn_fetch(s32, ctxt); 1090 modrm_ea += insn_fetch(s32, ctxt);
1064 else 1091 else {
1065 modrm_ea += ctxt->regs[base_reg]; 1092 modrm_ea += ctxt->regs[base_reg];
1093 adjust_modrm_seg(ctxt, base_reg);
1094 }
1066 if (index_reg != 4) 1095 if (index_reg != 4)
1067 modrm_ea += ctxt->regs[index_reg] << scale; 1096 modrm_ea += ctxt->regs[index_reg] << scale;
1068 } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) { 1097 } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) {
1069 if (ctxt->mode == X86EMUL_MODE_PROT64) 1098 if (ctxt->mode == X86EMUL_MODE_PROT64)
1070 ctxt->rip_relative = 1; 1099 ctxt->rip_relative = 1;
1071 } else 1100 } else {
1072 modrm_ea += ctxt->regs[ctxt->modrm_rm]; 1101 base_reg = ctxt->modrm_rm;
1102 modrm_ea += ctxt->regs[base_reg];
1103 adjust_modrm_seg(ctxt, base_reg);
1104 }
1073 switch (ctxt->modrm_mod) { 1105 switch (ctxt->modrm_mod) {
1074 case 0: 1106 case 0:
1075 if (ctxt->modrm_rm == 5) 1107 if (ctxt->modrm_rm == 5)
@@ -1264,7 +1296,8 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
1264 1296
1265/* allowed just for 8 bytes segments */ 1297/* allowed just for 8 bytes segments */
1266static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, 1298static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1267 u16 selector, struct desc_struct *desc) 1299 u16 selector, struct desc_struct *desc,
1300 ulong *desc_addr_p)
1268{ 1301{
1269 struct desc_ptr dt; 1302 struct desc_ptr dt;
1270 u16 index = selector >> 3; 1303 u16 index = selector >> 3;
@@ -1275,7 +1308,7 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1275 if (dt.size < index * 8 + 7) 1308 if (dt.size < index * 8 + 7)
1276 return emulate_gp(ctxt, selector & 0xfffc); 1309 return emulate_gp(ctxt, selector & 0xfffc);
1277 1310
1278 addr = dt.address + index * 8; 1311 *desc_addr_p = addr = dt.address + index * 8;
1279 return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc, 1312 return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc,
1280 &ctxt->exception); 1313 &ctxt->exception);
1281} 1314}
@@ -1302,11 +1335,12 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1302static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, 1335static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1303 u16 selector, int seg) 1336 u16 selector, int seg)
1304{ 1337{
1305 struct desc_struct seg_desc; 1338 struct desc_struct seg_desc, old_desc;
1306 u8 dpl, rpl, cpl; 1339 u8 dpl, rpl, cpl;
1307 unsigned err_vec = GP_VECTOR; 1340 unsigned err_vec = GP_VECTOR;
1308 u32 err_code = 0; 1341 u32 err_code = 0;
1309 bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ 1342 bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
1343 ulong desc_addr;
1310 int ret; 1344 int ret;
1311 1345
1312 memset(&seg_desc, 0, sizeof seg_desc); 1346 memset(&seg_desc, 0, sizeof seg_desc);
@@ -1324,8 +1358,14 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1324 goto load; 1358 goto load;
1325 } 1359 }
1326 1360
1327 /* NULL selector is not valid for TR, CS and SS */ 1361 rpl = selector & 3;
1328 if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR) 1362 cpl = ctxt->ops->cpl(ctxt);
1363
1364 /* NULL selector is not valid for TR, CS and SS (except for long mode) */
1365 if ((seg == VCPU_SREG_CS
1366 || (seg == VCPU_SREG_SS
1367 && (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl))
1368 || seg == VCPU_SREG_TR)
1329 && null_selector) 1369 && null_selector)
1330 goto exception; 1370 goto exception;
1331 1371
@@ -1336,7 +1376,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1336 if (null_selector) /* for NULL selector skip all following checks */ 1376 if (null_selector) /* for NULL selector skip all following checks */
1337 goto load; 1377 goto load;
1338 1378
1339 ret = read_segment_descriptor(ctxt, selector, &seg_desc); 1379 ret = read_segment_descriptor(ctxt, selector, &seg_desc, &desc_addr);
1340 if (ret != X86EMUL_CONTINUE) 1380 if (ret != X86EMUL_CONTINUE)
1341 return ret; 1381 return ret;
1342 1382
@@ -1352,9 +1392,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1352 goto exception; 1392 goto exception;
1353 } 1393 }
1354 1394
1355 rpl = selector & 3;
1356 dpl = seg_desc.dpl; 1395 dpl = seg_desc.dpl;
1357 cpl = ctxt->ops->cpl(ctxt);
1358 1396
1359 switch (seg) { 1397 switch (seg) {
1360 case VCPU_SREG_SS: 1398 case VCPU_SREG_SS:
@@ -1384,6 +1422,12 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1384 case VCPU_SREG_TR: 1422 case VCPU_SREG_TR:
1385 if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9)) 1423 if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9))
1386 goto exception; 1424 goto exception;
1425 old_desc = seg_desc;
1426 seg_desc.type |= 2; /* busy */
1427 ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc,
1428 sizeof(seg_desc), &ctxt->exception);
1429 if (ret != X86EMUL_CONTINUE)
1430 return ret;
1387 break; 1431 break;
1388 case VCPU_SREG_LDTR: 1432 case VCPU_SREG_LDTR:
1389 if (seg_desc.s || seg_desc.type != 2) 1433 if (seg_desc.s || seg_desc.type != 2)
@@ -1474,17 +1518,22 @@ static int writeback(struct x86_emulate_ctxt *ctxt)
1474 return X86EMUL_CONTINUE; 1518 return X86EMUL_CONTINUE;
1475} 1519}
1476 1520
1477static int em_push(struct x86_emulate_ctxt *ctxt) 1521static int push(struct x86_emulate_ctxt *ctxt, void *data, int bytes)
1478{ 1522{
1479 struct segmented_address addr; 1523 struct segmented_address addr;
1480 1524
1481 register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -ctxt->op_bytes); 1525 register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -bytes);
1482 addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]); 1526 addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]);
1483 addr.seg = VCPU_SREG_SS; 1527 addr.seg = VCPU_SREG_SS;
1484 1528
1529 return segmented_write(ctxt, addr, data, bytes);
1530}
1531
1532static int em_push(struct x86_emulate_ctxt *ctxt)
1533{
1485 /* Disable writeback. */ 1534 /* Disable writeback. */
1486 ctxt->dst.type = OP_NONE; 1535 ctxt->dst.type = OP_NONE;
1487 return segmented_write(ctxt, addr, &ctxt->src.val, ctxt->op_bytes); 1536 return push(ctxt, &ctxt->src.val, ctxt->op_bytes);
1488} 1537}
1489 1538
1490static int emulate_pop(struct x86_emulate_ctxt *ctxt, 1539static int emulate_pop(struct x86_emulate_ctxt *ctxt,
@@ -1556,6 +1605,33 @@ static int em_popf(struct x86_emulate_ctxt *ctxt)
1556 return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes); 1605 return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes);
1557} 1606}
1558 1607
1608static int em_enter(struct x86_emulate_ctxt *ctxt)
1609{
1610 int rc;
1611 unsigned frame_size = ctxt->src.val;
1612 unsigned nesting_level = ctxt->src2.val & 31;
1613
1614 if (nesting_level)
1615 return X86EMUL_UNHANDLEABLE;
1616
1617 rc = push(ctxt, &ctxt->regs[VCPU_REGS_RBP], stack_size(ctxt));
1618 if (rc != X86EMUL_CONTINUE)
1619 return rc;
1620 assign_masked(&ctxt->regs[VCPU_REGS_RBP], ctxt->regs[VCPU_REGS_RSP],
1621 stack_mask(ctxt));
1622 assign_masked(&ctxt->regs[VCPU_REGS_RSP],
1623 ctxt->regs[VCPU_REGS_RSP] - frame_size,
1624 stack_mask(ctxt));
1625 return X86EMUL_CONTINUE;
1626}
1627
1628static int em_leave(struct x86_emulate_ctxt *ctxt)
1629{
1630 assign_masked(&ctxt->regs[VCPU_REGS_RSP], ctxt->regs[VCPU_REGS_RBP],
1631 stack_mask(ctxt));
1632 return emulate_pop(ctxt, &ctxt->regs[VCPU_REGS_RBP], ctxt->op_bytes);
1633}
1634
1559static int em_push_sreg(struct x86_emulate_ctxt *ctxt) 1635static int em_push_sreg(struct x86_emulate_ctxt *ctxt)
1560{ 1636{
1561 int seg = ctxt->src2.val; 1637 int seg = ctxt->src2.val;
@@ -1993,8 +2069,8 @@ static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
1993 u32 eax, ebx, ecx, edx; 2069 u32 eax, ebx, ecx, edx;
1994 2070
1995 eax = ecx = 0; 2071 eax = ecx = 0;
1996 return ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx) 2072 ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
1997 && ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx 2073 return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx
1998 && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx 2074 && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx
1999 && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx; 2075 && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx;
2000} 2076}
@@ -2013,32 +2089,31 @@ static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)
2013 2089
2014 eax = 0x00000000; 2090 eax = 0x00000000;
2015 ecx = 0x00000000; 2091 ecx = 0x00000000;
2016 if (ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx)) { 2092 ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
2017 /* 2093 /*
2018 * Intel ("GenuineIntel") 2094 * Intel ("GenuineIntel")
2019 * remark: Intel CPUs only support "syscall" in 64bit 2095 * remark: Intel CPUs only support "syscall" in 64bit
2020 * longmode. Also an 64bit guest with a 2096 * longmode. Also an 64bit guest with a
2021 * 32bit compat-app running will #UD !! While this 2097 * 32bit compat-app running will #UD !! While this
2022 * behaviour can be fixed (by emulating) into AMD 2098 * behaviour can be fixed (by emulating) into AMD
2023 * response - CPUs of AMD can't behave like Intel. 2099 * response - CPUs of AMD can't behave like Intel.
2024 */ 2100 */
2025 if (ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx && 2101 if (ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx &&
2026 ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx && 2102 ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx &&
2027 edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx) 2103 edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx)
2028 return false; 2104 return false;
2029 2105
2030 /* AMD ("AuthenticAMD") */ 2106 /* AMD ("AuthenticAMD") */
2031 if (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx && 2107 if (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx &&
2032 ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx && 2108 ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx &&
2033 edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx) 2109 edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx)
2034 return true; 2110 return true;
2035 2111
2036 /* AMD ("AMDisbetter!") */ 2112 /* AMD ("AMDisbetter!") */
2037 if (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx && 2113 if (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx &&
2038 ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx && 2114 ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx &&
2039 edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx) 2115 edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx)
2040 return true; 2116 return true;
2041 }
2042 2117
2043 /* default: (not Intel, not AMD), apply Intel's stricter rules... */ 2118 /* default: (not Intel, not AMD), apply Intel's stricter rules... */
2044 return false; 2119 return false;
@@ -2547,13 +2622,14 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2547 ulong old_tss_base = 2622 ulong old_tss_base =
2548 ops->get_cached_segment_base(ctxt, VCPU_SREG_TR); 2623 ops->get_cached_segment_base(ctxt, VCPU_SREG_TR);
2549 u32 desc_limit; 2624 u32 desc_limit;
2625 ulong desc_addr;
2550 2626
2551 /* FIXME: old_tss_base == ~0 ? */ 2627 /* FIXME: old_tss_base == ~0 ? */
2552 2628
2553 ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc); 2629 ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc, &desc_addr);
2554 if (ret != X86EMUL_CONTINUE) 2630 if (ret != X86EMUL_CONTINUE)
2555 return ret; 2631 return ret;
2556 ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc); 2632 ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc, &desc_addr);
2557 if (ret != X86EMUL_CONTINUE) 2633 if (ret != X86EMUL_CONTINUE)
2558 return ret; 2634 return ret;
2559 2635
@@ -2948,6 +3024,24 @@ static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt)
2948 return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg); 3024 return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg);
2949} 3025}
2950 3026
3027static int em_lldt(struct x86_emulate_ctxt *ctxt)
3028{
3029 u16 sel = ctxt->src.val;
3030
3031 /* Disable writeback. */
3032 ctxt->dst.type = OP_NONE;
3033 return load_segment_descriptor(ctxt, sel, VCPU_SREG_LDTR);
3034}
3035
3036static int em_ltr(struct x86_emulate_ctxt *ctxt)
3037{
3038 u16 sel = ctxt->src.val;
3039
3040 /* Disable writeback. */
3041 ctxt->dst.type = OP_NONE;
3042 return load_segment_descriptor(ctxt, sel, VCPU_SREG_TR);
3043}
3044
2951static int em_invlpg(struct x86_emulate_ctxt *ctxt) 3045static int em_invlpg(struct x86_emulate_ctxt *ctxt)
2952{ 3046{
2953 int rc; 3047 int rc;
@@ -2989,11 +3083,42 @@ static int em_vmcall(struct x86_emulate_ctxt *ctxt)
2989 return X86EMUL_CONTINUE; 3083 return X86EMUL_CONTINUE;
2990} 3084}
2991 3085
3086static int emulate_store_desc_ptr(struct x86_emulate_ctxt *ctxt,
3087 void (*get)(struct x86_emulate_ctxt *ctxt,
3088 struct desc_ptr *ptr))
3089{
3090 struct desc_ptr desc_ptr;
3091
3092 if (ctxt->mode == X86EMUL_MODE_PROT64)
3093 ctxt->op_bytes = 8;
3094 get(ctxt, &desc_ptr);
3095 if (ctxt->op_bytes == 2) {
3096 ctxt->op_bytes = 4;
3097 desc_ptr.address &= 0x00ffffff;
3098 }
3099 /* Disable writeback. */
3100 ctxt->dst.type = OP_NONE;
3101 return segmented_write(ctxt, ctxt->dst.addr.mem,
3102 &desc_ptr, 2 + ctxt->op_bytes);
3103}
3104
3105static int em_sgdt(struct x86_emulate_ctxt *ctxt)
3106{
3107 return emulate_store_desc_ptr(ctxt, ctxt->ops->get_gdt);
3108}
3109
3110static int em_sidt(struct x86_emulate_ctxt *ctxt)
3111{
3112 return emulate_store_desc_ptr(ctxt, ctxt->ops->get_idt);
3113}
3114
2992static int em_lgdt(struct x86_emulate_ctxt *ctxt) 3115static int em_lgdt(struct x86_emulate_ctxt *ctxt)
2993{ 3116{
2994 struct desc_ptr desc_ptr; 3117 struct desc_ptr desc_ptr;
2995 int rc; 3118 int rc;
2996 3119
3120 if (ctxt->mode == X86EMUL_MODE_PROT64)
3121 ctxt->op_bytes = 8;
2997 rc = read_descriptor(ctxt, ctxt->src.addr.mem, 3122 rc = read_descriptor(ctxt, ctxt->src.addr.mem,
2998 &desc_ptr.size, &desc_ptr.address, 3123 &desc_ptr.size, &desc_ptr.address,
2999 ctxt->op_bytes); 3124 ctxt->op_bytes);
@@ -3021,6 +3146,8 @@ static int em_lidt(struct x86_emulate_ctxt *ctxt)
3021 struct desc_ptr desc_ptr; 3146 struct desc_ptr desc_ptr;
3022 int rc; 3147 int rc;
3023 3148
3149 if (ctxt->mode == X86EMUL_MODE_PROT64)
3150 ctxt->op_bytes = 8;
3024 rc = read_descriptor(ctxt, ctxt->src.addr.mem, 3151 rc = read_descriptor(ctxt, ctxt->src.addr.mem,
3025 &desc_ptr.size, &desc_ptr.address, 3152 &desc_ptr.size, &desc_ptr.address,
3026 ctxt->op_bytes); 3153 ctxt->op_bytes);
@@ -3143,6 +3270,42 @@ static int em_bsr(struct x86_emulate_ctxt *ctxt)
3143 return X86EMUL_CONTINUE; 3270 return X86EMUL_CONTINUE;
3144} 3271}
3145 3272
3273static int em_cpuid(struct x86_emulate_ctxt *ctxt)
3274{
3275 u32 eax, ebx, ecx, edx;
3276
3277 eax = ctxt->regs[VCPU_REGS_RAX];
3278 ecx = ctxt->regs[VCPU_REGS_RCX];
3279 ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
3280 ctxt->regs[VCPU_REGS_RAX] = eax;
3281 ctxt->regs[VCPU_REGS_RBX] = ebx;
3282 ctxt->regs[VCPU_REGS_RCX] = ecx;
3283 ctxt->regs[VCPU_REGS_RDX] = edx;
3284 return X86EMUL_CONTINUE;
3285}
3286
3287static int em_lahf(struct x86_emulate_ctxt *ctxt)
3288{
3289 ctxt->regs[VCPU_REGS_RAX] &= ~0xff00UL;
3290 ctxt->regs[VCPU_REGS_RAX] |= (ctxt->eflags & 0xff) << 8;
3291 return X86EMUL_CONTINUE;
3292}
3293
3294static int em_bswap(struct x86_emulate_ctxt *ctxt)
3295{
3296 switch (ctxt->op_bytes) {
3297#ifdef CONFIG_X86_64
3298 case 8:
3299 asm("bswap %0" : "+r"(ctxt->dst.val));
3300 break;
3301#endif
3302 default:
3303 asm("bswap %0" : "+r"(*(u32 *)&ctxt->dst.val));
3304 break;
3305 }
3306 return X86EMUL_CONTINUE;
3307}
3308
3146static bool valid_cr(int nr) 3309static bool valid_cr(int nr)
3147{ 3310{
3148 switch (nr) { 3311 switch (nr) {
@@ -3424,14 +3587,14 @@ static struct opcode group5[] = {
3424static struct opcode group6[] = { 3587static struct opcode group6[] = {
3425 DI(Prot, sldt), 3588 DI(Prot, sldt),
3426 DI(Prot, str), 3589 DI(Prot, str),
3427 DI(Prot | Priv, lldt), 3590 II(Prot | Priv | SrcMem16, em_lldt, lldt),
3428 DI(Prot | Priv, ltr), 3591 II(Prot | Priv | SrcMem16, em_ltr, ltr),
3429 N, N, N, N, 3592 N, N, N, N,
3430}; 3593};
3431 3594
3432static struct group_dual group7 = { { 3595static struct group_dual group7 = { {
3433 DI(Mov | DstMem | Priv, sgdt), 3596 II(Mov | DstMem | Priv, em_sgdt, sgdt),
3434 DI(Mov | DstMem | Priv, sidt), 3597 II(Mov | DstMem | Priv, em_sidt, sidt),
3435 II(SrcMem | Priv, em_lgdt, lgdt), 3598 II(SrcMem | Priv, em_lgdt, lgdt),
3436 II(SrcMem | Priv, em_lidt, lidt), 3599 II(SrcMem | Priv, em_lidt, lidt),
3437 II(SrcNone | DstMem | Mov, em_smsw, smsw), N, 3600 II(SrcNone | DstMem | Mov, em_smsw, smsw), N,
@@ -3538,7 +3701,7 @@ static struct opcode opcode_table[256] = {
3538 D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd), 3701 D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd),
3539 I(SrcImmFAddr | No64, em_call_far), N, 3702 I(SrcImmFAddr | No64, em_call_far), N,
3540 II(ImplicitOps | Stack, em_pushf, pushf), 3703 II(ImplicitOps | Stack, em_pushf, pushf),
3541 II(ImplicitOps | Stack, em_popf, popf), N, N, 3704 II(ImplicitOps | Stack, em_popf, popf), N, I(ImplicitOps, em_lahf),
3542 /* 0xA0 - 0xA7 */ 3705 /* 0xA0 - 0xA7 */
3543 I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), 3706 I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
3544 I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov), 3707 I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov),
@@ -3561,7 +3724,8 @@ static struct opcode opcode_table[256] = {
3561 I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg), 3724 I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg),
3562 G(ByteOp, group11), G(0, group11), 3725 G(ByteOp, group11), G(0, group11),
3563 /* 0xC8 - 0xCF */ 3726 /* 0xC8 - 0xCF */
3564 N, N, N, I(ImplicitOps | Stack, em_ret_far), 3727 I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave),
3728 N, I(ImplicitOps | Stack, em_ret_far),
3565 D(ImplicitOps), DI(SrcImmByte, intn), 3729 D(ImplicitOps), DI(SrcImmByte, intn),
3566 D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret), 3730 D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret),
3567 /* 0xD0 - 0xD7 */ 3731 /* 0xD0 - 0xD7 */
@@ -3635,7 +3799,7 @@ static struct opcode twobyte_table[256] = {
3635 X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), 3799 X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
3636 /* 0xA0 - 0xA7 */ 3800 /* 0xA0 - 0xA7 */
3637 I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg), 3801 I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg),
3638 DI(ImplicitOps, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt), 3802 II(ImplicitOps, em_cpuid, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt),
3639 D(DstMem | SrcReg | Src2ImmByte | ModRM), 3803 D(DstMem | SrcReg | Src2ImmByte | ModRM),
3640 D(DstMem | SrcReg | Src2CL | ModRM), N, N, 3804 D(DstMem | SrcReg | Src2CL | ModRM), N, N,
3641 /* 0xA8 - 0xAF */ 3805 /* 0xA8 - 0xAF */
@@ -3658,11 +3822,12 @@ static struct opcode twobyte_table[256] = {
3658 I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc), 3822 I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc),
3659 I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr), 3823 I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr),
3660 D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), 3824 D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
3661 /* 0xC0 - 0xCF */ 3825 /* 0xC0 - 0xC7 */
3662 D2bv(DstMem | SrcReg | ModRM | Lock), 3826 D2bv(DstMem | SrcReg | ModRM | Lock),
3663 N, D(DstMem | SrcReg | ModRM | Mov), 3827 N, D(DstMem | SrcReg | ModRM | Mov),
3664 N, N, N, GD(0, &group9), 3828 N, N, N, GD(0, &group9),
3665 N, N, N, N, N, N, N, N, 3829 /* 0xC8 - 0xCF */
3830 X8(I(DstReg, em_bswap)),
3666 /* 0xD0 - 0xDF */ 3831 /* 0xD0 - 0xDF */
3667 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, 3832 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
3668 /* 0xE0 - 0xEF */ 3833 /* 0xE0 - 0xEF */
@@ -4426,12 +4591,12 @@ twobyte_insn:
4426 break; 4591 break;
4427 case 0xb6 ... 0xb7: /* movzx */ 4592 case 0xb6 ... 0xb7: /* movzx */
4428 ctxt->dst.bytes = ctxt->op_bytes; 4593 ctxt->dst.bytes = ctxt->op_bytes;
4429 ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val 4594 ctxt->dst.val = (ctxt->src.bytes == 1) ? (u8) ctxt->src.val
4430 : (u16) ctxt->src.val; 4595 : (u16) ctxt->src.val;
4431 break; 4596 break;
4432 case 0xbe ... 0xbf: /* movsx */ 4597 case 0xbe ... 0xbf: /* movsx */
4433 ctxt->dst.bytes = ctxt->op_bytes; 4598 ctxt->dst.bytes = ctxt->op_bytes;
4434 ctxt->dst.val = (ctxt->d & ByteOp) ? (s8) ctxt->src.val : 4599 ctxt->dst.val = (ctxt->src.bytes == 1) ? (s8) ctxt->src.val :
4435 (s16) ctxt->src.val; 4600 (s16) ctxt->src.val;
4436 break; 4601 break;
4437 case 0xc0 ... 0xc1: /* xadd */ 4602 case 0xc0 ... 0xc1: /* xadd */
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 81cf4fa4a2be..1df8fb9e1d5d 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -188,14 +188,15 @@ void kvm_pic_update_irq(struct kvm_pic *s)
188 pic_unlock(s); 188 pic_unlock(s);
189} 189}
190 190
191int kvm_pic_set_irq(void *opaque, int irq, int level) 191int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level)
192{ 192{
193 struct kvm_pic *s = opaque;
194 int ret = -1; 193 int ret = -1;
195 194
196 pic_lock(s); 195 pic_lock(s);
197 if (irq >= 0 && irq < PIC_NUM_PINS) { 196 if (irq >= 0 && irq < PIC_NUM_PINS) {
198 ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); 197 int irq_level = __kvm_irq_line_state(&s->irq_states[irq],
198 irq_source_id, level);
199 ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level);
199 pic_update_irq(s); 200 pic_update_irq(s);
200 trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, 201 trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
201 s->pics[irq >> 3].imr, ret == 0); 202 s->pics[irq >> 3].imr, ret == 0);
@@ -205,6 +206,16 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
205 return ret; 206 return ret;
206} 207}
207 208
209void kvm_pic_clear_all(struct kvm_pic *s, int irq_source_id)
210{
211 int i;
212
213 pic_lock(s);
214 for (i = 0; i < PIC_NUM_PINS; i++)
215 __clear_bit(irq_source_id, &s->irq_states[i]);
216 pic_unlock(s);
217}
218
208/* 219/*
209 * acknowledge interrupt 'irq' 220 * acknowledge interrupt 'irq'
210 */ 221 */
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 93c15743f1ee..ce878788a39f 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -107,6 +107,16 @@ static inline void apic_clear_vector(int vec, void *bitmap)
107 clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); 107 clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
108} 108}
109 109
110static inline int __apic_test_and_set_vector(int vec, void *bitmap)
111{
112 return __test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
113}
114
115static inline int __apic_test_and_clear_vector(int vec, void *bitmap)
116{
117 return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
118}
119
110static inline int apic_hw_enabled(struct kvm_lapic *apic) 120static inline int apic_hw_enabled(struct kvm_lapic *apic)
111{ 121{
112 return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE; 122 return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
@@ -210,6 +220,16 @@ static int find_highest_vector(void *bitmap)
210 return fls(word[word_offset << 2]) - 1 + (word_offset << 5); 220 return fls(word[word_offset << 2]) - 1 + (word_offset << 5);
211} 221}
212 222
223static u8 count_vectors(void *bitmap)
224{
225 u32 *word = bitmap;
226 int word_offset;
227 u8 count = 0;
228 for (word_offset = 0; word_offset < MAX_APIC_VECTOR >> 5; ++word_offset)
229 count += hweight32(word[word_offset << 2]);
230 return count;
231}
232
213static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) 233static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
214{ 234{
215 apic->irr_pending = true; 235 apic->irr_pending = true;
@@ -242,6 +262,27 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
242 apic->irr_pending = true; 262 apic->irr_pending = true;
243} 263}
244 264
265static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
266{
267 if (!__apic_test_and_set_vector(vec, apic->regs + APIC_ISR))
268 ++apic->isr_count;
269 BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
270 /*
271 * ISR (in service register) bit is set when injecting an interrupt.
272 * The highest vector is injected. Thus the latest bit set matches
273 * the highest bit in ISR.
274 */
275 apic->highest_isr_cache = vec;
276}
277
278static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
279{
280 if (__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR))
281 --apic->isr_count;
282 BUG_ON(apic->isr_count < 0);
283 apic->highest_isr_cache = -1;
284}
285
245int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) 286int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
246{ 287{
247 struct kvm_lapic *apic = vcpu->arch.apic; 288 struct kvm_lapic *apic = vcpu->arch.apic;
@@ -270,9 +311,61 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq)
270 irq->level, irq->trig_mode); 311 irq->level, irq->trig_mode);
271} 312}
272 313
314static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val)
315{
316
317 return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val,
318 sizeof(val));
319}
320
321static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val)
322{
323
324 return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val,
325 sizeof(*val));
326}
327
328static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu)
329{
330 return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED;
331}
332
333static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu)
334{
335 u8 val;
336 if (pv_eoi_get_user(vcpu, &val) < 0)
337 apic_debug("Can't read EOI MSR value: 0x%llx\n",
338 (unsigned long long)vcpi->arch.pv_eoi.msr_val);
339 return val & 0x1;
340}
341
342static void pv_eoi_set_pending(struct kvm_vcpu *vcpu)
343{
344 if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0) {
345 apic_debug("Can't set EOI MSR value: 0x%llx\n",
346 (unsigned long long)vcpi->arch.pv_eoi.msr_val);
347 return;
348 }
349 __set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
350}
351
352static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
353{
354 if (pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) {
355 apic_debug("Can't clear EOI MSR value: 0x%llx\n",
356 (unsigned long long)vcpi->arch.pv_eoi.msr_val);
357 return;
358 }
359 __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
360}
361
273static inline int apic_find_highest_isr(struct kvm_lapic *apic) 362static inline int apic_find_highest_isr(struct kvm_lapic *apic)
274{ 363{
275 int result; 364 int result;
365 if (!apic->isr_count)
366 return -1;
367 if (likely(apic->highest_isr_cache != -1))
368 return apic->highest_isr_cache;
276 369
277 result = find_highest_vector(apic->regs + APIC_ISR); 370 result = find_highest_vector(apic->regs + APIC_ISR);
278 ASSERT(result == -1 || result >= 16); 371 ASSERT(result == -1 || result >= 16);
@@ -482,17 +575,20 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
482 return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; 575 return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
483} 576}
484 577
485static void apic_set_eoi(struct kvm_lapic *apic) 578static int apic_set_eoi(struct kvm_lapic *apic)
486{ 579{
487 int vector = apic_find_highest_isr(apic); 580 int vector = apic_find_highest_isr(apic);
581
582 trace_kvm_eoi(apic, vector);
583
488 /* 584 /*
489 * Not every write EOI will has corresponding ISR, 585 * Not every write EOI will has corresponding ISR,
490 * one example is when Kernel check timer on setup_IO_APIC 586 * one example is when Kernel check timer on setup_IO_APIC
491 */ 587 */
492 if (vector == -1) 588 if (vector == -1)
493 return; 589 return vector;
494 590
495 apic_clear_vector(vector, apic->regs + APIC_ISR); 591 apic_clear_isr(vector, apic);
496 apic_update_ppr(apic); 592 apic_update_ppr(apic);
497 593
498 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && 594 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) &&
@@ -505,6 +601,7 @@ static void apic_set_eoi(struct kvm_lapic *apic)
505 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); 601 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
506 } 602 }
507 kvm_make_request(KVM_REQ_EVENT, apic->vcpu); 603 kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
604 return vector;
508} 605}
509 606
510static void apic_send_ipi(struct kvm_lapic *apic) 607static void apic_send_ipi(struct kvm_lapic *apic)
@@ -1081,10 +1178,13 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
1081 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); 1178 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
1082 } 1179 }
1083 apic->irr_pending = false; 1180 apic->irr_pending = false;
1181 apic->isr_count = 0;
1182 apic->highest_isr_cache = -1;
1084 update_divide_count(apic); 1183 update_divide_count(apic);
1085 atomic_set(&apic->lapic_timer.pending, 0); 1184 atomic_set(&apic->lapic_timer.pending, 0);
1086 if (kvm_vcpu_is_bsp(vcpu)) 1185 if (kvm_vcpu_is_bsp(vcpu))
1087 vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; 1186 vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
1187 vcpu->arch.pv_eoi.msr_val = 0;
1088 apic_update_ppr(apic); 1188 apic_update_ppr(apic);
1089 1189
1090 vcpu->arch.apic_arb_prio = 0; 1190 vcpu->arch.apic_arb_prio = 0;
@@ -1248,7 +1348,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
1248 if (vector == -1) 1348 if (vector == -1)
1249 return -1; 1349 return -1;
1250 1350
1251 apic_set_vector(vector, apic->regs + APIC_ISR); 1351 apic_set_isr(vector, apic);
1252 apic_update_ppr(apic); 1352 apic_update_ppr(apic);
1253 apic_clear_irr(vector, apic); 1353 apic_clear_irr(vector, apic);
1254 return vector; 1354 return vector;
@@ -1267,6 +1367,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
1267 update_divide_count(apic); 1367 update_divide_count(apic);
1268 start_apic_timer(apic); 1368 start_apic_timer(apic);
1269 apic->irr_pending = true; 1369 apic->irr_pending = true;
1370 apic->isr_count = count_vectors(apic->regs + APIC_ISR);
1371 apic->highest_isr_cache = -1;
1270 kvm_make_request(KVM_REQ_EVENT, vcpu); 1372 kvm_make_request(KVM_REQ_EVENT, vcpu);
1271} 1373}
1272 1374
@@ -1283,11 +1385,51 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
1283 hrtimer_start_expires(timer, HRTIMER_MODE_ABS); 1385 hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
1284} 1386}
1285 1387
1388/*
1389 * apic_sync_pv_eoi_from_guest - called on vmexit or cancel interrupt
1390 *
1391 * Detect whether guest triggered PV EOI since the
1392 * last entry. If yes, set EOI on guests's behalf.
1393 * Clear PV EOI in guest memory in any case.
1394 */
1395static void apic_sync_pv_eoi_from_guest(struct kvm_vcpu *vcpu,
1396 struct kvm_lapic *apic)
1397{
1398 bool pending;
1399 int vector;
1400 /*
1401 * PV EOI state is derived from KVM_APIC_PV_EOI_PENDING in host
1402 * and KVM_PV_EOI_ENABLED in guest memory as follows:
1403 *
1404 * KVM_APIC_PV_EOI_PENDING is unset:
1405 * -> host disabled PV EOI.
1406 * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is set:
1407 * -> host enabled PV EOI, guest did not execute EOI yet.
1408 * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is unset:
1409 * -> host enabled PV EOI, guest executed EOI.
1410 */
1411 BUG_ON(!pv_eoi_enabled(vcpu));
1412 pending = pv_eoi_get_pending(vcpu);
1413 /*
1414 * Clear pending bit in any case: it will be set again on vmentry.
1415 * While this might not be ideal from performance point of view,
1416 * this makes sure pv eoi is only enabled when we know it's safe.
1417 */
1418 pv_eoi_clr_pending(vcpu);
1419 if (pending)
1420 return;
1421 vector = apic_set_eoi(apic);
1422 trace_kvm_pv_eoi(apic, vector);
1423}
1424
1286void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) 1425void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
1287{ 1426{
1288 u32 data; 1427 u32 data;
1289 void *vapic; 1428 void *vapic;
1290 1429
1430 if (test_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention))
1431 apic_sync_pv_eoi_from_guest(vcpu, vcpu->arch.apic);
1432
1291 if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) 1433 if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
1292 return; 1434 return;
1293 1435
@@ -1298,17 +1440,44 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
1298 apic_set_tpr(vcpu->arch.apic, data & 0xff); 1440 apic_set_tpr(vcpu->arch.apic, data & 0xff);
1299} 1441}
1300 1442
1443/*
1444 * apic_sync_pv_eoi_to_guest - called before vmentry
1445 *
1446 * Detect whether it's safe to enable PV EOI and
1447 * if yes do so.
1448 */
1449static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu,
1450 struct kvm_lapic *apic)
1451{
1452 if (!pv_eoi_enabled(vcpu) ||
1453 /* IRR set or many bits in ISR: could be nested. */
1454 apic->irr_pending ||
1455 /* Cache not set: could be safe but we don't bother. */
1456 apic->highest_isr_cache == -1 ||
1457 /* Need EOI to update ioapic. */
1458 kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache)) {
1459 /*
1460 * PV EOI was disabled by apic_sync_pv_eoi_from_guest
1461 * so we need not do anything here.
1462 */
1463 return;
1464 }
1465
1466 pv_eoi_set_pending(apic->vcpu);
1467}
1468
1301void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) 1469void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
1302{ 1470{
1303 u32 data, tpr; 1471 u32 data, tpr;
1304 int max_irr, max_isr; 1472 int max_irr, max_isr;
1305 struct kvm_lapic *apic; 1473 struct kvm_lapic *apic = vcpu->arch.apic;
1306 void *vapic; 1474 void *vapic;
1307 1475
1476 apic_sync_pv_eoi_to_guest(vcpu, apic);
1477
1308 if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) 1478 if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
1309 return; 1479 return;
1310 1480
1311 apic = vcpu->arch.apic;
1312 tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff; 1481 tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff;
1313 max_irr = apic_find_highest_irr(apic); 1482 max_irr = apic_find_highest_irr(apic);
1314 if (max_irr < 0) 1483 if (max_irr < 0)
@@ -1394,3 +1563,16 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
1394 1563
1395 return 0; 1564 return 0;
1396} 1565}
1566
1567int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data)
1568{
1569 u64 addr = data & ~KVM_MSR_ENABLED;
1570 if (!IS_ALIGNED(addr, 4))
1571 return 1;
1572
1573 vcpu->arch.pv_eoi.msr_val = data;
1574 if (!pv_eoi_enabled(vcpu))
1575 return 0;
1576 return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data,
1577 addr);
1578}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 6f4ce2575d09..4af5405ae1e2 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -13,6 +13,15 @@ struct kvm_lapic {
13 u32 divide_count; 13 u32 divide_count;
14 struct kvm_vcpu *vcpu; 14 struct kvm_vcpu *vcpu;
15 bool irr_pending; 15 bool irr_pending;
16 /* Number of bits set in ISR. */
17 s16 isr_count;
18 /* The highest vector set in ISR; if -1 - invalid, must scan ISR. */
19 int highest_isr_cache;
20 /**
21 * APIC register page. The layout matches the register layout seen by
22 * the guest 1:1, because it is accessed by the vmx microcode.
23 * Note: Only one register, the TPR, is used by the microcode.
24 */
16 void *regs; 25 void *regs;
17 gpa_t vapic_addr; 26 gpa_t vapic_addr;
18 struct page *vapic_page; 27 struct page *vapic_page;
@@ -60,4 +69,6 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
60{ 69{
61 return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE; 70 return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE;
62} 71}
72
73int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data);
63#endif 74#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index be3cea4407ff..01ca00423938 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -90,7 +90,7 @@ module_param(dbg, bool, 0644);
90 90
91#define PTE_PREFETCH_NUM 8 91#define PTE_PREFETCH_NUM 8
92 92
93#define PT_FIRST_AVAIL_BITS_SHIFT 9 93#define PT_FIRST_AVAIL_BITS_SHIFT 10
94#define PT64_SECOND_AVAIL_BITS_SHIFT 52 94#define PT64_SECOND_AVAIL_BITS_SHIFT 52
95 95
96#define PT64_LEVEL_BITS 9 96#define PT64_LEVEL_BITS 9
@@ -145,7 +145,8 @@ module_param(dbg, bool, 0644);
145#define CREATE_TRACE_POINTS 145#define CREATE_TRACE_POINTS
146#include "mmutrace.h" 146#include "mmutrace.h"
147 147
148#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) 148#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
149#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
149 150
150#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) 151#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
151 152
@@ -188,6 +189,7 @@ static u64 __read_mostly shadow_dirty_mask;
188static u64 __read_mostly shadow_mmio_mask; 189static u64 __read_mostly shadow_mmio_mask;
189 190
190static void mmu_spte_set(u64 *sptep, u64 spte); 191static void mmu_spte_set(u64 *sptep, u64 spte);
192static void mmu_free_roots(struct kvm_vcpu *vcpu);
191 193
192void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) 194void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
193{ 195{
@@ -444,8 +446,22 @@ static bool __check_direct_spte_mmio_pf(u64 spte)
444} 446}
445#endif 447#endif
446 448
449static bool spte_is_locklessly_modifiable(u64 spte)
450{
451 return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE));
452}
453
447static bool spte_has_volatile_bits(u64 spte) 454static bool spte_has_volatile_bits(u64 spte)
448{ 455{
456 /*
457 * Always atomicly update spte if it can be updated
458 * out of mmu-lock, it can ensure dirty bit is not lost,
459 * also, it can help us to get a stable is_writable_pte()
460 * to ensure tlb flush is not missed.
461 */
462 if (spte_is_locklessly_modifiable(spte))
463 return true;
464
449 if (!shadow_accessed_mask) 465 if (!shadow_accessed_mask)
450 return false; 466 return false;
451 467
@@ -478,34 +494,47 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte)
478 494
479/* Rules for using mmu_spte_update: 495/* Rules for using mmu_spte_update:
480 * Update the state bits, it means the mapped pfn is not changged. 496 * Update the state bits, it means the mapped pfn is not changged.
497 *
498 * Whenever we overwrite a writable spte with a read-only one we
499 * should flush remote TLBs. Otherwise rmap_write_protect
500 * will find a read-only spte, even though the writable spte
501 * might be cached on a CPU's TLB, the return value indicates this
502 * case.
481 */ 503 */
482static void mmu_spte_update(u64 *sptep, u64 new_spte) 504static bool mmu_spte_update(u64 *sptep, u64 new_spte)
483{ 505{
484 u64 mask, old_spte = *sptep; 506 u64 old_spte = *sptep;
507 bool ret = false;
485 508
486 WARN_ON(!is_rmap_spte(new_spte)); 509 WARN_ON(!is_rmap_spte(new_spte));
487 510
488 if (!is_shadow_present_pte(old_spte)) 511 if (!is_shadow_present_pte(old_spte)) {
489 return mmu_spte_set(sptep, new_spte); 512 mmu_spte_set(sptep, new_spte);
490 513 return ret;
491 new_spte |= old_spte & shadow_dirty_mask; 514 }
492
493 mask = shadow_accessed_mask;
494 if (is_writable_pte(old_spte))
495 mask |= shadow_dirty_mask;
496 515
497 if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask) 516 if (!spte_has_volatile_bits(old_spte))
498 __update_clear_spte_fast(sptep, new_spte); 517 __update_clear_spte_fast(sptep, new_spte);
499 else 518 else
500 old_spte = __update_clear_spte_slow(sptep, new_spte); 519 old_spte = __update_clear_spte_slow(sptep, new_spte);
501 520
521 /*
522 * For the spte updated out of mmu-lock is safe, since
523 * we always atomicly update it, see the comments in
524 * spte_has_volatile_bits().
525 */
526 if (is_writable_pte(old_spte) && !is_writable_pte(new_spte))
527 ret = true;
528
502 if (!shadow_accessed_mask) 529 if (!shadow_accessed_mask)
503 return; 530 return ret;
504 531
505 if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) 532 if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
506 kvm_set_pfn_accessed(spte_to_pfn(old_spte)); 533 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
507 if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) 534 if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
508 kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 535 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
536
537 return ret;
509} 538}
510 539
511/* 540/*
@@ -652,8 +681,7 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
652 mmu_page_header_cache); 681 mmu_page_header_cache);
653} 682}
654 683
655static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, 684static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
656 size_t size)
657{ 685{
658 void *p; 686 void *p;
659 687
@@ -664,8 +692,7 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
664 692
665static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) 693static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
666{ 694{
667 return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache, 695 return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
668 sizeof(struct pte_list_desc));
669} 696}
670 697
671static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) 698static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
@@ -1051,35 +1078,82 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
1051 rmap_remove(kvm, sptep); 1078 rmap_remove(kvm, sptep);
1052} 1079}
1053 1080
1054static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) 1081
1082static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
1083{
1084 if (is_large_pte(*sptep)) {
1085 WARN_ON(page_header(__pa(sptep))->role.level ==
1086 PT_PAGE_TABLE_LEVEL);
1087 drop_spte(kvm, sptep);
1088 --kvm->stat.lpages;
1089 return true;
1090 }
1091
1092 return false;
1093}
1094
1095static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1096{
1097 if (__drop_large_spte(vcpu->kvm, sptep))
1098 kvm_flush_remote_tlbs(vcpu->kvm);
1099}
1100
1101/*
1102 * Write-protect on the specified @sptep, @pt_protect indicates whether
1103 * spte writ-protection is caused by protecting shadow page table.
1104 * @flush indicates whether tlb need be flushed.
1105 *
1106 * Note: write protection is difference between drity logging and spte
1107 * protection:
1108 * - for dirty logging, the spte can be set to writable at anytime if
1109 * its dirty bitmap is properly set.
1110 * - for spte protection, the spte can be writable only after unsync-ing
1111 * shadow page.
1112 *
1113 * Return true if the spte is dropped.
1114 */
1115static bool
1116spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect)
1117{
1118 u64 spte = *sptep;
1119
1120 if (!is_writable_pte(spte) &&
1121 !(pt_protect && spte_is_locklessly_modifiable(spte)))
1122 return false;
1123
1124 rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
1125
1126 if (__drop_large_spte(kvm, sptep)) {
1127 *flush |= true;
1128 return true;
1129 }
1130
1131 if (pt_protect)
1132 spte &= ~SPTE_MMU_WRITEABLE;
1133 spte = spte & ~PT_WRITABLE_MASK;
1134
1135 *flush |= mmu_spte_update(sptep, spte);
1136 return false;
1137}
1138
1139static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
1140 int level, bool pt_protect)
1055{ 1141{
1056 u64 *sptep; 1142 u64 *sptep;
1057 struct rmap_iterator iter; 1143 struct rmap_iterator iter;
1058 int write_protected = 0; 1144 bool flush = false;
1059 1145
1060 for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { 1146 for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
1061 BUG_ON(!(*sptep & PT_PRESENT_MASK)); 1147 BUG_ON(!(*sptep & PT_PRESENT_MASK));
1062 rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); 1148 if (spte_write_protect(kvm, sptep, &flush, pt_protect)) {
1063
1064 if (!is_writable_pte(*sptep)) {
1065 sptep = rmap_get_next(&iter);
1066 continue;
1067 }
1068
1069 if (level == PT_PAGE_TABLE_LEVEL) {
1070 mmu_spte_update(sptep, *sptep & ~PT_WRITABLE_MASK);
1071 sptep = rmap_get_next(&iter);
1072 } else {
1073 BUG_ON(!is_large_pte(*sptep));
1074 drop_spte(kvm, sptep);
1075 --kvm->stat.lpages;
1076 sptep = rmap_get_first(*rmapp, &iter); 1149 sptep = rmap_get_first(*rmapp, &iter);
1150 continue;
1077 } 1151 }
1078 1152
1079 write_protected = 1; 1153 sptep = rmap_get_next(&iter);
1080 } 1154 }
1081 1155
1082 return write_protected; 1156 return flush;
1083} 1157}
1084 1158
1085/** 1159/**
@@ -1100,26 +1174,26 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1100 1174
1101 while (mask) { 1175 while (mask) {
1102 rmapp = &slot->rmap[gfn_offset + __ffs(mask)]; 1176 rmapp = &slot->rmap[gfn_offset + __ffs(mask)];
1103 __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL); 1177 __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false);
1104 1178
1105 /* clear the first set bit */ 1179 /* clear the first set bit */
1106 mask &= mask - 1; 1180 mask &= mask - 1;
1107 } 1181 }
1108} 1182}
1109 1183
1110static int rmap_write_protect(struct kvm *kvm, u64 gfn) 1184static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
1111{ 1185{
1112 struct kvm_memory_slot *slot; 1186 struct kvm_memory_slot *slot;
1113 unsigned long *rmapp; 1187 unsigned long *rmapp;
1114 int i; 1188 int i;
1115 int write_protected = 0; 1189 bool write_protected = false;
1116 1190
1117 slot = gfn_to_memslot(kvm, gfn); 1191 slot = gfn_to_memslot(kvm, gfn);
1118 1192
1119 for (i = PT_PAGE_TABLE_LEVEL; 1193 for (i = PT_PAGE_TABLE_LEVEL;
1120 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 1194 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
1121 rmapp = __gfn_to_rmap(gfn, i, slot); 1195 rmapp = __gfn_to_rmap(gfn, i, slot);
1122 write_protected |= __rmap_write_protect(kvm, rmapp, i); 1196 write_protected |= __rmap_write_protect(kvm, rmapp, i, true);
1123 } 1197 }
1124 1198
1125 return write_protected; 1199 return write_protected;
@@ -1238,11 +1312,12 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1238 unsigned long data) 1312 unsigned long data)
1239{ 1313{
1240 u64 *sptep; 1314 u64 *sptep;
1241 struct rmap_iterator iter; 1315 struct rmap_iterator uninitialized_var(iter);
1242 int young = 0; 1316 int young = 0;
1243 1317
1244 /* 1318 /*
1245 * Emulate the accessed bit for EPT, by checking if this page has 1319 * In case of absence of EPT Access and Dirty Bits supports,
1320 * emulate the accessed bit for EPT, by checking if this page has
1246 * an EPT mapping, and clearing it if it does. On the next access, 1321 * an EPT mapping, and clearing it if it does. On the next access,
1247 * a new EPT mapping will be established. 1322 * a new EPT mapping will be established.
1248 * This has some overhead, but not as much as the cost of swapping 1323 * This has some overhead, but not as much as the cost of swapping
@@ -1253,11 +1328,12 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1253 1328
1254 for (sptep = rmap_get_first(*rmapp, &iter); sptep; 1329 for (sptep = rmap_get_first(*rmapp, &iter); sptep;
1255 sptep = rmap_get_next(&iter)) { 1330 sptep = rmap_get_next(&iter)) {
1256 BUG_ON(!(*sptep & PT_PRESENT_MASK)); 1331 BUG_ON(!is_shadow_present_pte(*sptep));
1257 1332
1258 if (*sptep & PT_ACCESSED_MASK) { 1333 if (*sptep & shadow_accessed_mask) {
1259 young = 1; 1334 young = 1;
1260 clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)sptep); 1335 clear_bit((ffs(shadow_accessed_mask) - 1),
1336 (unsigned long *)sptep);
1261 } 1337 }
1262 } 1338 }
1263 1339
@@ -1281,9 +1357,9 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1281 1357
1282 for (sptep = rmap_get_first(*rmapp, &iter); sptep; 1358 for (sptep = rmap_get_first(*rmapp, &iter); sptep;
1283 sptep = rmap_get_next(&iter)) { 1359 sptep = rmap_get_next(&iter)) {
1284 BUG_ON(!(*sptep & PT_PRESENT_MASK)); 1360 BUG_ON(!is_shadow_present_pte(*sptep));
1285 1361
1286 if (*sptep & PT_ACCESSED_MASK) { 1362 if (*sptep & shadow_accessed_mask) {
1287 young = 1; 1363 young = 1;
1288 break; 1364 break;
1289 } 1365 }
@@ -1401,12 +1477,10 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
1401 u64 *parent_pte, int direct) 1477 u64 *parent_pte, int direct)
1402{ 1478{
1403 struct kvm_mmu_page *sp; 1479 struct kvm_mmu_page *sp;
1404 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, 1480 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
1405 sizeof *sp); 1481 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
1406 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
1407 if (!direct) 1482 if (!direct)
1408 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, 1483 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
1409 PAGE_SIZE);
1410 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 1484 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
1411 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); 1485 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
1412 bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM); 1486 bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM);
@@ -1701,7 +1775,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
1701 1775
1702 kvm_mmu_pages_init(parent, &parents, &pages); 1776 kvm_mmu_pages_init(parent, &parents, &pages);
1703 while (mmu_unsync_walk(parent, &pages)) { 1777 while (mmu_unsync_walk(parent, &pages)) {
1704 int protected = 0; 1778 bool protected = false;
1705 1779
1706 for_each_sp(pages, sp, parents, i) 1780 for_each_sp(pages, sp, parents, i)
1707 protected |= rmap_write_protect(vcpu->kvm, sp->gfn); 1781 protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
@@ -1866,15 +1940,6 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
1866 mmu_spte_set(sptep, spte); 1940 mmu_spte_set(sptep, spte);
1867} 1941}
1868 1942
1869static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1870{
1871 if (is_large_pte(*sptep)) {
1872 drop_spte(vcpu->kvm, sptep);
1873 --vcpu->kvm->stat.lpages;
1874 kvm_flush_remote_tlbs(vcpu->kvm);
1875 }
1876}
1877
1878static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, 1943static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1879 unsigned direct_access) 1944 unsigned direct_access)
1880{ 1945{
@@ -2243,7 +2308,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2243 gfn_t gfn, pfn_t pfn, bool speculative, 2308 gfn_t gfn, pfn_t pfn, bool speculative,
2244 bool can_unsync, bool host_writable) 2309 bool can_unsync, bool host_writable)
2245{ 2310{
2246 u64 spte, entry = *sptep; 2311 u64 spte;
2247 int ret = 0; 2312 int ret = 0;
2248 2313
2249 if (set_mmio_spte(sptep, gfn, pfn, pte_access)) 2314 if (set_mmio_spte(sptep, gfn, pfn, pte_access))
@@ -2257,8 +2322,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2257 spte |= shadow_x_mask; 2322 spte |= shadow_x_mask;
2258 else 2323 else
2259 spte |= shadow_nx_mask; 2324 spte |= shadow_nx_mask;
2325
2260 if (pte_access & ACC_USER_MASK) 2326 if (pte_access & ACC_USER_MASK)
2261 spte |= shadow_user_mask; 2327 spte |= shadow_user_mask;
2328
2262 if (level > PT_PAGE_TABLE_LEVEL) 2329 if (level > PT_PAGE_TABLE_LEVEL)
2263 spte |= PT_PAGE_SIZE_MASK; 2330 spte |= PT_PAGE_SIZE_MASK;
2264 if (tdp_enabled) 2331 if (tdp_enabled)
@@ -2283,7 +2350,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2283 goto done; 2350 goto done;
2284 } 2351 }
2285 2352
2286 spte |= PT_WRITABLE_MASK; 2353 spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
2287 2354
2288 if (!vcpu->arch.mmu.direct_map 2355 if (!vcpu->arch.mmu.direct_map
2289 && !(pte_access & ACC_WRITE_MASK)) { 2356 && !(pte_access & ACC_WRITE_MASK)) {
@@ -2312,8 +2379,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2312 __func__, gfn); 2379 __func__, gfn);
2313 ret = 1; 2380 ret = 1;
2314 pte_access &= ~ACC_WRITE_MASK; 2381 pte_access &= ~ACC_WRITE_MASK;
2315 if (is_writable_pte(spte)) 2382 spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
2316 spte &= ~PT_WRITABLE_MASK;
2317 } 2383 }
2318 } 2384 }
2319 2385
@@ -2321,14 +2387,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2321 mark_page_dirty(vcpu->kvm, gfn); 2387 mark_page_dirty(vcpu->kvm, gfn);
2322 2388
2323set_pte: 2389set_pte:
2324 mmu_spte_update(sptep, spte); 2390 if (mmu_spte_update(sptep, spte))
2325 /*
2326 * If we overwrite a writable spte with a read-only one we
2327 * should flush remote TLBs. Otherwise rmap_write_protect
2328 * will find a read-only spte, even though the writable spte
2329 * might be cached on a CPU's TLB.
2330 */
2331 if (is_writable_pte(entry) && !is_writable_pte(*sptep))
2332 kvm_flush_remote_tlbs(vcpu->kvm); 2391 kvm_flush_remote_tlbs(vcpu->kvm);
2333done: 2392done:
2334 return ret; 2393 return ret;
@@ -2403,6 +2462,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2403 2462
2404static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) 2463static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
2405{ 2464{
2465 mmu_free_roots(vcpu);
2406} 2466}
2407 2467
2408static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, 2468static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
@@ -2625,18 +2685,116 @@ exit:
2625 return ret; 2685 return ret;
2626} 2686}
2627 2687
2688static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, u32 error_code)
2689{
2690 /*
2691 * #PF can be fast only if the shadow page table is present and it
2692 * is caused by write-protect, that means we just need change the
2693 * W bit of the spte which can be done out of mmu-lock.
2694 */
2695 if (!(error_code & PFERR_PRESENT_MASK) ||
2696 !(error_code & PFERR_WRITE_MASK))
2697 return false;
2698
2699 return true;
2700}
2701
2702static bool
2703fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte)
2704{
2705 struct kvm_mmu_page *sp = page_header(__pa(sptep));
2706 gfn_t gfn;
2707
2708 WARN_ON(!sp->role.direct);
2709
2710 /*
2711 * The gfn of direct spte is stable since it is calculated
2712 * by sp->gfn.
2713 */
2714 gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
2715
2716 if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte)
2717 mark_page_dirty(vcpu->kvm, gfn);
2718
2719 return true;
2720}
2721
2722/*
2723 * Return value:
2724 * - true: let the vcpu to access on the same address again.
2725 * - false: let the real page fault path to fix it.
2726 */
2727static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
2728 u32 error_code)
2729{
2730 struct kvm_shadow_walk_iterator iterator;
2731 bool ret = false;
2732 u64 spte = 0ull;
2733
2734 if (!page_fault_can_be_fast(vcpu, error_code))
2735 return false;
2736
2737 walk_shadow_page_lockless_begin(vcpu);
2738 for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
2739 if (!is_shadow_present_pte(spte) || iterator.level < level)
2740 break;
2741
2742 /*
2743 * If the mapping has been changed, let the vcpu fault on the
2744 * same address again.
2745 */
2746 if (!is_rmap_spte(spte)) {
2747 ret = true;
2748 goto exit;
2749 }
2750
2751 if (!is_last_spte(spte, level))
2752 goto exit;
2753
2754 /*
2755 * Check if it is a spurious fault caused by TLB lazily flushed.
2756 *
2757 * Need not check the access of upper level table entries since
2758 * they are always ACC_ALL.
2759 */
2760 if (is_writable_pte(spte)) {
2761 ret = true;
2762 goto exit;
2763 }
2764
2765 /*
2766 * Currently, to simplify the code, only the spte write-protected
2767 * by dirty-log can be fast fixed.
2768 */
2769 if (!spte_is_locklessly_modifiable(spte))
2770 goto exit;
2771
2772 /*
2773 * Currently, fast page fault only works for direct mapping since
2774 * the gfn is not stable for indirect shadow page.
2775 * See Documentation/virtual/kvm/locking.txt to get more detail.
2776 */
2777 ret = fast_pf_fix_direct_spte(vcpu, iterator.sptep, spte);
2778exit:
2779 trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
2780 spte, ret);
2781 walk_shadow_page_lockless_end(vcpu);
2782
2783 return ret;
2784}
2785
2628static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, 2786static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
2629 gva_t gva, pfn_t *pfn, bool write, bool *writable); 2787 gva_t gva, pfn_t *pfn, bool write, bool *writable);
2630 2788
2631static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, 2789static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
2632 bool prefault) 2790 gfn_t gfn, bool prefault)
2633{ 2791{
2634 int r; 2792 int r;
2635 int level; 2793 int level;
2636 int force_pt_level; 2794 int force_pt_level;
2637 pfn_t pfn; 2795 pfn_t pfn;
2638 unsigned long mmu_seq; 2796 unsigned long mmu_seq;
2639 bool map_writable; 2797 bool map_writable, write = error_code & PFERR_WRITE_MASK;
2640 2798
2641 force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); 2799 force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
2642 if (likely(!force_pt_level)) { 2800 if (likely(!force_pt_level)) {
@@ -2653,6 +2811,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
2653 } else 2811 } else
2654 level = PT_PAGE_TABLE_LEVEL; 2812 level = PT_PAGE_TABLE_LEVEL;
2655 2813
2814 if (fast_page_fault(vcpu, v, level, error_code))
2815 return 0;
2816
2656 mmu_seq = vcpu->kvm->mmu_notifier_seq; 2817 mmu_seq = vcpu->kvm->mmu_notifier_seq;
2657 smp_rmb(); 2818 smp_rmb();
2658 2819
@@ -3041,7 +3202,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
3041 gfn = gva >> PAGE_SHIFT; 3202 gfn = gva >> PAGE_SHIFT;
3042 3203
3043 return nonpaging_map(vcpu, gva & PAGE_MASK, 3204 return nonpaging_map(vcpu, gva & PAGE_MASK,
3044 error_code & PFERR_WRITE_MASK, gfn, prefault); 3205 error_code, gfn, prefault);
3045} 3206}
3046 3207
3047static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) 3208static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
@@ -3121,6 +3282,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
3121 } else 3282 } else
3122 level = PT_PAGE_TABLE_LEVEL; 3283 level = PT_PAGE_TABLE_LEVEL;
3123 3284
3285 if (fast_page_fault(vcpu, gpa, level, error_code))
3286 return 0;
3287
3124 mmu_seq = vcpu->kvm->mmu_notifier_seq; 3288 mmu_seq = vcpu->kvm->mmu_notifier_seq;
3125 smp_rmb(); 3289 smp_rmb();
3126 3290
@@ -3885,6 +4049,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu)
3885void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) 4049void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
3886{ 4050{
3887 struct kvm_mmu_page *sp; 4051 struct kvm_mmu_page *sp;
4052 bool flush = false;
3888 4053
3889 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { 4054 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
3890 int i; 4055 int i;
@@ -3899,16 +4064,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
3899 !is_last_spte(pt[i], sp->role.level)) 4064 !is_last_spte(pt[i], sp->role.level))
3900 continue; 4065 continue;
3901 4066
3902 if (is_large_pte(pt[i])) { 4067 spte_write_protect(kvm, &pt[i], &flush, false);
3903 drop_spte(kvm, &pt[i]);
3904 --kvm->stat.lpages;
3905 continue;
3906 }
3907
3908 /* avoid RMW */
3909 if (is_writable_pte(pt[i]))
3910 mmu_spte_update(&pt[i],
3911 pt[i] & ~PT_WRITABLE_MASK);
3912 } 4068 }
3913 } 4069 }
3914 kvm_flush_remote_tlbs(kvm); 4070 kvm_flush_remote_tlbs(kvm);
@@ -3934,6 +4090,9 @@ static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
3934{ 4090{
3935 struct kvm_mmu_page *page; 4091 struct kvm_mmu_page *page;
3936 4092
4093 if (list_empty(&kvm->arch.active_mmu_pages))
4094 return;
4095
3937 page = container_of(kvm->arch.active_mmu_pages.prev, 4096 page = container_of(kvm->arch.active_mmu_pages.prev,
3938 struct kvm_mmu_page, link); 4097 struct kvm_mmu_page, link);
3939 kvm_mmu_prepare_zap_page(kvm, page, invalid_list); 4098 kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
@@ -3942,7 +4101,6 @@ static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
3942static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) 4101static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
3943{ 4102{
3944 struct kvm *kvm; 4103 struct kvm *kvm;
3945 struct kvm *kvm_freed = NULL;
3946 int nr_to_scan = sc->nr_to_scan; 4104 int nr_to_scan = sc->nr_to_scan;
3947 4105
3948 if (nr_to_scan == 0) 4106 if (nr_to_scan == 0)
@@ -3954,22 +4112,30 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
3954 int idx; 4112 int idx;
3955 LIST_HEAD(invalid_list); 4113 LIST_HEAD(invalid_list);
3956 4114
4115 /*
4116 * n_used_mmu_pages is accessed without holding kvm->mmu_lock
4117 * here. We may skip a VM instance errorneosly, but we do not
4118 * want to shrink a VM that only started to populate its MMU
4119 * anyway.
4120 */
4121 if (kvm->arch.n_used_mmu_pages > 0) {
4122 if (!nr_to_scan--)
4123 break;
4124 continue;
4125 }
4126
3957 idx = srcu_read_lock(&kvm->srcu); 4127 idx = srcu_read_lock(&kvm->srcu);
3958 spin_lock(&kvm->mmu_lock); 4128 spin_lock(&kvm->mmu_lock);
3959 if (!kvm_freed && nr_to_scan > 0 &&
3960 kvm->arch.n_used_mmu_pages > 0) {
3961 kvm_mmu_remove_some_alloc_mmu_pages(kvm,
3962 &invalid_list);
3963 kvm_freed = kvm;
3964 }
3965 nr_to_scan--;
3966 4129
4130 kvm_mmu_remove_some_alloc_mmu_pages(kvm, &invalid_list);
3967 kvm_mmu_commit_zap_page(kvm, &invalid_list); 4131 kvm_mmu_commit_zap_page(kvm, &invalid_list);
4132
3968 spin_unlock(&kvm->mmu_lock); 4133 spin_unlock(&kvm->mmu_lock);
3969 srcu_read_unlock(&kvm->srcu, idx); 4134 srcu_read_unlock(&kvm->srcu, idx);
4135
4136 list_move_tail(&kvm->vm_list, &vm_list);
4137 break;
3970 } 4138 }
3971 if (kvm_freed)
3972 list_move_tail(&kvm_freed->vm_list, &vm_list);
3973 4139
3974 raw_spin_unlock(&kvm_lock); 4140 raw_spin_unlock(&kvm_lock);
3975 4141
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 89fb0e81322a..cd6e98333ba3 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -54,8 +54,8 @@
54 */ 54 */
55TRACE_EVENT( 55TRACE_EVENT(
56 kvm_mmu_pagetable_walk, 56 kvm_mmu_pagetable_walk,
57 TP_PROTO(u64 addr, int write_fault, int user_fault, int fetch_fault), 57 TP_PROTO(u64 addr, u32 pferr),
58 TP_ARGS(addr, write_fault, user_fault, fetch_fault), 58 TP_ARGS(addr, pferr),
59 59
60 TP_STRUCT__entry( 60 TP_STRUCT__entry(
61 __field(__u64, addr) 61 __field(__u64, addr)
@@ -64,8 +64,7 @@ TRACE_EVENT(
64 64
65 TP_fast_assign( 65 TP_fast_assign(
66 __entry->addr = addr; 66 __entry->addr = addr;
67 __entry->pferr = (!!write_fault << 1) | (!!user_fault << 2) 67 __entry->pferr = pferr;
68 | (!!fetch_fault << 4);
69 ), 68 ),
70 69
71 TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr, 70 TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr,
@@ -243,6 +242,44 @@ TRACE_EVENT(
243 TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn, 242 TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn,
244 __entry->access) 243 __entry->access)
245); 244);
245
246#define __spte_satisfied(__spte) \
247 (__entry->retry && is_writable_pte(__entry->__spte))
248
249TRACE_EVENT(
250 fast_page_fault,
251 TP_PROTO(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
252 u64 *sptep, u64 old_spte, bool retry),
253 TP_ARGS(vcpu, gva, error_code, sptep, old_spte, retry),
254
255 TP_STRUCT__entry(
256 __field(int, vcpu_id)
257 __field(gva_t, gva)
258 __field(u32, error_code)
259 __field(u64 *, sptep)
260 __field(u64, old_spte)
261 __field(u64, new_spte)
262 __field(bool, retry)
263 ),
264
265 TP_fast_assign(
266 __entry->vcpu_id = vcpu->vcpu_id;
267 __entry->gva = gva;
268 __entry->error_code = error_code;
269 __entry->sptep = sptep;
270 __entry->old_spte = old_spte;
271 __entry->new_spte = *sptep;
272 __entry->retry = retry;
273 ),
274
275 TP_printk("vcpu %d gva %lx error_code %s sptep %p old %#llx"
276 " new %llx spurious %d fixed %d", __entry->vcpu_id,
277 __entry->gva, __print_flags(__entry->error_code, "|",
278 kvm_mmu_trace_pferr_flags), __entry->sptep,
279 __entry->old_spte, __entry->new_spte,
280 __spte_satisfied(old_spte), __spte_satisfied(new_spte)
281 )
282);
246#endif /* _TRACE_KVMMMU_H */ 283#endif /* _TRACE_KVMMMU_H */
247 284
248#undef TRACE_INCLUDE_PATH 285#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 34f970937ef1..bb7cf01cae76 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -154,8 +154,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
154 const int fetch_fault = access & PFERR_FETCH_MASK; 154 const int fetch_fault = access & PFERR_FETCH_MASK;
155 u16 errcode = 0; 155 u16 errcode = 0;
156 156
157 trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, 157 trace_kvm_mmu_pagetable_walk(addr, access);
158 fetch_fault);
159retry_walk: 158retry_walk:
160 eperm = false; 159 eperm = false;
161 walker->level = mmu->root_level; 160 walker->level = mmu->root_level;
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 2e88438ffd83..9b7ec1150ab0 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -80,10 +80,10 @@ static inline struct kvm_pmc *get_fixed_pmc_idx(struct kvm_pmu *pmu, int idx)
80 80
81static struct kvm_pmc *global_idx_to_pmc(struct kvm_pmu *pmu, int idx) 81static struct kvm_pmc *global_idx_to_pmc(struct kvm_pmu *pmu, int idx)
82{ 82{
83 if (idx < X86_PMC_IDX_FIXED) 83 if (idx < INTEL_PMC_IDX_FIXED)
84 return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + idx, MSR_P6_EVNTSEL0); 84 return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + idx, MSR_P6_EVNTSEL0);
85 else 85 else
86 return get_fixed_pmc_idx(pmu, idx - X86_PMC_IDX_FIXED); 86 return get_fixed_pmc_idx(pmu, idx - INTEL_PMC_IDX_FIXED);
87} 87}
88 88
89void kvm_deliver_pmi(struct kvm_vcpu *vcpu) 89void kvm_deliver_pmi(struct kvm_vcpu *vcpu)
@@ -291,7 +291,7 @@ static void reprogram_idx(struct kvm_pmu *pmu, int idx)
291 if (pmc_is_gp(pmc)) 291 if (pmc_is_gp(pmc))
292 reprogram_gp_counter(pmc, pmc->eventsel); 292 reprogram_gp_counter(pmc, pmc->eventsel);
293 else { 293 else {
294 int fidx = idx - X86_PMC_IDX_FIXED; 294 int fidx = idx - INTEL_PMC_IDX_FIXED;
295 reprogram_fixed_counter(pmc, 295 reprogram_fixed_counter(pmc,
296 fixed_en_pmi(pmu->fixed_ctr_ctrl, fidx), fidx); 296 fixed_en_pmi(pmu->fixed_ctr_ctrl, fidx), fidx);
297 } 297 }
@@ -452,7 +452,7 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
452 return; 452 return;
453 453
454 pmu->nr_arch_gp_counters = min((int)(entry->eax >> 8) & 0xff, 454 pmu->nr_arch_gp_counters = min((int)(entry->eax >> 8) & 0xff,
455 X86_PMC_MAX_GENERIC); 455 INTEL_PMC_MAX_GENERIC);
456 pmu->counter_bitmask[KVM_PMC_GP] = 456 pmu->counter_bitmask[KVM_PMC_GP] =
457 ((u64)1 << ((entry->eax >> 16) & 0xff)) - 1; 457 ((u64)1 << ((entry->eax >> 16) & 0xff)) - 1;
458 bitmap_len = (entry->eax >> 24) & 0xff; 458 bitmap_len = (entry->eax >> 24) & 0xff;
@@ -462,13 +462,13 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
462 pmu->nr_arch_fixed_counters = 0; 462 pmu->nr_arch_fixed_counters = 0;
463 } else { 463 } else {
464 pmu->nr_arch_fixed_counters = min((int)(entry->edx & 0x1f), 464 pmu->nr_arch_fixed_counters = min((int)(entry->edx & 0x1f),
465 X86_PMC_MAX_FIXED); 465 INTEL_PMC_MAX_FIXED);
466 pmu->counter_bitmask[KVM_PMC_FIXED] = 466 pmu->counter_bitmask[KVM_PMC_FIXED] =
467 ((u64)1 << ((entry->edx >> 5) & 0xff)) - 1; 467 ((u64)1 << ((entry->edx >> 5) & 0xff)) - 1;
468 } 468 }
469 469
470 pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) | 470 pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) |
471 (((1ull << pmu->nr_arch_fixed_counters) - 1) << X86_PMC_IDX_FIXED); 471 (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED);
472 pmu->global_ctrl_mask = ~pmu->global_ctrl; 472 pmu->global_ctrl_mask = ~pmu->global_ctrl;
473} 473}
474 474
@@ -478,15 +478,15 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu)
478 struct kvm_pmu *pmu = &vcpu->arch.pmu; 478 struct kvm_pmu *pmu = &vcpu->arch.pmu;
479 479
480 memset(pmu, 0, sizeof(*pmu)); 480 memset(pmu, 0, sizeof(*pmu));
481 for (i = 0; i < X86_PMC_MAX_GENERIC; i++) { 481 for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) {
482 pmu->gp_counters[i].type = KVM_PMC_GP; 482 pmu->gp_counters[i].type = KVM_PMC_GP;
483 pmu->gp_counters[i].vcpu = vcpu; 483 pmu->gp_counters[i].vcpu = vcpu;
484 pmu->gp_counters[i].idx = i; 484 pmu->gp_counters[i].idx = i;
485 } 485 }
486 for (i = 0; i < X86_PMC_MAX_FIXED; i++) { 486 for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) {
487 pmu->fixed_counters[i].type = KVM_PMC_FIXED; 487 pmu->fixed_counters[i].type = KVM_PMC_FIXED;
488 pmu->fixed_counters[i].vcpu = vcpu; 488 pmu->fixed_counters[i].vcpu = vcpu;
489 pmu->fixed_counters[i].idx = i + X86_PMC_IDX_FIXED; 489 pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED;
490 } 490 }
491 init_irq_work(&pmu->irq_work, trigger_pmi); 491 init_irq_work(&pmu->irq_work, trigger_pmi);
492 kvm_pmu_cpuid_update(vcpu); 492 kvm_pmu_cpuid_update(vcpu);
@@ -498,13 +498,13 @@ void kvm_pmu_reset(struct kvm_vcpu *vcpu)
498 int i; 498 int i;
499 499
500 irq_work_sync(&pmu->irq_work); 500 irq_work_sync(&pmu->irq_work);
501 for (i = 0; i < X86_PMC_MAX_GENERIC; i++) { 501 for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) {
502 struct kvm_pmc *pmc = &pmu->gp_counters[i]; 502 struct kvm_pmc *pmc = &pmu->gp_counters[i];
503 stop_counter(pmc); 503 stop_counter(pmc);
504 pmc->counter = pmc->eventsel = 0; 504 pmc->counter = pmc->eventsel = 0;
505 } 505 }
506 506
507 for (i = 0; i < X86_PMC_MAX_FIXED; i++) 507 for (i = 0; i < INTEL_PMC_MAX_FIXED; i++)
508 stop_counter(&pmu->fixed_counters[i]); 508 stop_counter(&pmu->fixed_counters[i]);
509 509
510 pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 510 pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status =
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f75af406b268..baead950d6c8 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3185,8 +3185,8 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
3185 break; 3185 break;
3186 case MSR_IA32_DEBUGCTLMSR: 3186 case MSR_IA32_DEBUGCTLMSR:
3187 if (!boot_cpu_has(X86_FEATURE_LBRV)) { 3187 if (!boot_cpu_has(X86_FEATURE_LBRV)) {
3188 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", 3188 vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
3189 __func__, data); 3189 __func__, data);
3190 break; 3190 break;
3191 } 3191 }
3192 if (data & DEBUGCTL_RESERVED_BITS) 3192 if (data & DEBUGCTL_RESERVED_BITS)
@@ -3205,7 +3205,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
3205 case MSR_VM_CR: 3205 case MSR_VM_CR:
3206 return svm_set_vm_cr(vcpu, data); 3206 return svm_set_vm_cr(vcpu, data);
3207 case MSR_VM_IGNNE: 3207 case MSR_VM_IGNNE:
3208 pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); 3208 vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
3209 break; 3209 break;
3210 default: 3210 default:
3211 return kvm_set_msr_common(vcpu, ecx, data); 3211 return kvm_set_msr_common(vcpu, ecx, data);
@@ -4044,6 +4044,11 @@ static bool svm_rdtscp_supported(void)
4044 return false; 4044 return false;
4045} 4045}
4046 4046
4047static bool svm_invpcid_supported(void)
4048{
4049 return false;
4050}
4051
4047static bool svm_has_wbinvd_exit(void) 4052static bool svm_has_wbinvd_exit(void)
4048{ 4053{
4049 return true; 4054 return true;
@@ -4312,6 +4317,7 @@ static struct kvm_x86_ops svm_x86_ops = {
4312 .cpuid_update = svm_cpuid_update, 4317 .cpuid_update = svm_cpuid_update,
4313 4318
4314 .rdtscp_supported = svm_rdtscp_supported, 4319 .rdtscp_supported = svm_rdtscp_supported,
4320 .invpcid_supported = svm_invpcid_supported,
4315 4321
4316 .set_supported_cpuid = svm_set_supported_cpuid, 4322 .set_supported_cpuid = svm_set_supported_cpuid,
4317 4323
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 911d2641f14c..a71faf727ff3 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -517,6 +517,40 @@ TRACE_EVENT(kvm_apic_accept_irq,
517 __entry->coalesced ? " (coalesced)" : "") 517 __entry->coalesced ? " (coalesced)" : "")
518); 518);
519 519
520TRACE_EVENT(kvm_eoi,
521 TP_PROTO(struct kvm_lapic *apic, int vector),
522 TP_ARGS(apic, vector),
523
524 TP_STRUCT__entry(
525 __field( __u32, apicid )
526 __field( int, vector )
527 ),
528
529 TP_fast_assign(
530 __entry->apicid = apic->vcpu->vcpu_id;
531 __entry->vector = vector;
532 ),
533
534 TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector)
535);
536
537TRACE_EVENT(kvm_pv_eoi,
538 TP_PROTO(struct kvm_lapic *apic, int vector),
539 TP_ARGS(apic, vector),
540
541 TP_STRUCT__entry(
542 __field( __u32, apicid )
543 __field( int, vector )
544 ),
545
546 TP_fast_assign(
547 __entry->apicid = apic->vcpu->vcpu_id;
548 __entry->vector = vector;
549 ),
550
551 TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector)
552);
553
520/* 554/*
521 * Tracepoint for nested VMRUN 555 * Tracepoint for nested VMRUN
522 */ 556 */
@@ -710,16 +744,6 @@ TRACE_EVENT(kvm_skinit,
710 __entry->rip, __entry->slb) 744 __entry->rip, __entry->slb)
711); 745);
712 746
713#define __print_insn(insn, ilen) ({ \
714 int i; \
715 const char *ret = p->buffer + p->len; \
716 \
717 for (i = 0; i < ilen; ++i) \
718 trace_seq_printf(p, " %02x", insn[i]); \
719 trace_seq_printf(p, "%c", 0); \
720 ret; \
721 })
722
723#define KVM_EMUL_INSN_F_CR0_PE (1 << 0) 747#define KVM_EMUL_INSN_F_CR0_PE (1 << 0)
724#define KVM_EMUL_INSN_F_EFL_VM (1 << 1) 748#define KVM_EMUL_INSN_F_EFL_VM (1 << 1)
725#define KVM_EMUL_INSN_F_CS_D (1 << 2) 749#define KVM_EMUL_INSN_F_CS_D (1 << 2)
@@ -786,7 +810,7 @@ TRACE_EVENT(kvm_emulate_insn,
786 810
787 TP_printk("%x:%llx:%s (%s)%s", 811 TP_printk("%x:%llx:%s (%s)%s",
788 __entry->csbase, __entry->rip, 812 __entry->csbase, __entry->rip,
789 __print_insn(__entry->insn, __entry->len), 813 __print_hex(__entry->insn, __entry->len),
790 __print_symbolic(__entry->flags, 814 __print_symbolic(__entry->flags,
791 kvm_trace_symbol_emul_flags), 815 kvm_trace_symbol_emul_flags),
792 __entry->failed ? " failed" : "" 816 __entry->failed ? " failed" : ""
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 32eb58866292..c39b60707e02 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -71,7 +71,10 @@ static bool __read_mostly enable_unrestricted_guest = 1;
71module_param_named(unrestricted_guest, 71module_param_named(unrestricted_guest,
72 enable_unrestricted_guest, bool, S_IRUGO); 72 enable_unrestricted_guest, bool, S_IRUGO);
73 73
74static bool __read_mostly emulate_invalid_guest_state = 0; 74static bool __read_mostly enable_ept_ad_bits = 1;
75module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
76
77static bool __read_mostly emulate_invalid_guest_state = true;
75module_param(emulate_invalid_guest_state, bool, S_IRUGO); 78module_param(emulate_invalid_guest_state, bool, S_IRUGO);
76 79
77static bool __read_mostly vmm_exclusive = 1; 80static bool __read_mostly vmm_exclusive = 1;
@@ -615,6 +618,10 @@ static void kvm_cpu_vmxon(u64 addr);
615static void kvm_cpu_vmxoff(void); 618static void kvm_cpu_vmxoff(void);
616static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); 619static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
617static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); 620static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
621static void vmx_set_segment(struct kvm_vcpu *vcpu,
622 struct kvm_segment *var, int seg);
623static void vmx_get_segment(struct kvm_vcpu *vcpu,
624 struct kvm_segment *var, int seg);
618 625
619static DEFINE_PER_CPU(struct vmcs *, vmxarea); 626static DEFINE_PER_CPU(struct vmcs *, vmxarea);
620static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 627static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -789,6 +796,11 @@ static inline bool cpu_has_vmx_ept_4levels(void)
789 return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; 796 return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
790} 797}
791 798
799static inline bool cpu_has_vmx_ept_ad_bits(void)
800{
801 return vmx_capability.ept & VMX_EPT_AD_BIT;
802}
803
792static inline bool cpu_has_vmx_invept_individual_addr(void) 804static inline bool cpu_has_vmx_invept_individual_addr(void)
793{ 805{
794 return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT; 806 return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
@@ -849,6 +861,12 @@ static inline bool cpu_has_vmx_rdtscp(void)
849 SECONDARY_EXEC_RDTSCP; 861 SECONDARY_EXEC_RDTSCP;
850} 862}
851 863
864static inline bool cpu_has_vmx_invpcid(void)
865{
866 return vmcs_config.cpu_based_2nd_exec_ctrl &
867 SECONDARY_EXEC_ENABLE_INVPCID;
868}
869
852static inline bool cpu_has_virtual_nmis(void) 870static inline bool cpu_has_virtual_nmis(void)
853{ 871{
854 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; 872 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
@@ -1739,6 +1757,11 @@ static bool vmx_rdtscp_supported(void)
1739 return cpu_has_vmx_rdtscp(); 1757 return cpu_has_vmx_rdtscp();
1740} 1758}
1741 1759
1760static bool vmx_invpcid_supported(void)
1761{
1762 return cpu_has_vmx_invpcid() && enable_ept;
1763}
1764
1742/* 1765/*
1743 * Swap MSR entry in host/guest MSR entry array. 1766 * Swap MSR entry in host/guest MSR entry array.
1744 */ 1767 */
@@ -2458,7 +2481,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
2458 SECONDARY_EXEC_ENABLE_EPT | 2481 SECONDARY_EXEC_ENABLE_EPT |
2459 SECONDARY_EXEC_UNRESTRICTED_GUEST | 2482 SECONDARY_EXEC_UNRESTRICTED_GUEST |
2460 SECONDARY_EXEC_PAUSE_LOOP_EXITING | 2483 SECONDARY_EXEC_PAUSE_LOOP_EXITING |
2461 SECONDARY_EXEC_RDTSCP; 2484 SECONDARY_EXEC_RDTSCP |
2485 SECONDARY_EXEC_ENABLE_INVPCID;
2462 if (adjust_vmx_controls(min2, opt2, 2486 if (adjust_vmx_controls(min2, opt2,
2463 MSR_IA32_VMX_PROCBASED_CTLS2, 2487 MSR_IA32_VMX_PROCBASED_CTLS2,
2464 &_cpu_based_2nd_exec_control) < 0) 2488 &_cpu_based_2nd_exec_control) < 0)
@@ -2645,8 +2669,12 @@ static __init int hardware_setup(void)
2645 !cpu_has_vmx_ept_4levels()) { 2669 !cpu_has_vmx_ept_4levels()) {
2646 enable_ept = 0; 2670 enable_ept = 0;
2647 enable_unrestricted_guest = 0; 2671 enable_unrestricted_guest = 0;
2672 enable_ept_ad_bits = 0;
2648 } 2673 }
2649 2674
2675 if (!cpu_has_vmx_ept_ad_bits())
2676 enable_ept_ad_bits = 0;
2677
2650 if (!cpu_has_vmx_unrestricted_guest()) 2678 if (!cpu_has_vmx_unrestricted_guest())
2651 enable_unrestricted_guest = 0; 2679 enable_unrestricted_guest = 0;
2652 2680
@@ -2770,6 +2798,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
2770{ 2798{
2771 unsigned long flags; 2799 unsigned long flags;
2772 struct vcpu_vmx *vmx = to_vmx(vcpu); 2800 struct vcpu_vmx *vmx = to_vmx(vcpu);
2801 struct kvm_segment var;
2773 2802
2774 if (enable_unrestricted_guest) 2803 if (enable_unrestricted_guest)
2775 return; 2804 return;
@@ -2813,20 +2842,23 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
2813 if (emulate_invalid_guest_state) 2842 if (emulate_invalid_guest_state)
2814 goto continue_rmode; 2843 goto continue_rmode;
2815 2844
2816 vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); 2845 vmx_get_segment(vcpu, &var, VCPU_SREG_SS);
2817 vmcs_write32(GUEST_SS_LIMIT, 0xffff); 2846 vmx_set_segment(vcpu, &var, VCPU_SREG_SS);
2818 vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); 2847
2848 vmx_get_segment(vcpu, &var, VCPU_SREG_CS);
2849 vmx_set_segment(vcpu, &var, VCPU_SREG_CS);
2850
2851 vmx_get_segment(vcpu, &var, VCPU_SREG_ES);
2852 vmx_set_segment(vcpu, &var, VCPU_SREG_ES);
2853
2854 vmx_get_segment(vcpu, &var, VCPU_SREG_DS);
2855 vmx_set_segment(vcpu, &var, VCPU_SREG_DS);
2819 2856
2820 vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); 2857 vmx_get_segment(vcpu, &var, VCPU_SREG_GS);
2821 vmcs_write32(GUEST_CS_LIMIT, 0xffff); 2858 vmx_set_segment(vcpu, &var, VCPU_SREG_GS);
2822 if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
2823 vmcs_writel(GUEST_CS_BASE, 0xf0000);
2824 vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
2825 2859
2826 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es); 2860 vmx_get_segment(vcpu, &var, VCPU_SREG_FS);
2827 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds); 2861 vmx_set_segment(vcpu, &var, VCPU_SREG_FS);
2828 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs);
2829 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs);
2830 2862
2831continue_rmode: 2863continue_rmode:
2832 kvm_mmu_reset_context(vcpu); 2864 kvm_mmu_reset_context(vcpu);
@@ -3027,6 +3059,8 @@ static u64 construct_eptp(unsigned long root_hpa)
3027 /* TODO write the value reading from MSR */ 3059 /* TODO write the value reading from MSR */
3028 eptp = VMX_EPT_DEFAULT_MT | 3060 eptp = VMX_EPT_DEFAULT_MT |
3029 VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT; 3061 VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
3062 if (enable_ept_ad_bits)
3063 eptp |= VMX_EPT_AD_ENABLE_BIT;
3030 eptp |= (root_hpa & PAGE_MASK); 3064 eptp |= (root_hpa & PAGE_MASK);
3031 3065
3032 return eptp; 3066 return eptp;
@@ -3153,11 +3187,22 @@ static int __vmx_get_cpl(struct kvm_vcpu *vcpu)
3153 3187
3154static int vmx_get_cpl(struct kvm_vcpu *vcpu) 3188static int vmx_get_cpl(struct kvm_vcpu *vcpu)
3155{ 3189{
3190 struct vcpu_vmx *vmx = to_vmx(vcpu);
3191
3192 /*
3193 * If we enter real mode with cs.sel & 3 != 0, the normal CPL calculations
3194 * fail; use the cache instead.
3195 */
3196 if (unlikely(vmx->emulation_required && emulate_invalid_guest_state)) {
3197 return vmx->cpl;
3198 }
3199
3156 if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) { 3200 if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) {
3157 __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); 3201 __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
3158 to_vmx(vcpu)->cpl = __vmx_get_cpl(vcpu); 3202 vmx->cpl = __vmx_get_cpl(vcpu);
3159 } 3203 }
3160 return to_vmx(vcpu)->cpl; 3204
3205 return vmx->cpl;
3161} 3206}
3162 3207
3163 3208
@@ -3165,7 +3210,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var)
3165{ 3210{
3166 u32 ar; 3211 u32 ar;
3167 3212
3168 if (var->unusable) 3213 if (var->unusable || !var->present)
3169 ar = 1 << 16; 3214 ar = 1 << 16;
3170 else { 3215 else {
3171 ar = var->type & 15; 3216 ar = var->type & 15;
@@ -3177,8 +3222,6 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var)
3177 ar |= (var->db & 1) << 14; 3222 ar |= (var->db & 1) << 14;
3178 ar |= (var->g & 1) << 15; 3223 ar |= (var->g & 1) << 15;
3179 } 3224 }
3180 if (ar == 0) /* a 0 value means unusable */
3181 ar = AR_UNUSABLE_MASK;
3182 3225
3183 return ar; 3226 return ar;
3184} 3227}
@@ -3229,6 +3272,44 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
3229 3272
3230 vmcs_write32(sf->ar_bytes, ar); 3273 vmcs_write32(sf->ar_bytes, ar);
3231 __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); 3274 __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
3275
3276 /*
3277 * Fix segments for real mode guest in hosts that don't have
3278 * "unrestricted_mode" or it was disabled.
3279 * This is done to allow migration of the guests from hosts with
3280 * unrestricted guest like Westmere to older host that don't have
3281 * unrestricted guest like Nehelem.
3282 */
3283 if (!enable_unrestricted_guest && vmx->rmode.vm86_active) {
3284 switch (seg) {
3285 case VCPU_SREG_CS:
3286 vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
3287 vmcs_write32(GUEST_CS_LIMIT, 0xffff);
3288 if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
3289 vmcs_writel(GUEST_CS_BASE, 0xf0000);
3290 vmcs_write16(GUEST_CS_SELECTOR,
3291 vmcs_readl(GUEST_CS_BASE) >> 4);
3292 break;
3293 case VCPU_SREG_ES:
3294 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es);
3295 break;
3296 case VCPU_SREG_DS:
3297 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds);
3298 break;
3299 case VCPU_SREG_GS:
3300 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs);
3301 break;
3302 case VCPU_SREG_FS:
3303 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs);
3304 break;
3305 case VCPU_SREG_SS:
3306 vmcs_write16(GUEST_SS_SELECTOR,
3307 vmcs_readl(GUEST_SS_BASE) >> 4);
3308 vmcs_write32(GUEST_SS_LIMIT, 0xffff);
3309 vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
3310 break;
3311 }
3312 }
3232} 3313}
3233 3314
3234static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 3315static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
@@ -3731,6 +3812,8 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
3731 if (!enable_ept) { 3812 if (!enable_ept) {
3732 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; 3813 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
3733 enable_unrestricted_guest = 0; 3814 enable_unrestricted_guest = 0;
3815 /* Enable INVPCID for non-ept guests may cause performance regression. */
3816 exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
3734 } 3817 }
3735 if (!enable_unrestricted_guest) 3818 if (!enable_unrestricted_guest)
3736 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 3819 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
@@ -4489,7 +4572,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
4489 break; 4572 break;
4490 } 4573 }
4491 vcpu->run->exit_reason = 0; 4574 vcpu->run->exit_reason = 0;
4492 pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n", 4575 vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
4493 (int)(exit_qualification >> 4) & 3, cr); 4576 (int)(exit_qualification >> 4) & 3, cr);
4494 return 0; 4577 return 0;
4495} 4578}
@@ -4769,6 +4852,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
4769{ 4852{
4770 unsigned long exit_qualification; 4853 unsigned long exit_qualification;
4771 gpa_t gpa; 4854 gpa_t gpa;
4855 u32 error_code;
4772 int gla_validity; 4856 int gla_validity;
4773 4857
4774 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 4858 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -4793,7 +4877,13 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
4793 4877
4794 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 4878 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
4795 trace_kvm_page_fault(gpa, exit_qualification); 4879 trace_kvm_page_fault(gpa, exit_qualification);
4796 return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0); 4880
4881 /* It is a write fault? */
4882 error_code = exit_qualification & (1U << 1);
4883 /* ept page table is present? */
4884 error_code |= (exit_qualification >> 3) & 0x1;
4885
4886 return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
4797} 4887}
4798 4888
4799static u64 ept_rsvd_mask(u64 spte, int level) 4889static u64 ept_rsvd_mask(u64 spte, int level)
@@ -4908,15 +4998,18 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
4908 int ret = 1; 4998 int ret = 1;
4909 u32 cpu_exec_ctrl; 4999 u32 cpu_exec_ctrl;
4910 bool intr_window_requested; 5000 bool intr_window_requested;
5001 unsigned count = 130;
4911 5002
4912 cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 5003 cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
4913 intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; 5004 intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
4914 5005
4915 while (!guest_state_valid(vcpu)) { 5006 while (!guest_state_valid(vcpu) && count-- != 0) {
4916 if (intr_window_requested 5007 if (intr_window_requested && vmx_interrupt_allowed(vcpu))
4917 && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF))
4918 return handle_interrupt_window(&vmx->vcpu); 5008 return handle_interrupt_window(&vmx->vcpu);
4919 5009
5010 if (test_bit(KVM_REQ_EVENT, &vcpu->requests))
5011 return 1;
5012
4920 err = emulate_instruction(vcpu, 0); 5013 err = emulate_instruction(vcpu, 0);
4921 5014
4922 if (err == EMULATE_DO_MMIO) { 5015 if (err == EMULATE_DO_MMIO) {
@@ -4924,8 +5017,12 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
4924 goto out; 5017 goto out;
4925 } 5018 }
4926 5019
4927 if (err != EMULATE_DONE) 5020 if (err != EMULATE_DONE) {
5021 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5022 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
5023 vcpu->run->internal.ndata = 0;
4928 return 0; 5024 return 0;
5025 }
4929 5026
4930 if (signal_pending(current)) 5027 if (signal_pending(current))
4931 goto out; 5028 goto out;
@@ -4933,7 +5030,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
4933 schedule(); 5030 schedule();
4934 } 5031 }
4935 5032
4936 vmx->emulation_required = 0; 5033 vmx->emulation_required = !guest_state_valid(vcpu);
4937out: 5034out:
4938 return ret; 5035 return ret;
4939} 5036}
@@ -6467,6 +6564,23 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
6467 } 6564 }
6468 } 6565 }
6469 } 6566 }
6567
6568 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
6569 /* Exposing INVPCID only when PCID is exposed */
6570 best = kvm_find_cpuid_entry(vcpu, 0x7, 0);
6571 if (vmx_invpcid_supported() &&
6572 best && (best->ecx & bit(X86_FEATURE_INVPCID)) &&
6573 guest_cpuid_has_pcid(vcpu)) {
6574 exec_control |= SECONDARY_EXEC_ENABLE_INVPCID;
6575 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
6576 exec_control);
6577 } else {
6578 exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
6579 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
6580 exec_control);
6581 if (best)
6582 best->ecx &= ~bit(X86_FEATURE_INVPCID);
6583 }
6470} 6584}
6471 6585
6472static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) 6586static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@ -7201,6 +7315,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
7201 .cpuid_update = vmx_cpuid_update, 7315 .cpuid_update = vmx_cpuid_update,
7202 7316
7203 .rdtscp_supported = vmx_rdtscp_supported, 7317 .rdtscp_supported = vmx_rdtscp_supported,
7318 .invpcid_supported = vmx_invpcid_supported,
7204 7319
7205 .set_supported_cpuid = vmx_set_supported_cpuid, 7320 .set_supported_cpuid = vmx_set_supported_cpuid,
7206 7321
@@ -7230,23 +7345,21 @@ static int __init vmx_init(void)
7230 if (!vmx_io_bitmap_a) 7345 if (!vmx_io_bitmap_a)
7231 return -ENOMEM; 7346 return -ENOMEM;
7232 7347
7348 r = -ENOMEM;
7349
7233 vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); 7350 vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL);
7234 if (!vmx_io_bitmap_b) { 7351 if (!vmx_io_bitmap_b)
7235 r = -ENOMEM;
7236 goto out; 7352 goto out;
7237 }
7238 7353
7239 vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); 7354 vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL);
7240 if (!vmx_msr_bitmap_legacy) { 7355 if (!vmx_msr_bitmap_legacy)
7241 r = -ENOMEM;
7242 goto out1; 7356 goto out1;
7243 } 7357
7244 7358
7245 vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); 7359 vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
7246 if (!vmx_msr_bitmap_longmode) { 7360 if (!vmx_msr_bitmap_longmode)
7247 r = -ENOMEM;
7248 goto out2; 7361 goto out2;
7249 } 7362
7250 7363
7251 /* 7364 /*
7252 * Allow direct access to the PC debug port (it is often used for I/O 7365 * Allow direct access to the PC debug port (it is often used for I/O
@@ -7275,8 +7388,10 @@ static int __init vmx_init(void)
7275 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); 7388 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
7276 7389
7277 if (enable_ept) { 7390 if (enable_ept) {
7278 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, 7391 kvm_mmu_set_mask_ptes(0ull,
7279 VMX_EPT_EXECUTABLE_MASK); 7392 (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
7393 (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull,
7394 0ull, VMX_EPT_EXECUTABLE_MASK);
7280 ept_set_mmio_spte_mask(); 7395 ept_set_mmio_spte_mask();
7281 kvm_enable_tdp(); 7396 kvm_enable_tdp();
7282 } else 7397 } else
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index be6d54929fa7..59b59508ff07 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -528,6 +528,9 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
528 return 1; 528 return 1;
529 } 529 }
530 530
531 if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
532 return 1;
533
531 kvm_x86_ops->set_cr0(vcpu, cr0); 534 kvm_x86_ops->set_cr0(vcpu, cr0);
532 535
533 if ((cr0 ^ old_cr0) & X86_CR0_PG) { 536 if ((cr0 ^ old_cr0) & X86_CR0_PG) {
@@ -604,10 +607,20 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
604 kvm_read_cr3(vcpu))) 607 kvm_read_cr3(vcpu)))
605 return 1; 608 return 1;
606 609
610 if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
611 if (!guest_cpuid_has_pcid(vcpu))
612 return 1;
613
614 /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
615 if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
616 return 1;
617 }
618
607 if (kvm_x86_ops->set_cr4(vcpu, cr4)) 619 if (kvm_x86_ops->set_cr4(vcpu, cr4))
608 return 1; 620 return 1;
609 621
610 if ((cr4 ^ old_cr4) & pdptr_bits) 622 if (((cr4 ^ old_cr4) & pdptr_bits) ||
623 (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
611 kvm_mmu_reset_context(vcpu); 624 kvm_mmu_reset_context(vcpu);
612 625
613 if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) 626 if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
@@ -626,8 +639,12 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
626 } 639 }
627 640
628 if (is_long_mode(vcpu)) { 641 if (is_long_mode(vcpu)) {
629 if (cr3 & CR3_L_MODE_RESERVED_BITS) 642 if (kvm_read_cr4(vcpu) & X86_CR4_PCIDE) {
630 return 1; 643 if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS)
644 return 1;
645 } else
646 if (cr3 & CR3_L_MODE_RESERVED_BITS)
647 return 1;
631 } else { 648 } else {
632 if (is_pae(vcpu)) { 649 if (is_pae(vcpu)) {
633 if (cr3 & CR3_PAE_RESERVED_BITS) 650 if (cr3 & CR3_PAE_RESERVED_BITS)
@@ -795,6 +812,7 @@ static u32 msrs_to_save[] = {
795 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, 812 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
796 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, 813 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
797 HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, 814 HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
815 MSR_KVM_PV_EOI_EN,
798 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 816 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
799 MSR_STAR, 817 MSR_STAR,
800#ifdef CONFIG_X86_64 818#ifdef CONFIG_X86_64
@@ -1437,8 +1455,8 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1437 break; 1455 break;
1438 } 1456 }
1439 default: 1457 default:
1440 pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " 1458 vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
1441 "data 0x%llx\n", msr, data); 1459 "data 0x%llx\n", msr, data);
1442 return 1; 1460 return 1;
1443 } 1461 }
1444 return 0; 1462 return 0;
@@ -1470,8 +1488,8 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1470 case HV_X64_MSR_TPR: 1488 case HV_X64_MSR_TPR:
1471 return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); 1489 return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
1472 default: 1490 default:
1473 pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " 1491 vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
1474 "data 0x%llx\n", msr, data); 1492 "data 0x%llx\n", msr, data);
1475 return 1; 1493 return 1;
1476 } 1494 }
1477 1495
@@ -1551,15 +1569,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1551 data &= ~(u64)0x100; /* ignore ignne emulation enable */ 1569 data &= ~(u64)0x100; /* ignore ignne emulation enable */
1552 data &= ~(u64)0x8; /* ignore TLB cache disable */ 1570 data &= ~(u64)0x8; /* ignore TLB cache disable */
1553 if (data != 0) { 1571 if (data != 0) {
1554 pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", 1572 vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
1555 data); 1573 data);
1556 return 1; 1574 return 1;
1557 } 1575 }
1558 break; 1576 break;
1559 case MSR_FAM10H_MMIO_CONF_BASE: 1577 case MSR_FAM10H_MMIO_CONF_BASE:
1560 if (data != 0) { 1578 if (data != 0) {
1561 pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " 1579 vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
1562 "0x%llx\n", data); 1580 "0x%llx\n", data);
1563 return 1; 1581 return 1;
1564 } 1582 }
1565 break; 1583 break;
@@ -1574,8 +1592,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1574 thus reserved and should throw a #GP */ 1592 thus reserved and should throw a #GP */
1575 return 1; 1593 return 1;
1576 } 1594 }
1577 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", 1595 vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
1578 __func__, data); 1596 __func__, data);
1579 break; 1597 break;
1580 case MSR_IA32_UCODE_REV: 1598 case MSR_IA32_UCODE_REV:
1581 case MSR_IA32_UCODE_WRITE: 1599 case MSR_IA32_UCODE_WRITE:
@@ -1653,6 +1671,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1653 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); 1671 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
1654 1672
1655 break; 1673 break;
1674 case MSR_KVM_PV_EOI_EN:
1675 if (kvm_lapic_enable_pv_eoi(vcpu, data))
1676 return 1;
1677 break;
1656 1678
1657 case MSR_IA32_MCG_CTL: 1679 case MSR_IA32_MCG_CTL:
1658 case MSR_IA32_MCG_STATUS: 1680 case MSR_IA32_MCG_STATUS:
@@ -1671,8 +1693,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1671 case MSR_K7_EVNTSEL2: 1693 case MSR_K7_EVNTSEL2:
1672 case MSR_K7_EVNTSEL3: 1694 case MSR_K7_EVNTSEL3:
1673 if (data != 0) 1695 if (data != 0)
1674 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " 1696 vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: "
1675 "0x%x data 0x%llx\n", msr, data); 1697 "0x%x data 0x%llx\n", msr, data);
1676 break; 1698 break;
1677 /* at least RHEL 4 unconditionally writes to the perfctr registers, 1699 /* at least RHEL 4 unconditionally writes to the perfctr registers,
1678 * so we ignore writes to make it happy. 1700 * so we ignore writes to make it happy.
@@ -1681,8 +1703,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1681 case MSR_K7_PERFCTR1: 1703 case MSR_K7_PERFCTR1:
1682 case MSR_K7_PERFCTR2: 1704 case MSR_K7_PERFCTR2:
1683 case MSR_K7_PERFCTR3: 1705 case MSR_K7_PERFCTR3:
1684 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " 1706 vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: "
1685 "0x%x data 0x%llx\n", msr, data); 1707 "0x%x data 0x%llx\n", msr, data);
1686 break; 1708 break;
1687 case MSR_P6_PERFCTR0: 1709 case MSR_P6_PERFCTR0:
1688 case MSR_P6_PERFCTR1: 1710 case MSR_P6_PERFCTR1:
@@ -1693,8 +1715,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1693 return kvm_pmu_set_msr(vcpu, msr, data); 1715 return kvm_pmu_set_msr(vcpu, msr, data);
1694 1716
1695 if (pr || data != 0) 1717 if (pr || data != 0)
1696 pr_unimpl(vcpu, "disabled perfctr wrmsr: " 1718 vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
1697 "0x%x data 0x%llx\n", msr, data); 1719 "0x%x data 0x%llx\n", msr, data);
1698 break; 1720 break;
1699 case MSR_K7_CLK_CTL: 1721 case MSR_K7_CLK_CTL:
1700 /* 1722 /*
@@ -1720,7 +1742,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1720 /* Drop writes to this legacy MSR -- see rdmsr 1742 /* Drop writes to this legacy MSR -- see rdmsr
1721 * counterpart for further detail. 1743 * counterpart for further detail.
1722 */ 1744 */
1723 pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); 1745 vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data);
1724 break; 1746 break;
1725 case MSR_AMD64_OSVW_ID_LENGTH: 1747 case MSR_AMD64_OSVW_ID_LENGTH:
1726 if (!guest_cpuid_has_osvw(vcpu)) 1748 if (!guest_cpuid_has_osvw(vcpu))
@@ -1738,12 +1760,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1738 if (kvm_pmu_msr(vcpu, msr)) 1760 if (kvm_pmu_msr(vcpu, msr))
1739 return kvm_pmu_set_msr(vcpu, msr, data); 1761 return kvm_pmu_set_msr(vcpu, msr, data);
1740 if (!ignore_msrs) { 1762 if (!ignore_msrs) {
1741 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", 1763 vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
1742 msr, data); 1764 msr, data);
1743 return 1; 1765 return 1;
1744 } else { 1766 } else {
1745 pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", 1767 vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n",
1746 msr, data); 1768 msr, data);
1747 break; 1769 break;
1748 } 1770 }
1749 } 1771 }
@@ -1846,7 +1868,7 @@ static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1846 data = kvm->arch.hv_hypercall; 1868 data = kvm->arch.hv_hypercall;
1847 break; 1869 break;
1848 default: 1870 default:
1849 pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); 1871 vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
1850 return 1; 1872 return 1;
1851 } 1873 }
1852 1874
@@ -1877,7 +1899,7 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1877 data = vcpu->arch.hv_vapic; 1899 data = vcpu->arch.hv_vapic;
1878 break; 1900 break;
1879 default: 1901 default:
1880 pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); 1902 vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
1881 return 1; 1903 return 1;
1882 } 1904 }
1883 *pdata = data; 1905 *pdata = data;
@@ -2030,10 +2052,10 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
2030 if (kvm_pmu_msr(vcpu, msr)) 2052 if (kvm_pmu_msr(vcpu, msr))
2031 return kvm_pmu_get_msr(vcpu, msr, pdata); 2053 return kvm_pmu_get_msr(vcpu, msr, pdata);
2032 if (!ignore_msrs) { 2054 if (!ignore_msrs) {
2033 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); 2055 vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
2034 return 1; 2056 return 1;
2035 } else { 2057 } else {
2036 pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr); 2058 vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);
2037 data = 0; 2059 data = 0;
2038 } 2060 }
2039 break; 2061 break;
@@ -4116,7 +4138,7 @@ static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)
4116 value = kvm_get_cr8(vcpu); 4138 value = kvm_get_cr8(vcpu);
4117 break; 4139 break;
4118 default: 4140 default:
4119 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 4141 kvm_err("%s: unexpected cr %u\n", __func__, cr);
4120 return 0; 4142 return 0;
4121 } 4143 }
4122 4144
@@ -4145,7 +4167,7 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
4145 res = kvm_set_cr8(vcpu, val); 4167 res = kvm_set_cr8(vcpu, val);
4146 break; 4168 break;
4147 default: 4169 default:
4148 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 4170 kvm_err("%s: unexpected cr %u\n", __func__, cr);
4149 res = -1; 4171 res = -1;
4150 } 4172 }
4151 4173
@@ -4297,26 +4319,10 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
4297 return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage); 4319 return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
4298} 4320}
4299 4321
4300static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, 4322static void emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
4301 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) 4323 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
4302{ 4324{
4303 struct kvm_cpuid_entry2 *cpuid = NULL; 4325 kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx);
4304
4305 if (eax && ecx)
4306 cpuid = kvm_find_cpuid_entry(emul_to_vcpu(ctxt),
4307 *eax, *ecx);
4308
4309 if (cpuid) {
4310 *eax = cpuid->eax;
4311 *ecx = cpuid->ecx;
4312 if (ebx)
4313 *ebx = cpuid->ebx;
4314 if (edx)
4315 *edx = cpuid->edx;
4316 return true;
4317 }
4318
4319 return false;
4320} 4326}
4321 4327
4322static struct x86_emulate_ops emulate_ops = { 4328static struct x86_emulate_ops emulate_ops = {
@@ -5296,8 +5302,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5296 5302
5297 r = kvm_mmu_reload(vcpu); 5303 r = kvm_mmu_reload(vcpu);
5298 if (unlikely(r)) { 5304 if (unlikely(r)) {
5299 kvm_x86_ops->cancel_injection(vcpu); 5305 goto cancel_injection;
5300 goto out;
5301 } 5306 }
5302 5307
5303 preempt_disable(); 5308 preempt_disable();
@@ -5322,9 +5327,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5322 smp_wmb(); 5327 smp_wmb();
5323 local_irq_enable(); 5328 local_irq_enable();
5324 preempt_enable(); 5329 preempt_enable();
5325 kvm_x86_ops->cancel_injection(vcpu);
5326 r = 1; 5330 r = 1;
5327 goto out; 5331 goto cancel_injection;
5328 } 5332 }
5329 5333
5330 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 5334 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
@@ -5388,9 +5392,16 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5388 if (unlikely(vcpu->arch.tsc_always_catchup)) 5392 if (unlikely(vcpu->arch.tsc_always_catchup))
5389 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 5393 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
5390 5394
5391 kvm_lapic_sync_from_vapic(vcpu); 5395 if (vcpu->arch.apic_attention)
5396 kvm_lapic_sync_from_vapic(vcpu);
5392 5397
5393 r = kvm_x86_ops->handle_exit(vcpu); 5398 r = kvm_x86_ops->handle_exit(vcpu);
5399 return r;
5400
5401cancel_injection:
5402 kvm_x86_ops->cancel_injection(vcpu);
5403 if (unlikely(vcpu->arch.apic_attention))
5404 kvm_lapic_sync_from_vapic(vcpu);
5394out: 5405out:
5395 return r; 5406 return r;
5396} 5407}
@@ -6304,7 +6315,7 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free,
6304 6315
6305 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { 6316 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
6306 if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) { 6317 if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) {
6307 vfree(free->arch.lpage_info[i]); 6318 kvm_kvfree(free->arch.lpage_info[i]);
6308 free->arch.lpage_info[i] = NULL; 6319 free->arch.lpage_info[i] = NULL;
6309 } 6320 }
6310 } 6321 }
@@ -6323,7 +6334,7 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
6323 slot->base_gfn, level) + 1; 6334 slot->base_gfn, level) + 1;
6324 6335
6325 slot->arch.lpage_info[i] = 6336 slot->arch.lpage_info[i] =
6326 vzalloc(lpages * sizeof(*slot->arch.lpage_info[i])); 6337 kvm_kvzalloc(lpages * sizeof(*slot->arch.lpage_info[i]));
6327 if (!slot->arch.lpage_info[i]) 6338 if (!slot->arch.lpage_info[i])
6328 goto out_free; 6339 goto out_free;
6329 6340
@@ -6350,7 +6361,7 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
6350 6361
6351out_free: 6362out_free:
6352 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { 6363 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
6353 vfree(slot->arch.lpage_info[i]); 6364 kvm_kvfree(slot->arch.lpage_info[i]);
6354 slot->arch.lpage_info[i] = NULL; 6365 slot->arch.lpage_info[i] = NULL;
6355 } 6366 }
6356 return -ENOMEM; 6367 return -ENOMEM;
diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c
index 459b58a8a15c..25b7ae8d058a 100644
--- a/arch/x86/lib/csum-wrappers_64.c
+++ b/arch/x86/lib/csum-wrappers_64.c
@@ -115,7 +115,7 @@ EXPORT_SYMBOL(csum_partial_copy_to_user);
115 * @src: source address 115 * @src: source address
116 * @dst: destination address 116 * @dst: destination address
117 * @len: number of bytes to be copied. 117 * @len: number of bytes to be copied.
118 * @isum: initial sum that is added into the result (32bit unfolded) 118 * @sum: initial sum that is added into the result (32bit unfolded)
119 * 119 *
120 * Returns an 32bit unfolded checksum of the buffer. 120 * Returns an 32bit unfolded checksum of the buffer.
121 */ 121 */
diff --git a/arch/x86/lib/msr-reg-export.c b/arch/x86/lib/msr-reg-export.c
index a311cc59b65d..8d6ef78b5d01 100644
--- a/arch/x86/lib/msr-reg-export.c
+++ b/arch/x86/lib/msr-reg-export.c
@@ -1,5 +1,5 @@
1#include <linux/module.h> 1#include <linux/module.h>
2#include <asm/msr.h> 2#include <asm/msr.h>
3 3
4EXPORT_SYMBOL(native_rdmsr_safe_regs); 4EXPORT_SYMBOL(rdmsr_safe_regs);
5EXPORT_SYMBOL(native_wrmsr_safe_regs); 5EXPORT_SYMBOL(wrmsr_safe_regs);
diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S
index 69fa10623f21..f6d13eefad10 100644
--- a/arch/x86/lib/msr-reg.S
+++ b/arch/x86/lib/msr-reg.S
@@ -6,13 +6,13 @@
6 6
7#ifdef CONFIG_X86_64 7#ifdef CONFIG_X86_64
8/* 8/*
9 * int native_{rdmsr,wrmsr}_safe_regs(u32 gprs[8]); 9 * int {rdmsr,wrmsr}_safe_regs(u32 gprs[8]);
10 * 10 *
11 * reg layout: u32 gprs[eax, ecx, edx, ebx, esp, ebp, esi, edi] 11 * reg layout: u32 gprs[eax, ecx, edx, ebx, esp, ebp, esi, edi]
12 * 12 *
13 */ 13 */
14.macro op_safe_regs op 14.macro op_safe_regs op
15ENTRY(native_\op\()_safe_regs) 15ENTRY(\op\()_safe_regs)
16 CFI_STARTPROC 16 CFI_STARTPROC
17 pushq_cfi %rbx 17 pushq_cfi %rbx
18 pushq_cfi %rbp 18 pushq_cfi %rbp
@@ -45,13 +45,13 @@ ENTRY(native_\op\()_safe_regs)
45 45
46 _ASM_EXTABLE(1b, 3b) 46 _ASM_EXTABLE(1b, 3b)
47 CFI_ENDPROC 47 CFI_ENDPROC
48ENDPROC(native_\op\()_safe_regs) 48ENDPROC(\op\()_safe_regs)
49.endm 49.endm
50 50
51#else /* X86_32 */ 51#else /* X86_32 */
52 52
53.macro op_safe_regs op 53.macro op_safe_regs op
54ENTRY(native_\op\()_safe_regs) 54ENTRY(\op\()_safe_regs)
55 CFI_STARTPROC 55 CFI_STARTPROC
56 pushl_cfi %ebx 56 pushl_cfi %ebx
57 pushl_cfi %ebp 57 pushl_cfi %ebp
@@ -92,7 +92,7 @@ ENTRY(native_\op\()_safe_regs)
92 92
93 _ASM_EXTABLE(1b, 3b) 93 _ASM_EXTABLE(1b, 3b)
94 CFI_ENDPROC 94 CFI_ENDPROC
95ENDPROC(native_\op\()_safe_regs) 95ENDPROC(\op\()_safe_regs)
96.endm 96.endm
97 97
98#endif 98#endif
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index bc4e9d84157f..e0e6990723e9 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -385,7 +385,7 @@ void free_initmem(void)
385} 385}
386 386
387#ifdef CONFIG_BLK_DEV_INITRD 387#ifdef CONFIG_BLK_DEV_INITRD
388void free_initrd_mem(unsigned long start, unsigned long end) 388void __init free_initrd_mem(unsigned long start, unsigned long end)
389{ 389{
390 /* 390 /*
391 * end could be not aligned, and We can not align that, 391 * end could be not aligned, and We can not align that,
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index a718e0d23503..931930a96160 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -919,11 +919,13 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
919 919
920 /* 920 /*
921 * On success we use clflush, when the CPU supports it to 921 * On success we use clflush, when the CPU supports it to
922 * avoid the wbindv. If the CPU does not support it and in the 922 * avoid the wbindv. If the CPU does not support it, in the
923 * error case we fall back to cpa_flush_all (which uses 923 * error case, and during early boot (for EFI) we fall back
924 * wbindv): 924 * to cpa_flush_all (which uses wbinvd):
925 */ 925 */
926 if (!ret && cpu_has_clflush) { 926 if (early_boot_irqs_disabled)
927 __cpa_flush_all((void *)(long)cache);
928 else if (!ret && cpu_has_clflush) {
927 if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { 929 if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
928 cpa_flush_array(addr, numpages, cache, 930 cpa_flush_array(addr, numpages, cache,
929 cpa.flags, pages); 931 cpa.flags, pages);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 5e57e113b72c..613cd83e8c0c 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -12,6 +12,7 @@
12#include <asm/cache.h> 12#include <asm/cache.h>
13#include <asm/apic.h> 13#include <asm/apic.h>
14#include <asm/uv/uv.h> 14#include <asm/uv/uv.h>
15#include <linux/debugfs.h>
15 16
16DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) 17DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
17 = { &init_mm, 0, }; 18 = { &init_mm, 0, };
@@ -27,33 +28,14 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
27 * 28 *
28 * More scalable flush, from Andi Kleen 29 * More scalable flush, from Andi Kleen
29 * 30 *
30 * To avoid global state use 8 different call vectors. 31 * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
31 * Each CPU uses a specific vector to trigger flushes on other
32 * CPUs. Depending on the received vector the target CPUs look into
33 * the right array slot for the flush data.
34 *
35 * With more than 8 CPUs they are hashed to the 8 available
36 * vectors. The limited global vector space forces us to this right now.
37 * In future when interrupts are split into per CPU domains this could be
38 * fixed, at the cost of triggering multiple IPIs in some cases.
39 */ 32 */
40 33
41union smp_flush_state { 34struct flush_tlb_info {
42 struct { 35 struct mm_struct *flush_mm;
43 struct mm_struct *flush_mm; 36 unsigned long flush_start;
44 unsigned long flush_va; 37 unsigned long flush_end;
45 raw_spinlock_t tlbstate_lock; 38};
46 DECLARE_BITMAP(flush_cpumask, NR_CPUS);
47 };
48 char pad[INTERNODE_CACHE_BYTES];
49} ____cacheline_internodealigned_in_smp;
50
51/* State is put into the per CPU data section, but padded
52 to a full cache line because other CPUs can access it and we don't
53 want false sharing in the per cpu data segment. */
54static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS];
55
56static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset);
57 39
58/* 40/*
59 * We cannot call mmdrop() because we are in interrupt context, 41 * We cannot call mmdrop() because we are in interrupt context,
@@ -72,28 +54,25 @@ void leave_mm(int cpu)
72EXPORT_SYMBOL_GPL(leave_mm); 54EXPORT_SYMBOL_GPL(leave_mm);
73 55
74/* 56/*
75 *
76 * The flush IPI assumes that a thread switch happens in this order: 57 * The flush IPI assumes that a thread switch happens in this order:
77 * [cpu0: the cpu that switches] 58 * [cpu0: the cpu that switches]
78 * 1) switch_mm() either 1a) or 1b) 59 * 1) switch_mm() either 1a) or 1b)
79 * 1a) thread switch to a different mm 60 * 1a) thread switch to a different mm
80 * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); 61 * 1a1) set cpu_tlbstate to TLBSTATE_OK
81 * Stop ipi delivery for the old mm. This is not synchronized with 62 * Now the tlb flush NMI handler flush_tlb_func won't call leave_mm
82 * the other cpus, but smp_invalidate_interrupt ignore flush ipis 63 * if cpu0 was in lazy tlb mode.
83 * for the wrong mm, and in the worst case we perform a superfluous 64 * 1a2) update cpu active_mm
84 * tlb flush.
85 * 1a2) set cpu mmu_state to TLBSTATE_OK
86 * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
87 * was in lazy tlb mode.
88 * 1a3) update cpu active_mm
89 * Now cpu0 accepts tlb flushes for the new mm. 65 * Now cpu0 accepts tlb flushes for the new mm.
90 * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); 66 * 1a3) cpu_set(cpu, new_mm->cpu_vm_mask);
91 * Now the other cpus will send tlb flush ipis. 67 * Now the other cpus will send tlb flush ipis.
92 * 1a4) change cr3. 68 * 1a4) change cr3.
69 * 1a5) cpu_clear(cpu, old_mm->cpu_vm_mask);
70 * Stop ipi delivery for the old mm. This is not synchronized with
71 * the other cpus, but flush_tlb_func ignore flush ipis for the wrong
72 * mm, and in the worst case we perform a superfluous tlb flush.
93 * 1b) thread switch without mm change 73 * 1b) thread switch without mm change
94 * cpu active_mm is correct, cpu0 already handles 74 * cpu active_mm is correct, cpu0 already handles flush ipis.
95 * flush ipis. 75 * 1b1) set cpu_tlbstate to TLBSTATE_OK
96 * 1b1) set cpu mmu_state to TLBSTATE_OK
97 * 1b2) test_and_set the cpu bit in cpu_vm_mask. 76 * 1b2) test_and_set the cpu bit in cpu_vm_mask.
98 * Atomically set the bit [other cpus will start sending flush ipis], 77 * Atomically set the bit [other cpus will start sending flush ipis],
99 * and test the bit. 78 * and test the bit.
@@ -106,174 +85,62 @@ EXPORT_SYMBOL_GPL(leave_mm);
106 * runs in kernel space, the cpu could load tlb entries for user space 85 * runs in kernel space, the cpu could load tlb entries for user space
107 * pages. 86 * pages.
108 * 87 *
109 * The good news is that cpu mmu_state is local to each cpu, no 88 * The good news is that cpu_tlbstate is local to each cpu, no
110 * write/read ordering problems. 89 * write/read ordering problems.
111 */ 90 */
112 91
113/* 92/*
114 * TLB flush IPI: 93 * TLB flush funcation:
115 *
116 * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. 94 * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
117 * 2) Leave the mm if we are in the lazy tlb mode. 95 * 2) Leave the mm if we are in the lazy tlb mode.
118 *
119 * Interrupts are disabled.
120 */
121
122/*
123 * FIXME: use of asmlinkage is not consistent. On x86_64 it's noop
124 * but still used for documentation purpose but the usage is slightly
125 * inconsistent. On x86_32, asmlinkage is regparm(0) but interrupt
126 * entry calls in with the first parameter in %eax. Maybe define
127 * intrlinkage?
128 */ 96 */
129#ifdef CONFIG_X86_64 97static void flush_tlb_func(void *info)
130asmlinkage
131#endif
132void smp_invalidate_interrupt(struct pt_regs *regs)
133{ 98{
134 unsigned int cpu; 99 struct flush_tlb_info *f = info;
135 unsigned int sender;
136 union smp_flush_state *f;
137
138 cpu = smp_processor_id();
139 /*
140 * orig_rax contains the negated interrupt vector.
141 * Use that to determine where the sender put the data.
142 */
143 sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
144 f = &flush_state[sender];
145
146 if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask)))
147 goto out;
148 /*
149 * This was a BUG() but until someone can quote me the
150 * line from the intel manual that guarantees an IPI to
151 * multiple CPUs is retried _only_ on the erroring CPUs
152 * its staying as a return
153 *
154 * BUG();
155 */
156
157 if (f->flush_mm == this_cpu_read(cpu_tlbstate.active_mm)) {
158 if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
159 if (f->flush_va == TLB_FLUSH_ALL)
160 local_flush_tlb();
161 else
162 __flush_tlb_one(f->flush_va);
163 } else
164 leave_mm(cpu);
165 }
166out:
167 ack_APIC_irq();
168 smp_mb__before_clear_bit();
169 cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask));
170 smp_mb__after_clear_bit();
171 inc_irq_stat(irq_tlb_count);
172}
173 100
174static void flush_tlb_others_ipi(const struct cpumask *cpumask, 101 if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
175 struct mm_struct *mm, unsigned long va) 102 return;
176{ 103
177 unsigned int sender; 104 if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
178 union smp_flush_state *f; 105 if (f->flush_end == TLB_FLUSH_ALL || !cpu_has_invlpg)
179 106 local_flush_tlb();
180 /* Caller has disabled preemption */ 107 else if (!f->flush_end)
181 sender = this_cpu_read(tlb_vector_offset); 108 __flush_tlb_single(f->flush_start);
182 f = &flush_state[sender]; 109 else {
183 110 unsigned long addr;
184 if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS) 111 addr = f->flush_start;
185 raw_spin_lock(&f->tlbstate_lock); 112 while (addr < f->flush_end) {
186 113 __flush_tlb_single(addr);
187 f->flush_mm = mm; 114 addr += PAGE_SIZE;
188 f->flush_va = va; 115 }
189 if (cpumask_andnot(to_cpumask(f->flush_cpumask), cpumask, cpumask_of(smp_processor_id()))) { 116 }
190 /* 117 } else
191 * We have to send the IPI only to 118 leave_mm(smp_processor_id());
192 * CPUs affected.
193 */
194 apic->send_IPI_mask(to_cpumask(f->flush_cpumask),
195 INVALIDATE_TLB_VECTOR_START + sender);
196
197 while (!cpumask_empty(to_cpumask(f->flush_cpumask)))
198 cpu_relax();
199 }
200 119
201 f->flush_mm = NULL;
202 f->flush_va = 0;
203 if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
204 raw_spin_unlock(&f->tlbstate_lock);
205} 120}
206 121
207void native_flush_tlb_others(const struct cpumask *cpumask, 122void native_flush_tlb_others(const struct cpumask *cpumask,
208 struct mm_struct *mm, unsigned long va) 123 struct mm_struct *mm, unsigned long start,
124 unsigned long end)
209{ 125{
126 struct flush_tlb_info info;
127 info.flush_mm = mm;
128 info.flush_start = start;
129 info.flush_end = end;
130
210 if (is_uv_system()) { 131 if (is_uv_system()) {
211 unsigned int cpu; 132 unsigned int cpu;
212 133
213 cpu = smp_processor_id(); 134 cpu = smp_processor_id();
214 cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu); 135 cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu);
215 if (cpumask) 136 if (cpumask)
216 flush_tlb_others_ipi(cpumask, mm, va); 137 smp_call_function_many(cpumask, flush_tlb_func,
138 &info, 1);
217 return; 139 return;
218 } 140 }
219 flush_tlb_others_ipi(cpumask, mm, va); 141 smp_call_function_many(cpumask, flush_tlb_func, &info, 1);
220} 142}
221 143
222static void __cpuinit calculate_tlb_offset(void)
223{
224 int cpu, node, nr_node_vecs, idx = 0;
225 /*
226 * we are changing tlb_vector_offset for each CPU in runtime, but this
227 * will not cause inconsistency, as the write is atomic under X86. we
228 * might see more lock contentions in a short time, but after all CPU's
229 * tlb_vector_offset are changed, everything should go normal
230 *
231 * Note: if NUM_INVALIDATE_TLB_VECTORS % nr_online_nodes !=0, we might
232 * waste some vectors.
233 **/
234 if (nr_online_nodes > NUM_INVALIDATE_TLB_VECTORS)
235 nr_node_vecs = 1;
236 else
237 nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes;
238
239 for_each_online_node(node) {
240 int node_offset = (idx % NUM_INVALIDATE_TLB_VECTORS) *
241 nr_node_vecs;
242 int cpu_offset = 0;
243 for_each_cpu(cpu, cpumask_of_node(node)) {
244 per_cpu(tlb_vector_offset, cpu) = node_offset +
245 cpu_offset;
246 cpu_offset++;
247 cpu_offset = cpu_offset % nr_node_vecs;
248 }
249 idx++;
250 }
251}
252
253static int __cpuinit tlb_cpuhp_notify(struct notifier_block *n,
254 unsigned long action, void *hcpu)
255{
256 switch (action & 0xf) {
257 case CPU_ONLINE:
258 case CPU_DEAD:
259 calculate_tlb_offset();
260 }
261 return NOTIFY_OK;
262}
263
264static int __cpuinit init_smp_flush(void)
265{
266 int i;
267
268 for (i = 0; i < ARRAY_SIZE(flush_state); i++)
269 raw_spin_lock_init(&flush_state[i].tlbstate_lock);
270
271 calculate_tlb_offset();
272 hotcpu_notifier(tlb_cpuhp_notify, 0);
273 return 0;
274}
275core_initcall(init_smp_flush);
276
277void flush_tlb_current_task(void) 144void flush_tlb_current_task(void)
278{ 145{
279 struct mm_struct *mm = current->mm; 146 struct mm_struct *mm = current->mm;
@@ -282,27 +149,91 @@ void flush_tlb_current_task(void)
282 149
283 local_flush_tlb(); 150 local_flush_tlb();
284 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) 151 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
285 flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL); 152 flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
286 preempt_enable(); 153 preempt_enable();
287} 154}
288 155
289void flush_tlb_mm(struct mm_struct *mm) 156/*
157 * It can find out the THP large page, or
158 * HUGETLB page in tlb_flush when THP disabled
159 */
160static inline unsigned long has_large_page(struct mm_struct *mm,
161 unsigned long start, unsigned long end)
162{
163 pgd_t *pgd;
164 pud_t *pud;
165 pmd_t *pmd;
166 unsigned long addr = ALIGN(start, HPAGE_SIZE);
167 for (; addr < end; addr += HPAGE_SIZE) {
168 pgd = pgd_offset(mm, addr);
169 if (likely(!pgd_none(*pgd))) {
170 pud = pud_offset(pgd, addr);
171 if (likely(!pud_none(*pud))) {
172 pmd = pmd_offset(pud, addr);
173 if (likely(!pmd_none(*pmd)))
174 if (pmd_large(*pmd))
175 return addr;
176 }
177 }
178 }
179 return 0;
180}
181
182void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
183 unsigned long end, unsigned long vmflag)
290{ 184{
185 unsigned long addr;
186 unsigned act_entries, tlb_entries = 0;
187
291 preempt_disable(); 188 preempt_disable();
189 if (current->active_mm != mm)
190 goto flush_all;
292 191
293 if (current->active_mm == mm) { 192 if (!current->mm) {
294 if (current->mm) 193 leave_mm(smp_processor_id());
194 goto flush_all;
195 }
196
197 if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1
198 || vmflag == VM_HUGETLB) {
199 local_flush_tlb();
200 goto flush_all;
201 }
202
203 /* In modern CPU, last level tlb used for both data/ins */
204 if (vmflag & VM_EXEC)
205 tlb_entries = tlb_lli_4k[ENTRIES];
206 else
207 tlb_entries = tlb_lld_4k[ENTRIES];
208 /* Assume all of TLB entries was occupied by this task */
209 act_entries = mm->total_vm > tlb_entries ? tlb_entries : mm->total_vm;
210
211 /* tlb_flushall_shift is on balance point, details in commit log */
212 if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift)
213 local_flush_tlb();
214 else {
215 if (has_large_page(mm, start, end)) {
295 local_flush_tlb(); 216 local_flush_tlb();
296 else 217 goto flush_all;
297 leave_mm(smp_processor_id()); 218 }
219 /* flush range by one by one 'invlpg' */
220 for (addr = start; addr < end; addr += PAGE_SIZE)
221 __flush_tlb_single(addr);
222
223 if (cpumask_any_but(mm_cpumask(mm),
224 smp_processor_id()) < nr_cpu_ids)
225 flush_tlb_others(mm_cpumask(mm), mm, start, end);
226 preempt_enable();
227 return;
298 } 228 }
299 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
300 flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL);
301 229
230flush_all:
231 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
232 flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
302 preempt_enable(); 233 preempt_enable();
303} 234}
304 235
305void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) 236void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
306{ 237{
307 struct mm_struct *mm = vma->vm_mm; 238 struct mm_struct *mm = vma->vm_mm;
308 239
@@ -310,13 +241,13 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
310 241
311 if (current->active_mm == mm) { 242 if (current->active_mm == mm) {
312 if (current->mm) 243 if (current->mm)
313 __flush_tlb_one(va); 244 __flush_tlb_one(start);
314 else 245 else
315 leave_mm(smp_processor_id()); 246 leave_mm(smp_processor_id());
316 } 247 }
317 248
318 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) 249 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
319 flush_tlb_others(mm_cpumask(mm), mm, va); 250 flush_tlb_others(mm_cpumask(mm), mm, start, 0UL);
320 251
321 preempt_enable(); 252 preempt_enable();
322} 253}
@@ -332,3 +263,83 @@ void flush_tlb_all(void)
332{ 263{
333 on_each_cpu(do_flush_tlb_all, NULL, 1); 264 on_each_cpu(do_flush_tlb_all, NULL, 1);
334} 265}
266
267static void do_kernel_range_flush(void *info)
268{
269 struct flush_tlb_info *f = info;
270 unsigned long addr;
271
272 /* flush range by one by one 'invlpg' */
273 for (addr = f->flush_start; addr < f->flush_end; addr += PAGE_SIZE)
274 __flush_tlb_single(addr);
275}
276
277void flush_tlb_kernel_range(unsigned long start, unsigned long end)
278{
279 unsigned act_entries;
280 struct flush_tlb_info info;
281
282 /* In modern CPU, last level tlb used for both data/ins */
283 act_entries = tlb_lld_4k[ENTRIES];
284
285 /* Balance as user space task's flush, a bit conservative */
286 if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1 ||
287 (end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift)
288
289 on_each_cpu(do_flush_tlb_all, NULL, 1);
290 else {
291 info.flush_start = start;
292 info.flush_end = end;
293 on_each_cpu(do_kernel_range_flush, &info, 1);
294 }
295}
296
297#ifdef CONFIG_DEBUG_TLBFLUSH
298static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
299 size_t count, loff_t *ppos)
300{
301 char buf[32];
302 unsigned int len;
303
304 len = sprintf(buf, "%hd\n", tlb_flushall_shift);
305 return simple_read_from_buffer(user_buf, count, ppos, buf, len);
306}
307
308static ssize_t tlbflush_write_file(struct file *file,
309 const char __user *user_buf, size_t count, loff_t *ppos)
310{
311 char buf[32];
312 ssize_t len;
313 s8 shift;
314
315 len = min(count, sizeof(buf) - 1);
316 if (copy_from_user(buf, user_buf, len))
317 return -EFAULT;
318
319 buf[len] = '\0';
320 if (kstrtos8(buf, 0, &shift))
321 return -EINVAL;
322
323 if (shift > 64)
324 return -EINVAL;
325
326 tlb_flushall_shift = shift;
327 return count;
328}
329
330static const struct file_operations fops_tlbflush = {
331 .read = tlbflush_read_file,
332 .write = tlbflush_write_file,
333 .llseek = default_llseek,
334};
335
336static int __cpuinit create_tlb_flushall_shift(void)
337{
338 if (cpu_has_invlpg) {
339 debugfs_create_file("tlb_flushall_shift", S_IRUSR | S_IWUSR,
340 arch_debugfs_dir, NULL, &fops_tlbflush);
341 }
342 return 0;
343}
344late_initcall(create_tlb_flushall_shift);
345#endif
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 0597f95b6da6..33643a8bcbbb 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -309,6 +309,10 @@ void bpf_jit_compile(struct sk_filter *fp)
309 else 309 else
310 EMIT1_off32(0x0d, K); /* or imm32,%eax */ 310 EMIT1_off32(0x0d, K); /* or imm32,%eax */
311 break; 311 break;
312 case BPF_S_ANC_ALU_XOR_X: /* A ^= X; */
313 seen |= SEEN_XREG;
314 EMIT2(0x31, 0xd8); /* xor %ebx,%eax */
315 break;
312 case BPF_S_ALU_LSH_X: /* A <<= X; */ 316 case BPF_S_ALU_LSH_X: /* A <<= X; */
313 seen |= SEEN_XREG; 317 seen |= SEEN_XREG;
314 EMIT4(0x89, 0xd9, 0xd3, 0xe0); /* mov %ebx,%ecx; shl %cl,%eax */ 318 EMIT4(0x89, 0xd9, 0xd3, 0xe0); /* mov %ebx,%ecx; shl %cl,%eax */
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 303f08637826..b2b94438ff05 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -312,7 +312,7 @@ static int op_amd_fill_in_addresses(struct op_msrs * const msrs)
312 goto fail; 312 goto fail;
313 } 313 }
314 /* both registers must be reserved */ 314 /* both registers must be reserved */
315 if (num_counters == AMD64_NUM_COUNTERS_F15H) { 315 if (num_counters == AMD64_NUM_COUNTERS_CORE) {
316 msrs->counters[i].addr = MSR_F15H_PERF_CTR + (i << 1); 316 msrs->counters[i].addr = MSR_F15H_PERF_CTR + (i << 1);
317 msrs->controls[i].addr = MSR_F15H_PERF_CTL + (i << 1); 317 msrs->controls[i].addr = MSR_F15H_PERF_CTL + (i << 1);
318 } else { 318 } else {
@@ -514,7 +514,7 @@ static int op_amd_init(struct oprofile_operations *ops)
514 ops->create_files = setup_ibs_files; 514 ops->create_files = setup_ibs_files;
515 515
516 if (boot_cpu_data.x86 == 0x15) { 516 if (boot_cpu_data.x86 == 0x15) {
517 num_counters = AMD64_NUM_COUNTERS_F15H; 517 num_counters = AMD64_NUM_COUNTERS_CORE;
518 } else { 518 } else {
519 num_counters = AMD64_NUM_COUNTERS; 519 num_counters = AMD64_NUM_COUNTERS;
520 } 520 }
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index fc09c2754e08..505acdd6d600 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -12,8 +12,13 @@ struct pci_root_info {
12 char name[16]; 12 char name[16];
13 unsigned int res_num; 13 unsigned int res_num;
14 struct resource *res; 14 struct resource *res;
15 int busnum;
16 struct pci_sysdata sd; 15 struct pci_sysdata sd;
16#ifdef CONFIG_PCI_MMCONFIG
17 bool mcfg_added;
18 u16 segment;
19 u8 start_bus;
20 u8 end_bus;
21#endif
17}; 22};
18 23
19static bool pci_use_crs = true; 24static bool pci_use_crs = true;
@@ -120,6 +125,81 @@ void __init pci_acpi_crs_quirks(void)
120 pci_use_crs ? "nocrs" : "use_crs"); 125 pci_use_crs ? "nocrs" : "use_crs");
121} 126}
122 127
128#ifdef CONFIG_PCI_MMCONFIG
129static int __devinit check_segment(u16 seg, struct device *dev, char *estr)
130{
131 if (seg) {
132 dev_err(dev,
133 "%s can't access PCI configuration "
134 "space under this host bridge.\n",
135 estr);
136 return -EIO;
137 }
138
139 /*
140 * Failure in adding MMCFG information is not fatal,
141 * just can't access extended configuration space of
142 * devices under this host bridge.
143 */
144 dev_warn(dev,
145 "%s can't access extended PCI configuration "
146 "space under this bridge.\n",
147 estr);
148
149 return 0;
150}
151
152static int __devinit setup_mcfg_map(struct pci_root_info *info,
153 u16 seg, u8 start, u8 end,
154 phys_addr_t addr)
155{
156 int result;
157 struct device *dev = &info->bridge->dev;
158
159 info->start_bus = start;
160 info->end_bus = end;
161 info->mcfg_added = false;
162
163 /* return success if MMCFG is not in use */
164 if (raw_pci_ext_ops && raw_pci_ext_ops != &pci_mmcfg)
165 return 0;
166
167 if (!(pci_probe & PCI_PROBE_MMCONF))
168 return check_segment(seg, dev, "MMCONFIG is disabled,");
169
170 result = pci_mmconfig_insert(dev, seg, start, end, addr);
171 if (result == 0) {
172 /* enable MMCFG if it hasn't been enabled yet */
173 if (raw_pci_ext_ops == NULL)
174 raw_pci_ext_ops = &pci_mmcfg;
175 info->mcfg_added = true;
176 } else if (result != -EEXIST)
177 return check_segment(seg, dev,
178 "fail to add MMCONFIG information,");
179
180 return 0;
181}
182
183static void teardown_mcfg_map(struct pci_root_info *info)
184{
185 if (info->mcfg_added) {
186 pci_mmconfig_delete(info->segment, info->start_bus,
187 info->end_bus);
188 info->mcfg_added = false;
189 }
190}
191#else
192static int __devinit setup_mcfg_map(struct pci_root_info *info,
193 u16 seg, u8 start, u8 end,
194 phys_addr_t addr)
195{
196 return 0;
197}
198static void teardown_mcfg_map(struct pci_root_info *info)
199{
200}
201#endif
202
123static acpi_status 203static acpi_status
124resource_to_addr(struct acpi_resource *resource, 204resource_to_addr(struct acpi_resource *resource,
125 struct acpi_resource_address64 *addr) 205 struct acpi_resource_address64 *addr)
@@ -234,13 +314,6 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
234 } 314 }
235 315
236 info->res_num++; 316 info->res_num++;
237 if (addr.translation_offset)
238 dev_info(&info->bridge->dev, "host bridge window %pR "
239 "(PCI address [%#llx-%#llx])\n",
240 res, res->start - addr.translation_offset,
241 res->end - addr.translation_offset);
242 else
243 dev_info(&info->bridge->dev, "host bridge window %pR\n", res);
244 317
245 return AE_OK; 318 return AE_OK;
246} 319}
@@ -332,8 +405,11 @@ static void __release_pci_root_info(struct pci_root_info *info)
332 405
333 free_pci_root_info_res(info); 406 free_pci_root_info_res(info);
334 407
408 teardown_mcfg_map(info);
409
335 kfree(info); 410 kfree(info);
336} 411}
412
337static void release_pci_root_info(struct pci_host_bridge *bridge) 413static void release_pci_root_info(struct pci_host_bridge *bridge)
338{ 414{
339 struct pci_root_info *info = bridge->release_data; 415 struct pci_root_info *info = bridge->release_data;
@@ -347,7 +423,9 @@ probe_pci_root_info(struct pci_root_info *info, struct acpi_device *device,
347{ 423{
348 size_t size; 424 size_t size;
349 425
426 sprintf(info->name, "PCI Bus %04x:%02x", domain, busnum);
350 info->bridge = device; 427 info->bridge = device;
428
351 info->res_num = 0; 429 info->res_num = 0;
352 acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_resource, 430 acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_resource,
353 info); 431 info);
@@ -360,8 +438,6 @@ probe_pci_root_info(struct pci_root_info *info, struct acpi_device *device,
360 if (!info->res) 438 if (!info->res)
361 return; 439 return;
362 440
363 sprintf(info->name, "PCI Bus %04x:%02x", domain, busnum);
364
365 acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource, 441 acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource,
366 info); 442 info);
367} 443}
@@ -373,7 +449,7 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
373 int domain = root->segment; 449 int domain = root->segment;
374 int busnum = root->secondary.start; 450 int busnum = root->secondary.start;
375 LIST_HEAD(resources); 451 LIST_HEAD(resources);
376 struct pci_bus *bus; 452 struct pci_bus *bus = NULL;
377 struct pci_sysdata *sd; 453 struct pci_sysdata *sd;
378 int node; 454 int node;
379#ifdef CONFIG_ACPI_NUMA 455#ifdef CONFIG_ACPI_NUMA
@@ -426,6 +502,8 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
426 } else { 502 } else {
427 probe_pci_root_info(info, device, busnum, domain); 503 probe_pci_root_info(info, device, busnum, domain);
428 504
505 /* insert busn res at first */
506 pci_add_resource(&resources, &root->secondary);
429 /* 507 /*
430 * _CRS with no apertures is normal, so only fall back to 508 * _CRS with no apertures is normal, so only fall back to
431 * defaults or native bridge info if we're ignoring _CRS. 509 * defaults or native bridge info if we're ignoring _CRS.
@@ -437,10 +515,13 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
437 x86_pci_root_bus_resources(busnum, &resources); 515 x86_pci_root_bus_resources(busnum, &resources);
438 } 516 }
439 517
440 bus = pci_create_root_bus(NULL, busnum, &pci_root_ops, sd, 518 if (!setup_mcfg_map(info, domain, (u8)root->secondary.start,
441 &resources); 519 (u8)root->secondary.end, root->mcfg_addr))
520 bus = pci_create_root_bus(NULL, busnum, &pci_root_ops,
521 sd, &resources);
522
442 if (bus) { 523 if (bus) {
443 bus->subordinate = pci_scan_child_bus(bus); 524 pci_scan_child_bus(bus);
444 pci_set_host_bridge_release( 525 pci_set_host_bridge_release(
445 to_pci_host_bridge(bus->bridge), 526 to_pci_host_bridge(bus->bridge),
446 release_pci_root_info, info); 527 release_pci_root_info, info);
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 5aed49bff058..e9e6ed5cdf94 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -121,7 +121,6 @@ static int __init early_fill_mp_bus_info(void)
121 link = (reg >> 8) & 0x03; 121 link = (reg >> 8) & 0x03;
122 122
123 info = alloc_pci_root_info(min_bus, max_bus, node, link); 123 info = alloc_pci_root_info(min_bus, max_bus, node, link);
124 sprintf(info->name, "PCI Bus #%02x", min_bus);
125 } 124 }
126 125
127 /* get the default node and link for left over res */ 126 /* get the default node and link for left over res */
@@ -300,9 +299,9 @@ static int __init early_fill_mp_bus_info(void)
300 int busnum; 299 int busnum;
301 struct pci_root_res *root_res; 300 struct pci_root_res *root_res;
302 301
303 busnum = info->bus_min; 302 busnum = info->busn.start;
304 printk(KERN_DEBUG "bus: [%02x, %02x] on node %x link %x\n", 303 printk(KERN_DEBUG "bus: %pR on node %x link %x\n",
305 info->bus_min, info->bus_max, info->node, info->link); 304 &info->busn, info->node, info->link);
306 list_for_each_entry(root_res, &info->resources, list) 305 list_for_each_entry(root_res, &info->resources, list)
307 printk(KERN_DEBUG "bus: %02x %pR\n", 306 printk(KERN_DEBUG "bus: %02x %pR\n",
308 busnum, &root_res->res); 307 busnum, &root_res->res);
diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c
index 306579f7d0fd..d37e2fec97e5 100644
--- a/arch/x86/pci/bus_numa.c
+++ b/arch/x86/pci/bus_numa.c
@@ -14,7 +14,7 @@ static struct pci_root_info *x86_find_pci_root_info(int bus)
14 return NULL; 14 return NULL;
15 15
16 list_for_each_entry(info, &pci_root_infos, list) 16 list_for_each_entry(info, &pci_root_infos, list)
17 if (info->bus_min == bus) 17 if (info->busn.start == bus)
18 return info; 18 return info;
19 19
20 return NULL; 20 return NULL;
@@ -24,6 +24,8 @@ void x86_pci_root_bus_resources(int bus, struct list_head *resources)
24{ 24{
25 struct pci_root_info *info = x86_find_pci_root_info(bus); 25 struct pci_root_info *info = x86_find_pci_root_info(bus);
26 struct pci_root_res *root_res; 26 struct pci_root_res *root_res;
27 struct pci_host_bridge_window *window;
28 bool found = false;
27 29
28 if (!info) 30 if (!info)
29 goto default_resources; 31 goto default_resources;
@@ -31,6 +33,16 @@ void x86_pci_root_bus_resources(int bus, struct list_head *resources)
31 printk(KERN_DEBUG "PCI: root bus %02x: hardware-probed resources\n", 33 printk(KERN_DEBUG "PCI: root bus %02x: hardware-probed resources\n",
32 bus); 34 bus);
33 35
36 /* already added by acpi ? */
37 list_for_each_entry(window, resources, list)
38 if (window->res->flags & IORESOURCE_BUS) {
39 found = true;
40 break;
41 }
42
43 if (!found)
44 pci_add_resource(resources, &info->busn);
45
34 list_for_each_entry(root_res, &info->resources, list) { 46 list_for_each_entry(root_res, &info->resources, list) {
35 struct resource *res; 47 struct resource *res;
36 struct resource *root; 48 struct resource *root;
@@ -66,9 +78,13 @@ struct pci_root_info __init *alloc_pci_root_info(int bus_min, int bus_max,
66 if (!info) 78 if (!info)
67 return info; 79 return info;
68 80
81 sprintf(info->name, "PCI Bus #%02x", bus_min);
82
69 INIT_LIST_HEAD(&info->resources); 83 INIT_LIST_HEAD(&info->resources);
70 info->bus_min = bus_min; 84 info->busn.name = info->name;
71 info->bus_max = bus_max; 85 info->busn.start = bus_min;
86 info->busn.end = bus_max;
87 info->busn.flags = IORESOURCE_BUS;
72 info->node = node; 88 info->node = node;
73 info->link = link; 89 info->link = link;
74 90
diff --git a/arch/x86/pci/bus_numa.h b/arch/x86/pci/bus_numa.h
index 226a466b2b2b..ff8f65b04574 100644
--- a/arch/x86/pci/bus_numa.h
+++ b/arch/x86/pci/bus_numa.h
@@ -13,8 +13,7 @@ struct pci_root_info {
13 struct list_head list; 13 struct list_head list;
14 char name[12]; 14 char name[12];
15 struct list_head resources; 15 struct list_head resources;
16 int bus_min; 16 struct resource busn;
17 int bus_max;
18 int node; 17 int node;
19 int link; 18 int link;
20}; 19};
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 0ad990a20d4a..720e973fc34a 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -494,7 +494,7 @@ int __init pcibios_init(void)
494 return 0; 494 return 0;
495} 495}
496 496
497char * __devinit pcibios_setup(char *str) 497char * __init pcibios_setup(char *str)
498{ 498{
499 if (!strcmp(str, "off")) { 499 if (!strcmp(str, "off")) {
500 pci_probe = 0; 500 pci_probe = 0;
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 301e325992f6..937bcece7006 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -17,6 +17,8 @@
17#include <linux/bitmap.h> 17#include <linux/bitmap.h>
18#include <linux/dmi.h> 18#include <linux/dmi.h>
19#include <linux/slab.h> 19#include <linux/slab.h>
20#include <linux/mutex.h>
21#include <linux/rculist.h>
20#include <asm/e820.h> 22#include <asm/e820.h>
21#include <asm/pci_x86.h> 23#include <asm/pci_x86.h>
22#include <asm/acpi.h> 24#include <asm/acpi.h>
@@ -24,7 +26,9 @@
24#define PREFIX "PCI: " 26#define PREFIX "PCI: "
25 27
26/* Indicate if the mmcfg resources have been placed into the resource table. */ 28/* Indicate if the mmcfg resources have been placed into the resource table. */
27static int __initdata pci_mmcfg_resources_inserted; 29static bool pci_mmcfg_running_state;
30static bool pci_mmcfg_arch_init_failed;
31static DEFINE_MUTEX(pci_mmcfg_lock);
28 32
29LIST_HEAD(pci_mmcfg_list); 33LIST_HEAD(pci_mmcfg_list);
30 34
@@ -45,24 +49,25 @@ static __init void free_all_mmcfg(void)
45 pci_mmconfig_remove(cfg); 49 pci_mmconfig_remove(cfg);
46} 50}
47 51
48static __init void list_add_sorted(struct pci_mmcfg_region *new) 52static __devinit void list_add_sorted(struct pci_mmcfg_region *new)
49{ 53{
50 struct pci_mmcfg_region *cfg; 54 struct pci_mmcfg_region *cfg;
51 55
52 /* keep list sorted by segment and starting bus number */ 56 /* keep list sorted by segment and starting bus number */
53 list_for_each_entry(cfg, &pci_mmcfg_list, list) { 57 list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list) {
54 if (cfg->segment > new->segment || 58 if (cfg->segment > new->segment ||
55 (cfg->segment == new->segment && 59 (cfg->segment == new->segment &&
56 cfg->start_bus >= new->start_bus)) { 60 cfg->start_bus >= new->start_bus)) {
57 list_add_tail(&new->list, &cfg->list); 61 list_add_tail_rcu(&new->list, &cfg->list);
58 return; 62 return;
59 } 63 }
60 } 64 }
61 list_add_tail(&new->list, &pci_mmcfg_list); 65 list_add_tail_rcu(&new->list, &pci_mmcfg_list);
62} 66}
63 67
64static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start, 68static __devinit struct pci_mmcfg_region *pci_mmconfig_alloc(int segment,
65 int end, u64 addr) 69 int start,
70 int end, u64 addr)
66{ 71{
67 struct pci_mmcfg_region *new; 72 struct pci_mmcfg_region *new;
68 struct resource *res; 73 struct resource *res;
@@ -79,8 +84,6 @@ static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start,
79 new->start_bus = start; 84 new->start_bus = start;
80 new->end_bus = end; 85 new->end_bus = end;
81 86
82 list_add_sorted(new);
83
84 res = &new->res; 87 res = &new->res;
85 res->start = addr + PCI_MMCFG_BUS_OFFSET(start); 88 res->start = addr + PCI_MMCFG_BUS_OFFSET(start);
86 res->end = addr + PCI_MMCFG_BUS_OFFSET(end + 1) - 1; 89 res->end = addr + PCI_MMCFG_BUS_OFFSET(end + 1) - 1;
@@ -89,9 +92,25 @@ static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start,
89 "PCI MMCONFIG %04x [bus %02x-%02x]", segment, start, end); 92 "PCI MMCONFIG %04x [bus %02x-%02x]", segment, start, end);
90 res->name = new->name; 93 res->name = new->name;
91 94
92 printk(KERN_INFO PREFIX "MMCONFIG for domain %04x [bus %02x-%02x] at " 95 return new;
93 "%pR (base %#lx)\n", segment, start, end, &new->res, 96}
94 (unsigned long) addr); 97
98static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start,
99 int end, u64 addr)
100{
101 struct pci_mmcfg_region *new;
102
103 new = pci_mmconfig_alloc(segment, start, end, addr);
104 if (new) {
105 mutex_lock(&pci_mmcfg_lock);
106 list_add_sorted(new);
107 mutex_unlock(&pci_mmcfg_lock);
108
109 pr_info(PREFIX
110 "MMCONFIG for domain %04x [bus %02x-%02x] at %pR "
111 "(base %#lx)\n",
112 segment, start, end, &new->res, (unsigned long)addr);
113 }
95 114
96 return new; 115 return new;
97} 116}
@@ -100,7 +119,7 @@ struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus)
100{ 119{
101 struct pci_mmcfg_region *cfg; 120 struct pci_mmcfg_region *cfg;
102 121
103 list_for_each_entry(cfg, &pci_mmcfg_list, list) 122 list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list)
104 if (cfg->segment == segment && 123 if (cfg->segment == segment &&
105 cfg->start_bus <= bus && bus <= cfg->end_bus) 124 cfg->start_bus <= bus && bus <= cfg->end_bus)
106 return cfg; 125 return cfg;
@@ -343,8 +362,7 @@ static int __init pci_mmcfg_check_hostbridge(void)
343 name = pci_mmcfg_probes[i].probe(); 362 name = pci_mmcfg_probes[i].probe();
344 363
345 if (name) 364 if (name)
346 printk(KERN_INFO PREFIX "%s with MMCONFIG support\n", 365 pr_info(PREFIX "%s with MMCONFIG support\n", name);
347 name);
348 } 366 }
349 367
350 /* some end_bus_number is crazy, fix it */ 368 /* some end_bus_number is crazy, fix it */
@@ -353,19 +371,8 @@ static int __init pci_mmcfg_check_hostbridge(void)
353 return !list_empty(&pci_mmcfg_list); 371 return !list_empty(&pci_mmcfg_list);
354} 372}
355 373
356static void __init pci_mmcfg_insert_resources(void) 374static acpi_status __devinit check_mcfg_resource(struct acpi_resource *res,
357{ 375 void *data)
358 struct pci_mmcfg_region *cfg;
359
360 list_for_each_entry(cfg, &pci_mmcfg_list, list)
361 insert_resource(&iomem_resource, &cfg->res);
362
363 /* Mark that the resources have been inserted. */
364 pci_mmcfg_resources_inserted = 1;
365}
366
367static acpi_status __init check_mcfg_resource(struct acpi_resource *res,
368 void *data)
369{ 376{
370 struct resource *mcfg_res = data; 377 struct resource *mcfg_res = data;
371 struct acpi_resource_address64 address; 378 struct acpi_resource_address64 address;
@@ -401,8 +408,8 @@ static acpi_status __init check_mcfg_resource(struct acpi_resource *res,
401 return AE_OK; 408 return AE_OK;
402} 409}
403 410
404static acpi_status __init find_mboard_resource(acpi_handle handle, u32 lvl, 411static acpi_status __devinit find_mboard_resource(acpi_handle handle, u32 lvl,
405 void *context, void **rv) 412 void *context, void **rv)
406{ 413{
407 struct resource *mcfg_res = context; 414 struct resource *mcfg_res = context;
408 415
@@ -415,7 +422,7 @@ static acpi_status __init find_mboard_resource(acpi_handle handle, u32 lvl,
415 return AE_OK; 422 return AE_OK;
416} 423}
417 424
418static int __init is_acpi_reserved(u64 start, u64 end, unsigned not_used) 425static int __devinit is_acpi_reserved(u64 start, u64 end, unsigned not_used)
419{ 426{
420 struct resource mcfg_res; 427 struct resource mcfg_res;
421 428
@@ -434,13 +441,15 @@ static int __init is_acpi_reserved(u64 start, u64 end, unsigned not_used)
434 441
435typedef int (*check_reserved_t)(u64 start, u64 end, unsigned type); 442typedef int (*check_reserved_t)(u64 start, u64 end, unsigned type);
436 443
437static int __init is_mmconf_reserved(check_reserved_t is_reserved, 444static int __ref is_mmconf_reserved(check_reserved_t is_reserved,
438 struct pci_mmcfg_region *cfg, int with_e820) 445 struct pci_mmcfg_region *cfg,
446 struct device *dev, int with_e820)
439{ 447{
440 u64 addr = cfg->res.start; 448 u64 addr = cfg->res.start;
441 u64 size = resource_size(&cfg->res); 449 u64 size = resource_size(&cfg->res);
442 u64 old_size = size; 450 u64 old_size = size;
443 int valid = 0, num_buses; 451 int num_buses;
452 char *method = with_e820 ? "E820" : "ACPI motherboard resources";
444 453
445 while (!is_reserved(addr, addr + size, E820_RESERVED)) { 454 while (!is_reserved(addr, addr + size, E820_RESERVED)) {
446 size >>= 1; 455 size >>= 1;
@@ -448,30 +457,76 @@ static int __init is_mmconf_reserved(check_reserved_t is_reserved,
448 break; 457 break;
449 } 458 }
450 459
451 if (size >= (16UL<<20) || size == old_size) { 460 if (size < (16UL<<20) && size != old_size)
452 printk(KERN_INFO PREFIX "MMCONFIG at %pR reserved in %s\n", 461 return 0;
453 &cfg->res, 462
454 with_e820 ? "E820" : "ACPI motherboard resources"); 463 if (dev)
455 valid = 1; 464 dev_info(dev, "MMCONFIG at %pR reserved in %s\n",
456 465 &cfg->res, method);
457 if (old_size != size) { 466 else
458 /* update end_bus */ 467 pr_info(PREFIX "MMCONFIG at %pR reserved in %s\n",
459 cfg->end_bus = cfg->start_bus + ((size>>20) - 1); 468 &cfg->res, method);
460 num_buses = cfg->end_bus - cfg->start_bus + 1; 469
461 cfg->res.end = cfg->res.start + 470 if (old_size != size) {
462 PCI_MMCFG_BUS_OFFSET(num_buses) - 1; 471 /* update end_bus */
463 snprintf(cfg->name, PCI_MMCFG_RESOURCE_NAME_LEN, 472 cfg->end_bus = cfg->start_bus + ((size>>20) - 1);
464 "PCI MMCONFIG %04x [bus %02x-%02x]", 473 num_buses = cfg->end_bus - cfg->start_bus + 1;
465 cfg->segment, cfg->start_bus, cfg->end_bus); 474 cfg->res.end = cfg->res.start +
466 printk(KERN_INFO PREFIX 475 PCI_MMCFG_BUS_OFFSET(num_buses) - 1;
467 "MMCONFIG for %04x [bus%02x-%02x] " 476 snprintf(cfg->name, PCI_MMCFG_RESOURCE_NAME_LEN,
468 "at %pR (base %#lx) (size reduced!)\n", 477 "PCI MMCONFIG %04x [bus %02x-%02x]",
469 cfg->segment, cfg->start_bus, cfg->end_bus, 478 cfg->segment, cfg->start_bus, cfg->end_bus);
470 &cfg->res, (unsigned long) cfg->address); 479
471 } 480 if (dev)
481 dev_info(dev,
482 "MMCONFIG "
483 "at %pR (base %#lx) (size reduced!)\n",
484 &cfg->res, (unsigned long) cfg->address);
485 else
486 pr_info(PREFIX
487 "MMCONFIG for %04x [bus%02x-%02x] "
488 "at %pR (base %#lx) (size reduced!)\n",
489 cfg->segment, cfg->start_bus, cfg->end_bus,
490 &cfg->res, (unsigned long) cfg->address);
472 } 491 }
473 492
474 return valid; 493 return 1;
494}
495
496static int __ref pci_mmcfg_check_reserved(struct device *dev,
497 struct pci_mmcfg_region *cfg, int early)
498{
499 if (!early && !acpi_disabled) {
500 if (is_mmconf_reserved(is_acpi_reserved, cfg, dev, 0))
501 return 1;
502
503 if (dev)
504 dev_info(dev, FW_INFO
505 "MMCONFIG at %pR not reserved in "
506 "ACPI motherboard resources\n",
507 &cfg->res);
508 else
509 pr_info(FW_INFO PREFIX
510 "MMCONFIG at %pR not reserved in "
511 "ACPI motherboard resources\n",
512 &cfg->res);
513 }
514
515 /*
516 * e820_all_mapped() is marked as __init.
517 * All entries from ACPI MCFG table have been checked at boot time.
518 * For MCFG information constructed from hotpluggable host bridge's
519 * _CBA method, just assume it's reserved.
520 */
521 if (pci_mmcfg_running_state)
522 return 1;
523
524 /* Don't try to do this check unless configuration
525 type 1 is available. how about type 2 ?*/
526 if (raw_pci_ops)
527 return is_mmconf_reserved(e820_all_mapped, cfg, dev, 1);
528
529 return 0;
475} 530}
476 531
477static void __init pci_mmcfg_reject_broken(int early) 532static void __init pci_mmcfg_reject_broken(int early)
@@ -479,38 +534,14 @@ static void __init pci_mmcfg_reject_broken(int early)
479 struct pci_mmcfg_region *cfg; 534 struct pci_mmcfg_region *cfg;
480 535
481 list_for_each_entry(cfg, &pci_mmcfg_list, list) { 536 list_for_each_entry(cfg, &pci_mmcfg_list, list) {
482 int valid = 0; 537 if (pci_mmcfg_check_reserved(NULL, cfg, early) == 0) {
483 538 pr_info(PREFIX "not using MMCONFIG\n");
484 if (!early && !acpi_disabled) { 539 free_all_mmcfg();
485 valid = is_mmconf_reserved(is_acpi_reserved, cfg, 0); 540 return;
486
487 if (valid)
488 continue;
489 else
490 printk(KERN_ERR FW_BUG PREFIX
491 "MMCONFIG at %pR not reserved in "
492 "ACPI motherboard resources\n",
493 &cfg->res);
494 } 541 }
495
496 /* Don't try to do this check unless configuration
497 type 1 is available. how about type 2 ?*/
498 if (raw_pci_ops)
499 valid = is_mmconf_reserved(e820_all_mapped, cfg, 1);
500
501 if (!valid)
502 goto reject;
503 } 542 }
504
505 return;
506
507reject:
508 printk(KERN_INFO PREFIX "not using MMCONFIG\n");
509 free_all_mmcfg();
510} 543}
511 544
512static int __initdata known_bridge;
513
514static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg, 545static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg,
515 struct acpi_mcfg_allocation *cfg) 546 struct acpi_mcfg_allocation *cfg)
516{ 547{
@@ -529,7 +560,7 @@ static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg,
529 return 0; 560 return 0;
530 } 561 }
531 562
532 printk(KERN_ERR PREFIX "MCFG region for %04x [bus %02x-%02x] at %#llx " 563 pr_err(PREFIX "MCFG region for %04x [bus %02x-%02x] at %#llx "
533 "is above 4GB, ignored\n", cfg->pci_segment, 564 "is above 4GB, ignored\n", cfg->pci_segment,
534 cfg->start_bus_number, cfg->end_bus_number, cfg->address); 565 cfg->start_bus_number, cfg->end_bus_number, cfg->address);
535 return -EINVAL; 566 return -EINVAL;
@@ -556,7 +587,7 @@ static int __init pci_parse_mcfg(struct acpi_table_header *header)
556 i -= sizeof(struct acpi_mcfg_allocation); 587 i -= sizeof(struct acpi_mcfg_allocation);
557 }; 588 };
558 if (entries == 0) { 589 if (entries == 0) {
559 printk(KERN_ERR PREFIX "MMCONFIG has no entries\n"); 590 pr_err(PREFIX "MMCONFIG has no entries\n");
560 return -ENODEV; 591 return -ENODEV;
561 } 592 }
562 593
@@ -570,8 +601,7 @@ static int __init pci_parse_mcfg(struct acpi_table_header *header)
570 601
571 if (pci_mmconfig_add(cfg->pci_segment, cfg->start_bus_number, 602 if (pci_mmconfig_add(cfg->pci_segment, cfg->start_bus_number,
572 cfg->end_bus_number, cfg->address) == NULL) { 603 cfg->end_bus_number, cfg->address) == NULL) {
573 printk(KERN_WARNING PREFIX 604 pr_warn(PREFIX "no memory for MCFG entries\n");
574 "no memory for MCFG entries\n");
575 free_all_mmcfg(); 605 free_all_mmcfg();
576 return -ENOMEM; 606 return -ENOMEM;
577 } 607 }
@@ -582,28 +612,7 @@ static int __init pci_parse_mcfg(struct acpi_table_header *header)
582 612
583static void __init __pci_mmcfg_init(int early) 613static void __init __pci_mmcfg_init(int early)
584{ 614{
585 /* MMCONFIG disabled */
586 if ((pci_probe & PCI_PROBE_MMCONF) == 0)
587 return;
588
589 /* MMCONFIG already enabled */
590 if (!early && !(pci_probe & PCI_PROBE_MASK & ~PCI_PROBE_MMCONF))
591 return;
592
593 /* for late to exit */
594 if (known_bridge)
595 return;
596
597 if (early) {
598 if (pci_mmcfg_check_hostbridge())
599 known_bridge = 1;
600 }
601
602 if (!known_bridge)
603 acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
604
605 pci_mmcfg_reject_broken(early); 615 pci_mmcfg_reject_broken(early);
606
607 if (list_empty(&pci_mmcfg_list)) 616 if (list_empty(&pci_mmcfg_list))
608 return; 617 return;
609 618
@@ -620,33 +629,48 @@ static void __init __pci_mmcfg_init(int early)
620 if (pci_mmcfg_arch_init()) 629 if (pci_mmcfg_arch_init())
621 pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; 630 pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
622 else { 631 else {
623 /* 632 free_all_mmcfg();
624 * Signal not to attempt to insert mmcfg resources because 633 pci_mmcfg_arch_init_failed = true;
625 * the architecture mmcfg setup could not initialize.
626 */
627 pci_mmcfg_resources_inserted = 1;
628 } 634 }
629} 635}
630 636
637static int __initdata known_bridge;
638
631void __init pci_mmcfg_early_init(void) 639void __init pci_mmcfg_early_init(void)
632{ 640{
633 __pci_mmcfg_init(1); 641 if (pci_probe & PCI_PROBE_MMCONF) {
642 if (pci_mmcfg_check_hostbridge())
643 known_bridge = 1;
644 else
645 acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
646 __pci_mmcfg_init(1);
647 }
634} 648}
635 649
636void __init pci_mmcfg_late_init(void) 650void __init pci_mmcfg_late_init(void)
637{ 651{
638 __pci_mmcfg_init(0); 652 /* MMCONFIG disabled */
653 if ((pci_probe & PCI_PROBE_MMCONF) == 0)
654 return;
655
656 if (known_bridge)
657 return;
658
659 /* MMCONFIG hasn't been enabled yet, try again */
660 if (pci_probe & PCI_PROBE_MASK & ~PCI_PROBE_MMCONF) {
661 acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
662 __pci_mmcfg_init(0);
663 }
639} 664}
640 665
641static int __init pci_mmcfg_late_insert_resources(void) 666static int __init pci_mmcfg_late_insert_resources(void)
642{ 667{
643 /* 668 struct pci_mmcfg_region *cfg;
644 * If resources are already inserted or we are not using MMCONFIG, 669
645 * don't insert the resources. 670 pci_mmcfg_running_state = true;
646 */ 671
647 if ((pci_mmcfg_resources_inserted == 1) || 672 /* If we are not using MMCONFIG, don't insert the resources. */
648 (pci_probe & PCI_PROBE_MMCONF) == 0 || 673 if ((pci_probe & PCI_PROBE_MMCONF) == 0)
649 list_empty(&pci_mmcfg_list))
650 return 1; 674 return 1;
651 675
652 /* 676 /*
@@ -654,7 +678,9 @@ static int __init pci_mmcfg_late_insert_resources(void)
654 * marked so it won't cause request errors when __request_region is 678 * marked so it won't cause request errors when __request_region is
655 * called. 679 * called.
656 */ 680 */
657 pci_mmcfg_insert_resources(); 681 list_for_each_entry(cfg, &pci_mmcfg_list, list)
682 if (!cfg->res.parent)
683 insert_resource(&iomem_resource, &cfg->res);
658 684
659 return 0; 685 return 0;
660} 686}
@@ -665,3 +691,101 @@ static int __init pci_mmcfg_late_insert_resources(void)
665 * with other system resources. 691 * with other system resources.
666 */ 692 */
667late_initcall(pci_mmcfg_late_insert_resources); 693late_initcall(pci_mmcfg_late_insert_resources);
694
695/* Add MMCFG information for host bridges */
696int __devinit pci_mmconfig_insert(struct device *dev,
697 u16 seg, u8 start, u8 end,
698 phys_addr_t addr)
699{
700 int rc;
701 struct resource *tmp = NULL;
702 struct pci_mmcfg_region *cfg;
703
704 if (!(pci_probe & PCI_PROBE_MMCONF) || pci_mmcfg_arch_init_failed)
705 return -ENODEV;
706
707 if (start > end)
708 return -EINVAL;
709
710 mutex_lock(&pci_mmcfg_lock);
711 cfg = pci_mmconfig_lookup(seg, start);
712 if (cfg) {
713 if (cfg->end_bus < end)
714 dev_info(dev, FW_INFO
715 "MMCONFIG for "
716 "domain %04x [bus %02x-%02x] "
717 "only partially covers this bridge\n",
718 cfg->segment, cfg->start_bus, cfg->end_bus);
719 mutex_unlock(&pci_mmcfg_lock);
720 return -EEXIST;
721 }
722
723 if (!addr) {
724 mutex_unlock(&pci_mmcfg_lock);
725 return -EINVAL;
726 }
727
728 rc = -EBUSY;
729 cfg = pci_mmconfig_alloc(seg, start, end, addr);
730 if (cfg == NULL) {
731 dev_warn(dev, "fail to add MMCONFIG (out of memory)\n");
732 rc = -ENOMEM;
733 } else if (!pci_mmcfg_check_reserved(dev, cfg, 0)) {
734 dev_warn(dev, FW_BUG "MMCONFIG %pR isn't reserved\n",
735 &cfg->res);
736 } else {
737 /* Insert resource if it's not in boot stage */
738 if (pci_mmcfg_running_state)
739 tmp = insert_resource_conflict(&iomem_resource,
740 &cfg->res);
741
742 if (tmp) {
743 dev_warn(dev,
744 "MMCONFIG %pR conflicts with "
745 "%s %pR\n",
746 &cfg->res, tmp->name, tmp);
747 } else if (pci_mmcfg_arch_map(cfg)) {
748 dev_warn(dev, "fail to map MMCONFIG %pR.\n",
749 &cfg->res);
750 } else {
751 list_add_sorted(cfg);
752 dev_info(dev, "MMCONFIG at %pR (base %#lx)\n",
753 &cfg->res, (unsigned long)addr);
754 cfg = NULL;
755 rc = 0;
756 }
757 }
758
759 if (cfg) {
760 if (cfg->res.parent)
761 release_resource(&cfg->res);
762 kfree(cfg);
763 }
764
765 mutex_unlock(&pci_mmcfg_lock);
766
767 return rc;
768}
769
770/* Delete MMCFG information for host bridges */
771int pci_mmconfig_delete(u16 seg, u8 start, u8 end)
772{
773 struct pci_mmcfg_region *cfg;
774
775 mutex_lock(&pci_mmcfg_lock);
776 list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list)
777 if (cfg->segment == seg && cfg->start_bus == start &&
778 cfg->end_bus == end) {
779 list_del_rcu(&cfg->list);
780 synchronize_rcu();
781 pci_mmcfg_arch_unmap(cfg);
782 if (cfg->res.parent)
783 release_resource(&cfg->res);
784 mutex_unlock(&pci_mmcfg_lock);
785 kfree(cfg);
786 return 0;
787 }
788 mutex_unlock(&pci_mmcfg_lock);
789
790 return -ENOENT;
791}
diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c
index 5372e86834c0..db63ac23e3d9 100644
--- a/arch/x86/pci/mmconfig_32.c
+++ b/arch/x86/pci/mmconfig_32.c
@@ -11,6 +11,7 @@
11 11
12#include <linux/pci.h> 12#include <linux/pci.h>
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/rcupdate.h>
14#include <asm/e820.h> 15#include <asm/e820.h>
15#include <asm/pci_x86.h> 16#include <asm/pci_x86.h>
16#include <acpi/acpi.h> 17#include <acpi/acpi.h>
@@ -60,9 +61,12 @@ err: *value = -1;
60 return -EINVAL; 61 return -EINVAL;
61 } 62 }
62 63
64 rcu_read_lock();
63 base = get_base_addr(seg, bus, devfn); 65 base = get_base_addr(seg, bus, devfn);
64 if (!base) 66 if (!base) {
67 rcu_read_unlock();
65 goto err; 68 goto err;
69 }
66 70
67 raw_spin_lock_irqsave(&pci_config_lock, flags); 71 raw_spin_lock_irqsave(&pci_config_lock, flags);
68 72
@@ -80,6 +84,7 @@ err: *value = -1;
80 break; 84 break;
81 } 85 }
82 raw_spin_unlock_irqrestore(&pci_config_lock, flags); 86 raw_spin_unlock_irqrestore(&pci_config_lock, flags);
87 rcu_read_unlock();
83 88
84 return 0; 89 return 0;
85} 90}
@@ -93,9 +98,12 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus,
93 if ((bus > 255) || (devfn > 255) || (reg > 4095)) 98 if ((bus > 255) || (devfn > 255) || (reg > 4095))
94 return -EINVAL; 99 return -EINVAL;
95 100
101 rcu_read_lock();
96 base = get_base_addr(seg, bus, devfn); 102 base = get_base_addr(seg, bus, devfn);
97 if (!base) 103 if (!base) {
104 rcu_read_unlock();
98 return -EINVAL; 105 return -EINVAL;
106 }
99 107
100 raw_spin_lock_irqsave(&pci_config_lock, flags); 108 raw_spin_lock_irqsave(&pci_config_lock, flags);
101 109
@@ -113,11 +121,12 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus,
113 break; 121 break;
114 } 122 }
115 raw_spin_unlock_irqrestore(&pci_config_lock, flags); 123 raw_spin_unlock_irqrestore(&pci_config_lock, flags);
124 rcu_read_unlock();
116 125
117 return 0; 126 return 0;
118} 127}
119 128
120static const struct pci_raw_ops pci_mmcfg = { 129const struct pci_raw_ops pci_mmcfg = {
121 .read = pci_mmcfg_read, 130 .read = pci_mmcfg_read,
122 .write = pci_mmcfg_write, 131 .write = pci_mmcfg_write,
123}; 132};
@@ -132,3 +141,18 @@ int __init pci_mmcfg_arch_init(void)
132void __init pci_mmcfg_arch_free(void) 141void __init pci_mmcfg_arch_free(void)
133{ 142{
134} 143}
144
145int __devinit pci_mmcfg_arch_map(struct pci_mmcfg_region *cfg)
146{
147 return 0;
148}
149
150void pci_mmcfg_arch_unmap(struct pci_mmcfg_region *cfg)
151{
152 unsigned long flags;
153
154 /* Invalidate the cached mmcfg map entry. */
155 raw_spin_lock_irqsave(&pci_config_lock, flags);
156 mmcfg_last_accessed_device = 0;
157 raw_spin_unlock_irqrestore(&pci_config_lock, flags);
158}
diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c
index 915a493502cb..d4ebd07c306d 100644
--- a/arch/x86/pci/mmconfig_64.c
+++ b/arch/x86/pci/mmconfig_64.c
@@ -9,6 +9,7 @@
9#include <linux/init.h> 9#include <linux/init.h>
10#include <linux/acpi.h> 10#include <linux/acpi.h>
11#include <linux/bitmap.h> 11#include <linux/bitmap.h>
12#include <linux/rcupdate.h>
12#include <asm/e820.h> 13#include <asm/e820.h>
13#include <asm/pci_x86.h> 14#include <asm/pci_x86.h>
14 15
@@ -34,9 +35,12 @@ err: *value = -1;
34 return -EINVAL; 35 return -EINVAL;
35 } 36 }
36 37
38 rcu_read_lock();
37 addr = pci_dev_base(seg, bus, devfn); 39 addr = pci_dev_base(seg, bus, devfn);
38 if (!addr) 40 if (!addr) {
41 rcu_read_unlock();
39 goto err; 42 goto err;
43 }
40 44
41 switch (len) { 45 switch (len) {
42 case 1: 46 case 1:
@@ -49,6 +53,7 @@ err: *value = -1;
49 *value = mmio_config_readl(addr + reg); 53 *value = mmio_config_readl(addr + reg);
50 break; 54 break;
51 } 55 }
56 rcu_read_unlock();
52 57
53 return 0; 58 return 0;
54} 59}
@@ -62,9 +67,12 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus,
62 if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095))) 67 if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095)))
63 return -EINVAL; 68 return -EINVAL;
64 69
70 rcu_read_lock();
65 addr = pci_dev_base(seg, bus, devfn); 71 addr = pci_dev_base(seg, bus, devfn);
66 if (!addr) 72 if (!addr) {
73 rcu_read_unlock();
67 return -EINVAL; 74 return -EINVAL;
75 }
68 76
69 switch (len) { 77 switch (len) {
70 case 1: 78 case 1:
@@ -77,16 +85,17 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus,
77 mmio_config_writel(addr + reg, value); 85 mmio_config_writel(addr + reg, value);
78 break; 86 break;
79 } 87 }
88 rcu_read_unlock();
80 89
81 return 0; 90 return 0;
82} 91}
83 92
84static const struct pci_raw_ops pci_mmcfg = { 93const struct pci_raw_ops pci_mmcfg = {
85 .read = pci_mmcfg_read, 94 .read = pci_mmcfg_read,
86 .write = pci_mmcfg_write, 95 .write = pci_mmcfg_write,
87}; 96};
88 97
89static void __iomem * __init mcfg_ioremap(struct pci_mmcfg_region *cfg) 98static void __iomem * __devinit mcfg_ioremap(struct pci_mmcfg_region *cfg)
90{ 99{
91 void __iomem *addr; 100 void __iomem *addr;
92 u64 start, size; 101 u64 start, size;
@@ -105,16 +114,14 @@ int __init pci_mmcfg_arch_init(void)
105{ 114{
106 struct pci_mmcfg_region *cfg; 115 struct pci_mmcfg_region *cfg;
107 116
108 list_for_each_entry(cfg, &pci_mmcfg_list, list) { 117 list_for_each_entry(cfg, &pci_mmcfg_list, list)
109 cfg->virt = mcfg_ioremap(cfg); 118 if (pci_mmcfg_arch_map(cfg)) {
110 if (!cfg->virt) {
111 printk(KERN_ERR PREFIX "can't map MMCONFIG at %pR\n",
112 &cfg->res);
113 pci_mmcfg_arch_free(); 119 pci_mmcfg_arch_free();
114 return 0; 120 return 0;
115 } 121 }
116 } 122
117 raw_pci_ext_ops = &pci_mmcfg; 123 raw_pci_ext_ops = &pci_mmcfg;
124
118 return 1; 125 return 1;
119} 126}
120 127
@@ -122,10 +129,25 @@ void __init pci_mmcfg_arch_free(void)
122{ 129{
123 struct pci_mmcfg_region *cfg; 130 struct pci_mmcfg_region *cfg;
124 131
125 list_for_each_entry(cfg, &pci_mmcfg_list, list) { 132 list_for_each_entry(cfg, &pci_mmcfg_list, list)
126 if (cfg->virt) { 133 pci_mmcfg_arch_unmap(cfg);
127 iounmap(cfg->virt + PCI_MMCFG_BUS_OFFSET(cfg->start_bus)); 134}
128 cfg->virt = NULL; 135
129 } 136int __devinit pci_mmcfg_arch_map(struct pci_mmcfg_region *cfg)
137{
138 cfg->virt = mcfg_ioremap(cfg);
139 if (!cfg->virt) {
140 pr_err(PREFIX "can't map MMCONFIG at %pR\n", &cfg->res);
141 return -ENOMEM;
142 }
143
144 return 0;
145}
146
147void pci_mmcfg_arch_unmap(struct pci_mmcfg_region *cfg)
148{
149 if (cfg && cfg->virt) {
150 iounmap(cfg->virt + PCI_MMCFG_BUS_OFFSET(cfg->start_bus));
151 cfg->virt = NULL;
130 } 152 }
131} 153}
diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/mrst.c
index 140942f66b31..e14a2ff708b5 100644
--- a/arch/x86/pci/mrst.c
+++ b/arch/x86/pci/mrst.c
@@ -264,7 +264,7 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_d3delay_fixup);
264 264
265static void __devinit mrst_power_off_unused_dev(struct pci_dev *dev) 265static void __devinit mrst_power_off_unused_dev(struct pci_dev *dev)
266{ 266{
267 pci_set_power_state(dev, PCI_D3cold); 267 pci_set_power_state(dev, PCI_D3hot);
268} 268}
269DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0801, mrst_power_off_unused_dev); 269DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0801, mrst_power_off_unused_dev);
270DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0809, mrst_power_off_unused_dev); 270DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0809, mrst_power_off_unused_dev);
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 92660edaa1e7..2dc29f51e75a 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -234,22 +234,7 @@ static efi_status_t __init phys_efi_set_virtual_address_map(
234 return status; 234 return status;
235} 235}
236 236
237static efi_status_t __init phys_efi_get_time(efi_time_t *tm, 237static int efi_set_rtc_mmss(unsigned long nowtime)
238 efi_time_cap_t *tc)
239{
240 unsigned long flags;
241 efi_status_t status;
242
243 spin_lock_irqsave(&rtc_lock, flags);
244 efi_call_phys_prelog();
245 status = efi_call_phys2(efi_phys.get_time, virt_to_phys(tm),
246 virt_to_phys(tc));
247 efi_call_phys_epilog();
248 spin_unlock_irqrestore(&rtc_lock, flags);
249 return status;
250}
251
252int efi_set_rtc_mmss(unsigned long nowtime)
253{ 238{
254 int real_seconds, real_minutes; 239 int real_seconds, real_minutes;
255 efi_status_t status; 240 efi_status_t status;
@@ -278,7 +263,7 @@ int efi_set_rtc_mmss(unsigned long nowtime)
278 return 0; 263 return 0;
279} 264}
280 265
281unsigned long efi_get_time(void) 266static unsigned long efi_get_time(void)
282{ 267{
283 efi_status_t status; 268 efi_status_t status;
284 efi_time_t eft; 269 efi_time_t eft;
@@ -621,18 +606,13 @@ static int __init efi_runtime_init(void)
621 } 606 }
622 /* 607 /*
623 * We will only need *early* access to the following 608 * We will only need *early* access to the following
624 * two EFI runtime services before set_virtual_address_map 609 * EFI runtime service before set_virtual_address_map
625 * is invoked. 610 * is invoked.
626 */ 611 */
627 efi_phys.get_time = (efi_get_time_t *)runtime->get_time;
628 efi_phys.set_virtual_address_map = 612 efi_phys.set_virtual_address_map =
629 (efi_set_virtual_address_map_t *) 613 (efi_set_virtual_address_map_t *)
630 runtime->set_virtual_address_map; 614 runtime->set_virtual_address_map;
631 /* 615
632 * Make efi_get_time can be called before entering
633 * virtual mode.
634 */
635 efi.get_time = phys_efi_get_time;
636 early_iounmap(runtime, sizeof(efi_runtime_services_t)); 616 early_iounmap(runtime, sizeof(efi_runtime_services_t));
637 617
638 return 0; 618 return 0;
@@ -720,12 +700,10 @@ void __init efi_init(void)
720 efi_enabled = 0; 700 efi_enabled = 0;
721 return; 701 return;
722 } 702 }
723#ifdef CONFIG_X86_32
724 if (efi_native) { 703 if (efi_native) {
725 x86_platform.get_wallclock = efi_get_time; 704 x86_platform.get_wallclock = efi_get_time;
726 x86_platform.set_wallclock = efi_set_rtc_mmss; 705 x86_platform.set_wallclock = efi_set_rtc_mmss;
727 } 706 }
728#endif
729 707
730#if EFI_DEBUG 708#if EFI_DEBUG
731 print_efi_memmap(); 709 print_efi_memmap();
diff --git a/arch/x86/platform/mrst/early_printk_mrst.c b/arch/x86/platform/mrst/early_printk_mrst.c
index 3c6e328483c7..028454f0c3a5 100644
--- a/arch/x86/platform/mrst/early_printk_mrst.c
+++ b/arch/x86/platform/mrst/early_printk_mrst.c
@@ -110,19 +110,16 @@ static struct kmsg_dumper dw_dumper;
110static int dumper_registered; 110static int dumper_registered;
111 111
112static void dw_kmsg_dump(struct kmsg_dumper *dumper, 112static void dw_kmsg_dump(struct kmsg_dumper *dumper,
113 enum kmsg_dump_reason reason, 113 enum kmsg_dump_reason reason)
114 const char *s1, unsigned long l1,
115 const char *s2, unsigned long l2)
116{ 114{
117 int i; 115 static char line[1024];
116 size_t len;
118 117
119 /* When run to this, we'd better re-init the HW */ 118 /* When run to this, we'd better re-init the HW */
120 mrst_early_console_init(); 119 mrst_early_console_init();
121 120
122 for (i = 0; i < l1; i++) 121 while (kmsg_dump_get_line(dumper, true, line, sizeof(line), &len))
123 early_mrst_console.write(&early_mrst_console, s1 + i, 1); 122 early_mrst_console.write(&early_mrst_console, line, len);
124 for (i = 0; i < l2; i++)
125 early_mrst_console.write(&early_mrst_console, s2 + i, 1);
126} 123}
127 124
128/* Set the ratio rate to 115200, 8n1, IRQ disabled */ 125/* Set the ratio rate to 115200, 8n1, IRQ disabled */
diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c
index 23e5b9d7977b..599be499fdf7 100644
--- a/arch/x86/platform/olpc/olpc-xo15-sci.c
+++ b/arch/x86/platform/olpc/olpc-xo15-sci.c
@@ -203,7 +203,7 @@ static int xo15_sci_remove(struct acpi_device *device, int type)
203 return 0; 203 return 0;
204} 204}
205 205
206static int xo15_sci_resume(struct acpi_device *device) 206static int xo15_sci_resume(struct device *dev)
207{ 207{
208 /* Enable all EC events */ 208 /* Enable all EC events */
209 olpc_ec_mask_write(EC_SCI_SRC_ALL); 209 olpc_ec_mask_write(EC_SCI_SRC_ALL);
@@ -215,6 +215,8 @@ static int xo15_sci_resume(struct acpi_device *device)
215 return 0; 215 return 0;
216} 216}
217 217
218static SIMPLE_DEV_PM_OPS(xo15_sci_pm, NULL, xo15_sci_resume);
219
218static const struct acpi_device_id xo15_sci_device_ids[] = { 220static const struct acpi_device_id xo15_sci_device_ids[] = {
219 {"XO15EC", 0}, 221 {"XO15EC", 0},
220 {"", 0}, 222 {"", 0},
@@ -227,8 +229,8 @@ static struct acpi_driver xo15_sci_drv = {
227 .ops = { 229 .ops = {
228 .add = xo15_sci_add, 230 .add = xo15_sci_add,
229 .remove = xo15_sci_remove, 231 .remove = xo15_sci_remove,
230 .resume = xo15_sci_resume,
231 }, 232 },
233 .drv.pm = &xo15_sci_pm,
232}; 234};
233 235
234static int __init xo15_sci_init(void) 236static int __init xo15_sci_init(void)
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 59880afa851f..b8b3a37c80cd 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * SGI UltraViolet TLB flush routines. 2 * SGI UltraViolet TLB flush routines.
3 * 3 *
4 * (c) 2008-2011 Cliff Wickman <cpw@sgi.com>, SGI. 4 * (c) 2008-2012 Cliff Wickman <cpw@sgi.com>, SGI.
5 * 5 *
6 * This code is released under the GNU General Public License version 2 or 6 * This code is released under the GNU General Public License version 2 or
7 * later. 7 * later.
@@ -38,8 +38,7 @@ static int timeout_base_ns[] = {
38 38
39static int timeout_us; 39static int timeout_us;
40static int nobau; 40static int nobau;
41static int baudisabled; 41static int nobau_perm;
42static spinlock_t disable_lock;
43static cycles_t congested_cycles; 42static cycles_t congested_cycles;
44 43
45/* tunables: */ 44/* tunables: */
@@ -47,12 +46,13 @@ static int max_concurr = MAX_BAU_CONCURRENT;
47static int max_concurr_const = MAX_BAU_CONCURRENT; 46static int max_concurr_const = MAX_BAU_CONCURRENT;
48static int plugged_delay = PLUGGED_DELAY; 47static int plugged_delay = PLUGGED_DELAY;
49static int plugsb4reset = PLUGSB4RESET; 48static int plugsb4reset = PLUGSB4RESET;
49static int giveup_limit = GIVEUP_LIMIT;
50static int timeoutsb4reset = TIMEOUTSB4RESET; 50static int timeoutsb4reset = TIMEOUTSB4RESET;
51static int ipi_reset_limit = IPI_RESET_LIMIT; 51static int ipi_reset_limit = IPI_RESET_LIMIT;
52static int complete_threshold = COMPLETE_THRESHOLD; 52static int complete_threshold = COMPLETE_THRESHOLD;
53static int congested_respns_us = CONGESTED_RESPONSE_US; 53static int congested_respns_us = CONGESTED_RESPONSE_US;
54static int congested_reps = CONGESTED_REPS; 54static int congested_reps = CONGESTED_REPS;
55static int congested_period = CONGESTED_PERIOD; 55static int disabled_period = DISABLED_PERIOD;
56 56
57static struct tunables tunables[] = { 57static struct tunables tunables[] = {
58 {&max_concurr, MAX_BAU_CONCURRENT}, /* must be [0] */ 58 {&max_concurr, MAX_BAU_CONCURRENT}, /* must be [0] */
@@ -63,7 +63,8 @@ static struct tunables tunables[] = {
63 {&complete_threshold, COMPLETE_THRESHOLD}, 63 {&complete_threshold, COMPLETE_THRESHOLD},
64 {&congested_respns_us, CONGESTED_RESPONSE_US}, 64 {&congested_respns_us, CONGESTED_RESPONSE_US},
65 {&congested_reps, CONGESTED_REPS}, 65 {&congested_reps, CONGESTED_REPS},
66 {&congested_period, CONGESTED_PERIOD} 66 {&disabled_period, DISABLED_PERIOD},
67 {&giveup_limit, GIVEUP_LIMIT}
67}; 68};
68 69
69static struct dentry *tunables_dir; 70static struct dentry *tunables_dir;
@@ -120,6 +121,40 @@ static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
120static DEFINE_PER_CPU(struct bau_control, bau_control); 121static DEFINE_PER_CPU(struct bau_control, bau_control);
121static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask); 122static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
122 123
124static void
125set_bau_on(void)
126{
127 int cpu;
128 struct bau_control *bcp;
129
130 if (nobau_perm) {
131 pr_info("BAU not initialized; cannot be turned on\n");
132 return;
133 }
134 nobau = 0;
135 for_each_present_cpu(cpu) {
136 bcp = &per_cpu(bau_control, cpu);
137 bcp->nobau = 0;
138 }
139 pr_info("BAU turned on\n");
140 return;
141}
142
143static void
144set_bau_off(void)
145{
146 int cpu;
147 struct bau_control *bcp;
148
149 nobau = 1;
150 for_each_present_cpu(cpu) {
151 bcp = &per_cpu(bau_control, cpu);
152 bcp->nobau = 1;
153 }
154 pr_info("BAU turned off\n");
155 return;
156}
157
123/* 158/*
124 * Determine the first node on a uvhub. 'Nodes' are used for kernel 159 * Determine the first node on a uvhub. 'Nodes' are used for kernel
125 * memory allocation. 160 * memory allocation.
@@ -278,7 +313,7 @@ static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp,
278 * Both sockets dump their completed count total into 313 * Both sockets dump their completed count total into
279 * the message's count. 314 * the message's count.
280 */ 315 */
281 smaster->socket_acknowledge_count[mdp->msg_slot] = 0; 316 *sp = 0;
282 asp = (struct atomic_short *)&msg->acknowledge_count; 317 asp = (struct atomic_short *)&msg->acknowledge_count;
283 msg_ack_count = atom_asr(socket_ack_count, asp); 318 msg_ack_count = atom_asr(socket_ack_count, asp);
284 319
@@ -491,16 +526,15 @@ static int uv1_wait_completion(struct bau_desc *bau_desc,
491} 526}
492 527
493/* 528/*
494 * UV2 has an extra bit of status in the ACTIVATION_STATUS_2 register. 529 * UV2 could have an extra bit of status in the ACTIVATION_STATUS_2 register.
530 * But not currently used.
495 */ 531 */
496static unsigned long uv2_read_status(unsigned long offset, int rshft, int desc) 532static unsigned long uv2_read_status(unsigned long offset, int rshft, int desc)
497{ 533{
498 unsigned long descriptor_status; 534 unsigned long descriptor_status;
499 unsigned long descriptor_status2;
500 535
501 descriptor_status = ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK); 536 descriptor_status =
502 descriptor_status2 = (read_mmr_uv2_status() >> desc) & 0x1UL; 537 ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK) << 1;
503 descriptor_status = (descriptor_status << 1) | descriptor_status2;
504 return descriptor_status; 538 return descriptor_status;
505} 539}
506 540
@@ -531,87 +565,11 @@ int normal_busy(struct bau_control *bcp)
531 */ 565 */
532int handle_uv2_busy(struct bau_control *bcp) 566int handle_uv2_busy(struct bau_control *bcp)
533{ 567{
534 int busy_one = bcp->using_desc;
535 int normal = bcp->uvhub_cpu;
536 int selected = -1;
537 int i;
538 unsigned long descriptor_status;
539 unsigned long status;
540 int mmr_offset;
541 struct bau_desc *bau_desc_old;
542 struct bau_desc *bau_desc_new;
543 struct bau_control *hmaster = bcp->uvhub_master;
544 struct ptc_stats *stat = bcp->statp; 568 struct ptc_stats *stat = bcp->statp;
545 cycles_t ttm;
546 569
547 stat->s_uv2_wars++; 570 stat->s_uv2_wars++;
548 spin_lock(&hmaster->uvhub_lock); 571 bcp->busy = 1;
549 /* try for the original first */ 572 return FLUSH_GIVEUP;
550 if (busy_one != normal) {
551 if (!normal_busy(bcp))
552 selected = normal;
553 }
554 if (selected < 0) {
555 /* can't use the normal, select an alternate */
556 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
557 descriptor_status = read_lmmr(mmr_offset);
558
559 /* scan available descriptors 32-63 */
560 for (i = 0; i < UV_CPUS_PER_AS; i++) {
561 if ((hmaster->inuse_map & (1 << i)) == 0) {
562 status = ((descriptor_status >>
563 (i * UV_ACT_STATUS_SIZE)) &
564 UV_ACT_STATUS_MASK) << 1;
565 if (status != UV2H_DESC_BUSY) {
566 selected = i + UV_CPUS_PER_AS;
567 break;
568 }
569 }
570 }
571 }
572
573 if (busy_one != normal)
574 /* mark the busy alternate as not in-use */
575 hmaster->inuse_map &= ~(1 << (busy_one - UV_CPUS_PER_AS));
576
577 if (selected >= 0) {
578 /* switch to the selected descriptor */
579 if (selected != normal) {
580 /* set the selected alternate as in-use */
581 hmaster->inuse_map |=
582 (1 << (selected - UV_CPUS_PER_AS));
583 if (selected > stat->s_uv2_wars_hw)
584 stat->s_uv2_wars_hw = selected;
585 }
586 bau_desc_old = bcp->descriptor_base;
587 bau_desc_old += (ITEMS_PER_DESC * busy_one);
588 bcp->using_desc = selected;
589 bau_desc_new = bcp->descriptor_base;
590 bau_desc_new += (ITEMS_PER_DESC * selected);
591 *bau_desc_new = *bau_desc_old;
592 } else {
593 /*
594 * All are busy. Wait for the normal one for this cpu to
595 * free up.
596 */
597 stat->s_uv2_war_waits++;
598 spin_unlock(&hmaster->uvhub_lock);
599 ttm = get_cycles();
600 do {
601 cpu_relax();
602 } while (normal_busy(bcp));
603 spin_lock(&hmaster->uvhub_lock);
604 /* switch to the original descriptor */
605 bcp->using_desc = normal;
606 bau_desc_old = bcp->descriptor_base;
607 bau_desc_old += (ITEMS_PER_DESC * bcp->using_desc);
608 bcp->using_desc = (ITEMS_PER_DESC * normal);
609 bau_desc_new = bcp->descriptor_base;
610 bau_desc_new += (ITEMS_PER_DESC * normal);
611 *bau_desc_new = *bau_desc_old; /* copy the entire descriptor */
612 }
613 spin_unlock(&hmaster->uvhub_lock);
614 return FLUSH_RETRY_BUSYBUG;
615} 573}
616 574
617static int uv2_wait_completion(struct bau_desc *bau_desc, 575static int uv2_wait_completion(struct bau_desc *bau_desc,
@@ -620,7 +578,7 @@ static int uv2_wait_completion(struct bau_desc *bau_desc,
620{ 578{
621 unsigned long descriptor_stat; 579 unsigned long descriptor_stat;
622 cycles_t ttm; 580 cycles_t ttm;
623 int desc = bcp->using_desc; 581 int desc = bcp->uvhub_cpu;
624 long busy_reps = 0; 582 long busy_reps = 0;
625 struct ptc_stats *stat = bcp->statp; 583 struct ptc_stats *stat = bcp->statp;
626 584
@@ -628,24 +586,38 @@ static int uv2_wait_completion(struct bau_desc *bau_desc,
628 586
629 /* spin on the status MMR, waiting for it to go idle */ 587 /* spin on the status MMR, waiting for it to go idle */
630 while (descriptor_stat != UV2H_DESC_IDLE) { 588 while (descriptor_stat != UV2H_DESC_IDLE) {
631 /* 589 if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT)) {
632 * Our software ack messages may be blocked because 590 /*
633 * there are no swack resources available. As long 591 * A h/w bug on the destination side may
634 * as none of them has timed out hardware will NACK 592 * have prevented the message being marked
635 * our message and its state will stay IDLE. 593 * pending, thus it doesn't get replied to
636 */ 594 * and gets continually nacked until it times
637 if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT) || 595 * out with a SOURCE_TIMEOUT.
638 (descriptor_stat == UV2H_DESC_DEST_PUT_ERR)) { 596 */
639 stat->s_stimeout++; 597 stat->s_stimeout++;
640 return FLUSH_GIVEUP; 598 return FLUSH_GIVEUP;
641 } else if (descriptor_stat == UV2H_DESC_DEST_STRONG_NACK) {
642 stat->s_strongnacks++;
643 bcp->conseccompletes = 0;
644 return FLUSH_GIVEUP;
645 } else if (descriptor_stat == UV2H_DESC_DEST_TIMEOUT) { 599 } else if (descriptor_stat == UV2H_DESC_DEST_TIMEOUT) {
600 ttm = get_cycles();
601
602 /*
603 * Our retries may be blocked by all destination
604 * swack resources being consumed, and a timeout
605 * pending. In that case hardware returns the
606 * ERROR that looks like a destination timeout.
607 * Without using the extended status we have to
608 * deduce from the short time that this was a
609 * strong nack.
610 */
611 if (cycles_2_us(ttm - bcp->send_message) < timeout_us) {
612 bcp->conseccompletes = 0;
613 stat->s_plugged++;
614 /* FLUSH_RETRY_PLUGGED causes hang on boot */
615 return FLUSH_GIVEUP;
616 }
646 stat->s_dtimeout++; 617 stat->s_dtimeout++;
647 bcp->conseccompletes = 0; 618 bcp->conseccompletes = 0;
648 return FLUSH_RETRY_TIMEOUT; 619 /* FLUSH_RETRY_TIMEOUT causes hang on boot */
620 return FLUSH_GIVEUP;
649 } else { 621 } else {
650 busy_reps++; 622 busy_reps++;
651 if (busy_reps > 1000000) { 623 if (busy_reps > 1000000) {
@@ -653,9 +625,8 @@ static int uv2_wait_completion(struct bau_desc *bau_desc,
653 busy_reps = 0; 625 busy_reps = 0;
654 ttm = get_cycles(); 626 ttm = get_cycles();
655 if ((ttm - bcp->send_message) > 627 if ((ttm - bcp->send_message) >
656 (bcp->clocks_per_100_usec)) { 628 bcp->timeout_interval)
657 return handle_uv2_busy(bcp); 629 return handle_uv2_busy(bcp);
658 }
659 } 630 }
660 /* 631 /*
661 * descriptor_stat is still BUSY 632 * descriptor_stat is still BUSY
@@ -679,7 +650,7 @@ static int wait_completion(struct bau_desc *bau_desc,
679{ 650{
680 int right_shift; 651 int right_shift;
681 unsigned long mmr_offset; 652 unsigned long mmr_offset;
682 int desc = bcp->using_desc; 653 int desc = bcp->uvhub_cpu;
683 654
684 if (desc < UV_CPUS_PER_AS) { 655 if (desc < UV_CPUS_PER_AS) {
685 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; 656 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
@@ -758,33 +729,31 @@ static void destination_timeout(struct bau_desc *bau_desc,
758} 729}
759 730
760/* 731/*
761 * Completions are taking a very long time due to a congested numalink 732 * Stop all cpus on a uvhub from using the BAU for a period of time.
762 * network. 733 * This is reversed by check_enable.
763 */ 734 */
764static void disable_for_congestion(struct bau_control *bcp, 735static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat)
765 struct ptc_stats *stat)
766{ 736{
767 /* let only one cpu do this disabling */ 737 int tcpu;
768 spin_lock(&disable_lock); 738 struct bau_control *tbcp;
769 739 struct bau_control *hmaster;
770 if (!baudisabled && bcp->period_requests && 740 cycles_t tm1;
771 ((bcp->period_time / bcp->period_requests) > congested_cycles)) { 741
772 int tcpu; 742 hmaster = bcp->uvhub_master;
773 struct bau_control *tbcp; 743 spin_lock(&hmaster->disable_lock);
774 /* it becomes this cpu's job to turn on the use of the 744 if (!bcp->baudisabled) {
775 BAU again */
776 baudisabled = 1;
777 bcp->set_bau_off = 1;
778 bcp->set_bau_on_time = get_cycles();
779 bcp->set_bau_on_time += sec_2_cycles(bcp->cong_period);
780 stat->s_bau_disabled++; 745 stat->s_bau_disabled++;
746 tm1 = get_cycles();
781 for_each_present_cpu(tcpu) { 747 for_each_present_cpu(tcpu) {
782 tbcp = &per_cpu(bau_control, tcpu); 748 tbcp = &per_cpu(bau_control, tcpu);
783 tbcp->baudisabled = 1; 749 if (tbcp->uvhub_master == hmaster) {
750 tbcp->baudisabled = 1;
751 tbcp->set_bau_on_time =
752 tm1 + bcp->disabled_period;
753 }
784 } 754 }
785 } 755 }
786 756 spin_unlock(&hmaster->disable_lock);
787 spin_unlock(&disable_lock);
788} 757}
789 758
790static void count_max_concurr(int stat, struct bau_control *bcp, 759static void count_max_concurr(int stat, struct bau_control *bcp,
@@ -815,16 +784,30 @@ static void record_send_stats(cycles_t time1, cycles_t time2,
815 bcp->period_requests++; 784 bcp->period_requests++;
816 bcp->period_time += elapsed; 785 bcp->period_time += elapsed;
817 if ((elapsed > congested_cycles) && 786 if ((elapsed > congested_cycles) &&
818 (bcp->period_requests > bcp->cong_reps)) 787 (bcp->period_requests > bcp->cong_reps) &&
819 disable_for_congestion(bcp, stat); 788 ((bcp->period_time / bcp->period_requests) >
789 congested_cycles)) {
790 stat->s_congested++;
791 disable_for_period(bcp, stat);
792 }
820 } 793 }
821 } else 794 } else
822 stat->s_requestor--; 795 stat->s_requestor--;
823 796
824 if (completion_status == FLUSH_COMPLETE && try > 1) 797 if (completion_status == FLUSH_COMPLETE && try > 1)
825 stat->s_retriesok++; 798 stat->s_retriesok++;
826 else if (completion_status == FLUSH_GIVEUP) 799 else if (completion_status == FLUSH_GIVEUP) {
827 stat->s_giveup++; 800 stat->s_giveup++;
801 if (get_cycles() > bcp->period_end)
802 bcp->period_giveups = 0;
803 bcp->period_giveups++;
804 if (bcp->period_giveups == 1)
805 bcp->period_end = get_cycles() + bcp->disabled_period;
806 if (bcp->period_giveups > bcp->giveup_limit) {
807 disable_for_period(bcp, stat);
808 stat->s_giveuplimit++;
809 }
810 }
828} 811}
829 812
830/* 813/*
@@ -868,7 +851,8 @@ static void handle_cmplt(int completion_status, struct bau_desc *bau_desc,
868 * Returns 1 if it gives up entirely and the original cpu mask is to be 851 * Returns 1 if it gives up entirely and the original cpu mask is to be
869 * returned to the kernel. 852 * returned to the kernel.
870 */ 853 */
871int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp) 854int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp,
855 struct bau_desc *bau_desc)
872{ 856{
873 int seq_number = 0; 857 int seq_number = 0;
874 int completion_stat = 0; 858 int completion_stat = 0;
@@ -881,24 +865,23 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp)
881 struct bau_control *hmaster = bcp->uvhub_master; 865 struct bau_control *hmaster = bcp->uvhub_master;
882 struct uv1_bau_msg_header *uv1_hdr = NULL; 866 struct uv1_bau_msg_header *uv1_hdr = NULL;
883 struct uv2_bau_msg_header *uv2_hdr = NULL; 867 struct uv2_bau_msg_header *uv2_hdr = NULL;
884 struct bau_desc *bau_desc;
885 868
886 if (bcp->uvhub_version == 1) 869 if (bcp->uvhub_version == 1) {
870 uv1 = 1;
887 uv1_throttle(hmaster, stat); 871 uv1_throttle(hmaster, stat);
872 }
888 873
889 while (hmaster->uvhub_quiesce) 874 while (hmaster->uvhub_quiesce)
890 cpu_relax(); 875 cpu_relax();
891 876
892 time1 = get_cycles(); 877 time1 = get_cycles();
878 if (uv1)
879 uv1_hdr = &bau_desc->header.uv1_hdr;
880 else
881 uv2_hdr = &bau_desc->header.uv2_hdr;
882
893 do { 883 do {
894 bau_desc = bcp->descriptor_base; 884 if (try == 0) {
895 bau_desc += (ITEMS_PER_DESC * bcp->using_desc);
896 if (bcp->uvhub_version == 1) {
897 uv1 = 1;
898 uv1_hdr = &bau_desc->header.uv1_hdr;
899 } else
900 uv2_hdr = &bau_desc->header.uv2_hdr;
901 if ((try == 0) || (completion_stat == FLUSH_RETRY_BUSYBUG)) {
902 if (uv1) 885 if (uv1)
903 uv1_hdr->msg_type = MSG_REGULAR; 886 uv1_hdr->msg_type = MSG_REGULAR;
904 else 887 else
@@ -916,25 +899,24 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp)
916 uv1_hdr->sequence = seq_number; 899 uv1_hdr->sequence = seq_number;
917 else 900 else
918 uv2_hdr->sequence = seq_number; 901 uv2_hdr->sequence = seq_number;
919 index = (1UL << AS_PUSH_SHIFT) | bcp->using_desc; 902 index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu;
920 bcp->send_message = get_cycles(); 903 bcp->send_message = get_cycles();
921 904
922 write_mmr_activation(index); 905 write_mmr_activation(index);
923 906
924 try++; 907 try++;
925 completion_stat = wait_completion(bau_desc, bcp, try); 908 completion_stat = wait_completion(bau_desc, bcp, try);
926 /* UV2: wait_completion() may change the bcp->using_desc */
927 909
928 handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat); 910 handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat);
929 911
930 if (bcp->ipi_attempts >= bcp->ipi_reset_limit) { 912 if (bcp->ipi_attempts >= bcp->ipi_reset_limit) {
931 bcp->ipi_attempts = 0; 913 bcp->ipi_attempts = 0;
914 stat->s_overipilimit++;
932 completion_stat = FLUSH_GIVEUP; 915 completion_stat = FLUSH_GIVEUP;
933 break; 916 break;
934 } 917 }
935 cpu_relax(); 918 cpu_relax();
936 } while ((completion_stat == FLUSH_RETRY_PLUGGED) || 919 } while ((completion_stat == FLUSH_RETRY_PLUGGED) ||
937 (completion_stat == FLUSH_RETRY_BUSYBUG) ||
938 (completion_stat == FLUSH_RETRY_TIMEOUT)); 920 (completion_stat == FLUSH_RETRY_TIMEOUT));
939 921
940 time2 = get_cycles(); 922 time2 = get_cycles();
@@ -955,28 +937,33 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp)
955} 937}
956 938
957/* 939/*
958 * The BAU is disabled. When the disabled time period has expired, the cpu 940 * The BAU is disabled for this uvhub. When the disabled time period has
959 * that disabled it must re-enable it. 941 * expired re-enable it.
960 * Return 0 if it is re-enabled for all cpus. 942 * Return 0 if it is re-enabled for all cpus on this uvhub.
961 */ 943 */
962static int check_enable(struct bau_control *bcp, struct ptc_stats *stat) 944static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
963{ 945{
964 int tcpu; 946 int tcpu;
965 struct bau_control *tbcp; 947 struct bau_control *tbcp;
948 struct bau_control *hmaster;
966 949
967 if (bcp->set_bau_off) { 950 hmaster = bcp->uvhub_master;
968 if (get_cycles() >= bcp->set_bau_on_time) { 951 spin_lock(&hmaster->disable_lock);
969 stat->s_bau_reenabled++; 952 if (bcp->baudisabled && (get_cycles() >= bcp->set_bau_on_time)) {
970 baudisabled = 0; 953 stat->s_bau_reenabled++;
971 for_each_present_cpu(tcpu) { 954 for_each_present_cpu(tcpu) {
972 tbcp = &per_cpu(bau_control, tcpu); 955 tbcp = &per_cpu(bau_control, tcpu);
956 if (tbcp->uvhub_master == hmaster) {
973 tbcp->baudisabled = 0; 957 tbcp->baudisabled = 0;
974 tbcp->period_requests = 0; 958 tbcp->period_requests = 0;
975 tbcp->period_time = 0; 959 tbcp->period_time = 0;
960 tbcp->period_giveups = 0;
976 } 961 }
977 return 0;
978 } 962 }
963 spin_unlock(&hmaster->disable_lock);
964 return 0;
979 } 965 }
966 spin_unlock(&hmaster->disable_lock);
980 return -1; 967 return -1;
981} 968}
982 969
@@ -1068,8 +1055,8 @@ static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp,
1068 * done. The returned pointer is valid till preemption is re-enabled. 1055 * done. The returned pointer is valid till preemption is re-enabled.
1069 */ 1056 */
1070const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, 1057const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
1071 struct mm_struct *mm, unsigned long va, 1058 struct mm_struct *mm, unsigned long start,
1072 unsigned int cpu) 1059 unsigned end, unsigned int cpu)
1073{ 1060{
1074 int locals = 0; 1061 int locals = 0;
1075 int remotes = 0; 1062 int remotes = 0;
@@ -1078,18 +1065,32 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
1078 struct cpumask *flush_mask; 1065 struct cpumask *flush_mask;
1079 struct ptc_stats *stat; 1066 struct ptc_stats *stat;
1080 struct bau_control *bcp; 1067 struct bau_control *bcp;
1081 1068 unsigned long descriptor_status;
1082 /* kernel was booted 'nobau' */ 1069 unsigned long status;
1083 if (nobau)
1084 return cpumask;
1085 1070
1086 bcp = &per_cpu(bau_control, cpu); 1071 bcp = &per_cpu(bau_control, cpu);
1087 stat = bcp->statp; 1072 stat = bcp->statp;
1073 stat->s_enters++;
1074
1075 if (bcp->nobau)
1076 return cpumask;
1077
1078 if (bcp->busy) {
1079 descriptor_status =
1080 read_lmmr(UVH_LB_BAU_SB_ACTIVATION_STATUS_0);
1081 status = ((descriptor_status >> (bcp->uvhub_cpu *
1082 UV_ACT_STATUS_SIZE)) & UV_ACT_STATUS_MASK) << 1;
1083 if (status == UV2H_DESC_BUSY)
1084 return cpumask;
1085 bcp->busy = 0;
1086 }
1088 1087
1089 /* bau was disabled due to slow response */ 1088 /* bau was disabled due to slow response */
1090 if (bcp->baudisabled) { 1089 if (bcp->baudisabled) {
1091 if (check_enable(bcp, stat)) 1090 if (check_enable(bcp, stat)) {
1091 stat->s_ipifordisabled++;
1092 return cpumask; 1092 return cpumask;
1093 }
1093 } 1094 }
1094 1095
1095 /* 1096 /*
@@ -1105,38 +1106,40 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
1105 stat->s_ntargself++; 1106 stat->s_ntargself++;
1106 1107
1107 bau_desc = bcp->descriptor_base; 1108 bau_desc = bcp->descriptor_base;
1108 bau_desc += (ITEMS_PER_DESC * bcp->using_desc); 1109 bau_desc += (ITEMS_PER_DESC * bcp->uvhub_cpu);
1109 bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); 1110 bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
1110 if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes)) 1111 if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes))
1111 return NULL; 1112 return NULL;
1112 1113
1113 record_send_statistics(stat, locals, hubs, remotes, bau_desc); 1114 record_send_statistics(stat, locals, hubs, remotes, bau_desc);
1114 1115
1115 bau_desc->payload.address = va; 1116 bau_desc->payload.address = start;
1116 bau_desc->payload.sending_cpu = cpu; 1117 bau_desc->payload.sending_cpu = cpu;
1117 /* 1118 /*
1118 * uv_flush_send_and_wait returns 0 if all cpu's were messaged, 1119 * uv_flush_send_and_wait returns 0 if all cpu's were messaged,
1119 * or 1 if it gave up and the original cpumask should be returned. 1120 * or 1 if it gave up and the original cpumask should be returned.
1120 */ 1121 */
1121 if (!uv_flush_send_and_wait(flush_mask, bcp)) 1122 if (!uv_flush_send_and_wait(flush_mask, bcp, bau_desc))
1122 return NULL; 1123 return NULL;
1123 else 1124 else
1124 return cpumask; 1125 return cpumask;
1125} 1126}
1126 1127
1127/* 1128/*
1128 * Search the message queue for any 'other' message with the same software 1129 * Search the message queue for any 'other' unprocessed message with the
1129 * acknowledge resource bit vector. 1130 * same software acknowledge resource bit vector as the 'msg' message.
1130 */ 1131 */
1131struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg, 1132struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg,
1132 struct bau_control *bcp, unsigned char swack_vec) 1133 struct bau_control *bcp)
1133{ 1134{
1134 struct bau_pq_entry *msg_next = msg + 1; 1135 struct bau_pq_entry *msg_next = msg + 1;
1136 unsigned char swack_vec = msg->swack_vec;
1135 1137
1136 if (msg_next > bcp->queue_last) 1138 if (msg_next > bcp->queue_last)
1137 msg_next = bcp->queue_first; 1139 msg_next = bcp->queue_first;
1138 while ((msg_next->swack_vec != 0) && (msg_next != msg)) { 1140 while (msg_next != msg) {
1139 if (msg_next->swack_vec == swack_vec) 1141 if ((msg_next->canceled == 0) && (msg_next->replied_to == 0) &&
1142 (msg_next->swack_vec == swack_vec))
1140 return msg_next; 1143 return msg_next;
1141 msg_next++; 1144 msg_next++;
1142 if (msg_next > bcp->queue_last) 1145 if (msg_next > bcp->queue_last)
@@ -1165,32 +1168,30 @@ void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp)
1165 * This message was assigned a swack resource, but no 1168 * This message was assigned a swack resource, but no
1166 * reserved acknowlegment is pending. 1169 * reserved acknowlegment is pending.
1167 * The bug has prevented this message from setting the MMR. 1170 * The bug has prevented this message from setting the MMR.
1168 * And no other message has used the same sw_ack resource.
1169 * Do the requested shootdown but do not reply to the msg.
1170 * (the 0 means make no acknowledge)
1171 */ 1171 */
1172 bau_process_message(mdp, bcp, 0);
1173 return;
1174 }
1175
1176 /*
1177 * Some message has set the MMR 'pending' bit; it might have been
1178 * another message. Look for that message.
1179 */
1180 other_msg = find_another_by_swack(msg, bcp, msg->swack_vec);
1181 if (other_msg) {
1182 /* There is another. Do not ack the current one. */
1183 bau_process_message(mdp, bcp, 0);
1184 /* 1172 /*
1185 * Let the natural processing of that message acknowledge 1173 * Some message has set the MMR 'pending' bit; it might have
1186 * it. Don't get the processing of sw_ack's out of order. 1174 * been another message. Look for that message.
1187 */ 1175 */
1188 return; 1176 other_msg = find_another_by_swack(msg, bcp);
1177 if (other_msg) {
1178 /*
1179 * There is another. Process this one but do not
1180 * ack it.
1181 */
1182 bau_process_message(mdp, bcp, 0);
1183 /*
1184 * Let the natural processing of that other message
1185 * acknowledge it. Don't get the processing of sw_ack's
1186 * out of order.
1187 */
1188 return;
1189 }
1189 } 1190 }
1190 1191
1191 /* 1192 /*
1192 * There is no other message using this sw_ack, so it is safe to 1193 * Either the MMR shows this one pending a reply or there is no
1193 * acknowledge it. 1194 * other message using this sw_ack, so it is safe to acknowledge it.
1194 */ 1195 */
1195 bau_process_message(mdp, bcp, 1); 1196 bau_process_message(mdp, bcp, 1);
1196 1197
@@ -1295,7 +1296,8 @@ static void __init enable_timeouts(void)
1295 */ 1296 */
1296 mmr_image |= (1L << SOFTACK_MSHIFT); 1297 mmr_image |= (1L << SOFTACK_MSHIFT);
1297 if (is_uv2_hub()) { 1298 if (is_uv2_hub()) {
1298 mmr_image |= (1L << UV2_EXT_SHFT); 1299 /* hw bug workaround; do not use extended status */
1300 mmr_image &= ~(1L << UV2_EXT_SHFT);
1299 } 1301 }
1300 write_mmr_misc_control(pnode, mmr_image); 1302 write_mmr_misc_control(pnode, mmr_image);
1301 } 1303 }
@@ -1338,29 +1340,34 @@ static inline unsigned long long usec_2_cycles(unsigned long microsec)
1338static int ptc_seq_show(struct seq_file *file, void *data) 1340static int ptc_seq_show(struct seq_file *file, void *data)
1339{ 1341{
1340 struct ptc_stats *stat; 1342 struct ptc_stats *stat;
1343 struct bau_control *bcp;
1341 int cpu; 1344 int cpu;
1342 1345
1343 cpu = *(loff_t *)data; 1346 cpu = *(loff_t *)data;
1344 if (!cpu) { 1347 if (!cpu) {
1345 seq_printf(file, 1348 seq_printf(file,
1346 "# cpu sent stime self locals remotes ncpus localhub "); 1349 "# cpu bauoff sent stime self locals remotes ncpus localhub ");
1347 seq_printf(file, 1350 seq_printf(file,
1348 "remotehub numuvhubs numuvhubs16 numuvhubs8 "); 1351 "remotehub numuvhubs numuvhubs16 numuvhubs8 ");
1349 seq_printf(file, 1352 seq_printf(file,
1350 "numuvhubs4 numuvhubs2 numuvhubs1 dto snacks retries rok "); 1353 "numuvhubs4 numuvhubs2 numuvhubs1 dto snacks retries ");
1354 seq_printf(file,
1355 "rok resetp resett giveup sto bz throt disable ");
1351 seq_printf(file, 1356 seq_printf(file,
1352 "resetp resett giveup sto bz throt swack recv rtime "); 1357 "enable wars warshw warwaits enters ipidis plugged ");
1353 seq_printf(file, 1358 seq_printf(file,
1354 "all one mult none retry canc nocan reset rcan "); 1359 "ipiover glim cong swack recv rtime all one mult ");
1355 seq_printf(file, 1360 seq_printf(file,
1356 "disable enable wars warshw warwaits\n"); 1361 "none retry canc nocan reset rcan\n");
1357 } 1362 }
1358 if (cpu < num_possible_cpus() && cpu_online(cpu)) { 1363 if (cpu < num_possible_cpus() && cpu_online(cpu)) {
1359 stat = &per_cpu(ptcstats, cpu); 1364 bcp = &per_cpu(bau_control, cpu);
1365 stat = bcp->statp;
1360 /* source side statistics */ 1366 /* source side statistics */
1361 seq_printf(file, 1367 seq_printf(file,
1362 "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", 1368 "cpu %d %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
1363 cpu, stat->s_requestor, cycles_2_us(stat->s_time), 1369 cpu, bcp->nobau, stat->s_requestor,
1370 cycles_2_us(stat->s_time),
1364 stat->s_ntargself, stat->s_ntarglocals, 1371 stat->s_ntargself, stat->s_ntarglocals,
1365 stat->s_ntargremotes, stat->s_ntargcpu, 1372 stat->s_ntargremotes, stat->s_ntargcpu,
1366 stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub, 1373 stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub,
@@ -1374,20 +1381,23 @@ static int ptc_seq_show(struct seq_file *file, void *data)
1374 stat->s_resets_plug, stat->s_resets_timeout, 1381 stat->s_resets_plug, stat->s_resets_timeout,
1375 stat->s_giveup, stat->s_stimeout, 1382 stat->s_giveup, stat->s_stimeout,
1376 stat->s_busy, stat->s_throttles); 1383 stat->s_busy, stat->s_throttles);
1384 seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
1385 stat->s_bau_disabled, stat->s_bau_reenabled,
1386 stat->s_uv2_wars, stat->s_uv2_wars_hw,
1387 stat->s_uv2_war_waits, stat->s_enters,
1388 stat->s_ipifordisabled, stat->s_plugged,
1389 stat->s_overipilimit, stat->s_giveuplimit,
1390 stat->s_congested);
1377 1391
1378 /* destination side statistics */ 1392 /* destination side statistics */
1379 seq_printf(file, 1393 seq_printf(file,
1380 "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", 1394 "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n",
1381 read_gmmr_sw_ack(uv_cpu_to_pnode(cpu)), 1395 read_gmmr_sw_ack(uv_cpu_to_pnode(cpu)),
1382 stat->d_requestee, cycles_2_us(stat->d_time), 1396 stat->d_requestee, cycles_2_us(stat->d_time),
1383 stat->d_alltlb, stat->d_onetlb, stat->d_multmsg, 1397 stat->d_alltlb, stat->d_onetlb, stat->d_multmsg,
1384 stat->d_nomsg, stat->d_retries, stat->d_canceled, 1398 stat->d_nomsg, stat->d_retries, stat->d_canceled,
1385 stat->d_nocanceled, stat->d_resets, 1399 stat->d_nocanceled, stat->d_resets,
1386 stat->d_rcanceled); 1400 stat->d_rcanceled);
1387 seq_printf(file, "%ld %ld %ld %ld %ld\n",
1388 stat->s_bau_disabled, stat->s_bau_reenabled,
1389 stat->s_uv2_wars, stat->s_uv2_wars_hw,
1390 stat->s_uv2_war_waits);
1391 } 1401 }
1392 return 0; 1402 return 0;
1393} 1403}
@@ -1401,13 +1411,14 @@ static ssize_t tunables_read(struct file *file, char __user *userbuf,
1401 char *buf; 1411 char *buf;
1402 int ret; 1412 int ret;
1403 1413
1404 buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n", 1414 buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d %d\n",
1405 "max_concur plugged_delay plugsb4reset", 1415 "max_concur plugged_delay plugsb4reset timeoutsb4reset",
1406 "timeoutsb4reset ipi_reset_limit complete_threshold", 1416 "ipi_reset_limit complete_threshold congested_response_us",
1407 "congested_response_us congested_reps congested_period", 1417 "congested_reps disabled_period giveup_limit",
1408 max_concurr, plugged_delay, plugsb4reset, 1418 max_concurr, plugged_delay, plugsb4reset,
1409 timeoutsb4reset, ipi_reset_limit, complete_threshold, 1419 timeoutsb4reset, ipi_reset_limit, complete_threshold,
1410 congested_respns_us, congested_reps, congested_period); 1420 congested_respns_us, congested_reps, disabled_period,
1421 giveup_limit);
1411 1422
1412 if (!buf) 1423 if (!buf)
1413 return -ENOMEM; 1424 return -ENOMEM;
@@ -1438,6 +1449,14 @@ static ssize_t ptc_proc_write(struct file *file, const char __user *user,
1438 return -EFAULT; 1449 return -EFAULT;
1439 optstr[count - 1] = '\0'; 1450 optstr[count - 1] = '\0';
1440 1451
1452 if (!strcmp(optstr, "on")) {
1453 set_bau_on();
1454 return count;
1455 } else if (!strcmp(optstr, "off")) {
1456 set_bau_off();
1457 return count;
1458 }
1459
1441 if (strict_strtol(optstr, 10, &input_arg) < 0) { 1460 if (strict_strtol(optstr, 10, &input_arg) < 0) {
1442 printk(KERN_DEBUG "%s is invalid\n", optstr); 1461 printk(KERN_DEBUG "%s is invalid\n", optstr);
1443 return -EINVAL; 1462 return -EINVAL;
@@ -1570,7 +1589,8 @@ static ssize_t tunables_write(struct file *file, const char __user *user,
1570 bcp->complete_threshold = complete_threshold; 1589 bcp->complete_threshold = complete_threshold;
1571 bcp->cong_response_us = congested_respns_us; 1590 bcp->cong_response_us = congested_respns_us;
1572 bcp->cong_reps = congested_reps; 1591 bcp->cong_reps = congested_reps;
1573 bcp->cong_period = congested_period; 1592 bcp->disabled_period = sec_2_cycles(disabled_period);
1593 bcp->giveup_limit = giveup_limit;
1574 } 1594 }
1575 return count; 1595 return count;
1576} 1596}
@@ -1699,6 +1719,10 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode)
1699 * fairness chaining multilevel count replied_to 1719 * fairness chaining multilevel count replied_to
1700 */ 1720 */
1701 } else { 1721 } else {
1722 /*
1723 * BIOS uses legacy mode, but UV2 hardware always
1724 * uses native mode for selective broadcasts.
1725 */
1702 uv2_hdr = &bd2->header.uv2_hdr; 1726 uv2_hdr = &bd2->header.uv2_hdr;
1703 uv2_hdr->swack_flag = 1; 1727 uv2_hdr->swack_flag = 1;
1704 uv2_hdr->base_dest_nasid = 1728 uv2_hdr->base_dest_nasid =
@@ -1811,8 +1835,8 @@ static int calculate_destination_timeout(void)
1811 index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK; 1835 index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK;
1812 mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT); 1836 mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT);
1813 mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK; 1837 mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK;
1814 base = timeout_base_ns[index]; 1838 ts_ns = timeout_base_ns[index];
1815 ts_ns = base * mult1 * mult2; 1839 ts_ns *= (mult1 * mult2);
1816 ret = ts_ns / 1000; 1840 ret = ts_ns / 1000;
1817 } else { 1841 } else {
1818 /* 4 bits 0/1 for 10/80us base, 3 bits of multiplier */ 1842 /* 4 bits 0/1 for 10/80us base, 3 bits of multiplier */
@@ -1836,6 +1860,8 @@ static void __init init_per_cpu_tunables(void)
1836 for_each_present_cpu(cpu) { 1860 for_each_present_cpu(cpu) {
1837 bcp = &per_cpu(bau_control, cpu); 1861 bcp = &per_cpu(bau_control, cpu);
1838 bcp->baudisabled = 0; 1862 bcp->baudisabled = 0;
1863 if (nobau)
1864 bcp->nobau = 1;
1839 bcp->statp = &per_cpu(ptcstats, cpu); 1865 bcp->statp = &per_cpu(ptcstats, cpu);
1840 /* time interval to catch a hardware stay-busy bug */ 1866 /* time interval to catch a hardware stay-busy bug */
1841 bcp->timeout_interval = usec_2_cycles(2*timeout_us); 1867 bcp->timeout_interval = usec_2_cycles(2*timeout_us);
@@ -1848,10 +1874,11 @@ static void __init init_per_cpu_tunables(void)
1848 bcp->complete_threshold = complete_threshold; 1874 bcp->complete_threshold = complete_threshold;
1849 bcp->cong_response_us = congested_respns_us; 1875 bcp->cong_response_us = congested_respns_us;
1850 bcp->cong_reps = congested_reps; 1876 bcp->cong_reps = congested_reps;
1851 bcp->cong_period = congested_period; 1877 bcp->disabled_period = sec_2_cycles(disabled_period);
1852 bcp->clocks_per_100_usec = usec_2_cycles(100); 1878 bcp->giveup_limit = giveup_limit;
1853 spin_lock_init(&bcp->queue_lock); 1879 spin_lock_init(&bcp->queue_lock);
1854 spin_lock_init(&bcp->uvhub_lock); 1880 spin_lock_init(&bcp->uvhub_lock);
1881 spin_lock_init(&bcp->disable_lock);
1855 } 1882 }
1856} 1883}
1857 1884
@@ -1972,7 +1999,6 @@ static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp,
1972 } 1999 }
1973 bcp->uvhub_master = *hmasterp; 2000 bcp->uvhub_master = *hmasterp;
1974 bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id; 2001 bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id;
1975 bcp->using_desc = bcp->uvhub_cpu;
1976 if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) { 2002 if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) {
1977 printk(KERN_EMERG "%d cpus per uvhub invalid\n", 2003 printk(KERN_EMERG "%d cpus per uvhub invalid\n",
1978 bcp->uvhub_cpu); 2004 bcp->uvhub_cpu);
@@ -2069,16 +2095,12 @@ static int __init uv_bau_init(void)
2069 if (!is_uv_system()) 2095 if (!is_uv_system())
2070 return 0; 2096 return 0;
2071 2097
2072 if (nobau)
2073 return 0;
2074
2075 for_each_possible_cpu(cur_cpu) { 2098 for_each_possible_cpu(cur_cpu) {
2076 mask = &per_cpu(uv_flush_tlb_mask, cur_cpu); 2099 mask = &per_cpu(uv_flush_tlb_mask, cur_cpu);
2077 zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cur_cpu)); 2100 zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cur_cpu));
2078 } 2101 }
2079 2102
2080 nuvhubs = uv_num_possible_blades(); 2103 nuvhubs = uv_num_possible_blades();
2081 spin_lock_init(&disable_lock);
2082 congested_cycles = usec_2_cycles(congested_respns_us); 2104 congested_cycles = usec_2_cycles(congested_respns_us);
2083 2105
2084 uv_base_pnode = 0x7fffffff; 2106 uv_base_pnode = 0x7fffffff;
@@ -2091,7 +2113,8 @@ static int __init uv_bau_init(void)
2091 enable_timeouts(); 2113 enable_timeouts();
2092 2114
2093 if (init_per_cpu(nuvhubs, uv_base_pnode)) { 2115 if (init_per_cpu(nuvhubs, uv_base_pnode)) {
2094 nobau = 1; 2116 set_bau_off();
2117 nobau_perm = 1;
2095 return 0; 2118 return 0;
2096 } 2119 }
2097 2120
diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c
index f25c2765a5c9..acf7752da952 100644
--- a/arch/x86/platform/uv/uv_irq.c
+++ b/arch/x86/platform/uv/uv_irq.c
@@ -135,6 +135,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
135 unsigned long mmr_value; 135 unsigned long mmr_value;
136 struct uv_IO_APIC_route_entry *entry; 136 struct uv_IO_APIC_route_entry *entry;
137 int mmr_pnode, err; 137 int mmr_pnode, err;
138 unsigned int dest;
138 139
139 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != 140 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
140 sizeof(unsigned long)); 141 sizeof(unsigned long));
@@ -143,6 +144,10 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
143 if (err != 0) 144 if (err != 0)
144 return err; 145 return err;
145 146
147 err = apic->cpu_mask_to_apicid_and(eligible_cpu, eligible_cpu, &dest);
148 if (err != 0)
149 return err;
150
146 if (limit == UV_AFFINITY_CPU) 151 if (limit == UV_AFFINITY_CPU)
147 irq_set_status_flags(irq, IRQ_NO_BALANCING); 152 irq_set_status_flags(irq, IRQ_NO_BALANCING);
148 else 153 else
@@ -159,7 +164,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
159 entry->polarity = 0; 164 entry->polarity = 0;
160 entry->trigger = 0; 165 entry->trigger = 0;
161 entry->mask = 0; 166 entry->mask = 0;
162 entry->dest = apic->cpu_mask_to_apicid(eligible_cpu); 167 entry->dest = dest;
163 168
164 mmr_pnode = uv_blade_to_pnode(mmr_blade); 169 mmr_pnode = uv_blade_to_pnode(mmr_blade);
165 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); 170 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
@@ -222,7 +227,7 @@ uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask,
222 if (cfg->move_in_progress) 227 if (cfg->move_in_progress)
223 send_cleanup_vector(cfg); 228 send_cleanup_vector(cfg);
224 229
225 return 0; 230 return IRQ_SET_MASK_OK_NOCOPY;
226} 231}
227 232
228/* 233/*
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index 5b84a2d30888..b2d534cab25f 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -22,7 +22,7 @@ wakeup-objs += video-bios.o
22realmode-y += header.o 22realmode-y += header.o
23realmode-y += trampoline_$(BITS).o 23realmode-y += trampoline_$(BITS).o
24realmode-y += stack.o 24realmode-y += stack.o
25realmode-$(CONFIG_X86_32) += reboot_32.o 25realmode-y += reboot.o
26realmode-$(CONFIG_ACPI_SLEEP) += $(wakeup-objs) 26realmode-$(CONFIG_ACPI_SLEEP) += $(wakeup-objs)
27 27
28targets += $(realmode-y) 28targets += $(realmode-y)
diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S
index fadf48378ada..a28221d94e69 100644
--- a/arch/x86/realmode/rm/header.S
+++ b/arch/x86/realmode/rm/header.S
@@ -6,6 +6,7 @@
6 6
7#include <linux/linkage.h> 7#include <linux/linkage.h>
8#include <asm/page_types.h> 8#include <asm/page_types.h>
9#include <asm/segment.h>
9 10
10#include "realmode.h" 11#include "realmode.h"
11 12
@@ -28,8 +29,9 @@ GLOBAL(real_mode_header)
28 .long pa_wakeup_header 29 .long pa_wakeup_header
29#endif 30#endif
30 /* APM/BIOS reboot */ 31 /* APM/BIOS reboot */
31#ifdef CONFIG_X86_32
32 .long pa_machine_real_restart_asm 32 .long pa_machine_real_restart_asm
33#ifdef CONFIG_X86_64
34 .long __KERNEL32_CS
33#endif 35#endif
34END(real_mode_header) 36END(real_mode_header)
35 37
diff --git a/arch/x86/realmode/rm/reboot_32.S b/arch/x86/realmode/rm/reboot.S
index 114044876b3d..f932ea61d1c8 100644
--- a/arch/x86/realmode/rm/reboot_32.S
+++ b/arch/x86/realmode/rm/reboot.S
@@ -2,6 +2,8 @@
2#include <linux/init.h> 2#include <linux/init.h>
3#include <asm/segment.h> 3#include <asm/segment.h>
4#include <asm/page_types.h> 4#include <asm/page_types.h>
5#include <asm/processor-flags.h>
6#include <asm/msr-index.h>
5#include "realmode.h" 7#include "realmode.h"
6 8
7/* 9/*
@@ -12,13 +14,35 @@
12 * doesn't work with at least one type of 486 motherboard. It is easy 14 * doesn't work with at least one type of 486 motherboard. It is easy
13 * to stop this code working; hence the copious comments. 15 * to stop this code working; hence the copious comments.
14 * 16 *
15 * This code is called with the restart type (0 = BIOS, 1 = APM) in %eax. 17 * This code is called with the restart type (0 = BIOS, 1 = APM) in
18 * the primary argument register (%eax for 32 bit, %edi for 64 bit).
16 */ 19 */
17 .section ".text32", "ax" 20 .section ".text32", "ax"
18 .code32 21 .code32
19
20 .balign 16
21ENTRY(machine_real_restart_asm) 22ENTRY(machine_real_restart_asm)
23
24#ifdef CONFIG_X86_64
25 /* Switch to trampoline GDT as it is guaranteed < 4 GiB */
26 movl $__KERNEL_DS, %eax
27 movl %eax, %ds
28 lgdtl pa_tr_gdt
29
30 /* Disable paging to drop us out of long mode */
31 movl %cr0, %eax
32 andl $~X86_CR0_PG, %eax
33 movl %eax, %cr0
34 ljmpl $__KERNEL32_CS, $pa_machine_real_restart_paging_off
35
36GLOBAL(machine_real_restart_paging_off)
37 xorl %eax, %eax
38 xorl %edx, %edx
39 movl $MSR_EFER, %ecx
40 wrmsr
41
42 movl %edi, %eax
43
44#endif /* CONFIG_X86_64 */
45
22 /* Set up the IDT for real mode. */ 46 /* Set up the IDT for real mode. */
23 lidtl pa_machine_real_restart_idt 47 lidtl pa_machine_real_restart_idt
24 48
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 66e6d9359826..0faad646f5fd 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -205,9 +205,9 @@ void syscall32_cpu_init(void)
205{ 205{
206 /* Load these always in case some future AMD CPU supports 206 /* Load these always in case some future AMD CPU supports
207 SYSENTER from compat mode too. */ 207 SYSENTER from compat mode too. */
208 checking_wrmsrl(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); 208 wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
209 checking_wrmsrl(MSR_IA32_SYSENTER_ESP, 0ULL); 209 wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
210 checking_wrmsrl(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); 210 wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
211 211
212 wrmsrl(MSR_CSTAR, ia32_cstar_target); 212 wrmsrl(MSR_CSTAR, ia32_cstar_target);
213} 213}
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index f1814fc2cb77..9642d4a38602 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1156,9 +1156,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
1156 .wbinvd = native_wbinvd, 1156 .wbinvd = native_wbinvd,
1157 1157
1158 .read_msr = native_read_msr_safe, 1158 .read_msr = native_read_msr_safe,
1159 .rdmsr_regs = native_rdmsr_safe_regs,
1160 .write_msr = xen_write_msr_safe, 1159 .write_msr = xen_write_msr_safe,
1161 .wrmsr_regs = native_wrmsr_safe_regs,
1162 1160
1163 .read_tsc = native_read_tsc, 1161 .read_tsc = native_read_tsc,
1164 .read_pmc = native_read_pmc, 1162 .read_pmc = native_read_pmc,
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 27336dfcda8e..b65a76133f4f 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1256,7 +1256,8 @@ static void xen_flush_tlb_single(unsigned long addr)
1256} 1256}
1257 1257
1258static void xen_flush_tlb_others(const struct cpumask *cpus, 1258static void xen_flush_tlb_others(const struct cpumask *cpus,
1259 struct mm_struct *mm, unsigned long va) 1259 struct mm_struct *mm, unsigned long start,
1260 unsigned long end)
1260{ 1261{
1261 struct { 1262 struct {
1262 struct mmuext_op op; 1263 struct mmuext_op op;
@@ -1268,7 +1269,7 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
1268 } *args; 1269 } *args;
1269 struct multicall_space mcs; 1270 struct multicall_space mcs;
1270 1271
1271 trace_xen_mmu_flush_tlb_others(cpus, mm, va); 1272 trace_xen_mmu_flush_tlb_others(cpus, mm, start, end);
1272 1273
1273 if (cpumask_empty(cpus)) 1274 if (cpumask_empty(cpus))
1274 return; /* nothing to do */ 1275 return; /* nothing to do */
@@ -1281,11 +1282,10 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
1281 cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask); 1282 cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1282 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask)); 1283 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
1283 1284
1284 if (va == TLB_FLUSH_ALL) { 1285 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1285 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; 1286 if (start != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) {
1286 } else {
1287 args->op.cmd = MMUEXT_INVLPG_MULTI; 1287 args->op.cmd = MMUEXT_INVLPG_MULTI;
1288 args->op.arg1.linear_addr = va; 1288 args->op.arg1.linear_addr = start;
1289 } 1289 }
1290 1290
1291 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); 1291 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index afb250d22a6b..f58dca7a6e52 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -80,9 +80,7 @@ static void __cpuinit cpu_bringup(void)
80 80
81 notify_cpu_starting(cpu); 81 notify_cpu_starting(cpu);
82 82
83 ipi_call_lock();
84 set_cpu_online(cpu, true); 83 set_cpu_online(cpu, true);
85 ipi_call_unlock();
86 84
87 this_cpu_write(cpu_state, CPU_ONLINE); 85 this_cpu_write(cpu_state, CPU_ONLINE);
88 86