aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig2
-rw-r--r--arch/x86/Kconfig.debug19
-rw-r--r--arch/x86/Makefile3
-rw-r--r--arch/x86/boot/compressed/cmdline.c4
-rw-r--r--arch/x86/boot/compressed/early_serial_console.c4
-rw-r--r--arch/x86/boot/compressed/eboot.c198
-rw-r--r--arch/x86/boot/compressed/head_32.S10
-rw-r--r--arch/x86/boot/compressed/head_64.S10
-rw-r--r--arch/x86/boot/compressed/misc.c31
-rw-r--r--arch/x86/boot/compressed/misc.h27
-rw-r--r--arch/x86/boot/header.S11
-rw-r--r--arch/x86/crypto/Makefile14
-rw-r--r--arch/x86/crypto/ablk_helper.c149
-rw-r--r--arch/x86/crypto/aes_glue.c2
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c110
-rw-r--r--arch/x86/crypto/camellia_glue.c355
-rw-r--r--arch/x86/crypto/glue_helper.c307
-rw-r--r--arch/x86/crypto/serpent-avx-x86_64-asm_64.S704
-rw-r--r--arch/x86/crypto/serpent_avx_glue.c636
-rw-r--r--arch/x86/crypto/serpent_sse2_glue.c513
-rw-r--r--arch/x86/crypto/sha1_ssse3_asm.S2
-rw-r--r--arch/x86/crypto/sha1_ssse3_glue.c6
-rw-r--r--arch/x86/crypto/twofish-avx-x86_64-asm_64.S300
-rw-r--r--arch/x86/crypto/twofish_avx_glue.c624
-rw-r--r--arch/x86/crypto/twofish_glue_3way.c409
-rw-r--r--arch/x86/include/asm/alternative.h74
-rw-r--r--arch/x86/include/asm/amd_nb.h21
-rw-r--r--arch/x86/include/asm/apic.h66
-rw-r--r--arch/x86/include/asm/bitops.h7
-rw-r--r--arch/x86/include/asm/bootparam.h1
-rw-r--r--arch/x86/include/asm/cpufeature.h2
-rw-r--r--arch/x86/include/asm/crypto/ablk_helper.h31
-rw-r--r--arch/x86/include/asm/crypto/aes.h (renamed from arch/x86/include/asm/aes.h)0
-rw-r--r--arch/x86/include/asm/crypto/glue_helper.h115
-rw-r--r--arch/x86/include/asm/crypto/serpent-avx.h32
-rw-r--r--arch/x86/include/asm/crypto/serpent-sse2.h (renamed from arch/x86/include/asm/serpent.h)4
-rw-r--r--arch/x86/include/asm/crypto/twofish.h46
-rw-r--r--arch/x86/include/asm/emergency-restart.h2
-rw-r--r--arch/x86/include/asm/entry_arch.h9
-rw-r--r--arch/x86/include/asm/floppy.h2
-rw-r--r--arch/x86/include/asm/hypervisor.h1
-rw-r--r--arch/x86/include/asm/iommu.h1
-rw-r--r--arch/x86/include/asm/irq_vectors.h11
-rw-r--r--arch/x86/include/asm/kvm.h1
-rw-r--r--arch/x86/include/asm/kvm_emulate.h6
-rw-r--r--arch/x86/include/asm/kvm_host.h35
-rw-r--r--arch/x86/include/asm/kvm_para.h7
-rw-r--r--arch/x86/include/asm/msr.h46
-rw-r--r--arch/x86/include/asm/nmi.h20
-rw-r--r--arch/x86/include/asm/paravirt.h46
-rw-r--r--arch/x86/include/asm/paravirt_types.h5
-rw-r--r--arch/x86/include/asm/pci_x86.h15
-rw-r--r--arch/x86/include/asm/percpu.h17
-rw-r--r--arch/x86/include/asm/perf_event.h24
-rw-r--r--arch/x86/include/asm/pgtable-2level.h4
-rw-r--r--arch/x86/include/asm/pgtable-3level.h6
-rw-r--r--arch/x86/include/asm/pgtable_64.h8
-rw-r--r--arch/x86/include/asm/processor-flags.h2
-rw-r--r--arch/x86/include/asm/processor.h13
-rw-r--r--arch/x86/include/asm/realmode.h3
-rw-r--r--arch/x86/include/asm/reboot.h4
-rw-r--r--arch/x86/include/asm/smp.h21
-rw-r--r--arch/x86/include/asm/tlb.h9
-rw-r--r--arch/x86/include/asm/tlbflush.h49
-rw-r--r--arch/x86/include/asm/uaccess_64.h11
-rw-r--r--arch/x86/include/asm/unistd.h1
-rw-r--r--arch/x86/include/asm/uprobes.h2
-rw-r--r--arch/x86/include/asm/uv/uv.h5
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h28
-rw-r--r--arch/x86/include/asm/vmx.h6
-rw-r--r--arch/x86/include/asm/x2apic.h18
-rw-r--r--arch/x86/include/asm/x86_init.h4
-rw-r--r--arch/x86/include/asm/xen/hypercall.h8
-rw-r--r--arch/x86/kernel/alternative.c19
-rw-r--r--arch/x86/kernel/amd_nb.c11
-rw-r--r--arch/x86/kernel/apic/apic.c42
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c76
-rw-r--r--arch/x86/kernel/apic/apic_noop.c9
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c50
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c48
-rw-r--r--arch/x86/kernel/apic/es7000_32.c51
-rw-r--r--arch/x86/kernel/apic/io_apic.c350
-rw-r--r--arch/x86/kernel/apic/numaq_32.c30
-rw-r--r--arch/x86/kernel/apic/probe_32.c23
-rw-r--r--arch/x86/kernel/apic/probe_64.c11
-rw-r--r--arch/x86/kernel/apic/summit_32.c68
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c82
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c39
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c45
-rw-r--r--arch/x86/kernel/apm_32.c29
-rw-r--r--arch/x86/kernel/cpu/Makefile6
-rw-r--r--arch/x86/kernel/cpu/amd.c39
-rw-r--r--arch/x86/kernel/cpu/bugs.c20
-rw-r--r--arch/x86/kernel/cpu/common.c33
-rw-r--r--arch/x86/kernel/cpu/cpu.h9
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c3
-rw-r--r--arch/x86/kernel/cpu/intel.c176
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c32
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c286
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c6
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c6
-rw-r--r--arch/x86/kernel/cpu/perf_event.c111
-rw-r--r--arch/x86/kernel/cpu/perf_event.h26
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c103
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c226
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c12
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.c2900
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.h621
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c16
-rw-r--r--arch/x86/kernel/cpu/perf_event_p6.c4
-rw-r--r--arch/x86/kernel/cpu/sched.c55
-rw-r--r--arch/x86/kernel/dumpstack.c5
-rw-r--r--arch/x86/kernel/dumpstack_32.c25
-rw-r--r--arch/x86/kernel/dumpstack_64.c21
-rw-r--r--arch/x86/kernel/e820.c2
-rw-r--r--arch/x86/kernel/entry_64.S38
-rw-r--r--arch/x86/kernel/irq.c4
-rw-r--r--arch/x86/kernel/irqinit.c73
-rw-r--r--arch/x86/kernel/kvm.c64
-rw-r--r--arch/x86/kernel/microcode_core.c66
-rw-r--r--arch/x86/kernel/module.c34
-rw-r--r--arch/x86/kernel/nmi.c47
-rw-r--r--arch/x86/kernel/nmi_selftest.c7
-rw-r--r--arch/x86/kernel/paravirt.c2
-rw-r--r--arch/x86/kernel/pci-calgary_64.c34
-rw-r--r--arch/x86/kernel/pci-dma.c11
-rw-r--r--arch/x86/kernel/process.c34
-rw-r--r--arch/x86/kernel/process_64.c12
-rw-r--r--arch/x86/kernel/quirks.c2
-rw-r--r--arch/x86/kernel/reboot.c74
-rw-r--r--arch/x86/kernel/setup.c2
-rw-r--r--arch/x86/kernel/setup_percpu.c2
-rw-r--r--arch/x86/kernel/signal.c5
-rw-r--r--arch/x86/kernel/smpboot.c114
-rw-r--r--arch/x86/kernel/traps.c19
-rw-r--r--arch/x86/kernel/tsc.c50
-rw-r--r--arch/x86/kernel/uprobes.c3
-rw-r--r--arch/x86/kernel/vm86_32.c6
-rw-r--r--arch/x86/kernel/vsmp_64.c44
-rw-r--r--arch/x86/kernel/vsyscall_64.c17
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c1
-rw-r--r--arch/x86/kernel/x86_init.c2
-rw-r--r--arch/x86/kernel/xsave.c12
-rw-r--r--arch/x86/kvm/cpuid.c46
-rw-r--r--arch/x86/kvm/cpuid.h9
-rw-r--r--arch/x86/kvm/emulate.c273
-rw-r--r--arch/x86/kvm/i8259.c17
-rw-r--r--arch/x86/kvm/lapic.c194
-rw-r--r--arch/x86/kvm/lapic.h11
-rw-r--r--arch/x86/kvm/mmu.c359
-rw-r--r--arch/x86/kvm/mmutrace.h45
-rw-r--r--arch/x86/kvm/paging_tmpl.h3
-rw-r--r--arch/x86/kvm/pmu.c22
-rw-r--r--arch/x86/kvm/svm.c12
-rw-r--r--arch/x86/kvm/trace.h46
-rw-r--r--arch/x86/kvm/vmx.c189
-rw-r--r--arch/x86/kvm/x86.c123
-rw-r--r--arch/x86/lib/msr-reg-export.c4
-rw-r--r--arch/x86/lib/msr-reg.S10
-rw-r--r--arch/x86/mm/init.c2
-rw-r--r--arch/x86/mm/pageattr.c10
-rw-r--r--arch/x86/mm/tlb.c401
-rw-r--r--arch/x86/net/bpf_jit_comp.c4
-rw-r--r--arch/x86/oprofile/op_model_amd.c4
-rw-r--r--arch/x86/pci/acpi.c109
-rw-r--r--arch/x86/pci/amd_bus.c7
-rw-r--r--arch/x86/pci/bus_numa.c22
-rw-r--r--arch/x86/pci/bus_numa.h3
-rw-r--r--arch/x86/pci/common.c2
-rw-r--r--arch/x86/pci/mmconfig-shared.c372
-rw-r--r--arch/x86/pci/mmconfig_32.c30
-rw-r--r--arch/x86/pci/mmconfig_64.c52
-rw-r--r--arch/x86/pci/mrst.c2
-rw-r--r--arch/x86/platform/efi/efi.c30
-rw-r--r--arch/x86/platform/olpc/olpc-xo15-sci.c6
-rw-r--r--arch/x86/platform/uv/tlb_uv.c459
-rw-r--r--arch/x86/platform/uv/uv_irq.c9
-rw-r--r--arch/x86/realmode/rm/Makefile2
-rw-r--r--arch/x86/realmode/rm/header.S4
-rw-r--r--arch/x86/realmode/rm/reboot.S (renamed from arch/x86/realmode/rm/reboot_32.S)30
-rw-r--r--arch/x86/vdso/vdso32-setup.c6
-rw-r--r--arch/x86/xen/enlighten.c226
-rw-r--r--arch/x86/xen/mmu.c51
-rw-r--r--arch/x86/xen/setup.c23
-rw-r--r--arch/x86/xen/smp.c2
-rw-r--r--arch/x86/xen/suspend.c2
-rw-r--r--arch/x86/xen/xen-ops.h2
187 files changed, 11171 insertions, 3868 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c70684f859e1..ba2657c49217 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -70,6 +70,7 @@ config X86
70 select HAVE_ARCH_JUMP_LABEL 70 select HAVE_ARCH_JUMP_LABEL
71 select HAVE_TEXT_POKE_SMP 71 select HAVE_TEXT_POKE_SMP
72 select HAVE_GENERIC_HARDIRQS 72 select HAVE_GENERIC_HARDIRQS
73 select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
73 select SPARSE_IRQ 74 select SPARSE_IRQ
74 select GENERIC_FIND_FIRST_BIT 75 select GENERIC_FIND_FIRST_BIT
75 select GENERIC_IRQ_PROBE 76 select GENERIC_IRQ_PROBE
@@ -84,6 +85,7 @@ config X86
84 select GENERIC_IOMAP 85 select GENERIC_IOMAP
85 select DCACHE_WORD_ACCESS 86 select DCACHE_WORD_ACCESS
86 select GENERIC_SMP_IDLE_THREAD 87 select GENERIC_SMP_IDLE_THREAD
88 select ARCH_WANT_IPC_PARSE_VERSION if X86_32
87 select HAVE_ARCH_SECCOMP_FILTER 89 select HAVE_ARCH_SECCOMP_FILTER
88 select BUILDTIME_EXTABLE_SORT 90 select BUILDTIME_EXTABLE_SORT
89 select GENERIC_CMOS_UPDATE 91 select GENERIC_CMOS_UPDATE
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index e46c2147397f..b322f124ee3c 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -129,6 +129,25 @@ config DOUBLEFAULT
129 option saves about 4k and might cause you much additional grey 129 option saves about 4k and might cause you much additional grey
130 hair. 130 hair.
131 131
132config DEBUG_TLBFLUSH
133 bool "Set upper limit of TLB entries to flush one-by-one"
134 depends on DEBUG_KERNEL && (X86_64 || X86_INVLPG)
135 ---help---
136
137 X86-only for now.
138
139 This option allows the user to tune the amount of TLB entries the
140 kernel flushes one-by-one instead of doing a full TLB flush. In
141 certain situations, the former is cheaper. This is controlled by the
142 tlb_flushall_shift knob under /sys/kernel/debug/x86. If you set it
143 to -1, the code flushes the whole TLB unconditionally. Otherwise,
144 for positive values of it, the kernel will use single TLB entry
145 invalidating instructions according to the following formula:
146
147 flush_entries <= active_tlb_entries / 2^tlb_flushall_shift
148
149 If in doubt, say "N".
150
132config IOMMU_DEBUG 151config IOMMU_DEBUG
133 bool "Enable IOMMU debugging" 152 bool "Enable IOMMU debugging"
134 depends on GART_IOMMU && DEBUG_KERNEL 153 depends on GART_IOMMU && DEBUG_KERNEL
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 1f2521434554..b0c5276861ec 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -49,6 +49,9 @@ else
49 KBUILD_AFLAGS += -m64 49 KBUILD_AFLAGS += -m64
50 KBUILD_CFLAGS += -m64 50 KBUILD_CFLAGS += -m64
51 51
52 # Use -mpreferred-stack-boundary=3 if supported.
53 KBUILD_CFLAGS += $(call cc-option,-mno-sse -mpreferred-stack-boundary=3)
54
52 # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu) 55 # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu)
53 cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8) 56 cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8)
54 cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona) 57 cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c
index cb62f786990d..10f6b1178c68 100644
--- a/arch/x86/boot/compressed/cmdline.c
+++ b/arch/x86/boot/compressed/cmdline.c
@@ -1,5 +1,7 @@
1#include "misc.h" 1#include "misc.h"
2 2
3#ifdef CONFIG_EARLY_PRINTK
4
3static unsigned long fs; 5static unsigned long fs;
4static inline void set_fs(unsigned long seg) 6static inline void set_fs(unsigned long seg)
5{ 7{
@@ -19,3 +21,5 @@ int cmdline_find_option_bool(const char *option)
19{ 21{
20 return __cmdline_find_option_bool(real_mode->hdr.cmd_line_ptr, option); 22 return __cmdline_find_option_bool(real_mode->hdr.cmd_line_ptr, option);
21} 23}
24
25#endif
diff --git a/arch/x86/boot/compressed/early_serial_console.c b/arch/x86/boot/compressed/early_serial_console.c
index 261e81fb9582..d3d003cb5481 100644
--- a/arch/x86/boot/compressed/early_serial_console.c
+++ b/arch/x86/boot/compressed/early_serial_console.c
@@ -1,5 +1,9 @@
1#include "misc.h" 1#include "misc.h"
2 2
3#ifdef CONFIG_EARLY_PRINTK
4
3int early_serial_base; 5int early_serial_base;
4 6
5#include "../early_serial_console.c" 7#include "../early_serial_console.c"
8
9#endif
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 4e85f5f85837..b3e0227df2c9 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -729,32 +729,68 @@ fail:
729 * need to create one ourselves (usually the bootloader would create 729 * need to create one ourselves (usually the bootloader would create
730 * one for us). 730 * one for us).
731 */ 731 */
732static efi_status_t make_boot_params(struct boot_params *boot_params, 732struct boot_params *make_boot_params(void *handle, efi_system_table_t *_table)
733 efi_loaded_image_t *image,
734 void *handle)
735{ 733{
736 struct efi_info *efi = &boot_params->efi_info; 734 struct boot_params *boot_params;
737 struct apm_bios_info *bi = &boot_params->apm_bios_info; 735 struct sys_desc_table *sdt;
738 struct sys_desc_table *sdt = &boot_params->sys_desc_table; 736 struct apm_bios_info *bi;
739 struct e820entry *e820_map = &boot_params->e820_map[0]; 737 struct setup_header *hdr;
740 struct e820entry *prev = NULL; 738 struct efi_info *efi;
741 struct setup_header *hdr = &boot_params->hdr; 739 efi_loaded_image_t *image;
742 unsigned long size, key, desc_size, _size; 740 void *options;
743 efi_memory_desc_t *mem_map; 741 u32 load_options_size;
744 void *options = image->load_options; 742 efi_guid_t proto = LOADED_IMAGE_PROTOCOL_GUID;
745 u32 load_options_size = image->load_options_size / 2; /* ASCII */
746 int options_size = 0; 743 int options_size = 0;
747 efi_status_t status; 744 efi_status_t status;
748 __u32 desc_version;
749 unsigned long cmdline; 745 unsigned long cmdline;
750 u8 nr_entries;
751 u16 *s2; 746 u16 *s2;
752 u8 *s1; 747 u8 *s1;
753 int i; 748 int i;
754 749
750 sys_table = _table;
751
752 /* Check if we were booted by the EFI firmware */
753 if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
754 return NULL;
755
756 status = efi_call_phys3(sys_table->boottime->handle_protocol,
757 handle, &proto, (void *)&image);
758 if (status != EFI_SUCCESS) {
759 efi_printk("Failed to get handle for LOADED_IMAGE_PROTOCOL\n");
760 return NULL;
761 }
762
763 status = low_alloc(0x4000, 1, (unsigned long *)&boot_params);
764 if (status != EFI_SUCCESS) {
765 efi_printk("Failed to alloc lowmem for boot params\n");
766 return NULL;
767 }
768
769 memset(boot_params, 0x0, 0x4000);
770
771 hdr = &boot_params->hdr;
772 efi = &boot_params->efi_info;
773 bi = &boot_params->apm_bios_info;
774 sdt = &boot_params->sys_desc_table;
775
776 /* Copy the second sector to boot_params */
777 memcpy(&hdr->jump, image->image_base + 512, 512);
778
779 /*
780 * Fill out some of the header fields ourselves because the
781 * EFI firmware loader doesn't load the first sector.
782 */
783 hdr->root_flags = 1;
784 hdr->vid_mode = 0xffff;
785 hdr->boot_flag = 0xAA55;
786
787 hdr->code32_start = (__u64)(unsigned long)image->image_base;
788
755 hdr->type_of_loader = 0x21; 789 hdr->type_of_loader = 0x21;
756 790
757 /* Convert unicode cmdline to ascii */ 791 /* Convert unicode cmdline to ascii */
792 options = image->load_options;
793 load_options_size = image->load_options_size / 2; /* ASCII */
758 cmdline = 0; 794 cmdline = 0;
759 s2 = (u16 *)options; 795 s2 = (u16 *)options;
760 796
@@ -791,18 +827,36 @@ static efi_status_t make_boot_params(struct boot_params *boot_params,
791 hdr->ramdisk_image = 0; 827 hdr->ramdisk_image = 0;
792 hdr->ramdisk_size = 0; 828 hdr->ramdisk_size = 0;
793 829
794 status = handle_ramdisks(image, hdr);
795 if (status != EFI_SUCCESS)
796 goto free_cmdline;
797
798 setup_graphics(boot_params);
799
800 /* Clear APM BIOS info */ 830 /* Clear APM BIOS info */
801 memset(bi, 0, sizeof(*bi)); 831 memset(bi, 0, sizeof(*bi));
802 832
803 memset(sdt, 0, sizeof(*sdt)); 833 memset(sdt, 0, sizeof(*sdt));
804 834
805 memcpy(&efi->efi_loader_signature, EFI_LOADER_SIGNATURE, sizeof(__u32)); 835 status = handle_ramdisks(image, hdr);
836 if (status != EFI_SUCCESS)
837 goto fail2;
838
839 return boot_params;
840fail2:
841 if (options_size)
842 low_free(options_size, hdr->cmd_line_ptr);
843fail:
844 low_free(0x4000, (unsigned long)boot_params);
845 return NULL;
846}
847
848static efi_status_t exit_boot(struct boot_params *boot_params,
849 void *handle)
850{
851 struct efi_info *efi = &boot_params->efi_info;
852 struct e820entry *e820_map = &boot_params->e820_map[0];
853 struct e820entry *prev = NULL;
854 unsigned long size, key, desc_size, _size;
855 efi_memory_desc_t *mem_map;
856 efi_status_t status;
857 __u32 desc_version;
858 u8 nr_entries;
859 int i;
806 860
807 size = sizeof(*mem_map) * 32; 861 size = sizeof(*mem_map) * 32;
808 862
@@ -811,7 +865,7 @@ again:
811 _size = size; 865 _size = size;
812 status = low_alloc(size, 1, (unsigned long *)&mem_map); 866 status = low_alloc(size, 1, (unsigned long *)&mem_map);
813 if (status != EFI_SUCCESS) 867 if (status != EFI_SUCCESS)
814 goto free_cmdline; 868 return status;
815 869
816 status = efi_call_phys5(sys_table->boottime->get_memory_map, &size, 870 status = efi_call_phys5(sys_table->boottime->get_memory_map, &size,
817 mem_map, &key, &desc_size, &desc_version); 871 mem_map, &key, &desc_size, &desc_version);
@@ -823,6 +877,7 @@ again:
823 if (status != EFI_SUCCESS) 877 if (status != EFI_SUCCESS)
824 goto free_mem_map; 878 goto free_mem_map;
825 879
880 memcpy(&efi->efi_loader_signature, EFI_LOADER_SIGNATURE, sizeof(__u32));
826 efi->efi_systab = (unsigned long)sys_table; 881 efi->efi_systab = (unsigned long)sys_table;
827 efi->efi_memdesc_size = desc_size; 882 efi->efi_memdesc_size = desc_size;
828 efi->efi_memdesc_version = desc_version; 883 efi->efi_memdesc_version = desc_version;
@@ -906,61 +961,13 @@ again:
906 961
907free_mem_map: 962free_mem_map:
908 low_free(_size, (unsigned long)mem_map); 963 low_free(_size, (unsigned long)mem_map);
909free_cmdline:
910 if (options_size)
911 low_free(options_size, hdr->cmd_line_ptr);
912fail:
913 return status; 964 return status;
914} 965}
915 966
916/* 967static efi_status_t relocate_kernel(struct setup_header *hdr)
917 * On success we return a pointer to a boot_params structure, and NULL
918 * on failure.
919 */
920struct boot_params *efi_main(void *handle, efi_system_table_t *_table)
921{ 968{
922 struct boot_params *boot_params;
923 unsigned long start, nr_pages; 969 unsigned long start, nr_pages;
924 struct desc_ptr *gdt, *idt;
925 efi_loaded_image_t *image;
926 struct setup_header *hdr;
927 efi_status_t status; 970 efi_status_t status;
928 efi_guid_t proto = LOADED_IMAGE_PROTOCOL_GUID;
929 struct desc_struct *desc;
930
931 sys_table = _table;
932
933 /* Check if we were booted by the EFI firmware */
934 if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
935 goto fail;
936
937 status = efi_call_phys3(sys_table->boottime->handle_protocol,
938 handle, &proto, (void *)&image);
939 if (status != EFI_SUCCESS) {
940 efi_printk("Failed to get handle for LOADED_IMAGE_PROTOCOL\n");
941 goto fail;
942 }
943
944 status = low_alloc(0x4000, 1, (unsigned long *)&boot_params);
945 if (status != EFI_SUCCESS) {
946 efi_printk("Failed to alloc lowmem for boot params\n");
947 goto fail;
948 }
949
950 memset(boot_params, 0x0, 0x4000);
951
952 hdr = &boot_params->hdr;
953
954 /* Copy the second sector to boot_params */
955 memcpy(&hdr->jump, image->image_base + 512, 512);
956
957 /*
958 * Fill out some of the header fields ourselves because the
959 * EFI firmware loader doesn't load the first sector.
960 */
961 hdr->root_flags = 1;
962 hdr->vid_mode = 0xffff;
963 hdr->boot_flag = 0xAA55;
964 971
965 /* 972 /*
966 * The EFI firmware loader could have placed the kernel image 973 * The EFI firmware loader could have placed the kernel image
@@ -978,16 +985,40 @@ struct boot_params *efi_main(void *handle, efi_system_table_t *_table)
978 if (status != EFI_SUCCESS) { 985 if (status != EFI_SUCCESS) {
979 status = low_alloc(hdr->init_size, hdr->kernel_alignment, 986 status = low_alloc(hdr->init_size, hdr->kernel_alignment,
980 &start); 987 &start);
981 if (status != EFI_SUCCESS) { 988 if (status != EFI_SUCCESS)
982 efi_printk("Failed to alloc mem for kernel\n"); 989 efi_printk("Failed to alloc mem for kernel\n");
983 goto fail;
984 }
985 } 990 }
986 991
992 if (status == EFI_SUCCESS)
993 memcpy((void *)start, (void *)(unsigned long)hdr->code32_start,
994 hdr->init_size);
995
996 hdr->pref_address = hdr->code32_start;
987 hdr->code32_start = (__u32)start; 997 hdr->code32_start = (__u32)start;
988 hdr->pref_address = (__u64)(unsigned long)image->image_base;
989 998
990 memcpy((void *)start, image->image_base, image->image_size); 999 return status;
1000}
1001
1002/*
1003 * On success we return a pointer to a boot_params structure, and NULL
1004 * on failure.
1005 */
1006struct boot_params *efi_main(void *handle, efi_system_table_t *_table,
1007 struct boot_params *boot_params)
1008{
1009 struct desc_ptr *gdt, *idt;
1010 efi_loaded_image_t *image;
1011 struct setup_header *hdr = &boot_params->hdr;
1012 efi_status_t status;
1013 struct desc_struct *desc;
1014
1015 sys_table = _table;
1016
1017 /* Check if we were booted by the EFI firmware */
1018 if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
1019 goto fail;
1020
1021 setup_graphics(boot_params);
991 1022
992 status = efi_call_phys3(sys_table->boottime->allocate_pool, 1023 status = efi_call_phys3(sys_table->boottime->allocate_pool,
993 EFI_LOADER_DATA, sizeof(*gdt), 1024 EFI_LOADER_DATA, sizeof(*gdt),
@@ -1015,7 +1046,18 @@ struct boot_params *efi_main(void *handle, efi_system_table_t *_table)
1015 idt->size = 0; 1046 idt->size = 0;
1016 idt->address = 0; 1047 idt->address = 0;
1017 1048
1018 status = make_boot_params(boot_params, image, handle); 1049 /*
1050 * If the kernel isn't already loaded at the preferred load
1051 * address, relocate it.
1052 */
1053 if (hdr->pref_address != hdr->code32_start) {
1054 status = relocate_kernel(hdr);
1055
1056 if (status != EFI_SUCCESS)
1057 goto fail;
1058 }
1059
1060 status = exit_boot(boot_params, handle);
1019 if (status != EFI_SUCCESS) 1061 if (status != EFI_SUCCESS)
1020 goto fail; 1062 goto fail;
1021 1063
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index c85e3ac99bba..aa4aaf1b2380 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -42,6 +42,16 @@ ENTRY(startup_32)
42 */ 42 */
43 add $0x4, %esp 43 add $0x4, %esp
44 44
45 call make_boot_params
46 cmpl $0, %eax
47 je 1f
48 movl 0x4(%esp), %esi
49 movl (%esp), %ecx
50 pushl %eax
51 pushl %esi
52 pushl %ecx
53
54 .org 0x30,0x90
45 call efi_main 55 call efi_main
46 cmpl $0, %eax 56 cmpl $0, %eax
47 movl %eax, %esi 57 movl %eax, %esi
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 87e03a13d8e3..2c4b171eec33 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -209,6 +209,16 @@ ENTRY(startup_64)
209 .org 0x210 209 .org 0x210
210 mov %rcx, %rdi 210 mov %rcx, %rdi
211 mov %rdx, %rsi 211 mov %rdx, %rsi
212 pushq %rdi
213 pushq %rsi
214 call make_boot_params
215 cmpq $0,%rax
216 je 1f
217 mov %rax, %rdx
218 popq %rsi
219 popq %rdi
220
221 .org 0x230,0x90
212 call efi_main 222 call efi_main
213 movq %rax,%rsi 223 movq %rax,%rsi
214 cmpq $0,%rax 224 cmpq $0,%rax
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 7116dcba0c9e..88f7ff6da404 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -108,8 +108,6 @@ static void error(char *m);
108 * This is set up by the setup-routine at boot-time 108 * This is set up by the setup-routine at boot-time
109 */ 109 */
110struct boot_params *real_mode; /* Pointer to real-mode data */ 110struct boot_params *real_mode; /* Pointer to real-mode data */
111static int quiet;
112static int debug;
113 111
114void *memset(void *s, int c, size_t n); 112void *memset(void *s, int c, size_t n);
115void *memcpy(void *dest, const void *src, size_t n); 113void *memcpy(void *dest, const void *src, size_t n);
@@ -170,15 +168,11 @@ static void serial_putchar(int ch)
170 outb(ch, early_serial_base + TXR); 168 outb(ch, early_serial_base + TXR);
171} 169}
172 170
173void __putstr(int error, const char *s) 171void __putstr(const char *s)
174{ 172{
175 int x, y, pos; 173 int x, y, pos;
176 char c; 174 char c;
177 175
178#ifndef CONFIG_X86_VERBOSE_BOOTUP
179 if (!error)
180 return;
181#endif
182 if (early_serial_base) { 176 if (early_serial_base) {
183 const char *str = s; 177 const char *str = s;
184 while (*str) { 178 while (*str) {
@@ -265,9 +259,9 @@ void *memcpy(void *dest, const void *src, size_t n)
265 259
266static void error(char *x) 260static void error(char *x)
267{ 261{
268 __putstr(1, "\n\n"); 262 error_putstr("\n\n");
269 __putstr(1, x); 263 error_putstr(x);
270 __putstr(1, "\n\n -- System halted"); 264 error_putstr("\n\n -- System halted");
271 265
272 while (1) 266 while (1)
273 asm("hlt"); 267 asm("hlt");
@@ -294,8 +288,7 @@ static void parse_elf(void *output)
294 return; 288 return;
295 } 289 }
296 290
297 if (!quiet) 291 debug_putstr("Parsing ELF... ");
298 putstr("Parsing ELF... ");
299 292
300 phdrs = malloc(sizeof(*phdrs) * ehdr.e_phnum); 293 phdrs = malloc(sizeof(*phdrs) * ehdr.e_phnum);
301 if (!phdrs) 294 if (!phdrs)
@@ -332,11 +325,6 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
332{ 325{
333 real_mode = rmode; 326 real_mode = rmode;
334 327
335 if (cmdline_find_option_bool("quiet"))
336 quiet = 1;
337 if (cmdline_find_option_bool("debug"))
338 debug = 1;
339
340 if (real_mode->screen_info.orig_video_mode == 7) { 328 if (real_mode->screen_info.orig_video_mode == 7) {
341 vidmem = (char *) 0xb0000; 329 vidmem = (char *) 0xb0000;
342 vidport = 0x3b4; 330 vidport = 0x3b4;
@@ -349,8 +337,7 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
349 cols = real_mode->screen_info.orig_video_cols; 337 cols = real_mode->screen_info.orig_video_cols;
350 338
351 console_init(); 339 console_init();
352 if (debug) 340 debug_putstr("early console in decompress_kernel\n");
353 putstr("early console in decompress_kernel\n");
354 341
355 free_mem_ptr = heap; /* Heap */ 342 free_mem_ptr = heap; /* Heap */
356 free_mem_end_ptr = heap + BOOT_HEAP_SIZE; 343 free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
@@ -369,11 +356,9 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
369 error("Wrong destination address"); 356 error("Wrong destination address");
370#endif 357#endif
371 358
372 if (!quiet) 359 debug_putstr("\nDecompressing Linux... ");
373 putstr("\nDecompressing Linux... ");
374 decompress(input_data, input_len, NULL, NULL, output, NULL, error); 360 decompress(input_data, input_len, NULL, NULL, output, NULL, error);
375 parse_elf(output); 361 parse_elf(output);
376 if (!quiet) 362 debug_putstr("done.\nBooting the kernel.\n");
377 putstr("done.\nBooting the kernel.\n");
378 return; 363 return;
379} 364}
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 3f19c81a6203..0e6dc0ee0eea 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -24,9 +24,21 @@
24 24
25/* misc.c */ 25/* misc.c */
26extern struct boot_params *real_mode; /* Pointer to real-mode data */ 26extern struct boot_params *real_mode; /* Pointer to real-mode data */
27void __putstr(int error, const char *s); 27void __putstr(const char *s);
28#define putstr(__x) __putstr(0, __x) 28#define error_putstr(__x) __putstr(__x)
29#define puts(__x) __putstr(0, __x) 29
30#ifdef CONFIG_X86_VERBOSE_BOOTUP
31
32#define debug_putstr(__x) __putstr(__x)
33
34#else
35
36static inline void debug_putstr(const char *s)
37{ }
38
39#endif
40
41#ifdef CONFIG_EARLY_PRINTK
30 42
31/* cmdline.c */ 43/* cmdline.c */
32int cmdline_find_option(const char *option, char *buffer, int bufsize); 44int cmdline_find_option(const char *option, char *buffer, int bufsize);
@@ -36,4 +48,13 @@ int cmdline_find_option_bool(const char *option);
36extern int early_serial_base; 48extern int early_serial_base;
37void console_init(void); 49void console_init(void);
38 50
51#else
52
53/* early_serial_console.c */
54static const int early_serial_base;
55static inline void console_init(void)
56{ }
57
58#endif
59
39#endif 60#endif
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index efe5acfc79c3..b4e15dd6786a 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -283,7 +283,7 @@ _start:
283 # Part 2 of the header, from the old setup.S 283 # Part 2 of the header, from the old setup.S
284 284
285 .ascii "HdrS" # header signature 285 .ascii "HdrS" # header signature
286 .word 0x020a # header version number (>= 0x0105) 286 .word 0x020b # header version number (>= 0x0105)
287 # or else old loadlin-1.5 will fail) 287 # or else old loadlin-1.5 will fail)
288 .globl realmode_swtch 288 .globl realmode_swtch
289realmode_swtch: .word 0, 0 # default_switch, SETUPSEG 289realmode_swtch: .word 0, 0 # default_switch, SETUPSEG
@@ -401,18 +401,13 @@ pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr
401#define INIT_SIZE VO_INIT_SIZE 401#define INIT_SIZE VO_INIT_SIZE
402#endif 402#endif
403init_size: .long INIT_SIZE # kernel initialization size 403init_size: .long INIT_SIZE # kernel initialization size
404handover_offset: .long 0x30 # offset to the handover
405 # protocol entry point
404 406
405# End of setup header ##################################################### 407# End of setup header #####################################################
406 408
407 .section ".entrytext", "ax" 409 .section ".entrytext", "ax"
408start_of_setup: 410start_of_setup:
409#ifdef SAFE_RESET_DISK_CONTROLLER
410# Reset the disk controller.
411 movw $0x0000, %ax # Reset disk controller
412 movb $0x80, %dl # All disks
413 int $0x13
414#endif
415
416# Force %es = %ds 411# Force %es = %ds
417 movw %ds, %ax 412 movw %ds, %ax
418 movw %ax, %es 413 movw %ax, %es
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index e191ac048b59..e908e5de82d3 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -2,6 +2,9 @@
2# Arch-specific CryptoAPI modules. 2# Arch-specific CryptoAPI modules.
3# 3#
4 4
5obj-$(CONFIG_CRYPTO_ABLK_HELPER_X86) += ablk_helper.o
6obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
7
5obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o 8obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
6obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o 9obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
7obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o 10obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
@@ -12,8 +15,10 @@ obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
12obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o 15obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
13obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o 16obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
14obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o 17obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
18obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
15obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o 19obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
16obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o 20obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
21obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
17obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o 22obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
18obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o 23obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
19 24
@@ -30,16 +35,11 @@ camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
30blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o 35blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
31twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o 36twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
32twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o 37twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
38twofish-avx-x86_64-y := twofish-avx-x86_64-asm_64.o twofish_avx_glue.o
33salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o 39salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
34serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o 40serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
41serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o serpent_avx_glue.o
35 42
36aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o 43aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
37
38ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o 44ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
39
40# enable AVX support only when $(AS) can actually assemble the instructions
41ifeq ($(call as-instr,vpxor %xmm0$(comma)%xmm1$(comma)%xmm2,yes,no),yes)
42AFLAGS_sha1_ssse3_asm.o += -DSHA1_ENABLE_AVX_SUPPORT
43CFLAGS_sha1_ssse3_glue.o += -DSHA1_ENABLE_AVX_SUPPORT
44endif
45sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o 45sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
diff --git a/arch/x86/crypto/ablk_helper.c b/arch/x86/crypto/ablk_helper.c
new file mode 100644
index 000000000000..43282fe04a8b
--- /dev/null
+++ b/arch/x86/crypto/ablk_helper.c
@@ -0,0 +1,149 @@
1/*
2 * Shared async block cipher helpers
3 *
4 * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * Based on aesni-intel_glue.c by:
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
23 * USA
24 *
25 */
26
27#include <linux/kernel.h>
28#include <linux/crypto.h>
29#include <linux/init.h>
30#include <linux/module.h>
31#include <crypto/algapi.h>
32#include <crypto/cryptd.h>
33#include <asm/i387.h>
34#include <asm/crypto/ablk_helper.h>
35
36int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
37 unsigned int key_len)
38{
39 struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm);
40 struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base;
41 int err;
42
43 crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
44 crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm)
45 & CRYPTO_TFM_REQ_MASK);
46 err = crypto_ablkcipher_setkey(child, key, key_len);
47 crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child)
48 & CRYPTO_TFM_RES_MASK);
49 return err;
50}
51EXPORT_SYMBOL_GPL(ablk_set_key);
52
53int __ablk_encrypt(struct ablkcipher_request *req)
54{
55 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
56 struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm);
57 struct blkcipher_desc desc;
58
59 desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
60 desc.info = req->info;
61 desc.flags = 0;
62
63 return crypto_blkcipher_crt(desc.tfm)->encrypt(
64 &desc, req->dst, req->src, req->nbytes);
65}
66EXPORT_SYMBOL_GPL(__ablk_encrypt);
67
68int ablk_encrypt(struct ablkcipher_request *req)
69{
70 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
71 struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm);
72
73 if (!irq_fpu_usable()) {
74 struct ablkcipher_request *cryptd_req =
75 ablkcipher_request_ctx(req);
76
77 memcpy(cryptd_req, req, sizeof(*req));
78 ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
79
80 return crypto_ablkcipher_encrypt(cryptd_req);
81 } else {
82 return __ablk_encrypt(req);
83 }
84}
85EXPORT_SYMBOL_GPL(ablk_encrypt);
86
87int ablk_decrypt(struct ablkcipher_request *req)
88{
89 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
90 struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm);
91
92 if (!irq_fpu_usable()) {
93 struct ablkcipher_request *cryptd_req =
94 ablkcipher_request_ctx(req);
95
96 memcpy(cryptd_req, req, sizeof(*req));
97 ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
98
99 return crypto_ablkcipher_decrypt(cryptd_req);
100 } else {
101 struct blkcipher_desc desc;
102
103 desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
104 desc.info = req->info;
105 desc.flags = 0;
106
107 return crypto_blkcipher_crt(desc.tfm)->decrypt(
108 &desc, req->dst, req->src, req->nbytes);
109 }
110}
111EXPORT_SYMBOL_GPL(ablk_decrypt);
112
113void ablk_exit(struct crypto_tfm *tfm)
114{
115 struct async_helper_ctx *ctx = crypto_tfm_ctx(tfm);
116
117 cryptd_free_ablkcipher(ctx->cryptd_tfm);
118}
119EXPORT_SYMBOL_GPL(ablk_exit);
120
121int ablk_init_common(struct crypto_tfm *tfm, const char *drv_name)
122{
123 struct async_helper_ctx *ctx = crypto_tfm_ctx(tfm);
124 struct cryptd_ablkcipher *cryptd_tfm;
125
126 cryptd_tfm = cryptd_alloc_ablkcipher(drv_name, 0, 0);
127 if (IS_ERR(cryptd_tfm))
128 return PTR_ERR(cryptd_tfm);
129
130 ctx->cryptd_tfm = cryptd_tfm;
131 tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) +
132 crypto_ablkcipher_reqsize(&cryptd_tfm->base);
133
134 return 0;
135}
136EXPORT_SYMBOL_GPL(ablk_init_common);
137
138int ablk_init(struct crypto_tfm *tfm)
139{
140 char drv_name[CRYPTO_MAX_ALG_NAME];
141
142 snprintf(drv_name, sizeof(drv_name), "__driver-%s",
143 crypto_tfm_alg_driver_name(tfm));
144
145 return ablk_init_common(tfm, drv_name);
146}
147EXPORT_SYMBOL_GPL(ablk_init);
148
149MODULE_LICENSE("GPL");
diff --git a/arch/x86/crypto/aes_glue.c b/arch/x86/crypto/aes_glue.c
index 8efcf42a9d7e..59b37deb8c8d 100644
--- a/arch/x86/crypto/aes_glue.c
+++ b/arch/x86/crypto/aes_glue.c
@@ -5,7 +5,7 @@
5 5
6#include <linux/module.h> 6#include <linux/module.h>
7#include <crypto/aes.h> 7#include <crypto/aes.h>
8#include <asm/aes.h> 8#include <asm/crypto/aes.h>
9 9
10asmlinkage void aes_enc_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in); 10asmlinkage void aes_enc_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in);
11asmlinkage void aes_dec_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in); 11asmlinkage void aes_dec_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in);
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index ac7f5cd019e8..34fdcff4d2c8 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -30,7 +30,8 @@
30#include <crypto/ctr.h> 30#include <crypto/ctr.h>
31#include <asm/cpu_device_id.h> 31#include <asm/cpu_device_id.h>
32#include <asm/i387.h> 32#include <asm/i387.h>
33#include <asm/aes.h> 33#include <asm/crypto/aes.h>
34#include <asm/crypto/ablk_helper.h>
34#include <crypto/scatterwalk.h> 35#include <crypto/scatterwalk.h>
35#include <crypto/internal/aead.h> 36#include <crypto/internal/aead.h>
36#include <linux/workqueue.h> 37#include <linux/workqueue.h>
@@ -52,10 +53,6 @@
52#define HAS_XTS 53#define HAS_XTS
53#endif 54#endif
54 55
55struct async_aes_ctx {
56 struct cryptd_ablkcipher *cryptd_tfm;
57};
58
59/* This data is stored at the end of the crypto_tfm struct. 56/* This data is stored at the end of the crypto_tfm struct.
60 * It's a type of per "session" data storage location. 57 * It's a type of per "session" data storage location.
61 * This needs to be 16 byte aligned. 58 * This needs to be 16 byte aligned.
@@ -377,87 +374,6 @@ static int ctr_crypt(struct blkcipher_desc *desc,
377} 374}
378#endif 375#endif
379 376
380static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
381 unsigned int key_len)
382{
383 struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
384 struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base;
385 int err;
386
387 crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
388 crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm)
389 & CRYPTO_TFM_REQ_MASK);
390 err = crypto_ablkcipher_setkey(child, key, key_len);
391 crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child)
392 & CRYPTO_TFM_RES_MASK);
393 return err;
394}
395
396static int ablk_encrypt(struct ablkcipher_request *req)
397{
398 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
399 struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
400
401 if (!irq_fpu_usable()) {
402 struct ablkcipher_request *cryptd_req =
403 ablkcipher_request_ctx(req);
404 memcpy(cryptd_req, req, sizeof(*req));
405 ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
406 return crypto_ablkcipher_encrypt(cryptd_req);
407 } else {
408 struct blkcipher_desc desc;
409 desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
410 desc.info = req->info;
411 desc.flags = 0;
412 return crypto_blkcipher_crt(desc.tfm)->encrypt(
413 &desc, req->dst, req->src, req->nbytes);
414 }
415}
416
417static int ablk_decrypt(struct ablkcipher_request *req)
418{
419 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
420 struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
421
422 if (!irq_fpu_usable()) {
423 struct ablkcipher_request *cryptd_req =
424 ablkcipher_request_ctx(req);
425 memcpy(cryptd_req, req, sizeof(*req));
426 ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
427 return crypto_ablkcipher_decrypt(cryptd_req);
428 } else {
429 struct blkcipher_desc desc;
430 desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
431 desc.info = req->info;
432 desc.flags = 0;
433 return crypto_blkcipher_crt(desc.tfm)->decrypt(
434 &desc, req->dst, req->src, req->nbytes);
435 }
436}
437
438static void ablk_exit(struct crypto_tfm *tfm)
439{
440 struct async_aes_ctx *ctx = crypto_tfm_ctx(tfm);
441
442 cryptd_free_ablkcipher(ctx->cryptd_tfm);
443}
444
445static int ablk_init_common(struct crypto_tfm *tfm, const char *drv_name)
446{
447 struct async_aes_ctx *ctx = crypto_tfm_ctx(tfm);
448 struct cryptd_ablkcipher *cryptd_tfm;
449
450 cryptd_tfm = cryptd_alloc_ablkcipher(drv_name, 0, 0);
451 if (IS_ERR(cryptd_tfm))
452 return PTR_ERR(cryptd_tfm);
453
454 ctx->cryptd_tfm = cryptd_tfm;
455 tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) +
456 crypto_ablkcipher_reqsize(&cryptd_tfm->base);
457
458 return 0;
459}
460
461static int ablk_ecb_init(struct crypto_tfm *tfm) 377static int ablk_ecb_init(struct crypto_tfm *tfm)
462{ 378{
463 return ablk_init_common(tfm, "__driver-ecb-aes-aesni"); 379 return ablk_init_common(tfm, "__driver-ecb-aes-aesni");
@@ -613,7 +529,7 @@ static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
613 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); 529 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
614 struct aesni_rfc4106_gcm_ctx *child_ctx = 530 struct aesni_rfc4106_gcm_ctx *child_ctx =
615 aesni_rfc4106_gcm_ctx_get(cryptd_child); 531 aesni_rfc4106_gcm_ctx_get(cryptd_child);
616 u8 *new_key_mem = NULL; 532 u8 *new_key_align, *new_key_mem = NULL;
617 533
618 if (key_len < 4) { 534 if (key_len < 4) {
619 crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); 535 crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
@@ -637,9 +553,9 @@ static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
637 if (!new_key_mem) 553 if (!new_key_mem)
638 return -ENOMEM; 554 return -ENOMEM;
639 555
640 new_key_mem = PTR_ALIGN(new_key_mem, AESNI_ALIGN); 556 new_key_align = PTR_ALIGN(new_key_mem, AESNI_ALIGN);
641 memcpy(new_key_mem, key, key_len); 557 memcpy(new_key_align, key, key_len);
642 key = new_key_mem; 558 key = new_key_align;
643 } 559 }
644 560
645 if (!irq_fpu_usable()) 561 if (!irq_fpu_usable())
@@ -968,7 +884,7 @@ static struct crypto_alg aesni_algs[] = { {
968 .cra_priority = 400, 884 .cra_priority = 400,
969 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 885 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
970 .cra_blocksize = AES_BLOCK_SIZE, 886 .cra_blocksize = AES_BLOCK_SIZE,
971 .cra_ctxsize = sizeof(struct async_aes_ctx), 887 .cra_ctxsize = sizeof(struct async_helper_ctx),
972 .cra_alignmask = 0, 888 .cra_alignmask = 0,
973 .cra_type = &crypto_ablkcipher_type, 889 .cra_type = &crypto_ablkcipher_type,
974 .cra_module = THIS_MODULE, 890 .cra_module = THIS_MODULE,
@@ -989,7 +905,7 @@ static struct crypto_alg aesni_algs[] = { {
989 .cra_priority = 400, 905 .cra_priority = 400,
990 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 906 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
991 .cra_blocksize = AES_BLOCK_SIZE, 907 .cra_blocksize = AES_BLOCK_SIZE,
992 .cra_ctxsize = sizeof(struct async_aes_ctx), 908 .cra_ctxsize = sizeof(struct async_helper_ctx),
993 .cra_alignmask = 0, 909 .cra_alignmask = 0,
994 .cra_type = &crypto_ablkcipher_type, 910 .cra_type = &crypto_ablkcipher_type,
995 .cra_module = THIS_MODULE, 911 .cra_module = THIS_MODULE,
@@ -1033,7 +949,7 @@ static struct crypto_alg aesni_algs[] = { {
1033 .cra_priority = 400, 949 .cra_priority = 400,
1034 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 950 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
1035 .cra_blocksize = 1, 951 .cra_blocksize = 1,
1036 .cra_ctxsize = sizeof(struct async_aes_ctx), 952 .cra_ctxsize = sizeof(struct async_helper_ctx),
1037 .cra_alignmask = 0, 953 .cra_alignmask = 0,
1038 .cra_type = &crypto_ablkcipher_type, 954 .cra_type = &crypto_ablkcipher_type,
1039 .cra_module = THIS_MODULE, 955 .cra_module = THIS_MODULE,
@@ -1098,7 +1014,7 @@ static struct crypto_alg aesni_algs[] = { {
1098 .cra_priority = 400, 1014 .cra_priority = 400,
1099 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 1015 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
1100 .cra_blocksize = 1, 1016 .cra_blocksize = 1,
1101 .cra_ctxsize = sizeof(struct async_aes_ctx), 1017 .cra_ctxsize = sizeof(struct async_helper_ctx),
1102 .cra_alignmask = 0, 1018 .cra_alignmask = 0,
1103 .cra_type = &crypto_ablkcipher_type, 1019 .cra_type = &crypto_ablkcipher_type,
1104 .cra_module = THIS_MODULE, 1020 .cra_module = THIS_MODULE,
@@ -1126,7 +1042,7 @@ static struct crypto_alg aesni_algs[] = { {
1126 .cra_priority = 400, 1042 .cra_priority = 400,
1127 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 1043 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
1128 .cra_blocksize = AES_BLOCK_SIZE, 1044 .cra_blocksize = AES_BLOCK_SIZE,
1129 .cra_ctxsize = sizeof(struct async_aes_ctx), 1045 .cra_ctxsize = sizeof(struct async_helper_ctx),
1130 .cra_alignmask = 0, 1046 .cra_alignmask = 0,
1131 .cra_type = &crypto_ablkcipher_type, 1047 .cra_type = &crypto_ablkcipher_type,
1132 .cra_module = THIS_MODULE, 1048 .cra_module = THIS_MODULE,
@@ -1150,7 +1066,7 @@ static struct crypto_alg aesni_algs[] = { {
1150 .cra_priority = 400, 1066 .cra_priority = 400,
1151 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 1067 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
1152 .cra_blocksize = AES_BLOCK_SIZE, 1068 .cra_blocksize = AES_BLOCK_SIZE,
1153 .cra_ctxsize = sizeof(struct async_aes_ctx), 1069 .cra_ctxsize = sizeof(struct async_helper_ctx),
1154 .cra_alignmask = 0, 1070 .cra_alignmask = 0,
1155 .cra_type = &crypto_ablkcipher_type, 1071 .cra_type = &crypto_ablkcipher_type,
1156 .cra_module = THIS_MODULE, 1072 .cra_module = THIS_MODULE,
@@ -1174,7 +1090,7 @@ static struct crypto_alg aesni_algs[] = { {
1174 .cra_priority = 400, 1090 .cra_priority = 400,
1175 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 1091 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
1176 .cra_blocksize = AES_BLOCK_SIZE, 1092 .cra_blocksize = AES_BLOCK_SIZE,
1177 .cra_ctxsize = sizeof(struct async_aes_ctx), 1093 .cra_ctxsize = sizeof(struct async_helper_ctx),
1178 .cra_alignmask = 0, 1094 .cra_alignmask = 0,
1179 .cra_type = &crypto_ablkcipher_type, 1095 .cra_type = &crypto_ablkcipher_type,
1180 .cra_module = THIS_MODULE, 1096 .cra_module = THIS_MODULE,
diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c
index 3306dc0b139e..eeb2b3b743e9 100644
--- a/arch/x86/crypto/camellia_glue.c
+++ b/arch/x86/crypto/camellia_glue.c
@@ -5,10 +5,6 @@
5 * 5 *
6 * Camellia parts based on code by: 6 * Camellia parts based on code by:
7 * Copyright (C) 2006 NTT (Nippon Telegraph and Telephone Corporation) 7 * Copyright (C) 2006 NTT (Nippon Telegraph and Telephone Corporation)
8 * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
9 * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
10 * CTR part based on code (crypto/ctr.c) by:
11 * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
12 * 8 *
13 * This program is free software; you can redistribute it and/or modify 9 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by 10 * it under the terms of the GNU General Public License as published by
@@ -34,9 +30,9 @@
34#include <linux/module.h> 30#include <linux/module.h>
35#include <linux/types.h> 31#include <linux/types.h>
36#include <crypto/algapi.h> 32#include <crypto/algapi.h>
37#include <crypto/b128ops.h>
38#include <crypto/lrw.h> 33#include <crypto/lrw.h>
39#include <crypto/xts.h> 34#include <crypto/xts.h>
35#include <asm/crypto/glue_helper.h>
40 36
41#define CAMELLIA_MIN_KEY_SIZE 16 37#define CAMELLIA_MIN_KEY_SIZE 16
42#define CAMELLIA_MAX_KEY_SIZE 32 38#define CAMELLIA_MAX_KEY_SIZE 32
@@ -1312,307 +1308,128 @@ static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key,
1312 &tfm->crt_flags); 1308 &tfm->crt_flags);
1313} 1309}
1314 1310
1315static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, 1311static void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src)
1316 void (*fn)(struct camellia_ctx *, u8 *, const u8 *),
1317 void (*fn_2way)(struct camellia_ctx *, u8 *, const u8 *))
1318{ 1312{
1319 struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 1313 u128 iv = *src;
1320 unsigned int bsize = CAMELLIA_BLOCK_SIZE;
1321 unsigned int nbytes;
1322 int err;
1323
1324 err = blkcipher_walk_virt(desc, walk);
1325
1326 while ((nbytes = walk->nbytes)) {
1327 u8 *wsrc = walk->src.virt.addr;
1328 u8 *wdst = walk->dst.virt.addr;
1329
1330 /* Process two block batch */
1331 if (nbytes >= bsize * 2) {
1332 do {
1333 fn_2way(ctx, wdst, wsrc);
1334
1335 wsrc += bsize * 2;
1336 wdst += bsize * 2;
1337 nbytes -= bsize * 2;
1338 } while (nbytes >= bsize * 2);
1339
1340 if (nbytes < bsize)
1341 goto done;
1342 }
1343
1344 /* Handle leftovers */
1345 do {
1346 fn(ctx, wdst, wsrc);
1347
1348 wsrc += bsize;
1349 wdst += bsize;
1350 nbytes -= bsize;
1351 } while (nbytes >= bsize);
1352
1353done:
1354 err = blkcipher_walk_done(desc, walk, nbytes);
1355 }
1356
1357 return err;
1358}
1359
1360static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1361 struct scatterlist *src, unsigned int nbytes)
1362{
1363 struct blkcipher_walk walk;
1364
1365 blkcipher_walk_init(&walk, dst, src, nbytes);
1366 return ecb_crypt(desc, &walk, camellia_enc_blk, camellia_enc_blk_2way);
1367}
1368 1314
1369static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 1315 camellia_dec_blk_2way(ctx, (u8 *)dst, (u8 *)src);
1370 struct scatterlist *src, unsigned int nbytes)
1371{
1372 struct blkcipher_walk walk;
1373
1374 blkcipher_walk_init(&walk, dst, src, nbytes);
1375 return ecb_crypt(desc, &walk, camellia_dec_blk, camellia_dec_blk_2way);
1376}
1377 1316
1378static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, 1317 u128_xor(&dst[1], &dst[1], &iv);
1379 struct blkcipher_walk *walk)
1380{
1381 struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
1382 unsigned int bsize = CAMELLIA_BLOCK_SIZE;
1383 unsigned int nbytes = walk->nbytes;
1384 u128 *src = (u128 *)walk->src.virt.addr;
1385 u128 *dst = (u128 *)walk->dst.virt.addr;
1386 u128 *iv = (u128 *)walk->iv;
1387
1388 do {
1389 u128_xor(dst, src, iv);
1390 camellia_enc_blk(ctx, (u8 *)dst, (u8 *)dst);
1391 iv = dst;
1392
1393 src += 1;
1394 dst += 1;
1395 nbytes -= bsize;
1396 } while (nbytes >= bsize);
1397
1398 u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv);
1399 return nbytes;
1400} 1318}
1401 1319
1402static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 1320static void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
1403 struct scatterlist *src, unsigned int nbytes)
1404{ 1321{
1405 struct blkcipher_walk walk; 1322 be128 ctrblk;
1406 int err;
1407 1323
1408 blkcipher_walk_init(&walk, dst, src, nbytes); 1324 if (dst != src)
1409 err = blkcipher_walk_virt(desc, &walk); 1325 *dst = *src;
1410 1326
1411 while ((nbytes = walk.nbytes)) { 1327 u128_to_be128(&ctrblk, iv);
1412 nbytes = __cbc_encrypt(desc, &walk); 1328 u128_inc(iv);
1413 err = blkcipher_walk_done(desc, &walk, nbytes);
1414 }
1415 1329
1416 return err; 1330 camellia_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk);
1417} 1331}
1418 1332
1419static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, 1333static void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src,
1420 struct blkcipher_walk *walk) 1334 u128 *iv)
1421{ 1335{
1422 struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 1336 be128 ctrblks[2];
1423 unsigned int bsize = CAMELLIA_BLOCK_SIZE;
1424 unsigned int nbytes = walk->nbytes;
1425 u128 *src = (u128 *)walk->src.virt.addr;
1426 u128 *dst = (u128 *)walk->dst.virt.addr;
1427 u128 ivs[2 - 1];
1428 u128 last_iv;
1429 1337
1430 /* Start of the last block. */ 1338 if (dst != src) {
1431 src += nbytes / bsize - 1; 1339 dst[0] = src[0];
1432 dst += nbytes / bsize - 1; 1340 dst[1] = src[1];
1433
1434 last_iv = *src;
1435
1436 /* Process two block batch */
1437 if (nbytes >= bsize * 2) {
1438 do {
1439 nbytes -= bsize * (2 - 1);
1440 src -= 2 - 1;
1441 dst -= 2 - 1;
1442
1443 ivs[0] = src[0];
1444
1445 camellia_dec_blk_2way(ctx, (u8 *)dst, (u8 *)src);
1446
1447 u128_xor(dst + 1, dst + 1, ivs + 0);
1448
1449 nbytes -= bsize;
1450 if (nbytes < bsize)
1451 goto done;
1452
1453 u128_xor(dst, dst, src - 1);
1454 src -= 1;
1455 dst -= 1;
1456 } while (nbytes >= bsize * 2);
1457
1458 if (nbytes < bsize)
1459 goto done;
1460 } 1341 }
1461 1342
1462 /* Handle leftovers */ 1343 u128_to_be128(&ctrblks[0], iv);
1463 for (;;) { 1344 u128_inc(iv);
1464 camellia_dec_blk(ctx, (u8 *)dst, (u8 *)src); 1345 u128_to_be128(&ctrblks[1], iv);
1465 1346 u128_inc(iv);
1466 nbytes -= bsize;
1467 if (nbytes < bsize)
1468 break;
1469 1347
1470 u128_xor(dst, dst, src - 1); 1348 camellia_enc_blk_xor_2way(ctx, (u8 *)dst, (u8 *)ctrblks);
1471 src -= 1;
1472 dst -= 1;
1473 }
1474
1475done:
1476 u128_xor(dst, dst, (u128 *)walk->iv);
1477 *(u128 *)walk->iv = last_iv;
1478
1479 return nbytes;
1480} 1349}
1481 1350
1482static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 1351static const struct common_glue_ctx camellia_enc = {
1483 struct scatterlist *src, unsigned int nbytes) 1352 .num_funcs = 2,
1484{ 1353 .fpu_blocks_limit = -1,
1485 struct blkcipher_walk walk; 1354
1486 int err; 1355 .funcs = { {
1487 1356 .num_blocks = 2,
1488 blkcipher_walk_init(&walk, dst, src, nbytes); 1357 .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk_2way) }
1489 err = blkcipher_walk_virt(desc, &walk); 1358 }, {
1359 .num_blocks = 1,
1360 .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk) }
1361 } }
1362};
1490 1363
1491 while ((nbytes = walk.nbytes)) { 1364static const struct common_glue_ctx camellia_ctr = {
1492 nbytes = __cbc_decrypt(desc, &walk); 1365 .num_funcs = 2,
1493 err = blkcipher_walk_done(desc, &walk, nbytes); 1366 .fpu_blocks_limit = -1,
1494 } 1367
1368 .funcs = { {
1369 .num_blocks = 2,
1370 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr_2way) }
1371 }, {
1372 .num_blocks = 1,
1373 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr) }
1374 } }
1375};
1495 1376
1496 return err; 1377static const struct common_glue_ctx camellia_dec = {
1497} 1378 .num_funcs = 2,
1379 .fpu_blocks_limit = -1,
1380
1381 .funcs = { {
1382 .num_blocks = 2,
1383 .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk_2way) }
1384 }, {
1385 .num_blocks = 1,
1386 .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk) }
1387 } }
1388};
1498 1389
1499static inline void u128_to_be128(be128 *dst, const u128 *src) 1390static const struct common_glue_ctx camellia_dec_cbc = {
1500{ 1391 .num_funcs = 2,
1501 dst->a = cpu_to_be64(src->a); 1392 .fpu_blocks_limit = -1,
1502 dst->b = cpu_to_be64(src->b); 1393
1503} 1394 .funcs = { {
1395 .num_blocks = 2,
1396 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_decrypt_cbc_2way) }
1397 }, {
1398 .num_blocks = 1,
1399 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_dec_blk) }
1400 } }
1401};
1504 1402
1505static inline void be128_to_u128(u128 *dst, const be128 *src) 1403static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1404 struct scatterlist *src, unsigned int nbytes)
1506{ 1405{
1507 dst->a = be64_to_cpu(src->a); 1406 return glue_ecb_crypt_128bit(&camellia_enc, desc, dst, src, nbytes);
1508 dst->b = be64_to_cpu(src->b);
1509} 1407}
1510 1408
1511static inline void u128_inc(u128 *i) 1409static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1410 struct scatterlist *src, unsigned int nbytes)
1512{ 1411{
1513 i->b++; 1412 return glue_ecb_crypt_128bit(&camellia_dec, desc, dst, src, nbytes);
1514 if (!i->b)
1515 i->a++;
1516} 1413}
1517 1414
1518static void ctr_crypt_final(struct blkcipher_desc *desc, 1415static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1519 struct blkcipher_walk *walk) 1416 struct scatterlist *src, unsigned int nbytes)
1520{ 1417{
1521 struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 1418 return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(camellia_enc_blk), desc,
1522 u8 keystream[CAMELLIA_BLOCK_SIZE]; 1419 dst, src, nbytes);
1523 u8 *src = walk->src.virt.addr;
1524 u8 *dst = walk->dst.virt.addr;
1525 unsigned int nbytes = walk->nbytes;
1526 u128 ctrblk;
1527
1528 memcpy(keystream, src, nbytes);
1529 camellia_enc_blk_xor(ctx, keystream, walk->iv);
1530 memcpy(dst, keystream, nbytes);
1531
1532 be128_to_u128(&ctrblk, (be128 *)walk->iv);
1533 u128_inc(&ctrblk);
1534 u128_to_be128((be128 *)walk->iv, &ctrblk);
1535} 1420}
1536 1421
1537static unsigned int __ctr_crypt(struct blkcipher_desc *desc, 1422static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1538 struct blkcipher_walk *walk) 1423 struct scatterlist *src, unsigned int nbytes)
1539{ 1424{
1540 struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 1425 return glue_cbc_decrypt_128bit(&camellia_dec_cbc, desc, dst, src,
1541 unsigned int bsize = CAMELLIA_BLOCK_SIZE; 1426 nbytes);
1542 unsigned int nbytes = walk->nbytes;
1543 u128 *src = (u128 *)walk->src.virt.addr;
1544 u128 *dst = (u128 *)walk->dst.virt.addr;
1545 u128 ctrblk;
1546 be128 ctrblocks[2];
1547
1548 be128_to_u128(&ctrblk, (be128 *)walk->iv);
1549
1550 /* Process two block batch */
1551 if (nbytes >= bsize * 2) {
1552 do {
1553 if (dst != src) {
1554 dst[0] = src[0];
1555 dst[1] = src[1];
1556 }
1557
1558 /* create ctrblks for parallel encrypt */
1559 u128_to_be128(&ctrblocks[0], &ctrblk);
1560 u128_inc(&ctrblk);
1561 u128_to_be128(&ctrblocks[1], &ctrblk);
1562 u128_inc(&ctrblk);
1563
1564 camellia_enc_blk_xor_2way(ctx, (u8 *)dst,
1565 (u8 *)ctrblocks);
1566
1567 src += 2;
1568 dst += 2;
1569 nbytes -= bsize * 2;
1570 } while (nbytes >= bsize * 2);
1571
1572 if (nbytes < bsize)
1573 goto done;
1574 }
1575
1576 /* Handle leftovers */
1577 do {
1578 if (dst != src)
1579 *dst = *src;
1580
1581 u128_to_be128(&ctrblocks[0], &ctrblk);
1582 u128_inc(&ctrblk);
1583
1584 camellia_enc_blk_xor(ctx, (u8 *)dst, (u8 *)ctrblocks);
1585
1586 src += 1;
1587 dst += 1;
1588 nbytes -= bsize;
1589 } while (nbytes >= bsize);
1590
1591done:
1592 u128_to_be128((be128 *)walk->iv, &ctrblk);
1593 return nbytes;
1594} 1427}
1595 1428
1596static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, 1429static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1597 struct scatterlist *src, unsigned int nbytes) 1430 struct scatterlist *src, unsigned int nbytes)
1598{ 1431{
1599 struct blkcipher_walk walk; 1432 return glue_ctr_crypt_128bit(&camellia_ctr, desc, dst, src, nbytes);
1600 int err;
1601
1602 blkcipher_walk_init(&walk, dst, src, nbytes);
1603 err = blkcipher_walk_virt_block(desc, &walk, CAMELLIA_BLOCK_SIZE);
1604
1605 while ((nbytes = walk.nbytes) >= CAMELLIA_BLOCK_SIZE) {
1606 nbytes = __ctr_crypt(desc, &walk);
1607 err = blkcipher_walk_done(desc, &walk, nbytes);
1608 }
1609
1610 if (walk.nbytes) {
1611 ctr_crypt_final(desc, &walk);
1612 err = blkcipher_walk_done(desc, &walk, 0);
1613 }
1614
1615 return err;
1616} 1433}
1617 1434
1618static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) 1435static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
new file mode 100644
index 000000000000..4854f0f31e4f
--- /dev/null
+++ b/arch/x86/crypto/glue_helper.c
@@ -0,0 +1,307 @@
1/*
2 * Shared glue code for 128bit block ciphers
3 *
4 * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
7 * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
8 * CTR part based on code (crypto/ctr.c) by:
9 * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
24 * USA
25 *
26 */
27
28#include <linux/module.h>
29#include <crypto/b128ops.h>
30#include <crypto/lrw.h>
31#include <crypto/xts.h>
32#include <asm/crypto/glue_helper.h>
33#include <crypto/scatterwalk.h>
34
35static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
36 struct blkcipher_desc *desc,
37 struct blkcipher_walk *walk)
38{
39 void *ctx = crypto_blkcipher_ctx(desc->tfm);
40 const unsigned int bsize = 128 / 8;
41 unsigned int nbytes, i, func_bytes;
42 bool fpu_enabled = false;
43 int err;
44
45 err = blkcipher_walk_virt(desc, walk);
46
47 while ((nbytes = walk->nbytes)) {
48 u8 *wsrc = walk->src.virt.addr;
49 u8 *wdst = walk->dst.virt.addr;
50
51 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
52 desc, fpu_enabled, nbytes);
53
54 for (i = 0; i < gctx->num_funcs; i++) {
55 func_bytes = bsize * gctx->funcs[i].num_blocks;
56
57 /* Process multi-block batch */
58 if (nbytes >= func_bytes) {
59 do {
60 gctx->funcs[i].fn_u.ecb(ctx, wdst,
61 wsrc);
62
63 wsrc += func_bytes;
64 wdst += func_bytes;
65 nbytes -= func_bytes;
66 } while (nbytes >= func_bytes);
67
68 if (nbytes < bsize)
69 goto done;
70 }
71 }
72
73done:
74 err = blkcipher_walk_done(desc, walk, nbytes);
75 }
76
77 glue_fpu_end(fpu_enabled);
78 return err;
79}
80
81int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
82 struct blkcipher_desc *desc, struct scatterlist *dst,
83 struct scatterlist *src, unsigned int nbytes)
84{
85 struct blkcipher_walk walk;
86
87 blkcipher_walk_init(&walk, dst, src, nbytes);
88 return __glue_ecb_crypt_128bit(gctx, desc, &walk);
89}
90EXPORT_SYMBOL_GPL(glue_ecb_crypt_128bit);
91
92static unsigned int __glue_cbc_encrypt_128bit(const common_glue_func_t fn,
93 struct blkcipher_desc *desc,
94 struct blkcipher_walk *walk)
95{
96 void *ctx = crypto_blkcipher_ctx(desc->tfm);
97 const unsigned int bsize = 128 / 8;
98 unsigned int nbytes = walk->nbytes;
99 u128 *src = (u128 *)walk->src.virt.addr;
100 u128 *dst = (u128 *)walk->dst.virt.addr;
101 u128 *iv = (u128 *)walk->iv;
102
103 do {
104 u128_xor(dst, src, iv);
105 fn(ctx, (u8 *)dst, (u8 *)dst);
106 iv = dst;
107
108 src += 1;
109 dst += 1;
110 nbytes -= bsize;
111 } while (nbytes >= bsize);
112
113 u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv);
114 return nbytes;
115}
116
117int glue_cbc_encrypt_128bit(const common_glue_func_t fn,
118 struct blkcipher_desc *desc,
119 struct scatterlist *dst,
120 struct scatterlist *src, unsigned int nbytes)
121{
122 struct blkcipher_walk walk;
123 int err;
124
125 blkcipher_walk_init(&walk, dst, src, nbytes);
126 err = blkcipher_walk_virt(desc, &walk);
127
128 while ((nbytes = walk.nbytes)) {
129 nbytes = __glue_cbc_encrypt_128bit(fn, desc, &walk);
130 err = blkcipher_walk_done(desc, &walk, nbytes);
131 }
132
133 return err;
134}
135EXPORT_SYMBOL_GPL(glue_cbc_encrypt_128bit);
136
137static unsigned int
138__glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
139 struct blkcipher_desc *desc,
140 struct blkcipher_walk *walk)
141{
142 void *ctx = crypto_blkcipher_ctx(desc->tfm);
143 const unsigned int bsize = 128 / 8;
144 unsigned int nbytes = walk->nbytes;
145 u128 *src = (u128 *)walk->src.virt.addr;
146 u128 *dst = (u128 *)walk->dst.virt.addr;
147 u128 last_iv;
148 unsigned int num_blocks, func_bytes;
149 unsigned int i;
150
151 /* Start of the last block. */
152 src += nbytes / bsize - 1;
153 dst += nbytes / bsize - 1;
154
155 last_iv = *src;
156
157 for (i = 0; i < gctx->num_funcs; i++) {
158 num_blocks = gctx->funcs[i].num_blocks;
159 func_bytes = bsize * num_blocks;
160
161 /* Process multi-block batch */
162 if (nbytes >= func_bytes) {
163 do {
164 nbytes -= func_bytes - bsize;
165 src -= num_blocks - 1;
166 dst -= num_blocks - 1;
167
168 gctx->funcs[i].fn_u.cbc(ctx, dst, src);
169
170 nbytes -= bsize;
171 if (nbytes < bsize)
172 goto done;
173
174 u128_xor(dst, dst, src - 1);
175 src -= 1;
176 dst -= 1;
177 } while (nbytes >= func_bytes);
178
179 if (nbytes < bsize)
180 goto done;
181 }
182 }
183
184done:
185 u128_xor(dst, dst, (u128 *)walk->iv);
186 *(u128 *)walk->iv = last_iv;
187
188 return nbytes;
189}
190
191int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
192 struct blkcipher_desc *desc,
193 struct scatterlist *dst,
194 struct scatterlist *src, unsigned int nbytes)
195{
196 const unsigned int bsize = 128 / 8;
197 bool fpu_enabled = false;
198 struct blkcipher_walk walk;
199 int err;
200
201 blkcipher_walk_init(&walk, dst, src, nbytes);
202 err = blkcipher_walk_virt(desc, &walk);
203
204 while ((nbytes = walk.nbytes)) {
205 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
206 desc, fpu_enabled, nbytes);
207 nbytes = __glue_cbc_decrypt_128bit(gctx, desc, &walk);
208 err = blkcipher_walk_done(desc, &walk, nbytes);
209 }
210
211 glue_fpu_end(fpu_enabled);
212 return err;
213}
214EXPORT_SYMBOL_GPL(glue_cbc_decrypt_128bit);
215
216static void glue_ctr_crypt_final_128bit(const common_glue_ctr_func_t fn_ctr,
217 struct blkcipher_desc *desc,
218 struct blkcipher_walk *walk)
219{
220 void *ctx = crypto_blkcipher_ctx(desc->tfm);
221 u8 *src = (u8 *)walk->src.virt.addr;
222 u8 *dst = (u8 *)walk->dst.virt.addr;
223 unsigned int nbytes = walk->nbytes;
224 u128 ctrblk;
225 u128 tmp;
226
227 be128_to_u128(&ctrblk, (be128 *)walk->iv);
228
229 memcpy(&tmp, src, nbytes);
230 fn_ctr(ctx, &tmp, &tmp, &ctrblk);
231 memcpy(dst, &tmp, nbytes);
232
233 u128_to_be128((be128 *)walk->iv, &ctrblk);
234}
235EXPORT_SYMBOL_GPL(glue_ctr_crypt_final_128bit);
236
237static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
238 struct blkcipher_desc *desc,
239 struct blkcipher_walk *walk)
240{
241 const unsigned int bsize = 128 / 8;
242 void *ctx = crypto_blkcipher_ctx(desc->tfm);
243 unsigned int nbytes = walk->nbytes;
244 u128 *src = (u128 *)walk->src.virt.addr;
245 u128 *dst = (u128 *)walk->dst.virt.addr;
246 u128 ctrblk;
247 unsigned int num_blocks, func_bytes;
248 unsigned int i;
249
250 be128_to_u128(&ctrblk, (be128 *)walk->iv);
251
252 /* Process multi-block batch */
253 for (i = 0; i < gctx->num_funcs; i++) {
254 num_blocks = gctx->funcs[i].num_blocks;
255 func_bytes = bsize * num_blocks;
256
257 if (nbytes >= func_bytes) {
258 do {
259 gctx->funcs[i].fn_u.ctr(ctx, dst, src, &ctrblk);
260
261 src += num_blocks;
262 dst += num_blocks;
263 nbytes -= func_bytes;
264 } while (nbytes >= func_bytes);
265
266 if (nbytes < bsize)
267 goto done;
268 }
269 }
270
271done:
272 u128_to_be128((be128 *)walk->iv, &ctrblk);
273 return nbytes;
274}
275
276int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
277 struct blkcipher_desc *desc, struct scatterlist *dst,
278 struct scatterlist *src, unsigned int nbytes)
279{
280 const unsigned int bsize = 128 / 8;
281 bool fpu_enabled = false;
282 struct blkcipher_walk walk;
283 int err;
284
285 blkcipher_walk_init(&walk, dst, src, nbytes);
286 err = blkcipher_walk_virt_block(desc, &walk, bsize);
287
288 while ((nbytes = walk.nbytes) >= bsize) {
289 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
290 desc, fpu_enabled, nbytes);
291 nbytes = __glue_ctr_crypt_128bit(gctx, desc, &walk);
292 err = blkcipher_walk_done(desc, &walk, nbytes);
293 }
294
295 glue_fpu_end(fpu_enabled);
296
297 if (walk.nbytes) {
298 glue_ctr_crypt_final_128bit(
299 gctx->funcs[gctx->num_funcs - 1].fn_u.ctr, desc, &walk);
300 err = blkcipher_walk_done(desc, &walk, 0);
301 }
302
303 return err;
304}
305EXPORT_SYMBOL_GPL(glue_ctr_crypt_128bit);
306
307MODULE_LICENSE("GPL");
diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
new file mode 100644
index 000000000000..504106bf04a2
--- /dev/null
+++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
@@ -0,0 +1,704 @@
1/*
2 * Serpent Cipher 8-way parallel algorithm (x86_64/AVX)
3 *
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
6 *
7 * Based on arch/x86/crypto/serpent-sse2-x86_64-asm_64.S by
8 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
23 * USA
24 *
25 */
26
27.file "serpent-avx-x86_64-asm_64.S"
28.text
29
30#define CTX %rdi
31
32/**********************************************************************
33 8-way AVX serpent
34 **********************************************************************/
35#define RA1 %xmm0
36#define RB1 %xmm1
37#define RC1 %xmm2
38#define RD1 %xmm3
39#define RE1 %xmm4
40
41#define tp %xmm5
42
43#define RA2 %xmm6
44#define RB2 %xmm7
45#define RC2 %xmm8
46#define RD2 %xmm9
47#define RE2 %xmm10
48
49#define RNOT %xmm11
50
51#define RK0 %xmm12
52#define RK1 %xmm13
53#define RK2 %xmm14
54#define RK3 %xmm15
55
56
57#define S0_1(x0, x1, x2, x3, x4) \
58 vpor x0, x3, tp; \
59 vpxor x3, x0, x0; \
60 vpxor x2, x3, x4; \
61 vpxor RNOT, x4, x4; \
62 vpxor x1, tp, x3; \
63 vpand x0, x1, x1; \
64 vpxor x4, x1, x1; \
65 vpxor x0, x2, x2;
66#define S0_2(x0, x1, x2, x3, x4) \
67 vpxor x3, x0, x0; \
68 vpor x0, x4, x4; \
69 vpxor x2, x0, x0; \
70 vpand x1, x2, x2; \
71 vpxor x2, x3, x3; \
72 vpxor RNOT, x1, x1; \
73 vpxor x4, x2, x2; \
74 vpxor x2, x1, x1;
75
76#define S1_1(x0, x1, x2, x3, x4) \
77 vpxor x0, x1, tp; \
78 vpxor x3, x0, x0; \
79 vpxor RNOT, x3, x3; \
80 vpand tp, x1, x4; \
81 vpor tp, x0, x0; \
82 vpxor x2, x3, x3; \
83 vpxor x3, x0, x0; \
84 vpxor x3, tp, x1;
85#define S1_2(x0, x1, x2, x3, x4) \
86 vpxor x4, x3, x3; \
87 vpor x4, x1, x1; \
88 vpxor x2, x4, x4; \
89 vpand x0, x2, x2; \
90 vpxor x1, x2, x2; \
91 vpor x0, x1, x1; \
92 vpxor RNOT, x0, x0; \
93 vpxor x2, x0, x0; \
94 vpxor x1, x4, x4;
95
96#define S2_1(x0, x1, x2, x3, x4) \
97 vpxor RNOT, x3, x3; \
98 vpxor x0, x1, x1; \
99 vpand x2, x0, tp; \
100 vpxor x3, tp, tp; \
101 vpor x0, x3, x3; \
102 vpxor x1, x2, x2; \
103 vpxor x1, x3, x3; \
104 vpand tp, x1, x1;
105#define S2_2(x0, x1, x2, x3, x4) \
106 vpxor x2, tp, tp; \
107 vpand x3, x2, x2; \
108 vpor x1, x3, x3; \
109 vpxor RNOT, tp, tp; \
110 vpxor tp, x3, x3; \
111 vpxor tp, x0, x4; \
112 vpxor x2, tp, x0; \
113 vpor x2, x1, x1;
114
115#define S3_1(x0, x1, x2, x3, x4) \
116 vpxor x3, x1, tp; \
117 vpor x0, x3, x3; \
118 vpand x0, x1, x4; \
119 vpxor x2, x0, x0; \
120 vpxor tp, x2, x2; \
121 vpand x3, tp, x1; \
122 vpxor x3, x2, x2; \
123 vpor x4, x0, x0; \
124 vpxor x3, x4, x4;
125#define S3_2(x0, x1, x2, x3, x4) \
126 vpxor x0, x1, x1; \
127 vpand x3, x0, x0; \
128 vpand x4, x3, x3; \
129 vpxor x2, x3, x3; \
130 vpor x1, x4, x4; \
131 vpand x1, x2, x2; \
132 vpxor x3, x4, x4; \
133 vpxor x3, x0, x0; \
134 vpxor x2, x3, x3;
135
136#define S4_1(x0, x1, x2, x3, x4) \
137 vpand x0, x3, tp; \
138 vpxor x3, x0, x0; \
139 vpxor x2, tp, tp; \
140 vpor x3, x2, x2; \
141 vpxor x1, x0, x0; \
142 vpxor tp, x3, x4; \
143 vpor x0, x2, x2; \
144 vpxor x1, x2, x2;
145#define S4_2(x0, x1, x2, x3, x4) \
146 vpand x0, x1, x1; \
147 vpxor x4, x1, x1; \
148 vpand x2, x4, x4; \
149 vpxor tp, x2, x2; \
150 vpxor x0, x4, x4; \
151 vpor x1, tp, x3; \
152 vpxor RNOT, x1, x1; \
153 vpxor x0, x3, x3;
154
155#define S5_1(x0, x1, x2, x3, x4) \
156 vpor x0, x1, tp; \
157 vpxor tp, x2, x2; \
158 vpxor RNOT, x3, x3; \
159 vpxor x0, x1, x4; \
160 vpxor x2, x0, x0; \
161 vpand x4, tp, x1; \
162 vpor x3, x4, x4; \
163 vpxor x0, x4, x4;
164#define S5_2(x0, x1, x2, x3, x4) \
165 vpand x3, x0, x0; \
166 vpxor x3, x1, x1; \
167 vpxor x2, x3, x3; \
168 vpxor x1, x0, x0; \
169 vpand x4, x2, x2; \
170 vpxor x2, x1, x1; \
171 vpand x0, x2, x2; \
172 vpxor x2, x3, x3;
173
174#define S6_1(x0, x1, x2, x3, x4) \
175 vpxor x0, x3, x3; \
176 vpxor x2, x1, tp; \
177 vpxor x0, x2, x2; \
178 vpand x3, x0, x0; \
179 vpor x3, tp, tp; \
180 vpxor RNOT, x1, x4; \
181 vpxor tp, x0, x0; \
182 vpxor x2, tp, x1;
183#define S6_2(x0, x1, x2, x3, x4) \
184 vpxor x4, x3, x3; \
185 vpxor x0, x4, x4; \
186 vpand x0, x2, x2; \
187 vpxor x1, x4, x4; \
188 vpxor x3, x2, x2; \
189 vpand x1, x3, x3; \
190 vpxor x0, x3, x3; \
191 vpxor x2, x1, x1;
192
193#define S7_1(x0, x1, x2, x3, x4) \
194 vpxor RNOT, x1, tp; \
195 vpxor RNOT, x0, x0; \
196 vpand x2, tp, x1; \
197 vpxor x3, x1, x1; \
198 vpor tp, x3, x3; \
199 vpxor x2, tp, x4; \
200 vpxor x3, x2, x2; \
201 vpxor x0, x3, x3; \
202 vpor x1, x0, x0;
203#define S7_2(x0, x1, x2, x3, x4) \
204 vpand x0, x2, x2; \
205 vpxor x4, x0, x0; \
206 vpxor x3, x4, x4; \
207 vpand x0, x3, x3; \
208 vpxor x1, x4, x4; \
209 vpxor x4, x2, x2; \
210 vpxor x1, x3, x3; \
211 vpor x0, x4, x4; \
212 vpxor x1, x4, x4;
213
214#define SI0_1(x0, x1, x2, x3, x4) \
215 vpxor x0, x1, x1; \
216 vpor x1, x3, tp; \
217 vpxor x1, x3, x4; \
218 vpxor RNOT, x0, x0; \
219 vpxor tp, x2, x2; \
220 vpxor x0, tp, x3; \
221 vpand x1, x0, x0; \
222 vpxor x2, x0, x0;
223#define SI0_2(x0, x1, x2, x3, x4) \
224 vpand x3, x2, x2; \
225 vpxor x4, x3, x3; \
226 vpxor x3, x2, x2; \
227 vpxor x3, x1, x1; \
228 vpand x0, x3, x3; \
229 vpxor x0, x1, x1; \
230 vpxor x2, x0, x0; \
231 vpxor x3, x4, x4;
232
233#define SI1_1(x0, x1, x2, x3, x4) \
234 vpxor x3, x1, x1; \
235 vpxor x2, x0, tp; \
236 vpxor RNOT, x2, x2; \
237 vpor x1, x0, x4; \
238 vpxor x3, x4, x4; \
239 vpand x1, x3, x3; \
240 vpxor x2, x1, x1; \
241 vpand x4, x2, x2;
242#define SI1_2(x0, x1, x2, x3, x4) \
243 vpxor x1, x4, x4; \
244 vpor x3, x1, x1; \
245 vpxor tp, x3, x3; \
246 vpxor tp, x2, x2; \
247 vpor x4, tp, x0; \
248 vpxor x4, x2, x2; \
249 vpxor x0, x1, x1; \
250 vpxor x1, x4, x4;
251
252#define SI2_1(x0, x1, x2, x3, x4) \
253 vpxor x1, x2, x2; \
254 vpxor RNOT, x3, tp; \
255 vpor x2, tp, tp; \
256 vpxor x3, x2, x2; \
257 vpxor x0, x3, x4; \
258 vpxor x1, tp, x3; \
259 vpor x2, x1, x1; \
260 vpxor x0, x2, x2;
261#define SI2_2(x0, x1, x2, x3, x4) \
262 vpxor x4, x1, x1; \
263 vpor x3, x4, x4; \
264 vpxor x3, x2, x2; \
265 vpxor x2, x4, x4; \
266 vpand x1, x2, x2; \
267 vpxor x3, x2, x2; \
268 vpxor x4, x3, x3; \
269 vpxor x0, x4, x4;
270
271#define SI3_1(x0, x1, x2, x3, x4) \
272 vpxor x1, x2, x2; \
273 vpand x2, x1, tp; \
274 vpxor x0, tp, tp; \
275 vpor x1, x0, x0; \
276 vpxor x3, x1, x4; \
277 vpxor x3, x0, x0; \
278 vpor tp, x3, x3; \
279 vpxor x2, tp, x1;
280#define SI3_2(x0, x1, x2, x3, x4) \
281 vpxor x3, x1, x1; \
282 vpxor x2, x0, x0; \
283 vpxor x3, x2, x2; \
284 vpand x1, x3, x3; \
285 vpxor x0, x1, x1; \
286 vpand x2, x0, x0; \
287 vpxor x3, x4, x4; \
288 vpxor x0, x3, x3; \
289 vpxor x1, x0, x0;
290
291#define SI4_1(x0, x1, x2, x3, x4) \
292 vpxor x3, x2, x2; \
293 vpand x1, x0, tp; \
294 vpxor x2, tp, tp; \
295 vpor x3, x2, x2; \
296 vpxor RNOT, x0, x4; \
297 vpxor tp, x1, x1; \
298 vpxor x2, tp, x0; \
299 vpand x4, x2, x2;
300#define SI4_2(x0, x1, x2, x3, x4) \
301 vpxor x0, x2, x2; \
302 vpor x4, x0, x0; \
303 vpxor x3, x0, x0; \
304 vpand x2, x3, x3; \
305 vpxor x3, x4, x4; \
306 vpxor x1, x3, x3; \
307 vpand x0, x1, x1; \
308 vpxor x1, x4, x4; \
309 vpxor x3, x0, x0;
310
311#define SI5_1(x0, x1, x2, x3, x4) \
312 vpor x2, x1, tp; \
313 vpxor x1, x2, x2; \
314 vpxor x3, tp, tp; \
315 vpand x1, x3, x3; \
316 vpxor x3, x2, x2; \
317 vpor x0, x3, x3; \
318 vpxor RNOT, x0, x0; \
319 vpxor x2, x3, x3; \
320 vpor x0, x2, x2;
321#define SI5_2(x0, x1, x2, x3, x4) \
322 vpxor tp, x1, x4; \
323 vpxor x4, x2, x2; \
324 vpand x0, x4, x4; \
325 vpxor tp, x0, x0; \
326 vpxor x3, tp, x1; \
327 vpand x2, x0, x0; \
328 vpxor x3, x2, x2; \
329 vpxor x2, x0, x0; \
330 vpxor x4, x2, x2; \
331 vpxor x3, x4, x4;
332
333#define SI6_1(x0, x1, x2, x3, x4) \
334 vpxor x2, x0, x0; \
335 vpand x3, x0, tp; \
336 vpxor x3, x2, x2; \
337 vpxor x2, tp, tp; \
338 vpxor x1, x3, x3; \
339 vpor x0, x2, x2; \
340 vpxor x3, x2, x2; \
341 vpand tp, x3, x3;
342#define SI6_2(x0, x1, x2, x3, x4) \
343 vpxor RNOT, tp, tp; \
344 vpxor x1, x3, x3; \
345 vpand x2, x1, x1; \
346 vpxor tp, x0, x4; \
347 vpxor x4, x3, x3; \
348 vpxor x2, x4, x4; \
349 vpxor x1, tp, x0; \
350 vpxor x0, x2, x2;
351
352#define SI7_1(x0, x1, x2, x3, x4) \
353 vpand x0, x3, tp; \
354 vpxor x2, x0, x0; \
355 vpor x3, x2, x2; \
356 vpxor x1, x3, x4; \
357 vpxor RNOT, x0, x0; \
358 vpor tp, x1, x1; \
359 vpxor x0, x4, x4; \
360 vpand x2, x0, x0; \
361 vpxor x1, x0, x0;
362#define SI7_2(x0, x1, x2, x3, x4) \
363 vpand x2, x1, x1; \
364 vpxor x2, tp, x3; \
365 vpxor x3, x4, x4; \
366 vpand x3, x2, x2; \
367 vpor x0, x3, x3; \
368 vpxor x4, x1, x1; \
369 vpxor x4, x3, x3; \
370 vpand x0, x4, x4; \
371 vpxor x2, x4, x4;
372
373#define get_key(i, j, t) \
374 vbroadcastss (4*(i)+(j))*4(CTX), t;
375
376#define K2(x0, x1, x2, x3, x4, i) \
377 get_key(i, 0, RK0); \
378 get_key(i, 1, RK1); \
379 get_key(i, 2, RK2); \
380 get_key(i, 3, RK3); \
381 vpxor RK0, x0 ## 1, x0 ## 1; \
382 vpxor RK1, x1 ## 1, x1 ## 1; \
383 vpxor RK2, x2 ## 1, x2 ## 1; \
384 vpxor RK3, x3 ## 1, x3 ## 1; \
385 vpxor RK0, x0 ## 2, x0 ## 2; \
386 vpxor RK1, x1 ## 2, x1 ## 2; \
387 vpxor RK2, x2 ## 2, x2 ## 2; \
388 vpxor RK3, x3 ## 2, x3 ## 2;
389
390#define LK2(x0, x1, x2, x3, x4, i) \
391 vpslld $13, x0 ## 1, x4 ## 1; \
392 vpsrld $(32 - 13), x0 ## 1, x0 ## 1; \
393 vpor x4 ## 1, x0 ## 1, x0 ## 1; \
394 vpxor x0 ## 1, x1 ## 1, x1 ## 1; \
395 vpslld $3, x2 ## 1, x4 ## 1; \
396 vpsrld $(32 - 3), x2 ## 1, x2 ## 1; \
397 vpor x4 ## 1, x2 ## 1, x2 ## 1; \
398 vpxor x2 ## 1, x1 ## 1, x1 ## 1; \
399 vpslld $13, x0 ## 2, x4 ## 2; \
400 vpsrld $(32 - 13), x0 ## 2, x0 ## 2; \
401 vpor x4 ## 2, x0 ## 2, x0 ## 2; \
402 vpxor x0 ## 2, x1 ## 2, x1 ## 2; \
403 vpslld $3, x2 ## 2, x4 ## 2; \
404 vpsrld $(32 - 3), x2 ## 2, x2 ## 2; \
405 vpor x4 ## 2, x2 ## 2, x2 ## 2; \
406 vpxor x2 ## 2, x1 ## 2, x1 ## 2; \
407 vpslld $1, x1 ## 1, x4 ## 1; \
408 vpsrld $(32 - 1), x1 ## 1, x1 ## 1; \
409 vpor x4 ## 1, x1 ## 1, x1 ## 1; \
410 vpslld $3, x0 ## 1, x4 ## 1; \
411 vpxor x2 ## 1, x3 ## 1, x3 ## 1; \
412 vpxor x4 ## 1, x3 ## 1, x3 ## 1; \
413 get_key(i, 1, RK1); \
414 vpslld $1, x1 ## 2, x4 ## 2; \
415 vpsrld $(32 - 1), x1 ## 2, x1 ## 2; \
416 vpor x4 ## 2, x1 ## 2, x1 ## 2; \
417 vpslld $3, x0 ## 2, x4 ## 2; \
418 vpxor x2 ## 2, x3 ## 2, x3 ## 2; \
419 vpxor x4 ## 2, x3 ## 2, x3 ## 2; \
420 get_key(i, 3, RK3); \
421 vpslld $7, x3 ## 1, x4 ## 1; \
422 vpsrld $(32 - 7), x3 ## 1, x3 ## 1; \
423 vpor x4 ## 1, x3 ## 1, x3 ## 1; \
424 vpslld $7, x1 ## 1, x4 ## 1; \
425 vpxor x1 ## 1, x0 ## 1, x0 ## 1; \
426 vpxor x3 ## 1, x0 ## 1, x0 ## 1; \
427 vpxor x3 ## 1, x2 ## 1, x2 ## 1; \
428 vpxor x4 ## 1, x2 ## 1, x2 ## 1; \
429 get_key(i, 0, RK0); \
430 vpslld $7, x3 ## 2, x4 ## 2; \
431 vpsrld $(32 - 7), x3 ## 2, x3 ## 2; \
432 vpor x4 ## 2, x3 ## 2, x3 ## 2; \
433 vpslld $7, x1 ## 2, x4 ## 2; \
434 vpxor x1 ## 2, x0 ## 2, x0 ## 2; \
435 vpxor x3 ## 2, x0 ## 2, x0 ## 2; \
436 vpxor x3 ## 2, x2 ## 2, x2 ## 2; \
437 vpxor x4 ## 2, x2 ## 2, x2 ## 2; \
438 get_key(i, 2, RK2); \
439 vpxor RK1, x1 ## 1, x1 ## 1; \
440 vpxor RK3, x3 ## 1, x3 ## 1; \
441 vpslld $5, x0 ## 1, x4 ## 1; \
442 vpsrld $(32 - 5), x0 ## 1, x0 ## 1; \
443 vpor x4 ## 1, x0 ## 1, x0 ## 1; \
444 vpslld $22, x2 ## 1, x4 ## 1; \
445 vpsrld $(32 - 22), x2 ## 1, x2 ## 1; \
446 vpor x4 ## 1, x2 ## 1, x2 ## 1; \
447 vpxor RK0, x0 ## 1, x0 ## 1; \
448 vpxor RK2, x2 ## 1, x2 ## 1; \
449 vpxor RK1, x1 ## 2, x1 ## 2; \
450 vpxor RK3, x3 ## 2, x3 ## 2; \
451 vpslld $5, x0 ## 2, x4 ## 2; \
452 vpsrld $(32 - 5), x0 ## 2, x0 ## 2; \
453 vpor x4 ## 2, x0 ## 2, x0 ## 2; \
454 vpslld $22, x2 ## 2, x4 ## 2; \
455 vpsrld $(32 - 22), x2 ## 2, x2 ## 2; \
456 vpor x4 ## 2, x2 ## 2, x2 ## 2; \
457 vpxor RK0, x0 ## 2, x0 ## 2; \
458 vpxor RK2, x2 ## 2, x2 ## 2;
459
460#define KL2(x0, x1, x2, x3, x4, i) \
461 vpxor RK0, x0 ## 1, x0 ## 1; \
462 vpxor RK2, x2 ## 1, x2 ## 1; \
463 vpsrld $5, x0 ## 1, x4 ## 1; \
464 vpslld $(32 - 5), x0 ## 1, x0 ## 1; \
465 vpor x4 ## 1, x0 ## 1, x0 ## 1; \
466 vpxor RK3, x3 ## 1, x3 ## 1; \
467 vpxor RK1, x1 ## 1, x1 ## 1; \
468 vpsrld $22, x2 ## 1, x4 ## 1; \
469 vpslld $(32 - 22), x2 ## 1, x2 ## 1; \
470 vpor x4 ## 1, x2 ## 1, x2 ## 1; \
471 vpxor x3 ## 1, x2 ## 1, x2 ## 1; \
472 vpxor RK0, x0 ## 2, x0 ## 2; \
473 vpxor RK2, x2 ## 2, x2 ## 2; \
474 vpsrld $5, x0 ## 2, x4 ## 2; \
475 vpslld $(32 - 5), x0 ## 2, x0 ## 2; \
476 vpor x4 ## 2, x0 ## 2, x0 ## 2; \
477 vpxor RK3, x3 ## 2, x3 ## 2; \
478 vpxor RK1, x1 ## 2, x1 ## 2; \
479 vpsrld $22, x2 ## 2, x4 ## 2; \
480 vpslld $(32 - 22), x2 ## 2, x2 ## 2; \
481 vpor x4 ## 2, x2 ## 2, x2 ## 2; \
482 vpxor x3 ## 2, x2 ## 2, x2 ## 2; \
483 vpxor x3 ## 1, x0 ## 1, x0 ## 1; \
484 vpslld $7, x1 ## 1, x4 ## 1; \
485 vpxor x1 ## 1, x0 ## 1, x0 ## 1; \
486 vpxor x4 ## 1, x2 ## 1, x2 ## 1; \
487 vpsrld $1, x1 ## 1, x4 ## 1; \
488 vpslld $(32 - 1), x1 ## 1, x1 ## 1; \
489 vpor x4 ## 1, x1 ## 1, x1 ## 1; \
490 vpxor x3 ## 2, x0 ## 2, x0 ## 2; \
491 vpslld $7, x1 ## 2, x4 ## 2; \
492 vpxor x1 ## 2, x0 ## 2, x0 ## 2; \
493 vpxor x4 ## 2, x2 ## 2, x2 ## 2; \
494 vpsrld $1, x1 ## 2, x4 ## 2; \
495 vpslld $(32 - 1), x1 ## 2, x1 ## 2; \
496 vpor x4 ## 2, x1 ## 2, x1 ## 2; \
497 vpsrld $7, x3 ## 1, x4 ## 1; \
498 vpslld $(32 - 7), x3 ## 1, x3 ## 1; \
499 vpor x4 ## 1, x3 ## 1, x3 ## 1; \
500 vpxor x0 ## 1, x1 ## 1, x1 ## 1; \
501 vpslld $3, x0 ## 1, x4 ## 1; \
502 vpxor x4 ## 1, x3 ## 1, x3 ## 1; \
503 vpsrld $7, x3 ## 2, x4 ## 2; \
504 vpslld $(32 - 7), x3 ## 2, x3 ## 2; \
505 vpor x4 ## 2, x3 ## 2, x3 ## 2; \
506 vpxor x0 ## 2, x1 ## 2, x1 ## 2; \
507 vpslld $3, x0 ## 2, x4 ## 2; \
508 vpxor x4 ## 2, x3 ## 2, x3 ## 2; \
509 vpsrld $13, x0 ## 1, x4 ## 1; \
510 vpslld $(32 - 13), x0 ## 1, x0 ## 1; \
511 vpor x4 ## 1, x0 ## 1, x0 ## 1; \
512 vpxor x2 ## 1, x1 ## 1, x1 ## 1; \
513 vpxor x2 ## 1, x3 ## 1, x3 ## 1; \
514 vpsrld $3, x2 ## 1, x4 ## 1; \
515 vpslld $(32 - 3), x2 ## 1, x2 ## 1; \
516 vpor x4 ## 1, x2 ## 1, x2 ## 1; \
517 vpsrld $13, x0 ## 2, x4 ## 2; \
518 vpslld $(32 - 13), x0 ## 2, x0 ## 2; \
519 vpor x4 ## 2, x0 ## 2, x0 ## 2; \
520 vpxor x2 ## 2, x1 ## 2, x1 ## 2; \
521 vpxor x2 ## 2, x3 ## 2, x3 ## 2; \
522 vpsrld $3, x2 ## 2, x4 ## 2; \
523 vpslld $(32 - 3), x2 ## 2, x2 ## 2; \
524 vpor x4 ## 2, x2 ## 2, x2 ## 2;
525
526#define S(SBOX, x0, x1, x2, x3, x4) \
527 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
528 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
529 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
530 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
531
532#define SP(SBOX, x0, x1, x2, x3, x4, i) \
533 get_key(i, 0, RK0); \
534 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
535 get_key(i, 2, RK2); \
536 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
537 get_key(i, 3, RK3); \
538 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
539 get_key(i, 1, RK1); \
540 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
541
542#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
543 vpunpckldq x1, x0, t0; \
544 vpunpckhdq x1, x0, t2; \
545 vpunpckldq x3, x2, t1; \
546 vpunpckhdq x3, x2, x3; \
547 \
548 vpunpcklqdq t1, t0, x0; \
549 vpunpckhqdq t1, t0, x1; \
550 vpunpcklqdq x3, t2, x2; \
551 vpunpckhqdq x3, t2, x3;
552
553#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
554 vmovdqu (0*4*4)(in), x0; \
555 vmovdqu (1*4*4)(in), x1; \
556 vmovdqu (2*4*4)(in), x2; \
557 vmovdqu (3*4*4)(in), x3; \
558 \
559 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
560
561#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
562 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
563 \
564 vmovdqu x0, (0*4*4)(out); \
565 vmovdqu x1, (1*4*4)(out); \
566 vmovdqu x2, (2*4*4)(out); \
567 vmovdqu x3, (3*4*4)(out);
568
569#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
570 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
571 \
572 vpxor (0*4*4)(out), x0, x0; \
573 vmovdqu x0, (0*4*4)(out); \
574 vpxor (1*4*4)(out), x1, x1; \
575 vmovdqu x1, (1*4*4)(out); \
576 vpxor (2*4*4)(out), x2, x2; \
577 vmovdqu x2, (2*4*4)(out); \
578 vpxor (3*4*4)(out), x3, x3; \
579 vmovdqu x3, (3*4*4)(out);
580
581.align 8
582.global __serpent_enc_blk_8way_avx
583.type __serpent_enc_blk_8way_avx,@function;
584
585__serpent_enc_blk_8way_avx:
586 /* input:
587 * %rdi: ctx, CTX
588 * %rsi: dst
589 * %rdx: src
590 * %rcx: bool, if true: xor output
591 */
592
593 vpcmpeqd RNOT, RNOT, RNOT;
594
595 leaq (4*4*4)(%rdx), %rax;
596 read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
597 read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
598
599 K2(RA, RB, RC, RD, RE, 0);
600 S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);
601 S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2);
602 S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3);
603 S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4);
604 S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5);
605 S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6);
606 S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7);
607 S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8);
608 S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9);
609 S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10);
610 S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11);
611 S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12);
612 S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13);
613 S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14);
614 S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15);
615 S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16);
616 S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17);
617 S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18);
618 S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19);
619 S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20);
620 S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21);
621 S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22);
622 S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23);
623 S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24);
624 S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25);
625 S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26);
626 S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27);
627 S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28);
628 S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29);
629 S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30);
630 S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);
631 S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);
632
633 leaq (4*4*4)(%rsi), %rax;
634
635 testb %cl, %cl;
636 jnz __enc_xor8;
637
638 write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
639 write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
640
641 ret;
642
643__enc_xor8:
644 xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
645 xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
646
647 ret;
648
649.align 8
650.global serpent_dec_blk_8way_avx
651.type serpent_dec_blk_8way_avx,@function;
652
653serpent_dec_blk_8way_avx:
654 /* input:
655 * %rdi: ctx, CTX
656 * %rsi: dst
657 * %rdx: src
658 */
659
660 vpcmpeqd RNOT, RNOT, RNOT;
661
662 leaq (4*4*4)(%rdx), %rax;
663 read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
664 read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
665
666 K2(RA, RB, RC, RD, RE, 32);
667 SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);
668 SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30);
669 SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29);
670 SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28);
671 SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27);
672 SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26);
673 SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25);
674 SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24);
675 SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23);
676 SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22);
677 SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21);
678 SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20);
679 SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19);
680 SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18);
681 SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17);
682 SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16);
683 SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15);
684 SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14);
685 SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13);
686 SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12);
687 SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11);
688 SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10);
689 SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9);
690 SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8);
691 SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7);
692 SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6);
693 SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5);
694 SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4);
695 SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3);
696 SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2);
697 SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);
698 S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);
699
700 leaq (4*4*4)(%rsi), %rax;
701 write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2);
702 write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2);
703
704 ret;
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
new file mode 100644
index 000000000000..b36bdac237eb
--- /dev/null
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -0,0 +1,636 @@
1/*
2 * Glue Code for AVX assembler versions of Serpent Cipher
3 *
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
6 *
7 * Glue code based on serpent_sse2_glue.c by:
8 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
23 * USA
24 *
25 */
26
27#include <linux/module.h>
28#include <linux/hardirq.h>
29#include <linux/types.h>
30#include <linux/crypto.h>
31#include <linux/err.h>
32#include <crypto/algapi.h>
33#include <crypto/serpent.h>
34#include <crypto/cryptd.h>
35#include <crypto/b128ops.h>
36#include <crypto/ctr.h>
37#include <crypto/lrw.h>
38#include <crypto/xts.h>
39#include <asm/xcr.h>
40#include <asm/xsave.h>
41#include <asm/crypto/serpent-avx.h>
42#include <asm/crypto/ablk_helper.h>
43#include <asm/crypto/glue_helper.h>
44
45static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)
46{
47 u128 ivs[SERPENT_PARALLEL_BLOCKS - 1];
48 unsigned int j;
49
50 for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
51 ivs[j] = src[j];
52
53 serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
54
55 for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
56 u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
57}
58
59static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
60{
61 be128 ctrblk;
62
63 u128_to_be128(&ctrblk, iv);
64 u128_inc(iv);
65
66 __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
67 u128_xor(dst, src, (u128 *)&ctrblk);
68}
69
70static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
71 u128 *iv)
72{
73 be128 ctrblks[SERPENT_PARALLEL_BLOCKS];
74 unsigned int i;
75
76 for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) {
77 if (dst != src)
78 dst[i] = src[i];
79
80 u128_to_be128(&ctrblks[i], iv);
81 u128_inc(iv);
82 }
83
84 serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
85}
86
87static const struct common_glue_ctx serpent_enc = {
88 .num_funcs = 2,
89 .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
90
91 .funcs = { {
92 .num_blocks = SERPENT_PARALLEL_BLOCKS,
93 .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_enc_blk_xway) }
94 }, {
95 .num_blocks = 1,
96 .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) }
97 } }
98};
99
100static const struct common_glue_ctx serpent_ctr = {
101 .num_funcs = 2,
102 .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
103
104 .funcs = { {
105 .num_blocks = SERPENT_PARALLEL_BLOCKS,
106 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr_xway) }
107 }, {
108 .num_blocks = 1,
109 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) }
110 } }
111};
112
113static const struct common_glue_ctx serpent_dec = {
114 .num_funcs = 2,
115 .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
116
117 .funcs = { {
118 .num_blocks = SERPENT_PARALLEL_BLOCKS,
119 .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_dec_blk_xway) }
120 }, {
121 .num_blocks = 1,
122 .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) }
123 } }
124};
125
126static const struct common_glue_ctx serpent_dec_cbc = {
127 .num_funcs = 2,
128 .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
129
130 .funcs = { {
131 .num_blocks = SERPENT_PARALLEL_BLOCKS,
132 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_decrypt_cbc_xway) }
133 }, {
134 .num_blocks = 1,
135 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) }
136 } }
137};
138
139static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
140 struct scatterlist *src, unsigned int nbytes)
141{
142 return glue_ecb_crypt_128bit(&serpent_enc, desc, dst, src, nbytes);
143}
144
145static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
146 struct scatterlist *src, unsigned int nbytes)
147{
148 return glue_ecb_crypt_128bit(&serpent_dec, desc, dst, src, nbytes);
149}
150
151static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
152 struct scatterlist *src, unsigned int nbytes)
153{
154 return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(__serpent_encrypt), desc,
155 dst, src, nbytes);
156}
157
158static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
159 struct scatterlist *src, unsigned int nbytes)
160{
161 return glue_cbc_decrypt_128bit(&serpent_dec_cbc, desc, dst, src,
162 nbytes);
163}
164
165static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
166 struct scatterlist *src, unsigned int nbytes)
167{
168 return glue_ctr_crypt_128bit(&serpent_ctr, desc, dst, src, nbytes);
169}
170
171static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes)
172{
173 return glue_fpu_begin(SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS,
174 NULL, fpu_enabled, nbytes);
175}
176
177static inline void serpent_fpu_end(bool fpu_enabled)
178{
179 glue_fpu_end(fpu_enabled);
180}
181
182struct crypt_priv {
183 struct serpent_ctx *ctx;
184 bool fpu_enabled;
185};
186
187static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
188{
189 const unsigned int bsize = SERPENT_BLOCK_SIZE;
190 struct crypt_priv *ctx = priv;
191 int i;
192
193 ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
194
195 if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
196 serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst);
197 return;
198 }
199
200 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
201 __serpent_encrypt(ctx->ctx, srcdst, srcdst);
202}
203
204static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
205{
206 const unsigned int bsize = SERPENT_BLOCK_SIZE;
207 struct crypt_priv *ctx = priv;
208 int i;
209
210 ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
211
212 if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
213 serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst);
214 return;
215 }
216
217 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
218 __serpent_decrypt(ctx->ctx, srcdst, srcdst);
219}
220
221struct serpent_lrw_ctx {
222 struct lrw_table_ctx lrw_table;
223 struct serpent_ctx serpent_ctx;
224};
225
226static int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
227 unsigned int keylen)
228{
229 struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
230 int err;
231
232 err = __serpent_setkey(&ctx->serpent_ctx, key, keylen -
233 SERPENT_BLOCK_SIZE);
234 if (err)
235 return err;
236
237 return lrw_init_table(&ctx->lrw_table, key + keylen -
238 SERPENT_BLOCK_SIZE);
239}
240
241static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
242 struct scatterlist *src, unsigned int nbytes)
243{
244 struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
245 be128 buf[SERPENT_PARALLEL_BLOCKS];
246 struct crypt_priv crypt_ctx = {
247 .ctx = &ctx->serpent_ctx,
248 .fpu_enabled = false,
249 };
250 struct lrw_crypt_req req = {
251 .tbuf = buf,
252 .tbuflen = sizeof(buf),
253
254 .table_ctx = &ctx->lrw_table,
255 .crypt_ctx = &crypt_ctx,
256 .crypt_fn = encrypt_callback,
257 };
258 int ret;
259
260 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
261 ret = lrw_crypt(desc, dst, src, nbytes, &req);
262 serpent_fpu_end(crypt_ctx.fpu_enabled);
263
264 return ret;
265}
266
267static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
268 struct scatterlist *src, unsigned int nbytes)
269{
270 struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
271 be128 buf[SERPENT_PARALLEL_BLOCKS];
272 struct crypt_priv crypt_ctx = {
273 .ctx = &ctx->serpent_ctx,
274 .fpu_enabled = false,
275 };
276 struct lrw_crypt_req req = {
277 .tbuf = buf,
278 .tbuflen = sizeof(buf),
279
280 .table_ctx = &ctx->lrw_table,
281 .crypt_ctx = &crypt_ctx,
282 .crypt_fn = decrypt_callback,
283 };
284 int ret;
285
286 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
287 ret = lrw_crypt(desc, dst, src, nbytes, &req);
288 serpent_fpu_end(crypt_ctx.fpu_enabled);
289
290 return ret;
291}
292
293static void lrw_exit_tfm(struct crypto_tfm *tfm)
294{
295 struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
296
297 lrw_free_table(&ctx->lrw_table);
298}
299
300struct serpent_xts_ctx {
301 struct serpent_ctx tweak_ctx;
302 struct serpent_ctx crypt_ctx;
303};
304
305static int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
306 unsigned int keylen)
307{
308 struct serpent_xts_ctx *ctx = crypto_tfm_ctx(tfm);
309 u32 *flags = &tfm->crt_flags;
310 int err;
311
312 /* key consists of keys of equal size concatenated, therefore
313 * the length must be even
314 */
315 if (keylen % 2) {
316 *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
317 return -EINVAL;
318 }
319
320 /* first half of xts-key is for crypt */
321 err = __serpent_setkey(&ctx->crypt_ctx, key, keylen / 2);
322 if (err)
323 return err;
324
325 /* second half of xts-key is for tweak */
326 return __serpent_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2);
327}
328
329static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
330 struct scatterlist *src, unsigned int nbytes)
331{
332 struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
333 be128 buf[SERPENT_PARALLEL_BLOCKS];
334 struct crypt_priv crypt_ctx = {
335 .ctx = &ctx->crypt_ctx,
336 .fpu_enabled = false,
337 };
338 struct xts_crypt_req req = {
339 .tbuf = buf,
340 .tbuflen = sizeof(buf),
341
342 .tweak_ctx = &ctx->tweak_ctx,
343 .tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt),
344 .crypt_ctx = &crypt_ctx,
345 .crypt_fn = encrypt_callback,
346 };
347 int ret;
348
349 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
350 ret = xts_crypt(desc, dst, src, nbytes, &req);
351 serpent_fpu_end(crypt_ctx.fpu_enabled);
352
353 return ret;
354}
355
356static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
357 struct scatterlist *src, unsigned int nbytes)
358{
359 struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
360 be128 buf[SERPENT_PARALLEL_BLOCKS];
361 struct crypt_priv crypt_ctx = {
362 .ctx = &ctx->crypt_ctx,
363 .fpu_enabled = false,
364 };
365 struct xts_crypt_req req = {
366 .tbuf = buf,
367 .tbuflen = sizeof(buf),
368
369 .tweak_ctx = &ctx->tweak_ctx,
370 .tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt),
371 .crypt_ctx = &crypt_ctx,
372 .crypt_fn = decrypt_callback,
373 };
374 int ret;
375
376 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
377 ret = xts_crypt(desc, dst, src, nbytes, &req);
378 serpent_fpu_end(crypt_ctx.fpu_enabled);
379
380 return ret;
381}
382
383static struct crypto_alg serpent_algs[10] = { {
384 .cra_name = "__ecb-serpent-avx",
385 .cra_driver_name = "__driver-ecb-serpent-avx",
386 .cra_priority = 0,
387 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
388 .cra_blocksize = SERPENT_BLOCK_SIZE,
389 .cra_ctxsize = sizeof(struct serpent_ctx),
390 .cra_alignmask = 0,
391 .cra_type = &crypto_blkcipher_type,
392 .cra_module = THIS_MODULE,
393 .cra_list = LIST_HEAD_INIT(serpent_algs[0].cra_list),
394 .cra_u = {
395 .blkcipher = {
396 .min_keysize = SERPENT_MIN_KEY_SIZE,
397 .max_keysize = SERPENT_MAX_KEY_SIZE,
398 .setkey = serpent_setkey,
399 .encrypt = ecb_encrypt,
400 .decrypt = ecb_decrypt,
401 },
402 },
403}, {
404 .cra_name = "__cbc-serpent-avx",
405 .cra_driver_name = "__driver-cbc-serpent-avx",
406 .cra_priority = 0,
407 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
408 .cra_blocksize = SERPENT_BLOCK_SIZE,
409 .cra_ctxsize = sizeof(struct serpent_ctx),
410 .cra_alignmask = 0,
411 .cra_type = &crypto_blkcipher_type,
412 .cra_module = THIS_MODULE,
413 .cra_list = LIST_HEAD_INIT(serpent_algs[1].cra_list),
414 .cra_u = {
415 .blkcipher = {
416 .min_keysize = SERPENT_MIN_KEY_SIZE,
417 .max_keysize = SERPENT_MAX_KEY_SIZE,
418 .setkey = serpent_setkey,
419 .encrypt = cbc_encrypt,
420 .decrypt = cbc_decrypt,
421 },
422 },
423}, {
424 .cra_name = "__ctr-serpent-avx",
425 .cra_driver_name = "__driver-ctr-serpent-avx",
426 .cra_priority = 0,
427 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
428 .cra_blocksize = 1,
429 .cra_ctxsize = sizeof(struct serpent_ctx),
430 .cra_alignmask = 0,
431 .cra_type = &crypto_blkcipher_type,
432 .cra_module = THIS_MODULE,
433 .cra_list = LIST_HEAD_INIT(serpent_algs[2].cra_list),
434 .cra_u = {
435 .blkcipher = {
436 .min_keysize = SERPENT_MIN_KEY_SIZE,
437 .max_keysize = SERPENT_MAX_KEY_SIZE,
438 .ivsize = SERPENT_BLOCK_SIZE,
439 .setkey = serpent_setkey,
440 .encrypt = ctr_crypt,
441 .decrypt = ctr_crypt,
442 },
443 },
444}, {
445 .cra_name = "__lrw-serpent-avx",
446 .cra_driver_name = "__driver-lrw-serpent-avx",
447 .cra_priority = 0,
448 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
449 .cra_blocksize = SERPENT_BLOCK_SIZE,
450 .cra_ctxsize = sizeof(struct serpent_lrw_ctx),
451 .cra_alignmask = 0,
452 .cra_type = &crypto_blkcipher_type,
453 .cra_module = THIS_MODULE,
454 .cra_list = LIST_HEAD_INIT(serpent_algs[3].cra_list),
455 .cra_exit = lrw_exit_tfm,
456 .cra_u = {
457 .blkcipher = {
458 .min_keysize = SERPENT_MIN_KEY_SIZE +
459 SERPENT_BLOCK_SIZE,
460 .max_keysize = SERPENT_MAX_KEY_SIZE +
461 SERPENT_BLOCK_SIZE,
462 .ivsize = SERPENT_BLOCK_SIZE,
463 .setkey = lrw_serpent_setkey,
464 .encrypt = lrw_encrypt,
465 .decrypt = lrw_decrypt,
466 },
467 },
468}, {
469 .cra_name = "__xts-serpent-avx",
470 .cra_driver_name = "__driver-xts-serpent-avx",
471 .cra_priority = 0,
472 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
473 .cra_blocksize = SERPENT_BLOCK_SIZE,
474 .cra_ctxsize = sizeof(struct serpent_xts_ctx),
475 .cra_alignmask = 0,
476 .cra_type = &crypto_blkcipher_type,
477 .cra_module = THIS_MODULE,
478 .cra_list = LIST_HEAD_INIT(serpent_algs[4].cra_list),
479 .cra_u = {
480 .blkcipher = {
481 .min_keysize = SERPENT_MIN_KEY_SIZE * 2,
482 .max_keysize = SERPENT_MAX_KEY_SIZE * 2,
483 .ivsize = SERPENT_BLOCK_SIZE,
484 .setkey = xts_serpent_setkey,
485 .encrypt = xts_encrypt,
486 .decrypt = xts_decrypt,
487 },
488 },
489}, {
490 .cra_name = "ecb(serpent)",
491 .cra_driver_name = "ecb-serpent-avx",
492 .cra_priority = 500,
493 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
494 .cra_blocksize = SERPENT_BLOCK_SIZE,
495 .cra_ctxsize = sizeof(struct async_helper_ctx),
496 .cra_alignmask = 0,
497 .cra_type = &crypto_ablkcipher_type,
498 .cra_module = THIS_MODULE,
499 .cra_list = LIST_HEAD_INIT(serpent_algs[5].cra_list),
500 .cra_init = ablk_init,
501 .cra_exit = ablk_exit,
502 .cra_u = {
503 .ablkcipher = {
504 .min_keysize = SERPENT_MIN_KEY_SIZE,
505 .max_keysize = SERPENT_MAX_KEY_SIZE,
506 .setkey = ablk_set_key,
507 .encrypt = ablk_encrypt,
508 .decrypt = ablk_decrypt,
509 },
510 },
511}, {
512 .cra_name = "cbc(serpent)",
513 .cra_driver_name = "cbc-serpent-avx",
514 .cra_priority = 500,
515 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
516 .cra_blocksize = SERPENT_BLOCK_SIZE,
517 .cra_ctxsize = sizeof(struct async_helper_ctx),
518 .cra_alignmask = 0,
519 .cra_type = &crypto_ablkcipher_type,
520 .cra_module = THIS_MODULE,
521 .cra_list = LIST_HEAD_INIT(serpent_algs[6].cra_list),
522 .cra_init = ablk_init,
523 .cra_exit = ablk_exit,
524 .cra_u = {
525 .ablkcipher = {
526 .min_keysize = SERPENT_MIN_KEY_SIZE,
527 .max_keysize = SERPENT_MAX_KEY_SIZE,
528 .ivsize = SERPENT_BLOCK_SIZE,
529 .setkey = ablk_set_key,
530 .encrypt = __ablk_encrypt,
531 .decrypt = ablk_decrypt,
532 },
533 },
534}, {
535 .cra_name = "ctr(serpent)",
536 .cra_driver_name = "ctr-serpent-avx",
537 .cra_priority = 500,
538 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
539 .cra_blocksize = 1,
540 .cra_ctxsize = sizeof(struct async_helper_ctx),
541 .cra_alignmask = 0,
542 .cra_type = &crypto_ablkcipher_type,
543 .cra_module = THIS_MODULE,
544 .cra_list = LIST_HEAD_INIT(serpent_algs[7].cra_list),
545 .cra_init = ablk_init,
546 .cra_exit = ablk_exit,
547 .cra_u = {
548 .ablkcipher = {
549 .min_keysize = SERPENT_MIN_KEY_SIZE,
550 .max_keysize = SERPENT_MAX_KEY_SIZE,
551 .ivsize = SERPENT_BLOCK_SIZE,
552 .setkey = ablk_set_key,
553 .encrypt = ablk_encrypt,
554 .decrypt = ablk_encrypt,
555 .geniv = "chainiv",
556 },
557 },
558}, {
559 .cra_name = "lrw(serpent)",
560 .cra_driver_name = "lrw-serpent-avx",
561 .cra_priority = 500,
562 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
563 .cra_blocksize = SERPENT_BLOCK_SIZE,
564 .cra_ctxsize = sizeof(struct async_helper_ctx),
565 .cra_alignmask = 0,
566 .cra_type = &crypto_ablkcipher_type,
567 .cra_module = THIS_MODULE,
568 .cra_list = LIST_HEAD_INIT(serpent_algs[8].cra_list),
569 .cra_init = ablk_init,
570 .cra_exit = ablk_exit,
571 .cra_u = {
572 .ablkcipher = {
573 .min_keysize = SERPENT_MIN_KEY_SIZE +
574 SERPENT_BLOCK_SIZE,
575 .max_keysize = SERPENT_MAX_KEY_SIZE +
576 SERPENT_BLOCK_SIZE,
577 .ivsize = SERPENT_BLOCK_SIZE,
578 .setkey = ablk_set_key,
579 .encrypt = ablk_encrypt,
580 .decrypt = ablk_decrypt,
581 },
582 },
583}, {
584 .cra_name = "xts(serpent)",
585 .cra_driver_name = "xts-serpent-avx",
586 .cra_priority = 500,
587 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
588 .cra_blocksize = SERPENT_BLOCK_SIZE,
589 .cra_ctxsize = sizeof(struct async_helper_ctx),
590 .cra_alignmask = 0,
591 .cra_type = &crypto_ablkcipher_type,
592 .cra_module = THIS_MODULE,
593 .cra_list = LIST_HEAD_INIT(serpent_algs[9].cra_list),
594 .cra_init = ablk_init,
595 .cra_exit = ablk_exit,
596 .cra_u = {
597 .ablkcipher = {
598 .min_keysize = SERPENT_MIN_KEY_SIZE * 2,
599 .max_keysize = SERPENT_MAX_KEY_SIZE * 2,
600 .ivsize = SERPENT_BLOCK_SIZE,
601 .setkey = ablk_set_key,
602 .encrypt = ablk_encrypt,
603 .decrypt = ablk_decrypt,
604 },
605 },
606} };
607
608static int __init serpent_init(void)
609{
610 u64 xcr0;
611
612 if (!cpu_has_avx || !cpu_has_osxsave) {
613 printk(KERN_INFO "AVX instructions are not detected.\n");
614 return -ENODEV;
615 }
616
617 xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
618 if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
619 printk(KERN_INFO "AVX detected but unusable.\n");
620 return -ENODEV;
621 }
622
623 return crypto_register_algs(serpent_algs, ARRAY_SIZE(serpent_algs));
624}
625
626static void __exit serpent_exit(void)
627{
628 crypto_unregister_algs(serpent_algs, ARRAY_SIZE(serpent_algs));
629}
630
631module_init(serpent_init);
632module_exit(serpent_exit);
633
634MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX optimized");
635MODULE_LICENSE("GPL");
636MODULE_ALIAS("serpent");
diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
index 4b21be85e0a1..d679c8675f4a 100644
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -41,358 +41,145 @@
41#include <crypto/ctr.h> 41#include <crypto/ctr.h>
42#include <crypto/lrw.h> 42#include <crypto/lrw.h>
43#include <crypto/xts.h> 43#include <crypto/xts.h>
44#include <asm/i387.h> 44#include <asm/crypto/serpent-sse2.h>
45#include <asm/serpent.h> 45#include <asm/crypto/ablk_helper.h>
46#include <crypto/scatterwalk.h> 46#include <asm/crypto/glue_helper.h>
47#include <linux/workqueue.h>
48#include <linux/spinlock.h>
49
50struct async_serpent_ctx {
51 struct cryptd_ablkcipher *cryptd_tfm;
52};
53 47
54static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes) 48static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)
55{
56 if (fpu_enabled)
57 return true;
58
59 /* SSE2 is only used when chunk to be processed is large enough, so
60 * do not enable FPU until it is necessary.
61 */
62 if (nbytes < SERPENT_BLOCK_SIZE * SERPENT_PARALLEL_BLOCKS)
63 return false;
64
65 kernel_fpu_begin();
66 return true;
67}
68
69static inline void serpent_fpu_end(bool fpu_enabled)
70{ 49{
71 if (fpu_enabled) 50 u128 ivs[SERPENT_PARALLEL_BLOCKS - 1];
72 kernel_fpu_end(); 51 unsigned int j;
73}
74
75static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
76 bool enc)
77{
78 bool fpu_enabled = false;
79 struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
80 const unsigned int bsize = SERPENT_BLOCK_SIZE;
81 unsigned int nbytes;
82 int err;
83
84 err = blkcipher_walk_virt(desc, walk);
85 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
86
87 while ((nbytes = walk->nbytes)) {
88 u8 *wsrc = walk->src.virt.addr;
89 u8 *wdst = walk->dst.virt.addr;
90
91 fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes);
92
93 /* Process multi-block batch */
94 if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) {
95 do {
96 if (enc)
97 serpent_enc_blk_xway(ctx, wdst, wsrc);
98 else
99 serpent_dec_blk_xway(ctx, wdst, wsrc);
100
101 wsrc += bsize * SERPENT_PARALLEL_BLOCKS;
102 wdst += bsize * SERPENT_PARALLEL_BLOCKS;
103 nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
104 } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS);
105
106 if (nbytes < bsize)
107 goto done;
108 }
109
110 /* Handle leftovers */
111 do {
112 if (enc)
113 __serpent_encrypt(ctx, wdst, wsrc);
114 else
115 __serpent_decrypt(ctx, wdst, wsrc);
116
117 wsrc += bsize;
118 wdst += bsize;
119 nbytes -= bsize;
120 } while (nbytes >= bsize);
121
122done:
123 err = blkcipher_walk_done(desc, walk, nbytes);
124 }
125 52
126 serpent_fpu_end(fpu_enabled); 53 for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
127 return err; 54 ivs[j] = src[j];
128}
129 55
130static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 56 serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
131 struct scatterlist *src, unsigned int nbytes)
132{
133 struct blkcipher_walk walk;
134 57
135 blkcipher_walk_init(&walk, dst, src, nbytes); 58 for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
136 return ecb_crypt(desc, &walk, true); 59 u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
137} 60}
138 61
139static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 62static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
140 struct scatterlist *src, unsigned int nbytes)
141{ 63{
142 struct blkcipher_walk walk; 64 be128 ctrblk;
143 65
144 blkcipher_walk_init(&walk, dst, src, nbytes); 66 u128_to_be128(&ctrblk, iv);
145 return ecb_crypt(desc, &walk, false); 67 u128_inc(iv);
146}
147 68
148static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, 69 __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
149 struct blkcipher_walk *walk) 70 u128_xor(dst, src, (u128 *)&ctrblk);
150{
151 struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
152 const unsigned int bsize = SERPENT_BLOCK_SIZE;
153 unsigned int nbytes = walk->nbytes;
154 u128 *src = (u128 *)walk->src.virt.addr;
155 u128 *dst = (u128 *)walk->dst.virt.addr;
156 u128 *iv = (u128 *)walk->iv;
157
158 do {
159 u128_xor(dst, src, iv);
160 __serpent_encrypt(ctx, (u8 *)dst, (u8 *)dst);
161 iv = dst;
162
163 src += 1;
164 dst += 1;
165 nbytes -= bsize;
166 } while (nbytes >= bsize);
167
168 u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv);
169 return nbytes;
170} 71}
171 72
172static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 73static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
173 struct scatterlist *src, unsigned int nbytes) 74 u128 *iv)
174{ 75{
175 struct blkcipher_walk walk; 76 be128 ctrblks[SERPENT_PARALLEL_BLOCKS];
176 int err; 77 unsigned int i;
177 78
178 blkcipher_walk_init(&walk, dst, src, nbytes); 79 for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) {
179 err = blkcipher_walk_virt(desc, &walk); 80 if (dst != src)
81 dst[i] = src[i];
180 82
181 while ((nbytes = walk.nbytes)) { 83 u128_to_be128(&ctrblks[i], iv);
182 nbytes = __cbc_encrypt(desc, &walk); 84 u128_inc(iv);
183 err = blkcipher_walk_done(desc, &walk, nbytes);
184 } 85 }
185 86
186 return err; 87 serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
187} 88}
188 89
189static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, 90static const struct common_glue_ctx serpent_enc = {
190 struct blkcipher_walk *walk) 91 .num_funcs = 2,
191{ 92 .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
192 struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
193 const unsigned int bsize = SERPENT_BLOCK_SIZE;
194 unsigned int nbytes = walk->nbytes;
195 u128 *src = (u128 *)walk->src.virt.addr;
196 u128 *dst = (u128 *)walk->dst.virt.addr;
197 u128 ivs[SERPENT_PARALLEL_BLOCKS - 1];
198 u128 last_iv;
199 int i;
200
201 /* Start of the last block. */
202 src += nbytes / bsize - 1;
203 dst += nbytes / bsize - 1;
204
205 last_iv = *src;
206
207 /* Process multi-block batch */
208 if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) {
209 do {
210 nbytes -= bsize * (SERPENT_PARALLEL_BLOCKS - 1);
211 src -= SERPENT_PARALLEL_BLOCKS - 1;
212 dst -= SERPENT_PARALLEL_BLOCKS - 1;
213
214 for (i = 0; i < SERPENT_PARALLEL_BLOCKS - 1; i++)
215 ivs[i] = src[i];
216
217 serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
218
219 for (i = 0; i < SERPENT_PARALLEL_BLOCKS - 1; i++)
220 u128_xor(dst + (i + 1), dst + (i + 1), ivs + i);
221
222 nbytes -= bsize;
223 if (nbytes < bsize)
224 goto done;
225 93
226 u128_xor(dst, dst, src - 1); 94 .funcs = { {
227 src -= 1; 95 .num_blocks = SERPENT_PARALLEL_BLOCKS,
228 dst -= 1; 96 .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_enc_blk_xway) }
229 } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS); 97 }, {
230 98 .num_blocks = 1,
231 if (nbytes < bsize) 99 .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) }
232 goto done; 100 } }
233 } 101};
234
235 /* Handle leftovers */
236 for (;;) {
237 __serpent_decrypt(ctx, (u8 *)dst, (u8 *)src);
238
239 nbytes -= bsize;
240 if (nbytes < bsize)
241 break;
242 102
243 u128_xor(dst, dst, src - 1); 103static const struct common_glue_ctx serpent_ctr = {
244 src -= 1; 104 .num_funcs = 2,
245 dst -= 1; 105 .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
246 } 106
107 .funcs = { {
108 .num_blocks = SERPENT_PARALLEL_BLOCKS,
109 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr_xway) }
110 }, {
111 .num_blocks = 1,
112 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) }
113 } }
114};
247 115
248done: 116static const struct common_glue_ctx serpent_dec = {
249 u128_xor(dst, dst, (u128 *)walk->iv); 117 .num_funcs = 2,
250 *(u128 *)walk->iv = last_iv; 118 .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
119
120 .funcs = { {
121 .num_blocks = SERPENT_PARALLEL_BLOCKS,
122 .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_dec_blk_xway) }
123 }, {
124 .num_blocks = 1,
125 .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) }
126 } }
127};
251 128
252 return nbytes; 129static const struct common_glue_ctx serpent_dec_cbc = {
253} 130 .num_funcs = 2,
131 .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
132
133 .funcs = { {
134 .num_blocks = SERPENT_PARALLEL_BLOCKS,
135 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_decrypt_cbc_xway) }
136 }, {
137 .num_blocks = 1,
138 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) }
139 } }
140};
254 141
255static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 142static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
256 struct scatterlist *src, unsigned int nbytes) 143 struct scatterlist *src, unsigned int nbytes)
257{ 144{
258 bool fpu_enabled = false; 145 return glue_ecb_crypt_128bit(&serpent_enc, desc, dst, src, nbytes);
259 struct blkcipher_walk walk;
260 int err;
261
262 blkcipher_walk_init(&walk, dst, src, nbytes);
263 err = blkcipher_walk_virt(desc, &walk);
264 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
265
266 while ((nbytes = walk.nbytes)) {
267 fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes);
268 nbytes = __cbc_decrypt(desc, &walk);
269 err = blkcipher_walk_done(desc, &walk, nbytes);
270 }
271
272 serpent_fpu_end(fpu_enabled);
273 return err;
274} 146}
275 147
276static inline void u128_to_be128(be128 *dst, const u128 *src) 148static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
149 struct scatterlist *src, unsigned int nbytes)
277{ 150{
278 dst->a = cpu_to_be64(src->a); 151 return glue_ecb_crypt_128bit(&serpent_dec, desc, dst, src, nbytes);
279 dst->b = cpu_to_be64(src->b);
280} 152}
281 153
282static inline void be128_to_u128(u128 *dst, const be128 *src) 154static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
155 struct scatterlist *src, unsigned int nbytes)
283{ 156{
284 dst->a = be64_to_cpu(src->a); 157 return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(__serpent_encrypt), desc,
285 dst->b = be64_to_cpu(src->b); 158 dst, src, nbytes);
286} 159}
287 160
288static inline void u128_inc(u128 *i) 161static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
162 struct scatterlist *src, unsigned int nbytes)
289{ 163{
290 i->b++; 164 return glue_cbc_decrypt_128bit(&serpent_dec_cbc, desc, dst, src,
291 if (!i->b) 165 nbytes);
292 i->a++;
293} 166}
294 167
295static void ctr_crypt_final(struct blkcipher_desc *desc, 168static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
296 struct blkcipher_walk *walk) 169 struct scatterlist *src, unsigned int nbytes)
297{ 170{
298 struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 171 return glue_ctr_crypt_128bit(&serpent_ctr, desc, dst, src, nbytes);
299 u8 *ctrblk = walk->iv;
300 u8 keystream[SERPENT_BLOCK_SIZE];
301 u8 *src = walk->src.virt.addr;
302 u8 *dst = walk->dst.virt.addr;
303 unsigned int nbytes = walk->nbytes;
304
305 __serpent_encrypt(ctx, keystream, ctrblk);
306 crypto_xor(keystream, src, nbytes);
307 memcpy(dst, keystream, nbytes);
308
309 crypto_inc(ctrblk, SERPENT_BLOCK_SIZE);
310} 172}
311 173
312static unsigned int __ctr_crypt(struct blkcipher_desc *desc, 174static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes)
313 struct blkcipher_walk *walk)
314{ 175{
315 struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 176 return glue_fpu_begin(SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS,
316 const unsigned int bsize = SERPENT_BLOCK_SIZE; 177 NULL, fpu_enabled, nbytes);
317 unsigned int nbytes = walk->nbytes;
318 u128 *src = (u128 *)walk->src.virt.addr;
319 u128 *dst = (u128 *)walk->dst.virt.addr;
320 u128 ctrblk;
321 be128 ctrblocks[SERPENT_PARALLEL_BLOCKS];
322 int i;
323
324 be128_to_u128(&ctrblk, (be128 *)walk->iv);
325
326 /* Process multi-block batch */
327 if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) {
328 do {
329 /* create ctrblks for parallel encrypt */
330 for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) {
331 if (dst != src)
332 dst[i] = src[i];
333
334 u128_to_be128(&ctrblocks[i], &ctrblk);
335 u128_inc(&ctrblk);
336 }
337
338 serpent_enc_blk_xway_xor(ctx, (u8 *)dst,
339 (u8 *)ctrblocks);
340
341 src += SERPENT_PARALLEL_BLOCKS;
342 dst += SERPENT_PARALLEL_BLOCKS;
343 nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
344 } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS);
345
346 if (nbytes < bsize)
347 goto done;
348 }
349
350 /* Handle leftovers */
351 do {
352 if (dst != src)
353 *dst = *src;
354
355 u128_to_be128(&ctrblocks[0], &ctrblk);
356 u128_inc(&ctrblk);
357
358 __serpent_encrypt(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks);
359 u128_xor(dst, dst, (u128 *)ctrblocks);
360
361 src += 1;
362 dst += 1;
363 nbytes -= bsize;
364 } while (nbytes >= bsize);
365
366done:
367 u128_to_be128((be128 *)walk->iv, &ctrblk);
368 return nbytes;
369} 178}
370 179
371static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, 180static inline void serpent_fpu_end(bool fpu_enabled)
372 struct scatterlist *src, unsigned int nbytes)
373{ 181{
374 bool fpu_enabled = false; 182 glue_fpu_end(fpu_enabled);
375 struct blkcipher_walk walk;
376 int err;
377
378 blkcipher_walk_init(&walk, dst, src, nbytes);
379 err = blkcipher_walk_virt_block(desc, &walk, SERPENT_BLOCK_SIZE);
380 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
381
382 while ((nbytes = walk.nbytes) >= SERPENT_BLOCK_SIZE) {
383 fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes);
384 nbytes = __ctr_crypt(desc, &walk);
385 err = blkcipher_walk_done(desc, &walk, nbytes);
386 }
387
388 serpent_fpu_end(fpu_enabled);
389
390 if (walk.nbytes) {
391 ctr_crypt_final(desc, &walk);
392 err = blkcipher_walk_done(desc, &walk, 0);
393 }
394
395 return err;
396} 183}
397 184
398struct crypt_priv { 185struct crypt_priv {
@@ -596,106 +383,6 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
596 return ret; 383 return ret;
597} 384}
598 385
599static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
600 unsigned int key_len)
601{
602 struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
603 struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base;
604 int err;
605
606 crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
607 crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm)
608 & CRYPTO_TFM_REQ_MASK);
609 err = crypto_ablkcipher_setkey(child, key, key_len);
610 crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child)
611 & CRYPTO_TFM_RES_MASK);
612 return err;
613}
614
615static int __ablk_encrypt(struct ablkcipher_request *req)
616{
617 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
618 struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
619 struct blkcipher_desc desc;
620
621 desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
622 desc.info = req->info;
623 desc.flags = 0;
624
625 return crypto_blkcipher_crt(desc.tfm)->encrypt(
626 &desc, req->dst, req->src, req->nbytes);
627}
628
629static int ablk_encrypt(struct ablkcipher_request *req)
630{
631 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
632 struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
633
634 if (!irq_fpu_usable()) {
635 struct ablkcipher_request *cryptd_req =
636 ablkcipher_request_ctx(req);
637
638 memcpy(cryptd_req, req, sizeof(*req));
639 ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
640
641 return crypto_ablkcipher_encrypt(cryptd_req);
642 } else {
643 return __ablk_encrypt(req);
644 }
645}
646
647static int ablk_decrypt(struct ablkcipher_request *req)
648{
649 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
650 struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
651
652 if (!irq_fpu_usable()) {
653 struct ablkcipher_request *cryptd_req =
654 ablkcipher_request_ctx(req);
655
656 memcpy(cryptd_req, req, sizeof(*req));
657 ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
658
659 return crypto_ablkcipher_decrypt(cryptd_req);
660 } else {
661 struct blkcipher_desc desc;
662
663 desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
664 desc.info = req->info;
665 desc.flags = 0;
666
667 return crypto_blkcipher_crt(desc.tfm)->decrypt(
668 &desc, req->dst, req->src, req->nbytes);
669 }
670}
671
672static void ablk_exit(struct crypto_tfm *tfm)
673{
674 struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm);
675
676 cryptd_free_ablkcipher(ctx->cryptd_tfm);
677}
678
679static int ablk_init(struct crypto_tfm *tfm)
680{
681 struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm);
682 struct cryptd_ablkcipher *cryptd_tfm;
683 char drv_name[CRYPTO_MAX_ALG_NAME];
684
685 snprintf(drv_name, sizeof(drv_name), "__driver-%s",
686 crypto_tfm_alg_driver_name(tfm));
687
688 cryptd_tfm = cryptd_alloc_ablkcipher(drv_name, 0, 0);
689 if (IS_ERR(cryptd_tfm))
690 return PTR_ERR(cryptd_tfm);
691
692 ctx->cryptd_tfm = cryptd_tfm;
693 tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) +
694 crypto_ablkcipher_reqsize(&cryptd_tfm->base);
695
696 return 0;
697}
698
699static struct crypto_alg serpent_algs[10] = { { 386static struct crypto_alg serpent_algs[10] = { {
700 .cra_name = "__ecb-serpent-sse2", 387 .cra_name = "__ecb-serpent-sse2",
701 .cra_driver_name = "__driver-ecb-serpent-sse2", 388 .cra_driver_name = "__driver-ecb-serpent-sse2",
@@ -808,7 +495,7 @@ static struct crypto_alg serpent_algs[10] = { {
808 .cra_priority = 400, 495 .cra_priority = 400,
809 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 496 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
810 .cra_blocksize = SERPENT_BLOCK_SIZE, 497 .cra_blocksize = SERPENT_BLOCK_SIZE,
811 .cra_ctxsize = sizeof(struct async_serpent_ctx), 498 .cra_ctxsize = sizeof(struct async_helper_ctx),
812 .cra_alignmask = 0, 499 .cra_alignmask = 0,
813 .cra_type = &crypto_ablkcipher_type, 500 .cra_type = &crypto_ablkcipher_type,
814 .cra_module = THIS_MODULE, 501 .cra_module = THIS_MODULE,
@@ -830,7 +517,7 @@ static struct crypto_alg serpent_algs[10] = { {
830 .cra_priority = 400, 517 .cra_priority = 400,
831 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 518 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
832 .cra_blocksize = SERPENT_BLOCK_SIZE, 519 .cra_blocksize = SERPENT_BLOCK_SIZE,
833 .cra_ctxsize = sizeof(struct async_serpent_ctx), 520 .cra_ctxsize = sizeof(struct async_helper_ctx),
834 .cra_alignmask = 0, 521 .cra_alignmask = 0,
835 .cra_type = &crypto_ablkcipher_type, 522 .cra_type = &crypto_ablkcipher_type,
836 .cra_module = THIS_MODULE, 523 .cra_module = THIS_MODULE,
@@ -853,7 +540,7 @@ static struct crypto_alg serpent_algs[10] = { {
853 .cra_priority = 400, 540 .cra_priority = 400,
854 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 541 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
855 .cra_blocksize = 1, 542 .cra_blocksize = 1,
856 .cra_ctxsize = sizeof(struct async_serpent_ctx), 543 .cra_ctxsize = sizeof(struct async_helper_ctx),
857 .cra_alignmask = 0, 544 .cra_alignmask = 0,
858 .cra_type = &crypto_ablkcipher_type, 545 .cra_type = &crypto_ablkcipher_type,
859 .cra_module = THIS_MODULE, 546 .cra_module = THIS_MODULE,
@@ -877,7 +564,7 @@ static struct crypto_alg serpent_algs[10] = { {
877 .cra_priority = 400, 564 .cra_priority = 400,
878 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 565 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
879 .cra_blocksize = SERPENT_BLOCK_SIZE, 566 .cra_blocksize = SERPENT_BLOCK_SIZE,
880 .cra_ctxsize = sizeof(struct async_serpent_ctx), 567 .cra_ctxsize = sizeof(struct async_helper_ctx),
881 .cra_alignmask = 0, 568 .cra_alignmask = 0,
882 .cra_type = &crypto_ablkcipher_type, 569 .cra_type = &crypto_ablkcipher_type,
883 .cra_module = THIS_MODULE, 570 .cra_module = THIS_MODULE,
@@ -902,7 +589,7 @@ static struct crypto_alg serpent_algs[10] = { {
902 .cra_priority = 400, 589 .cra_priority = 400,
903 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 590 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
904 .cra_blocksize = SERPENT_BLOCK_SIZE, 591 .cra_blocksize = SERPENT_BLOCK_SIZE,
905 .cra_ctxsize = sizeof(struct async_serpent_ctx), 592 .cra_ctxsize = sizeof(struct async_helper_ctx),
906 .cra_alignmask = 0, 593 .cra_alignmask = 0,
907 .cra_type = &crypto_ablkcipher_type, 594 .cra_type = &crypto_ablkcipher_type,
908 .cra_module = THIS_MODULE, 595 .cra_module = THIS_MODULE,
diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S
index b2c2f57d70e8..49d6987a73d9 100644
--- a/arch/x86/crypto/sha1_ssse3_asm.S
+++ b/arch/x86/crypto/sha1_ssse3_asm.S
@@ -468,7 +468,7 @@ W_PRECALC_SSSE3
468 */ 468 */
469SHA1_VECTOR_ASM sha1_transform_ssse3 469SHA1_VECTOR_ASM sha1_transform_ssse3
470 470
471#ifdef SHA1_ENABLE_AVX_SUPPORT 471#ifdef CONFIG_AS_AVX
472 472
473.macro W_PRECALC_AVX 473.macro W_PRECALC_AVX
474 474
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
index f916499d0abe..4a11a9d72451 100644
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -35,7 +35,7 @@
35 35
36asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data, 36asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data,
37 unsigned int rounds); 37 unsigned int rounds);
38#ifdef SHA1_ENABLE_AVX_SUPPORT 38#ifdef CONFIG_AS_AVX
39asmlinkage void sha1_transform_avx(u32 *digest, const char *data, 39asmlinkage void sha1_transform_avx(u32 *digest, const char *data,
40 unsigned int rounds); 40 unsigned int rounds);
41#endif 41#endif
@@ -184,7 +184,7 @@ static struct shash_alg alg = {
184 } 184 }
185}; 185};
186 186
187#ifdef SHA1_ENABLE_AVX_SUPPORT 187#ifdef CONFIG_AS_AVX
188static bool __init avx_usable(void) 188static bool __init avx_usable(void)
189{ 189{
190 u64 xcr0; 190 u64 xcr0;
@@ -209,7 +209,7 @@ static int __init sha1_ssse3_mod_init(void)
209 if (cpu_has_ssse3) 209 if (cpu_has_ssse3)
210 sha1_transform_asm = sha1_transform_ssse3; 210 sha1_transform_asm = sha1_transform_ssse3;
211 211
212#ifdef SHA1_ENABLE_AVX_SUPPORT 212#ifdef CONFIG_AS_AVX
213 /* allow AVX to override SSSE3, it's a little faster */ 213 /* allow AVX to override SSSE3, it's a little faster */
214 if (avx_usable()) 214 if (avx_usable())
215 sha1_transform_asm = sha1_transform_avx; 215 sha1_transform_asm = sha1_transform_avx;
diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
new file mode 100644
index 000000000000..35f45574390d
--- /dev/null
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -0,0 +1,300 @@
1/*
2 * Twofish Cipher 8-way parallel algorithm (AVX/x86_64)
3 *
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
20 * USA
21 *
22 */
23
24.file "twofish-avx-x86_64-asm_64.S"
25.text
26
27/* structure of crypto context */
28#define s0 0
29#define s1 1024
30#define s2 2048
31#define s3 3072
32#define w 4096
33#define k 4128
34
35/**********************************************************************
36 8-way AVX twofish
37 **********************************************************************/
38#define CTX %rdi
39
40#define RA1 %xmm0
41#define RB1 %xmm1
42#define RC1 %xmm2
43#define RD1 %xmm3
44
45#define RA2 %xmm4
46#define RB2 %xmm5
47#define RC2 %xmm6
48#define RD2 %xmm7
49
50#define RX %xmm8
51#define RY %xmm9
52
53#define RK1 %xmm10
54#define RK2 %xmm11
55
56#define RID1 %rax
57#define RID1b %al
58#define RID2 %rbx
59#define RID2b %bl
60
61#define RGI1 %rdx
62#define RGI1bl %dl
63#define RGI1bh %dh
64#define RGI2 %rcx
65#define RGI2bl %cl
66#define RGI2bh %ch
67
68#define RGS1 %r8
69#define RGS1d %r8d
70#define RGS2 %r9
71#define RGS2d %r9d
72#define RGS3 %r10
73#define RGS3d %r10d
74
75
76#define lookup_32bit(t0, t1, t2, t3, src, dst) \
77 movb src ## bl, RID1b; \
78 movb src ## bh, RID2b; \
79 movl t0(CTX, RID1, 4), dst ## d; \
80 xorl t1(CTX, RID2, 4), dst ## d; \
81 shrq $16, src; \
82 movb src ## bl, RID1b; \
83 movb src ## bh, RID2b; \
84 xorl t2(CTX, RID1, 4), dst ## d; \
85 xorl t3(CTX, RID2, 4), dst ## d;
86
87#define G(a, x, t0, t1, t2, t3) \
88 vmovq a, RGI1; \
89 vpsrldq $8, a, x; \
90 vmovq x, RGI2; \
91 \
92 lookup_32bit(t0, t1, t2, t3, RGI1, RGS1); \
93 shrq $16, RGI1; \
94 lookup_32bit(t0, t1, t2, t3, RGI1, RGS2); \
95 shlq $32, RGS2; \
96 orq RGS1, RGS2; \
97 \
98 lookup_32bit(t0, t1, t2, t3, RGI2, RGS1); \
99 shrq $16, RGI2; \
100 lookup_32bit(t0, t1, t2, t3, RGI2, RGS3); \
101 shlq $32, RGS3; \
102 orq RGS1, RGS3; \
103 \
104 vmovq RGS2, x; \
105 vpinsrq $1, RGS3, x, x;
106
107#define encround(a, b, c, d, x, y) \
108 G(a, x, s0, s1, s2, s3); \
109 G(b, y, s1, s2, s3, s0); \
110 vpaddd x, y, x; \
111 vpaddd y, x, y; \
112 vpaddd x, RK1, x; \
113 vpaddd y, RK2, y; \
114 vpxor x, c, c; \
115 vpsrld $1, c, x; \
116 vpslld $(32 - 1), c, c; \
117 vpor c, x, c; \
118 vpslld $1, d, x; \
119 vpsrld $(32 - 1), d, d; \
120 vpor d, x, d; \
121 vpxor d, y, d;
122
123#define decround(a, b, c, d, x, y) \
124 G(a, x, s0, s1, s2, s3); \
125 G(b, y, s1, s2, s3, s0); \
126 vpaddd x, y, x; \
127 vpaddd y, x, y; \
128 vpaddd y, RK2, y; \
129 vpxor d, y, d; \
130 vpsrld $1, d, y; \
131 vpslld $(32 - 1), d, d; \
132 vpor d, y, d; \
133 vpslld $1, c, y; \
134 vpsrld $(32 - 1), c, c; \
135 vpor c, y, c; \
136 vpaddd x, RK1, x; \
137 vpxor x, c, c;
138
139#define encrypt_round(n, a, b, c, d) \
140 vbroadcastss (k+4*(2*(n)))(CTX), RK1; \
141 vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \
142 encround(a ## 1, b ## 1, c ## 1, d ## 1, RX, RY); \
143 encround(a ## 2, b ## 2, c ## 2, d ## 2, RX, RY);
144
145#define decrypt_round(n, a, b, c, d) \
146 vbroadcastss (k+4*(2*(n)))(CTX), RK1; \
147 vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \
148 decround(a ## 1, b ## 1, c ## 1, d ## 1, RX, RY); \
149 decround(a ## 2, b ## 2, c ## 2, d ## 2, RX, RY);
150
151#define encrypt_cycle(n) \
152 encrypt_round((2*n), RA, RB, RC, RD); \
153 encrypt_round(((2*n) + 1), RC, RD, RA, RB);
154
155#define decrypt_cycle(n) \
156 decrypt_round(((2*n) + 1), RC, RD, RA, RB); \
157 decrypt_round((2*n), RA, RB, RC, RD);
158
159
160#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
161 vpunpckldq x1, x0, t0; \
162 vpunpckhdq x1, x0, t2; \
163 vpunpckldq x3, x2, t1; \
164 vpunpckhdq x3, x2, x3; \
165 \
166 vpunpcklqdq t1, t0, x0; \
167 vpunpckhqdq t1, t0, x1; \
168 vpunpcklqdq x3, t2, x2; \
169 vpunpckhqdq x3, t2, x3;
170
171#define inpack_blocks(in, x0, x1, x2, x3, wkey, t0, t1, t2) \
172 vpxor (0*4*4)(in), wkey, x0; \
173 vpxor (1*4*4)(in), wkey, x1; \
174 vpxor (2*4*4)(in), wkey, x2; \
175 vpxor (3*4*4)(in), wkey, x3; \
176 \
177 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
178
179#define outunpack_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \
180 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
181 \
182 vpxor x0, wkey, x0; \
183 vmovdqu x0, (0*4*4)(out); \
184 vpxor x1, wkey, x1; \
185 vmovdqu x1, (1*4*4)(out); \
186 vpxor x2, wkey, x2; \
187 vmovdqu x2, (2*4*4)(out); \
188 vpxor x3, wkey, x3; \
189 vmovdqu x3, (3*4*4)(out);
190
191#define outunpack_xor_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \
192 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
193 \
194 vpxor x0, wkey, x0; \
195 vpxor (0*4*4)(out), x0, x0; \
196 vmovdqu x0, (0*4*4)(out); \
197 vpxor x1, wkey, x1; \
198 vpxor (1*4*4)(out), x1, x1; \
199 vmovdqu x1, (1*4*4)(out); \
200 vpxor x2, wkey, x2; \
201 vpxor (2*4*4)(out), x2, x2; \
202 vmovdqu x2, (2*4*4)(out); \
203 vpxor x3, wkey, x3; \
204 vpxor (3*4*4)(out), x3, x3; \
205 vmovdqu x3, (3*4*4)(out);
206
207.align 8
208.global __twofish_enc_blk_8way
209.type __twofish_enc_blk_8way,@function;
210
211__twofish_enc_blk_8way:
212 /* input:
213 * %rdi: ctx, CTX
214 * %rsi: dst
215 * %rdx: src
216 * %rcx: bool, if true: xor output
217 */
218
219 pushq %rbx;
220 pushq %rcx;
221
222 vmovdqu w(CTX), RK1;
223
224 leaq (4*4*4)(%rdx), %rax;
225 inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RK1, RX, RY, RK2);
226 inpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX, RY, RK2);
227
228 xorq RID1, RID1;
229 xorq RID2, RID2;
230
231 encrypt_cycle(0);
232 encrypt_cycle(1);
233 encrypt_cycle(2);
234 encrypt_cycle(3);
235 encrypt_cycle(4);
236 encrypt_cycle(5);
237 encrypt_cycle(6);
238 encrypt_cycle(7);
239
240 vmovdqu (w+4*4)(CTX), RK1;
241
242 popq %rcx;
243 popq %rbx;
244
245 leaq (4*4*4)(%rsi), %rax;
246
247 testb %cl, %cl;
248 jnz __enc_xor8;
249
250 outunpack_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2);
251 outunpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2);
252
253 ret;
254
255__enc_xor8:
256 outunpack_xor_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2);
257 outunpack_xor_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2);
258
259 ret;
260
261.align 8
262.global twofish_dec_blk_8way
263.type twofish_dec_blk_8way,@function;
264
265twofish_dec_blk_8way:
266 /* input:
267 * %rdi: ctx, CTX
268 * %rsi: dst
269 * %rdx: src
270 */
271
272 pushq %rbx;
273
274 vmovdqu (w+4*4)(CTX), RK1;
275
276 leaq (4*4*4)(%rdx), %rax;
277 inpack_blocks(%rdx, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2);
278 inpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2);
279
280 xorq RID1, RID1;
281 xorq RID2, RID2;
282
283 decrypt_cycle(7);
284 decrypt_cycle(6);
285 decrypt_cycle(5);
286 decrypt_cycle(4);
287 decrypt_cycle(3);
288 decrypt_cycle(2);
289 decrypt_cycle(1);
290 decrypt_cycle(0);
291
292 vmovdqu (w)(CTX), RK1;
293
294 popq %rbx;
295
296 leaq (4*4*4)(%rsi), %rax;
297 outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RK1, RX, RY, RK2);
298 outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX, RY, RK2);
299
300 ret;
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
new file mode 100644
index 000000000000..782b67ddaf6a
--- /dev/null
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -0,0 +1,624 @@
1/*
2 * Glue Code for AVX assembler version of Twofish Cipher
3 *
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
20 * USA
21 *
22 */
23
24#include <linux/module.h>
25#include <linux/hardirq.h>
26#include <linux/types.h>
27#include <linux/crypto.h>
28#include <linux/err.h>
29#include <crypto/algapi.h>
30#include <crypto/twofish.h>
31#include <crypto/cryptd.h>
32#include <crypto/b128ops.h>
33#include <crypto/ctr.h>
34#include <crypto/lrw.h>
35#include <crypto/xts.h>
36#include <asm/i387.h>
37#include <asm/xcr.h>
38#include <asm/xsave.h>
39#include <asm/crypto/twofish.h>
40#include <asm/crypto/ablk_helper.h>
41#include <asm/crypto/glue_helper.h>
42#include <crypto/scatterwalk.h>
43#include <linux/workqueue.h>
44#include <linux/spinlock.h>
45
46#define TWOFISH_PARALLEL_BLOCKS 8
47
48static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
49 const u8 *src)
50{
51 __twofish_enc_blk_3way(ctx, dst, src, false);
52}
53
54/* 8-way parallel cipher functions */
55asmlinkage void __twofish_enc_blk_8way(struct twofish_ctx *ctx, u8 *dst,
56 const u8 *src, bool xor);
57asmlinkage void twofish_dec_blk_8way(struct twofish_ctx *ctx, u8 *dst,
58 const u8 *src);
59
60static inline void twofish_enc_blk_xway(struct twofish_ctx *ctx, u8 *dst,
61 const u8 *src)
62{
63 __twofish_enc_blk_8way(ctx, dst, src, false);
64}
65
66static inline void twofish_enc_blk_xway_xor(struct twofish_ctx *ctx, u8 *dst,
67 const u8 *src)
68{
69 __twofish_enc_blk_8way(ctx, dst, src, true);
70}
71
72static inline void twofish_dec_blk_xway(struct twofish_ctx *ctx, u8 *dst,
73 const u8 *src)
74{
75 twofish_dec_blk_8way(ctx, dst, src);
76}
77
78static void twofish_dec_blk_cbc_xway(void *ctx, u128 *dst, const u128 *src)
79{
80 u128 ivs[TWOFISH_PARALLEL_BLOCKS - 1];
81 unsigned int j;
82
83 for (j = 0; j < TWOFISH_PARALLEL_BLOCKS - 1; j++)
84 ivs[j] = src[j];
85
86 twofish_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
87
88 for (j = 0; j < TWOFISH_PARALLEL_BLOCKS - 1; j++)
89 u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
90}
91
92static void twofish_enc_blk_ctr_xway(void *ctx, u128 *dst, const u128 *src,
93 u128 *iv)
94{
95 be128 ctrblks[TWOFISH_PARALLEL_BLOCKS];
96 unsigned int i;
97
98 for (i = 0; i < TWOFISH_PARALLEL_BLOCKS; i++) {
99 if (dst != src)
100 dst[i] = src[i];
101
102 u128_to_be128(&ctrblks[i], iv);
103 u128_inc(iv);
104 }
105
106 twofish_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
107}
108
109static const struct common_glue_ctx twofish_enc = {
110 .num_funcs = 3,
111 .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
112
113 .funcs = { {
114 .num_blocks = TWOFISH_PARALLEL_BLOCKS,
115 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_xway) }
116 }, {
117 .num_blocks = 3,
118 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) }
119 }, {
120 .num_blocks = 1,
121 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk) }
122 } }
123};
124
125static const struct common_glue_ctx twofish_ctr = {
126 .num_funcs = 3,
127 .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
128
129 .funcs = { {
130 .num_blocks = TWOFISH_PARALLEL_BLOCKS,
131 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_xway) }
132 }, {
133 .num_blocks = 3,
134 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) }
135 }, {
136 .num_blocks = 1,
137 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr) }
138 } }
139};
140
141static const struct common_glue_ctx twofish_dec = {
142 .num_funcs = 3,
143 .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
144
145 .funcs = { {
146 .num_blocks = TWOFISH_PARALLEL_BLOCKS,
147 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_xway) }
148 }, {
149 .num_blocks = 3,
150 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) }
151 }, {
152 .num_blocks = 1,
153 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk) }
154 } }
155};
156
157static const struct common_glue_ctx twofish_dec_cbc = {
158 .num_funcs = 3,
159 .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
160
161 .funcs = { {
162 .num_blocks = TWOFISH_PARALLEL_BLOCKS,
163 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_xway) }
164 }, {
165 .num_blocks = 3,
166 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) }
167 }, {
168 .num_blocks = 1,
169 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk) }
170 } }
171};
172
173static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
174 struct scatterlist *src, unsigned int nbytes)
175{
176 return glue_ecb_crypt_128bit(&twofish_enc, desc, dst, src, nbytes);
177}
178
179static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
180 struct scatterlist *src, unsigned int nbytes)
181{
182 return glue_ecb_crypt_128bit(&twofish_dec, desc, dst, src, nbytes);
183}
184
185static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
186 struct scatterlist *src, unsigned int nbytes)
187{
188 return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(twofish_enc_blk), desc,
189 dst, src, nbytes);
190}
191
192static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
193 struct scatterlist *src, unsigned int nbytes)
194{
195 return glue_cbc_decrypt_128bit(&twofish_dec_cbc, desc, dst, src,
196 nbytes);
197}
198
199static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
200 struct scatterlist *src, unsigned int nbytes)
201{
202 return glue_ctr_crypt_128bit(&twofish_ctr, desc, dst, src, nbytes);
203}
204
205static inline bool twofish_fpu_begin(bool fpu_enabled, unsigned int nbytes)
206{
207 return glue_fpu_begin(TF_BLOCK_SIZE, TWOFISH_PARALLEL_BLOCKS, NULL,
208 fpu_enabled, nbytes);
209}
210
211static inline void twofish_fpu_end(bool fpu_enabled)
212{
213 glue_fpu_end(fpu_enabled);
214}
215
216struct crypt_priv {
217 struct twofish_ctx *ctx;
218 bool fpu_enabled;
219};
220
221static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
222{
223 const unsigned int bsize = TF_BLOCK_SIZE;
224 struct crypt_priv *ctx = priv;
225 int i;
226
227 ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
228
229 if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
230 twofish_enc_blk_xway(ctx->ctx, srcdst, srcdst);
231 return;
232 }
233
234 for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3)
235 twofish_enc_blk_3way(ctx->ctx, srcdst, srcdst);
236
237 nbytes %= bsize * 3;
238
239 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
240 twofish_enc_blk(ctx->ctx, srcdst, srcdst);
241}
242
243static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
244{
245 const unsigned int bsize = TF_BLOCK_SIZE;
246 struct crypt_priv *ctx = priv;
247 int i;
248
249 ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
250
251 if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
252 twofish_dec_blk_xway(ctx->ctx, srcdst, srcdst);
253 return;
254 }
255
256 for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3)
257 twofish_dec_blk_3way(ctx->ctx, srcdst, srcdst);
258
259 nbytes %= bsize * 3;
260
261 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
262 twofish_dec_blk(ctx->ctx, srcdst, srcdst);
263}
264
265static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
266 struct scatterlist *src, unsigned int nbytes)
267{
268 struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
269 be128 buf[TWOFISH_PARALLEL_BLOCKS];
270 struct crypt_priv crypt_ctx = {
271 .ctx = &ctx->twofish_ctx,
272 .fpu_enabled = false,
273 };
274 struct lrw_crypt_req req = {
275 .tbuf = buf,
276 .tbuflen = sizeof(buf),
277
278 .table_ctx = &ctx->lrw_table,
279 .crypt_ctx = &crypt_ctx,
280 .crypt_fn = encrypt_callback,
281 };
282 int ret;
283
284 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
285 ret = lrw_crypt(desc, dst, src, nbytes, &req);
286 twofish_fpu_end(crypt_ctx.fpu_enabled);
287
288 return ret;
289}
290
291static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
292 struct scatterlist *src, unsigned int nbytes)
293{
294 struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
295 be128 buf[TWOFISH_PARALLEL_BLOCKS];
296 struct crypt_priv crypt_ctx = {
297 .ctx = &ctx->twofish_ctx,
298 .fpu_enabled = false,
299 };
300 struct lrw_crypt_req req = {
301 .tbuf = buf,
302 .tbuflen = sizeof(buf),
303
304 .table_ctx = &ctx->lrw_table,
305 .crypt_ctx = &crypt_ctx,
306 .crypt_fn = decrypt_callback,
307 };
308 int ret;
309
310 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
311 ret = lrw_crypt(desc, dst, src, nbytes, &req);
312 twofish_fpu_end(crypt_ctx.fpu_enabled);
313
314 return ret;
315}
316
317static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
318 struct scatterlist *src, unsigned int nbytes)
319{
320 struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
321 be128 buf[TWOFISH_PARALLEL_BLOCKS];
322 struct crypt_priv crypt_ctx = {
323 .ctx = &ctx->crypt_ctx,
324 .fpu_enabled = false,
325 };
326 struct xts_crypt_req req = {
327 .tbuf = buf,
328 .tbuflen = sizeof(buf),
329
330 .tweak_ctx = &ctx->tweak_ctx,
331 .tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk),
332 .crypt_ctx = &crypt_ctx,
333 .crypt_fn = encrypt_callback,
334 };
335 int ret;
336
337 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
338 ret = xts_crypt(desc, dst, src, nbytes, &req);
339 twofish_fpu_end(crypt_ctx.fpu_enabled);
340
341 return ret;
342}
343
344static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
345 struct scatterlist *src, unsigned int nbytes)
346{
347 struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
348 be128 buf[TWOFISH_PARALLEL_BLOCKS];
349 struct crypt_priv crypt_ctx = {
350 .ctx = &ctx->crypt_ctx,
351 .fpu_enabled = false,
352 };
353 struct xts_crypt_req req = {
354 .tbuf = buf,
355 .tbuflen = sizeof(buf),
356
357 .tweak_ctx = &ctx->tweak_ctx,
358 .tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk),
359 .crypt_ctx = &crypt_ctx,
360 .crypt_fn = decrypt_callback,
361 };
362 int ret;
363
364 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
365 ret = xts_crypt(desc, dst, src, nbytes, &req);
366 twofish_fpu_end(crypt_ctx.fpu_enabled);
367
368 return ret;
369}
370
371static struct crypto_alg twofish_algs[10] = { {
372 .cra_name = "__ecb-twofish-avx",
373 .cra_driver_name = "__driver-ecb-twofish-avx",
374 .cra_priority = 0,
375 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
376 .cra_blocksize = TF_BLOCK_SIZE,
377 .cra_ctxsize = sizeof(struct twofish_ctx),
378 .cra_alignmask = 0,
379 .cra_type = &crypto_blkcipher_type,
380 .cra_module = THIS_MODULE,
381 .cra_list = LIST_HEAD_INIT(twofish_algs[0].cra_list),
382 .cra_u = {
383 .blkcipher = {
384 .min_keysize = TF_MIN_KEY_SIZE,
385 .max_keysize = TF_MAX_KEY_SIZE,
386 .setkey = twofish_setkey,
387 .encrypt = ecb_encrypt,
388 .decrypt = ecb_decrypt,
389 },
390 },
391}, {
392 .cra_name = "__cbc-twofish-avx",
393 .cra_driver_name = "__driver-cbc-twofish-avx",
394 .cra_priority = 0,
395 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
396 .cra_blocksize = TF_BLOCK_SIZE,
397 .cra_ctxsize = sizeof(struct twofish_ctx),
398 .cra_alignmask = 0,
399 .cra_type = &crypto_blkcipher_type,
400 .cra_module = THIS_MODULE,
401 .cra_list = LIST_HEAD_INIT(twofish_algs[1].cra_list),
402 .cra_u = {
403 .blkcipher = {
404 .min_keysize = TF_MIN_KEY_SIZE,
405 .max_keysize = TF_MAX_KEY_SIZE,
406 .setkey = twofish_setkey,
407 .encrypt = cbc_encrypt,
408 .decrypt = cbc_decrypt,
409 },
410 },
411}, {
412 .cra_name = "__ctr-twofish-avx",
413 .cra_driver_name = "__driver-ctr-twofish-avx",
414 .cra_priority = 0,
415 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
416 .cra_blocksize = 1,
417 .cra_ctxsize = sizeof(struct twofish_ctx),
418 .cra_alignmask = 0,
419 .cra_type = &crypto_blkcipher_type,
420 .cra_module = THIS_MODULE,
421 .cra_list = LIST_HEAD_INIT(twofish_algs[2].cra_list),
422 .cra_u = {
423 .blkcipher = {
424 .min_keysize = TF_MIN_KEY_SIZE,
425 .max_keysize = TF_MAX_KEY_SIZE,
426 .ivsize = TF_BLOCK_SIZE,
427 .setkey = twofish_setkey,
428 .encrypt = ctr_crypt,
429 .decrypt = ctr_crypt,
430 },
431 },
432}, {
433 .cra_name = "__lrw-twofish-avx",
434 .cra_driver_name = "__driver-lrw-twofish-avx",
435 .cra_priority = 0,
436 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
437 .cra_blocksize = TF_BLOCK_SIZE,
438 .cra_ctxsize = sizeof(struct twofish_lrw_ctx),
439 .cra_alignmask = 0,
440 .cra_type = &crypto_blkcipher_type,
441 .cra_module = THIS_MODULE,
442 .cra_list = LIST_HEAD_INIT(twofish_algs[3].cra_list),
443 .cra_exit = lrw_twofish_exit_tfm,
444 .cra_u = {
445 .blkcipher = {
446 .min_keysize = TF_MIN_KEY_SIZE +
447 TF_BLOCK_SIZE,
448 .max_keysize = TF_MAX_KEY_SIZE +
449 TF_BLOCK_SIZE,
450 .ivsize = TF_BLOCK_SIZE,
451 .setkey = lrw_twofish_setkey,
452 .encrypt = lrw_encrypt,
453 .decrypt = lrw_decrypt,
454 },
455 },
456}, {
457 .cra_name = "__xts-twofish-avx",
458 .cra_driver_name = "__driver-xts-twofish-avx",
459 .cra_priority = 0,
460 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
461 .cra_blocksize = TF_BLOCK_SIZE,
462 .cra_ctxsize = sizeof(struct twofish_xts_ctx),
463 .cra_alignmask = 0,
464 .cra_type = &crypto_blkcipher_type,
465 .cra_module = THIS_MODULE,
466 .cra_list = LIST_HEAD_INIT(twofish_algs[4].cra_list),
467 .cra_u = {
468 .blkcipher = {
469 .min_keysize = TF_MIN_KEY_SIZE * 2,
470 .max_keysize = TF_MAX_KEY_SIZE * 2,
471 .ivsize = TF_BLOCK_SIZE,
472 .setkey = xts_twofish_setkey,
473 .encrypt = xts_encrypt,
474 .decrypt = xts_decrypt,
475 },
476 },
477}, {
478 .cra_name = "ecb(twofish)",
479 .cra_driver_name = "ecb-twofish-avx",
480 .cra_priority = 400,
481 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
482 .cra_blocksize = TF_BLOCK_SIZE,
483 .cra_ctxsize = sizeof(struct async_helper_ctx),
484 .cra_alignmask = 0,
485 .cra_type = &crypto_ablkcipher_type,
486 .cra_module = THIS_MODULE,
487 .cra_list = LIST_HEAD_INIT(twofish_algs[5].cra_list),
488 .cra_init = ablk_init,
489 .cra_exit = ablk_exit,
490 .cra_u = {
491 .ablkcipher = {
492 .min_keysize = TF_MIN_KEY_SIZE,
493 .max_keysize = TF_MAX_KEY_SIZE,
494 .setkey = ablk_set_key,
495 .encrypt = ablk_encrypt,
496 .decrypt = ablk_decrypt,
497 },
498 },
499}, {
500 .cra_name = "cbc(twofish)",
501 .cra_driver_name = "cbc-twofish-avx",
502 .cra_priority = 400,
503 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
504 .cra_blocksize = TF_BLOCK_SIZE,
505 .cra_ctxsize = sizeof(struct async_helper_ctx),
506 .cra_alignmask = 0,
507 .cra_type = &crypto_ablkcipher_type,
508 .cra_module = THIS_MODULE,
509 .cra_list = LIST_HEAD_INIT(twofish_algs[6].cra_list),
510 .cra_init = ablk_init,
511 .cra_exit = ablk_exit,
512 .cra_u = {
513 .ablkcipher = {
514 .min_keysize = TF_MIN_KEY_SIZE,
515 .max_keysize = TF_MAX_KEY_SIZE,
516 .ivsize = TF_BLOCK_SIZE,
517 .setkey = ablk_set_key,
518 .encrypt = __ablk_encrypt,
519 .decrypt = ablk_decrypt,
520 },
521 },
522}, {
523 .cra_name = "ctr(twofish)",
524 .cra_driver_name = "ctr-twofish-avx",
525 .cra_priority = 400,
526 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
527 .cra_blocksize = 1,
528 .cra_ctxsize = sizeof(struct async_helper_ctx),
529 .cra_alignmask = 0,
530 .cra_type = &crypto_ablkcipher_type,
531 .cra_module = THIS_MODULE,
532 .cra_list = LIST_HEAD_INIT(twofish_algs[7].cra_list),
533 .cra_init = ablk_init,
534 .cra_exit = ablk_exit,
535 .cra_u = {
536 .ablkcipher = {
537 .min_keysize = TF_MIN_KEY_SIZE,
538 .max_keysize = TF_MAX_KEY_SIZE,
539 .ivsize = TF_BLOCK_SIZE,
540 .setkey = ablk_set_key,
541 .encrypt = ablk_encrypt,
542 .decrypt = ablk_encrypt,
543 .geniv = "chainiv",
544 },
545 },
546}, {
547 .cra_name = "lrw(twofish)",
548 .cra_driver_name = "lrw-twofish-avx",
549 .cra_priority = 400,
550 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
551 .cra_blocksize = TF_BLOCK_SIZE,
552 .cra_ctxsize = sizeof(struct async_helper_ctx),
553 .cra_alignmask = 0,
554 .cra_type = &crypto_ablkcipher_type,
555 .cra_module = THIS_MODULE,
556 .cra_list = LIST_HEAD_INIT(twofish_algs[8].cra_list),
557 .cra_init = ablk_init,
558 .cra_exit = ablk_exit,
559 .cra_u = {
560 .ablkcipher = {
561 .min_keysize = TF_MIN_KEY_SIZE +
562 TF_BLOCK_SIZE,
563 .max_keysize = TF_MAX_KEY_SIZE +
564 TF_BLOCK_SIZE,
565 .ivsize = TF_BLOCK_SIZE,
566 .setkey = ablk_set_key,
567 .encrypt = ablk_encrypt,
568 .decrypt = ablk_decrypt,
569 },
570 },
571}, {
572 .cra_name = "xts(twofish)",
573 .cra_driver_name = "xts-twofish-avx",
574 .cra_priority = 400,
575 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
576 .cra_blocksize = TF_BLOCK_SIZE,
577 .cra_ctxsize = sizeof(struct async_helper_ctx),
578 .cra_alignmask = 0,
579 .cra_type = &crypto_ablkcipher_type,
580 .cra_module = THIS_MODULE,
581 .cra_list = LIST_HEAD_INIT(twofish_algs[9].cra_list),
582 .cra_init = ablk_init,
583 .cra_exit = ablk_exit,
584 .cra_u = {
585 .ablkcipher = {
586 .min_keysize = TF_MIN_KEY_SIZE * 2,
587 .max_keysize = TF_MAX_KEY_SIZE * 2,
588 .ivsize = TF_BLOCK_SIZE,
589 .setkey = ablk_set_key,
590 .encrypt = ablk_encrypt,
591 .decrypt = ablk_decrypt,
592 },
593 },
594} };
595
596static int __init twofish_init(void)
597{
598 u64 xcr0;
599
600 if (!cpu_has_avx || !cpu_has_osxsave) {
601 printk(KERN_INFO "AVX instructions are not detected.\n");
602 return -ENODEV;
603 }
604
605 xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
606 if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
607 printk(KERN_INFO "AVX detected but unusable.\n");
608 return -ENODEV;
609 }
610
611 return crypto_register_algs(twofish_algs, ARRAY_SIZE(twofish_algs));
612}
613
614static void __exit twofish_exit(void)
615{
616 crypto_unregister_algs(twofish_algs, ARRAY_SIZE(twofish_algs));
617}
618
619module_init(twofish_init);
620module_exit(twofish_exit);
621
622MODULE_DESCRIPTION("Twofish Cipher Algorithm, AVX optimized");
623MODULE_LICENSE("GPL");
624MODULE_ALIAS("twofish");
diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c
index 922ab24cce31..15f9347316c8 100644
--- a/arch/x86/crypto/twofish_glue_3way.c
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -3,11 +3,6 @@
3 * 3 *
4 * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 4 * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 * 5 *
6 * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
7 * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
8 * CTR part based on code (crypto/ctr.c) by:
9 * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
10 *
11 * This program is free software; you can redistribute it and/or modify 6 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by 7 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or 8 * the Free Software Foundation; either version 2 of the License, or
@@ -33,20 +28,13 @@
33#include <crypto/algapi.h> 28#include <crypto/algapi.h>
34#include <crypto/twofish.h> 29#include <crypto/twofish.h>
35#include <crypto/b128ops.h> 30#include <crypto/b128ops.h>
31#include <asm/crypto/twofish.h>
32#include <asm/crypto/glue_helper.h>
36#include <crypto/lrw.h> 33#include <crypto/lrw.h>
37#include <crypto/xts.h> 34#include <crypto/xts.h>
38 35
39/* regular block cipher functions from twofish_x86_64 module */ 36EXPORT_SYMBOL_GPL(__twofish_enc_blk_3way);
40asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst, 37EXPORT_SYMBOL_GPL(twofish_dec_blk_3way);
41 const u8 *src);
42asmlinkage void twofish_dec_blk(struct twofish_ctx *ctx, u8 *dst,
43 const u8 *src);
44
45/* 3-way parallel cipher functions */
46asmlinkage void __twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
47 const u8 *src, bool xor);
48asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst,
49 const u8 *src);
50 38
51static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, 39static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
52 const u8 *src) 40 const u8 *src)
@@ -60,311 +48,139 @@ static inline void twofish_enc_blk_xor_3way(struct twofish_ctx *ctx, u8 *dst,
60 __twofish_enc_blk_3way(ctx, dst, src, true); 48 __twofish_enc_blk_3way(ctx, dst, src, true);
61} 49}
62 50
63static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, 51void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src)
64 void (*fn)(struct twofish_ctx *, u8 *, const u8 *),
65 void (*fn_3way)(struct twofish_ctx *, u8 *, const u8 *))
66{
67 struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
68 unsigned int bsize = TF_BLOCK_SIZE;
69 unsigned int nbytes;
70 int err;
71
72 err = blkcipher_walk_virt(desc, walk);
73
74 while ((nbytes = walk->nbytes)) {
75 u8 *wsrc = walk->src.virt.addr;
76 u8 *wdst = walk->dst.virt.addr;
77
78 /* Process three block batch */
79 if (nbytes >= bsize * 3) {
80 do {
81 fn_3way(ctx, wdst, wsrc);
82
83 wsrc += bsize * 3;
84 wdst += bsize * 3;
85 nbytes -= bsize * 3;
86 } while (nbytes >= bsize * 3);
87
88 if (nbytes < bsize)
89 goto done;
90 }
91
92 /* Handle leftovers */
93 do {
94 fn(ctx, wdst, wsrc);
95
96 wsrc += bsize;
97 wdst += bsize;
98 nbytes -= bsize;
99 } while (nbytes >= bsize);
100
101done:
102 err = blkcipher_walk_done(desc, walk, nbytes);
103 }
104
105 return err;
106}
107
108static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
109 struct scatterlist *src, unsigned int nbytes)
110{ 52{
111 struct blkcipher_walk walk; 53 u128 ivs[2];
112 54
113 blkcipher_walk_init(&walk, dst, src, nbytes); 55 ivs[0] = src[0];
114 return ecb_crypt(desc, &walk, twofish_enc_blk, twofish_enc_blk_3way); 56 ivs[1] = src[1];
115}
116 57
117static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 58 twofish_dec_blk_3way(ctx, (u8 *)dst, (u8 *)src);
118 struct scatterlist *src, unsigned int nbytes)
119{
120 struct blkcipher_walk walk;
121 59
122 blkcipher_walk_init(&walk, dst, src, nbytes); 60 u128_xor(&dst[1], &dst[1], &ivs[0]);
123 return ecb_crypt(desc, &walk, twofish_dec_blk, twofish_dec_blk_3way); 61 u128_xor(&dst[2], &dst[2], &ivs[1]);
124} 62}
63EXPORT_SYMBOL_GPL(twofish_dec_blk_cbc_3way);
125 64
126static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, 65void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
127 struct blkcipher_walk *walk)
128{
129 struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
130 unsigned int bsize = TF_BLOCK_SIZE;
131 unsigned int nbytes = walk->nbytes;
132 u128 *src = (u128 *)walk->src.virt.addr;
133 u128 *dst = (u128 *)walk->dst.virt.addr;
134 u128 *iv = (u128 *)walk->iv;
135
136 do {
137 u128_xor(dst, src, iv);
138 twofish_enc_blk(ctx, (u8 *)dst, (u8 *)dst);
139 iv = dst;
140
141 src += 1;
142 dst += 1;
143 nbytes -= bsize;
144 } while (nbytes >= bsize);
145
146 u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv);
147 return nbytes;
148}
149
150static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
151 struct scatterlist *src, unsigned int nbytes)
152{ 66{
153 struct blkcipher_walk walk; 67 be128 ctrblk;
154 int err;
155 68
156 blkcipher_walk_init(&walk, dst, src, nbytes); 69 if (dst != src)
157 err = blkcipher_walk_virt(desc, &walk); 70 *dst = *src;
158 71
159 while ((nbytes = walk.nbytes)) { 72 u128_to_be128(&ctrblk, iv);
160 nbytes = __cbc_encrypt(desc, &walk); 73 u128_inc(iv);
161 err = blkcipher_walk_done(desc, &walk, nbytes);
162 }
163 74
164 return err; 75 twofish_enc_blk(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
76 u128_xor(dst, dst, (u128 *)&ctrblk);
165} 77}
78EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr);
166 79
167static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, 80void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src,
168 struct blkcipher_walk *walk) 81 u128 *iv)
169{ 82{
170 struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 83 be128 ctrblks[3];
171 unsigned int bsize = TF_BLOCK_SIZE;
172 unsigned int nbytes = walk->nbytes;
173 u128 *src = (u128 *)walk->src.virt.addr;
174 u128 *dst = (u128 *)walk->dst.virt.addr;
175 u128 ivs[3 - 1];
176 u128 last_iv;
177
178 /* Start of the last block. */
179 src += nbytes / bsize - 1;
180 dst += nbytes / bsize - 1;
181
182 last_iv = *src;
183
184 /* Process three block batch */
185 if (nbytes >= bsize * 3) {
186 do {
187 nbytes -= bsize * (3 - 1);
188 src -= 3 - 1;
189 dst -= 3 - 1;
190
191 ivs[0] = src[0];
192 ivs[1] = src[1];
193
194 twofish_dec_blk_3way(ctx, (u8 *)dst, (u8 *)src);
195
196 u128_xor(dst + 1, dst + 1, ivs + 0);
197 u128_xor(dst + 2, dst + 2, ivs + 1);
198
199 nbytes -= bsize;
200 if (nbytes < bsize)
201 goto done;
202
203 u128_xor(dst, dst, src - 1);
204 src -= 1;
205 dst -= 1;
206 } while (nbytes >= bsize * 3);
207
208 if (nbytes < bsize)
209 goto done;
210 }
211
212 /* Handle leftovers */
213 for (;;) {
214 twofish_dec_blk(ctx, (u8 *)dst, (u8 *)src);
215
216 nbytes -= bsize;
217 if (nbytes < bsize)
218 break;
219 84
220 u128_xor(dst, dst, src - 1); 85 if (dst != src) {
221 src -= 1; 86 dst[0] = src[0];
222 dst -= 1; 87 dst[1] = src[1];
88 dst[2] = src[2];
223 } 89 }
224 90
225done: 91 u128_to_be128(&ctrblks[0], iv);
226 u128_xor(dst, dst, (u128 *)walk->iv); 92 u128_inc(iv);
227 *(u128 *)walk->iv = last_iv; 93 u128_to_be128(&ctrblks[1], iv);
94 u128_inc(iv);
95 u128_to_be128(&ctrblks[2], iv);
96 u128_inc(iv);
228 97
229 return nbytes; 98 twofish_enc_blk_xor_3way(ctx, (u8 *)dst, (u8 *)ctrblks);
230} 99}
100EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr_3way);
101
102static const struct common_glue_ctx twofish_enc = {
103 .num_funcs = 2,
104 .fpu_blocks_limit = -1,
105
106 .funcs = { {
107 .num_blocks = 3,
108 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) }
109 }, {
110 .num_blocks = 1,
111 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk) }
112 } }
113};
231 114
232static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 115static const struct common_glue_ctx twofish_ctr = {
233 struct scatterlist *src, unsigned int nbytes) 116 .num_funcs = 2,
234{ 117 .fpu_blocks_limit = -1,
235 struct blkcipher_walk walk; 118
236 int err; 119 .funcs = { {
237 120 .num_blocks = 3,
238 blkcipher_walk_init(&walk, dst, src, nbytes); 121 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_ctr_3way) }
239 err = blkcipher_walk_virt(desc, &walk); 122 }, {
123 .num_blocks = 1,
124 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_ctr) }
125 } }
126};
240 127
241 while ((nbytes = walk.nbytes)) { 128static const struct common_glue_ctx twofish_dec = {
242 nbytes = __cbc_decrypt(desc, &walk); 129 .num_funcs = 2,
243 err = blkcipher_walk_done(desc, &walk, nbytes); 130 .fpu_blocks_limit = -1,
244 } 131
132 .funcs = { {
133 .num_blocks = 3,
134 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) }
135 }, {
136 .num_blocks = 1,
137 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk) }
138 } }
139};
245 140
246 return err; 141static const struct common_glue_ctx twofish_dec_cbc = {
247} 142 .num_funcs = 2,
143 .fpu_blocks_limit = -1,
144
145 .funcs = { {
146 .num_blocks = 3,
147 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) }
148 }, {
149 .num_blocks = 1,
150 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk) }
151 } }
152};
248 153
249static inline void u128_to_be128(be128 *dst, const u128 *src) 154static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
155 struct scatterlist *src, unsigned int nbytes)
250{ 156{
251 dst->a = cpu_to_be64(src->a); 157 return glue_ecb_crypt_128bit(&twofish_enc, desc, dst, src, nbytes);
252 dst->b = cpu_to_be64(src->b);
253} 158}
254 159
255static inline void be128_to_u128(u128 *dst, const be128 *src) 160static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
161 struct scatterlist *src, unsigned int nbytes)
256{ 162{
257 dst->a = be64_to_cpu(src->a); 163 return glue_ecb_crypt_128bit(&twofish_dec, desc, dst, src, nbytes);
258 dst->b = be64_to_cpu(src->b);
259} 164}
260 165
261static inline void u128_inc(u128 *i) 166static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
167 struct scatterlist *src, unsigned int nbytes)
262{ 168{
263 i->b++; 169 return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(twofish_enc_blk), desc,
264 if (!i->b) 170 dst, src, nbytes);
265 i->a++;
266} 171}
267 172
268static void ctr_crypt_final(struct blkcipher_desc *desc, 173static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
269 struct blkcipher_walk *walk) 174 struct scatterlist *src, unsigned int nbytes)
270{ 175{
271 struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 176 return glue_cbc_decrypt_128bit(&twofish_dec_cbc, desc, dst, src,
272 u8 *ctrblk = walk->iv; 177 nbytes);
273 u8 keystream[TF_BLOCK_SIZE];
274 u8 *src = walk->src.virt.addr;
275 u8 *dst = walk->dst.virt.addr;
276 unsigned int nbytes = walk->nbytes;
277
278 twofish_enc_blk(ctx, keystream, ctrblk);
279 crypto_xor(keystream, src, nbytes);
280 memcpy(dst, keystream, nbytes);
281
282 crypto_inc(ctrblk, TF_BLOCK_SIZE);
283}
284
285static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
286 struct blkcipher_walk *walk)
287{
288 struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
289 unsigned int bsize = TF_BLOCK_SIZE;
290 unsigned int nbytes = walk->nbytes;
291 u128 *src = (u128 *)walk->src.virt.addr;
292 u128 *dst = (u128 *)walk->dst.virt.addr;
293 u128 ctrblk;
294 be128 ctrblocks[3];
295
296 be128_to_u128(&ctrblk, (be128 *)walk->iv);
297
298 /* Process three block batch */
299 if (nbytes >= bsize * 3) {
300 do {
301 if (dst != src) {
302 dst[0] = src[0];
303 dst[1] = src[1];
304 dst[2] = src[2];
305 }
306
307 /* create ctrblks for parallel encrypt */
308 u128_to_be128(&ctrblocks[0], &ctrblk);
309 u128_inc(&ctrblk);
310 u128_to_be128(&ctrblocks[1], &ctrblk);
311 u128_inc(&ctrblk);
312 u128_to_be128(&ctrblocks[2], &ctrblk);
313 u128_inc(&ctrblk);
314
315 twofish_enc_blk_xor_3way(ctx, (u8 *)dst,
316 (u8 *)ctrblocks);
317
318 src += 3;
319 dst += 3;
320 nbytes -= bsize * 3;
321 } while (nbytes >= bsize * 3);
322
323 if (nbytes < bsize)
324 goto done;
325 }
326
327 /* Handle leftovers */
328 do {
329 if (dst != src)
330 *dst = *src;
331
332 u128_to_be128(&ctrblocks[0], &ctrblk);
333 u128_inc(&ctrblk);
334
335 twofish_enc_blk(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks);
336 u128_xor(dst, dst, (u128 *)ctrblocks);
337
338 src += 1;
339 dst += 1;
340 nbytes -= bsize;
341 } while (nbytes >= bsize);
342
343done:
344 u128_to_be128((be128 *)walk->iv, &ctrblk);
345 return nbytes;
346} 178}
347 179
348static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, 180static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
349 struct scatterlist *src, unsigned int nbytes) 181 struct scatterlist *src, unsigned int nbytes)
350{ 182{
351 struct blkcipher_walk walk; 183 return glue_ctr_crypt_128bit(&twofish_ctr, desc, dst, src, nbytes);
352 int err;
353
354 blkcipher_walk_init(&walk, dst, src, nbytes);
355 err = blkcipher_walk_virt_block(desc, &walk, TF_BLOCK_SIZE);
356
357 while ((nbytes = walk.nbytes) >= TF_BLOCK_SIZE) {
358 nbytes = __ctr_crypt(desc, &walk);
359 err = blkcipher_walk_done(desc, &walk, nbytes);
360 }
361
362 if (walk.nbytes) {
363 ctr_crypt_final(desc, &walk);
364 err = blkcipher_walk_done(desc, &walk, 0);
365 }
366
367 return err;
368} 184}
369 185
370static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) 186static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
@@ -397,13 +213,8 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
397 twofish_dec_blk(ctx, srcdst, srcdst); 213 twofish_dec_blk(ctx, srcdst, srcdst);
398} 214}
399 215
400struct twofish_lrw_ctx { 216int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
401 struct lrw_table_ctx lrw_table; 217 unsigned int keylen)
402 struct twofish_ctx twofish_ctx;
403};
404
405static int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
406 unsigned int keylen)
407{ 218{
408 struct twofish_lrw_ctx *ctx = crypto_tfm_ctx(tfm); 219 struct twofish_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
409 int err; 220 int err;
@@ -415,6 +226,7 @@ static int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
415 226
416 return lrw_init_table(&ctx->lrw_table, key + keylen - TF_BLOCK_SIZE); 227 return lrw_init_table(&ctx->lrw_table, key + keylen - TF_BLOCK_SIZE);
417} 228}
229EXPORT_SYMBOL_GPL(lrw_twofish_setkey);
418 230
419static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 231static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
420 struct scatterlist *src, unsigned int nbytes) 232 struct scatterlist *src, unsigned int nbytes)
@@ -450,20 +262,16 @@ static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
450 return lrw_crypt(desc, dst, src, nbytes, &req); 262 return lrw_crypt(desc, dst, src, nbytes, &req);
451} 263}
452 264
453static void lrw_exit_tfm(struct crypto_tfm *tfm) 265void lrw_twofish_exit_tfm(struct crypto_tfm *tfm)
454{ 266{
455 struct twofish_lrw_ctx *ctx = crypto_tfm_ctx(tfm); 267 struct twofish_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
456 268
457 lrw_free_table(&ctx->lrw_table); 269 lrw_free_table(&ctx->lrw_table);
458} 270}
271EXPORT_SYMBOL_GPL(lrw_twofish_exit_tfm);
459 272
460struct twofish_xts_ctx { 273int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
461 struct twofish_ctx tweak_ctx; 274 unsigned int keylen)
462 struct twofish_ctx crypt_ctx;
463};
464
465static int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
466 unsigned int keylen)
467{ 275{
468 struct twofish_xts_ctx *ctx = crypto_tfm_ctx(tfm); 276 struct twofish_xts_ctx *ctx = crypto_tfm_ctx(tfm);
469 u32 *flags = &tfm->crt_flags; 277 u32 *flags = &tfm->crt_flags;
@@ -486,6 +294,7 @@ static int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
486 return __twofish_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2, 294 return __twofish_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2,
487 flags); 295 flags);
488} 296}
297EXPORT_SYMBOL_GPL(xts_twofish_setkey);
489 298
490static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 299static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
491 struct scatterlist *src, unsigned int nbytes) 300 struct scatterlist *src, unsigned int nbytes)
@@ -596,7 +405,7 @@ static struct crypto_alg tf_algs[5] = { {
596 .cra_type = &crypto_blkcipher_type, 405 .cra_type = &crypto_blkcipher_type,
597 .cra_module = THIS_MODULE, 406 .cra_module = THIS_MODULE,
598 .cra_list = LIST_HEAD_INIT(tf_algs[3].cra_list), 407 .cra_list = LIST_HEAD_INIT(tf_algs[3].cra_list),
599 .cra_exit = lrw_exit_tfm, 408 .cra_exit = lrw_twofish_exit_tfm,
600 .cra_u = { 409 .cra_u = {
601 .blkcipher = { 410 .blkcipher = {
602 .min_keysize = TF_MIN_KEY_SIZE + TF_BLOCK_SIZE, 411 .min_keysize = TF_MIN_KEY_SIZE + TF_BLOCK_SIZE,
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 49331bedc158..70780689599a 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -75,23 +75,54 @@ static inline int alternatives_text_reserved(void *start, void *end)
75} 75}
76#endif /* CONFIG_SMP */ 76#endif /* CONFIG_SMP */
77 77
78#define OLDINSTR(oldinstr) "661:\n\t" oldinstr "\n662:\n"
79
80#define b_replacement(number) "663"#number
81#define e_replacement(number) "664"#number
82
83#define alt_slen "662b-661b"
84#define alt_rlen(number) e_replacement(number)"f-"b_replacement(number)"f"
85
86#define ALTINSTR_ENTRY(feature, number) \
87 " .long 661b - .\n" /* label */ \
88 " .long " b_replacement(number)"f - .\n" /* new instruction */ \
89 " .word " __stringify(feature) "\n" /* feature bit */ \
90 " .byte " alt_slen "\n" /* source len */ \
91 " .byte " alt_rlen(number) "\n" /* replacement len */
92
93#define DISCARD_ENTRY(number) /* rlen <= slen */ \
94 " .byte 0xff + (" alt_rlen(number) ") - (" alt_slen ")\n"
95
96#define ALTINSTR_REPLACEMENT(newinstr, feature, number) /* replacement */ \
97 b_replacement(number)":\n\t" newinstr "\n" e_replacement(number) ":\n\t"
98
78/* alternative assembly primitive: */ 99/* alternative assembly primitive: */
79#define ALTERNATIVE(oldinstr, newinstr, feature) \ 100#define ALTERNATIVE(oldinstr, newinstr, feature) \
80 \ 101 OLDINSTR(oldinstr) \
81 "661:\n\t" oldinstr "\n662:\n" \ 102 ".section .altinstructions,\"a\"\n" \
82 ".section .altinstructions,\"a\"\n" \ 103 ALTINSTR_ENTRY(feature, 1) \
83 " .long 661b - .\n" /* label */ \ 104 ".previous\n" \
84 " .long 663f - .\n" /* new instruction */ \ 105 ".section .discard,\"aw\",@progbits\n" \
85 " .word " __stringify(feature) "\n" /* feature bit */ \ 106 DISCARD_ENTRY(1) \
86 " .byte 662b-661b\n" /* sourcelen */ \ 107 ".previous\n" \
87 " .byte 664f-663f\n" /* replacementlen */ \ 108 ".section .altinstr_replacement, \"ax\"\n" \
88 ".previous\n" \ 109 ALTINSTR_REPLACEMENT(newinstr, feature, 1) \
89 ".section .discard,\"aw\",@progbits\n" \ 110 ".previous"
90 " .byte 0xff + (664f-663f) - (662b-661b)\n" /* rlen <= slen */ \ 111
91 ".previous\n" \ 112#define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\
92 ".section .altinstr_replacement, \"ax\"\n" \ 113 OLDINSTR(oldinstr) \
93 "663:\n\t" newinstr "\n664:\n" /* replacement */ \ 114 ".section .altinstructions,\"a\"\n" \
94 ".previous" 115 ALTINSTR_ENTRY(feature1, 1) \
116 ALTINSTR_ENTRY(feature2, 2) \
117 ".previous\n" \
118 ".section .discard,\"aw\",@progbits\n" \
119 DISCARD_ENTRY(1) \
120 DISCARD_ENTRY(2) \
121 ".previous\n" \
122 ".section .altinstr_replacement, \"ax\"\n" \
123 ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \
124 ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \
125 ".previous"
95 126
96/* 127/*
97 * This must be included *after* the definition of ALTERNATIVE due to 128 * This must be included *after* the definition of ALTERNATIVE due to
@@ -140,6 +171,19 @@ static inline int alternatives_text_reserved(void *start, void *end)
140 : output : [old] "i" (oldfunc), [new] "i" (newfunc), ## input) 171 : output : [old] "i" (oldfunc), [new] "i" (newfunc), ## input)
141 172
142/* 173/*
174 * Like alternative_call, but there are two features and respective functions.
175 * If CPU has feature2, function2 is used.
176 * Otherwise, if CPU has feature1, function1 is used.
177 * Otherwise, old function is used.
178 */
179#define alternative_call_2(oldfunc, newfunc1, feature1, newfunc2, feature2, \
180 output, input...) \
181 asm volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", feature1,\
182 "call %P[new2]", feature2) \
183 : output : [old] "i" (oldfunc), [new1] "i" (newfunc1), \
184 [new2] "i" (newfunc2), ## input)
185
186/*
143 * use this macro(s) if you need more than one output parameter 187 * use this macro(s) if you need more than one output parameter
144 * in alternative_io 188 * in alternative_io
145 */ 189 */
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 49ad773f4b9f..b3341e9cd8fd 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -26,10 +26,31 @@ struct amd_l3_cache {
26 u8 subcaches[4]; 26 u8 subcaches[4];
27}; 27};
28 28
29struct threshold_block {
30 unsigned int block;
31 unsigned int bank;
32 unsigned int cpu;
33 u32 address;
34 u16 interrupt_enable;
35 bool interrupt_capable;
36 u16 threshold_limit;
37 struct kobject kobj;
38 struct list_head miscj;
39};
40
41struct threshold_bank {
42 struct kobject *kobj;
43 struct threshold_block *blocks;
44
45 /* initialized to the number of CPUs on the node sharing this bank */
46 atomic_t cpus;
47};
48
29struct amd_northbridge { 49struct amd_northbridge {
30 struct pci_dev *misc; 50 struct pci_dev *misc;
31 struct pci_dev *link; 51 struct pci_dev *link;
32 struct amd_l3_cache l3_cache; 52 struct amd_l3_cache l3_cache;
53 struct threshold_bank *bank4;
33}; 54};
34 55
35struct amd_northbridge_info { 56struct amd_northbridge_info {
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index eaff4790ed96..f34261296ffb 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -306,7 +306,8 @@ struct apic {
306 unsigned long (*check_apicid_used)(physid_mask_t *map, int apicid); 306 unsigned long (*check_apicid_used)(physid_mask_t *map, int apicid);
307 unsigned long (*check_apicid_present)(int apicid); 307 unsigned long (*check_apicid_present)(int apicid);
308 308
309 void (*vector_allocation_domain)(int cpu, struct cpumask *retmask); 309 void (*vector_allocation_domain)(int cpu, struct cpumask *retmask,
310 const struct cpumask *mask);
310 void (*init_apic_ldr)(void); 311 void (*init_apic_ldr)(void);
311 312
312 void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap); 313 void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap);
@@ -331,9 +332,9 @@ struct apic {
331 unsigned long (*set_apic_id)(unsigned int id); 332 unsigned long (*set_apic_id)(unsigned int id);
332 unsigned long apic_id_mask; 333 unsigned long apic_id_mask;
333 334
334 unsigned int (*cpu_mask_to_apicid)(const struct cpumask *cpumask); 335 int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask,
335 unsigned int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask, 336 const struct cpumask *andmask,
336 const struct cpumask *andmask); 337 unsigned int *apicid);
337 338
338 /* ipi */ 339 /* ipi */
339 void (*send_IPI_mask)(const struct cpumask *mask, int vector); 340 void (*send_IPI_mask)(const struct cpumask *mask, int vector);
@@ -464,6 +465,8 @@ static inline u32 safe_apic_wait_icr_idle(void)
464 return apic->safe_wait_icr_idle(); 465 return apic->safe_wait_icr_idle();
465} 466}
466 467
468extern void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v));
469
467#else /* CONFIG_X86_LOCAL_APIC */ 470#else /* CONFIG_X86_LOCAL_APIC */
468 471
469static inline u32 apic_read(u32 reg) { return 0; } 472static inline u32 apic_read(u32 reg) { return 0; }
@@ -473,6 +476,7 @@ static inline u64 apic_icr_read(void) { return 0; }
473static inline void apic_icr_write(u32 low, u32 high) { } 476static inline void apic_icr_write(u32 low, u32 high) { }
474static inline void apic_wait_icr_idle(void) { } 477static inline void apic_wait_icr_idle(void) { }
475static inline u32 safe_apic_wait_icr_idle(void) { return 0; } 478static inline u32 safe_apic_wait_icr_idle(void) { return 0; }
479static inline void apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) {}
476 480
477#endif /* CONFIG_X86_LOCAL_APIC */ 481#endif /* CONFIG_X86_LOCAL_APIC */
478 482
@@ -537,7 +541,12 @@ static inline const struct cpumask *default_target_cpus(void)
537#endif 541#endif
538} 542}
539 543
540DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid); 544static inline const struct cpumask *online_target_cpus(void)
545{
546 return cpu_online_mask;
547}
548
549DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid);
541 550
542 551
543static inline unsigned int read_apic_id(void) 552static inline unsigned int read_apic_id(void)
@@ -586,21 +595,50 @@ static inline int default_phys_pkg_id(int cpuid_apic, int index_msb)
586 595
587#endif 596#endif
588 597
589static inline unsigned int 598static inline int
590default_cpu_mask_to_apicid(const struct cpumask *cpumask) 599flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
600 const struct cpumask *andmask,
601 unsigned int *apicid)
591{ 602{
592 return cpumask_bits(cpumask)[0] & APIC_ALL_CPUS; 603 unsigned long cpu_mask = cpumask_bits(cpumask)[0] &
604 cpumask_bits(andmask)[0] &
605 cpumask_bits(cpu_online_mask)[0] &
606 APIC_ALL_CPUS;
607
608 if (likely(cpu_mask)) {
609 *apicid = (unsigned int)cpu_mask;
610 return 0;
611 } else {
612 return -EINVAL;
613 }
593} 614}
594 615
595static inline unsigned int 616extern int
596default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, 617default_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
597 const struct cpumask *andmask) 618 const struct cpumask *andmask,
619 unsigned int *apicid);
620
621static inline void
622flat_vector_allocation_domain(int cpu, struct cpumask *retmask,
623 const struct cpumask *mask)
598{ 624{
599 unsigned long mask1 = cpumask_bits(cpumask)[0]; 625 /* Careful. Some cpus do not strictly honor the set of cpus
600 unsigned long mask2 = cpumask_bits(andmask)[0]; 626 * specified in the interrupt destination when using lowest
601 unsigned long mask3 = cpumask_bits(cpu_online_mask)[0]; 627 * priority interrupt delivery mode.
628 *
629 * In particular there was a hyperthreading cpu observed to
630 * deliver interrupts to the wrong hyperthread when only one
631 * hyperthread was specified in the interrupt desitination.
632 */
633 cpumask_clear(retmask);
634 cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
635}
602 636
603 return (unsigned int)(mask1 & mask2 & mask3); 637static inline void
638default_vector_allocation_domain(int cpu, struct cpumask *retmask,
639 const struct cpumask *mask)
640{
641 cpumask_copy(retmask, cpumask_of(cpu));
604} 642}
605 643
606static inline unsigned long default_check_apicid_used(physid_mask_t *map, int apicid) 644static inline unsigned long default_check_apicid_used(physid_mask_t *map, int apicid)
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index a6983b277220..72f5009deb5a 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -264,6 +264,13 @@ static inline int test_and_clear_bit(int nr, volatile unsigned long *addr)
264 * This operation is non-atomic and can be reordered. 264 * This operation is non-atomic and can be reordered.
265 * If two examples of this operation race, one can appear to succeed 265 * If two examples of this operation race, one can appear to succeed
266 * but actually fail. You must protect multiple accesses with a lock. 266 * but actually fail. You must protect multiple accesses with a lock.
267 *
268 * Note: the operation is performed atomically with respect to
269 * the local CPU, but not other CPUs. Portable code should not
270 * rely on this behaviour.
271 * KVM relies on this behaviour on x86 for modifying memory that is also
272 * accessed from a hypervisor on the same CPU if running in a VM: don't change
273 * this without also updating arch/x86/kernel/kvm.c
267 */ 274 */
268static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) 275static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
269{ 276{
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
index eb45aa6b1f27..2ad874cb661c 100644
--- a/arch/x86/include/asm/bootparam.h
+++ b/arch/x86/include/asm/bootparam.h
@@ -66,6 +66,7 @@ struct setup_header {
66 __u64 setup_data; 66 __u64 setup_data;
67 __u64 pref_address; 67 __u64 pref_address;
68 __u32 init_size; 68 __u32 init_size;
69 __u32 handover_offset;
69} __attribute__((packed)); 70} __attribute__((packed));
70 71
71struct sys_desc_table { 72struct sys_desc_table {
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index f91e80f4f180..6b7ee5ff6820 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -207,6 +207,8 @@
207#define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */ 207#define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */
208#define X86_FEATURE_INVPCID (9*32+10) /* Invalidate Processor Context ID */ 208#define X86_FEATURE_INVPCID (9*32+10) /* Invalidate Processor Context ID */
209#define X86_FEATURE_RTM (9*32+11) /* Restricted Transactional Memory */ 209#define X86_FEATURE_RTM (9*32+11) /* Restricted Transactional Memory */
210#define X86_FEATURE_RDSEED (9*32+18) /* The RDSEED instruction */
211#define X86_FEATURE_ADX (9*32+19) /* The ADCX and ADOX instructions */
210 212
211#if defined(__KERNEL__) && !defined(__ASSEMBLY__) 213#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
212 214
diff --git a/arch/x86/include/asm/crypto/ablk_helper.h b/arch/x86/include/asm/crypto/ablk_helper.h
new file mode 100644
index 000000000000..4f93df50c23e
--- /dev/null
+++ b/arch/x86/include/asm/crypto/ablk_helper.h
@@ -0,0 +1,31 @@
1/*
2 * Shared async block cipher helpers
3 */
4
5#ifndef _CRYPTO_ABLK_HELPER_H
6#define _CRYPTO_ABLK_HELPER_H
7
8#include <linux/crypto.h>
9#include <linux/kernel.h>
10#include <crypto/cryptd.h>
11
12struct async_helper_ctx {
13 struct cryptd_ablkcipher *cryptd_tfm;
14};
15
16extern int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
17 unsigned int key_len);
18
19extern int __ablk_encrypt(struct ablkcipher_request *req);
20
21extern int ablk_encrypt(struct ablkcipher_request *req);
22
23extern int ablk_decrypt(struct ablkcipher_request *req);
24
25extern void ablk_exit(struct crypto_tfm *tfm);
26
27extern int ablk_init_common(struct crypto_tfm *tfm, const char *drv_name);
28
29extern int ablk_init(struct crypto_tfm *tfm);
30
31#endif /* _CRYPTO_ABLK_HELPER_H */
diff --git a/arch/x86/include/asm/aes.h b/arch/x86/include/asm/crypto/aes.h
index 80545a1cbe39..80545a1cbe39 100644
--- a/arch/x86/include/asm/aes.h
+++ b/arch/x86/include/asm/crypto/aes.h
diff --git a/arch/x86/include/asm/crypto/glue_helper.h b/arch/x86/include/asm/crypto/glue_helper.h
new file mode 100644
index 000000000000..3e408bddc96f
--- /dev/null
+++ b/arch/x86/include/asm/crypto/glue_helper.h
@@ -0,0 +1,115 @@
1/*
2 * Shared glue code for 128bit block ciphers
3 */
4
5#ifndef _CRYPTO_GLUE_HELPER_H
6#define _CRYPTO_GLUE_HELPER_H
7
8#include <linux/kernel.h>
9#include <linux/crypto.h>
10#include <asm/i387.h>
11#include <crypto/b128ops.h>
12
13typedef void (*common_glue_func_t)(void *ctx, u8 *dst, const u8 *src);
14typedef void (*common_glue_cbc_func_t)(void *ctx, u128 *dst, const u128 *src);
15typedef void (*common_glue_ctr_func_t)(void *ctx, u128 *dst, const u128 *src,
16 u128 *iv);
17
18#define GLUE_FUNC_CAST(fn) ((common_glue_func_t)(fn))
19#define GLUE_CBC_FUNC_CAST(fn) ((common_glue_cbc_func_t)(fn))
20#define GLUE_CTR_FUNC_CAST(fn) ((common_glue_ctr_func_t)(fn))
21
22struct common_glue_func_entry {
23 unsigned int num_blocks; /* number of blocks that @fn will process */
24 union {
25 common_glue_func_t ecb;
26 common_glue_cbc_func_t cbc;
27 common_glue_ctr_func_t ctr;
28 } fn_u;
29};
30
31struct common_glue_ctx {
32 unsigned int num_funcs;
33 int fpu_blocks_limit; /* -1 means fpu not needed at all */
34
35 /*
36 * First funcs entry must have largest num_blocks and last funcs entry
37 * must have num_blocks == 1!
38 */
39 struct common_glue_func_entry funcs[];
40};
41
42static inline bool glue_fpu_begin(unsigned int bsize, int fpu_blocks_limit,
43 struct blkcipher_desc *desc,
44 bool fpu_enabled, unsigned int nbytes)
45{
46 if (likely(fpu_blocks_limit < 0))
47 return false;
48
49 if (fpu_enabled)
50 return true;
51
52 /*
53 * Vector-registers are only used when chunk to be processed is large
54 * enough, so do not enable FPU until it is necessary.
55 */
56 if (nbytes < bsize * (unsigned int)fpu_blocks_limit)
57 return false;
58
59 if (desc) {
60 /* prevent sleeping if FPU is in use */
61 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
62 }
63
64 kernel_fpu_begin();
65 return true;
66}
67
68static inline void glue_fpu_end(bool fpu_enabled)
69{
70 if (fpu_enabled)
71 kernel_fpu_end();
72}
73
74static inline void u128_to_be128(be128 *dst, const u128 *src)
75{
76 dst->a = cpu_to_be64(src->a);
77 dst->b = cpu_to_be64(src->b);
78}
79
80static inline void be128_to_u128(u128 *dst, const be128 *src)
81{
82 dst->a = be64_to_cpu(src->a);
83 dst->b = be64_to_cpu(src->b);
84}
85
86static inline void u128_inc(u128 *i)
87{
88 i->b++;
89 if (!i->b)
90 i->a++;
91}
92
93extern int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
94 struct blkcipher_desc *desc,
95 struct scatterlist *dst,
96 struct scatterlist *src, unsigned int nbytes);
97
98extern int glue_cbc_encrypt_128bit(const common_glue_func_t fn,
99 struct blkcipher_desc *desc,
100 struct scatterlist *dst,
101 struct scatterlist *src,
102 unsigned int nbytes);
103
104extern int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
105 struct blkcipher_desc *desc,
106 struct scatterlist *dst,
107 struct scatterlist *src,
108 unsigned int nbytes);
109
110extern int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
111 struct blkcipher_desc *desc,
112 struct scatterlist *dst,
113 struct scatterlist *src, unsigned int nbytes);
114
115#endif /* _CRYPTO_GLUE_HELPER_H */
diff --git a/arch/x86/include/asm/crypto/serpent-avx.h b/arch/x86/include/asm/crypto/serpent-avx.h
new file mode 100644
index 000000000000..432deedd2945
--- /dev/null
+++ b/arch/x86/include/asm/crypto/serpent-avx.h
@@ -0,0 +1,32 @@
1#ifndef ASM_X86_SERPENT_AVX_H
2#define ASM_X86_SERPENT_AVX_H
3
4#include <linux/crypto.h>
5#include <crypto/serpent.h>
6
7#define SERPENT_PARALLEL_BLOCKS 8
8
9asmlinkage void __serpent_enc_blk_8way_avx(struct serpent_ctx *ctx, u8 *dst,
10 const u8 *src, bool xor);
11asmlinkage void serpent_dec_blk_8way_avx(struct serpent_ctx *ctx, u8 *dst,
12 const u8 *src);
13
14static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst,
15 const u8 *src)
16{
17 __serpent_enc_blk_8way_avx(ctx, dst, src, false);
18}
19
20static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst,
21 const u8 *src)
22{
23 __serpent_enc_blk_8way_avx(ctx, dst, src, true);
24}
25
26static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst,
27 const u8 *src)
28{
29 serpent_dec_blk_8way_avx(ctx, dst, src);
30}
31
32#endif
diff --git a/arch/x86/include/asm/serpent.h b/arch/x86/include/asm/crypto/serpent-sse2.h
index d3ef63fe0c81..e6e77dffbdab 100644
--- a/arch/x86/include/asm/serpent.h
+++ b/arch/x86/include/asm/crypto/serpent-sse2.h
@@ -1,5 +1,5 @@
1#ifndef ASM_X86_SERPENT_H 1#ifndef ASM_X86_SERPENT_SSE2_H
2#define ASM_X86_SERPENT_H 2#define ASM_X86_SERPENT_SSE2_H
3 3
4#include <linux/crypto.h> 4#include <linux/crypto.h>
5#include <crypto/serpent.h> 5#include <crypto/serpent.h>
diff --git a/arch/x86/include/asm/crypto/twofish.h b/arch/x86/include/asm/crypto/twofish.h
new file mode 100644
index 000000000000..9d2c514bd5f9
--- /dev/null
+++ b/arch/x86/include/asm/crypto/twofish.h
@@ -0,0 +1,46 @@
1#ifndef ASM_X86_TWOFISH_H
2#define ASM_X86_TWOFISH_H
3
4#include <linux/crypto.h>
5#include <crypto/twofish.h>
6#include <crypto/lrw.h>
7#include <crypto/b128ops.h>
8
9struct twofish_lrw_ctx {
10 struct lrw_table_ctx lrw_table;
11 struct twofish_ctx twofish_ctx;
12};
13
14struct twofish_xts_ctx {
15 struct twofish_ctx tweak_ctx;
16 struct twofish_ctx crypt_ctx;
17};
18
19/* regular block cipher functions from twofish_x86_64 module */
20asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst,
21 const u8 *src);
22asmlinkage void twofish_dec_blk(struct twofish_ctx *ctx, u8 *dst,
23 const u8 *src);
24
25/* 3-way parallel cipher functions */
26asmlinkage void __twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
27 const u8 *src, bool xor);
28asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst,
29 const u8 *src);
30
31/* helpers from twofish_x86_64-3way module */
32extern void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src);
33extern void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src,
34 u128 *iv);
35extern void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src,
36 u128 *iv);
37
38extern int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
39 unsigned int keylen);
40
41extern void lrw_twofish_exit_tfm(struct crypto_tfm *tfm);
42
43extern int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
44 unsigned int keylen);
45
46#endif /* ASM_X86_TWOFISH_H */
diff --git a/arch/x86/include/asm/emergency-restart.h b/arch/x86/include/asm/emergency-restart.h
index cc70c1c78ca4..75ce3f47d204 100644
--- a/arch/x86/include/asm/emergency-restart.h
+++ b/arch/x86/include/asm/emergency-restart.h
@@ -4,9 +4,7 @@
4enum reboot_type { 4enum reboot_type {
5 BOOT_TRIPLE = 't', 5 BOOT_TRIPLE = 't',
6 BOOT_KBD = 'k', 6 BOOT_KBD = 'k',
7#ifdef CONFIG_X86_32
8 BOOT_BIOS = 'b', 7 BOOT_BIOS = 'b',
9#endif
10 BOOT_ACPI = 'a', 8 BOOT_ACPI = 'a',
11 BOOT_EFI = 'e', 9 BOOT_EFI = 'e',
12 BOOT_CF9 = 'p', 10 BOOT_CF9 = 'p',
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 0baa628e330c..40afa0005c69 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -15,15 +15,6 @@ BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)
15BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR) 15BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR)
16BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR) 16BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
17BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR) 17BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR)
18
19.irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
20 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
21.if NUM_INVALIDATE_TLB_VECTORS > \idx
22BUILD_INTERRUPT3(invalidate_interrupt\idx,
23 (INVALIDATE_TLB_VECTOR_START)+\idx,
24 smp_invalidate_interrupt)
25.endif
26.endr
27#endif 18#endif
28 19
29BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) 20BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)
diff --git a/arch/x86/include/asm/floppy.h b/arch/x86/include/asm/floppy.h
index dbe82a5c5eac..d3d74698dce9 100644
--- a/arch/x86/include/asm/floppy.h
+++ b/arch/x86/include/asm/floppy.h
@@ -99,7 +99,7 @@ static irqreturn_t floppy_hardint(int irq, void *dev_id)
99 virtual_dma_residue += virtual_dma_count; 99 virtual_dma_residue += virtual_dma_count;
100 virtual_dma_count = 0; 100 virtual_dma_count = 0;
101#ifdef TRACE_FLPY_INT 101#ifdef TRACE_FLPY_INT
102 printk("count=%x, residue=%x calls=%d bytes=%d dma_wait=%d\n", 102 printk(KERN_DEBUG "count=%x, residue=%x calls=%d bytes=%d dma_wait=%d\n",
103 virtual_dma_count, virtual_dma_residue, calls, bytes, 103 virtual_dma_count, virtual_dma_residue, calls, bytes,
104 dma_wait); 104 dma_wait);
105 calls = 0; 105 calls = 0;
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index 7a15153c675d..b518c7509933 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -49,6 +49,7 @@ extern const struct hypervisor_x86 *x86_hyper;
49extern const struct hypervisor_x86 x86_hyper_vmware; 49extern const struct hypervisor_x86 x86_hyper_vmware;
50extern const struct hypervisor_x86 x86_hyper_ms_hyperv; 50extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
51extern const struct hypervisor_x86 x86_hyper_xen_hvm; 51extern const struct hypervisor_x86 x86_hyper_xen_hvm;
52extern const struct hypervisor_x86 x86_hyper_kvm;
52 53
53static inline bool hypervisor_x2apic_available(void) 54static inline bool hypervisor_x2apic_available(void)
54{ 55{
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index dffc38ee6255..345c99cef152 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -5,7 +5,6 @@ extern struct dma_map_ops nommu_dma_ops;
5extern int force_iommu, no_iommu; 5extern int force_iommu, no_iommu;
6extern int iommu_detected; 6extern int iommu_detected;
7extern int iommu_pass_through; 7extern int iommu_pass_through;
8extern int iommu_group_mf;
9 8
10/* 10 seconds */ 9/* 10 seconds */
11#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) 10#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000)
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 4b4448761e88..1508e518c7e3 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -119,17 +119,6 @@
119 */ 119 */
120#define LOCAL_TIMER_VECTOR 0xef 120#define LOCAL_TIMER_VECTOR 0xef
121 121
122/* up to 32 vectors used for spreading out TLB flushes: */
123#if NR_CPUS <= 32
124# define NUM_INVALIDATE_TLB_VECTORS (NR_CPUS)
125#else
126# define NUM_INVALIDATE_TLB_VECTORS (32)
127#endif
128
129#define INVALIDATE_TLB_VECTOR_END (0xee)
130#define INVALIDATE_TLB_VECTOR_START \
131 (INVALIDATE_TLB_VECTOR_END-NUM_INVALIDATE_TLB_VECTORS+1)
132
133#define NR_VECTORS 256 122#define NR_VECTORS 256
134 123
135#define FPU_IRQ 13 124#define FPU_IRQ 13
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index e7d1c194d272..246617efd67f 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -12,6 +12,7 @@
12/* Select x86 specific features in <linux/kvm.h> */ 12/* Select x86 specific features in <linux/kvm.h> */
13#define __KVM_HAVE_PIT 13#define __KVM_HAVE_PIT
14#define __KVM_HAVE_IOAPIC 14#define __KVM_HAVE_IOAPIC
15#define __KVM_HAVE_IRQ_LINE
15#define __KVM_HAVE_DEVICE_ASSIGNMENT 16#define __KVM_HAVE_DEVICE_ASSIGNMENT
16#define __KVM_HAVE_MSI 17#define __KVM_HAVE_MSI
17#define __KVM_HAVE_USER_NMI 18#define __KVM_HAVE_USER_NMI
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 1ac46c22dd50..c764f43b71c5 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -192,8 +192,8 @@ struct x86_emulate_ops {
192 struct x86_instruction_info *info, 192 struct x86_instruction_info *info,
193 enum x86_intercept_stage stage); 193 enum x86_intercept_stage stage);
194 194
195 bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, 195 void (*get_cpuid)(struct x86_emulate_ctxt *ctxt,
196 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); 196 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
197}; 197};
198 198
199typedef u32 __attribute__((vector_size(16))) sse128_t; 199typedef u32 __attribute__((vector_size(16))) sse128_t;
@@ -280,9 +280,9 @@ struct x86_emulate_ctxt {
280 u8 modrm_seg; 280 u8 modrm_seg;
281 bool rip_relative; 281 bool rip_relative;
282 unsigned long _eip; 282 unsigned long _eip;
283 struct operand memop;
283 /* Fields above regs are cleared together. */ 284 /* Fields above regs are cleared together. */
284 unsigned long regs[NR_VCPU_REGS]; 285 unsigned long regs[NR_VCPU_REGS];
285 struct operand memop;
286 struct operand *memopp; 286 struct operand *memopp;
287 struct fetch_cache fetch; 287 struct fetch_cache fetch;
288 struct read_cache io_read; 288 struct read_cache io_read;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index db7c1f2709a2..09155d64cf7e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -48,12 +48,13 @@
48 48
49#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) 49#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
50#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) 50#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
51#define CR3_PCID_ENABLED_RESERVED_BITS 0xFFFFFF0000000000ULL
51#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ 52#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \
52 0xFFFFFF0000000000ULL) 53 0xFFFFFF0000000000ULL)
53#define CR4_RESERVED_BITS \ 54#define CR4_RESERVED_BITS \
54 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ 55 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
55 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ 56 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
56 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ 57 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \
57 | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_RDWRGSFS \ 58 | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_RDWRGSFS \
58 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) 59 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
59 60
@@ -175,6 +176,13 @@ enum {
175 176
176/* apic attention bits */ 177/* apic attention bits */
177#define KVM_APIC_CHECK_VAPIC 0 178#define KVM_APIC_CHECK_VAPIC 0
179/*
180 * The following bit is set with PV-EOI, unset on EOI.
181 * We detect PV-EOI changes by guest by comparing
182 * this bit with PV-EOI in guest memory.
183 * See the implementation in apic_update_pv_eoi.
184 */
185#define KVM_APIC_PV_EOI_PENDING 1
178 186
179/* 187/*
180 * We don't want allocation failures within the mmu code, so we preallocate 188 * We don't want allocation failures within the mmu code, so we preallocate
@@ -313,8 +321,8 @@ struct kvm_pmu {
313 u64 counter_bitmask[2]; 321 u64 counter_bitmask[2];
314 u64 global_ctrl_mask; 322 u64 global_ctrl_mask;
315 u8 version; 323 u8 version;
316 struct kvm_pmc gp_counters[X86_PMC_MAX_GENERIC]; 324 struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC];
317 struct kvm_pmc fixed_counters[X86_PMC_MAX_FIXED]; 325 struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED];
318 struct irq_work irq_work; 326 struct irq_work irq_work;
319 u64 reprogram_pmi; 327 u64 reprogram_pmi;
320}; 328};
@@ -484,6 +492,11 @@ struct kvm_vcpu_arch {
484 u64 length; 492 u64 length;
485 u64 status; 493 u64 status;
486 } osvw; 494 } osvw;
495
496 struct {
497 u64 msr_val;
498 struct gfn_to_hva_cache data;
499 } pv_eoi;
487}; 500};
488 501
489struct kvm_lpage_info { 502struct kvm_lpage_info {
@@ -661,6 +674,7 @@ struct kvm_x86_ops {
661 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); 674 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
662 int (*get_lpage_level)(void); 675 int (*get_lpage_level)(void);
663 bool (*rdtscp_supported)(void); 676 bool (*rdtscp_supported)(void);
677 bool (*invpcid_supported)(void);
664 void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment, bool host); 678 void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment, bool host);
665 679
666 void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); 680 void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
@@ -802,7 +816,20 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
802void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); 816void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
803bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); 817bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
804 818
805int kvm_pic_set_irq(void *opaque, int irq, int level); 819static inline int __kvm_irq_line_state(unsigned long *irq_state,
820 int irq_source_id, int level)
821{
822 /* Logical OR for level trig interrupt */
823 if (level)
824 __set_bit(irq_source_id, irq_state);
825 else
826 __clear_bit(irq_source_id, irq_state);
827
828 return !!(*irq_state);
829}
830
831int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level);
832void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
806 833
807void kvm_inject_nmi(struct kvm_vcpu *vcpu); 834void kvm_inject_nmi(struct kvm_vcpu *vcpu);
808 835
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 63ab1661d00e..2f7712e08b1e 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -22,6 +22,7 @@
22#define KVM_FEATURE_CLOCKSOURCE2 3 22#define KVM_FEATURE_CLOCKSOURCE2 3
23#define KVM_FEATURE_ASYNC_PF 4 23#define KVM_FEATURE_ASYNC_PF 4
24#define KVM_FEATURE_STEAL_TIME 5 24#define KVM_FEATURE_STEAL_TIME 5
25#define KVM_FEATURE_PV_EOI 6
25 26
26/* The last 8 bits are used to indicate how to interpret the flags field 27/* The last 8 bits are used to indicate how to interpret the flags field
27 * in pvclock structure. If no bits are set, all flags are ignored. 28 * in pvclock structure. If no bits are set, all flags are ignored.
@@ -37,6 +38,7 @@
37#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 38#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
38#define MSR_KVM_ASYNC_PF_EN 0x4b564d02 39#define MSR_KVM_ASYNC_PF_EN 0x4b564d02
39#define MSR_KVM_STEAL_TIME 0x4b564d03 40#define MSR_KVM_STEAL_TIME 0x4b564d03
41#define MSR_KVM_PV_EOI_EN 0x4b564d04
40 42
41struct kvm_steal_time { 43struct kvm_steal_time {
42 __u64 steal; 44 __u64 steal;
@@ -89,6 +91,11 @@ struct kvm_vcpu_pv_apf_data {
89 __u32 enabled; 91 __u32 enabled;
90}; 92};
91 93
94#define KVM_PV_EOI_BIT 0
95#define KVM_PV_EOI_MASK (0x1 << KVM_PV_EOI_BIT)
96#define KVM_PV_EOI_ENABLED KVM_PV_EOI_MASK
97#define KVM_PV_EOI_DISABLED 0x0
98
92#ifdef __KERNEL__ 99#ifdef __KERNEL__
93#include <asm/processor.h> 100#include <asm/processor.h>
94 101
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 084ef95274cd..813ed103f45e 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -115,8 +115,8 @@ notrace static inline int native_write_msr_safe(unsigned int msr,
115 115
116extern unsigned long long native_read_tsc(void); 116extern unsigned long long native_read_tsc(void);
117 117
118extern int native_rdmsr_safe_regs(u32 regs[8]); 118extern int rdmsr_safe_regs(u32 regs[8]);
119extern int native_wrmsr_safe_regs(u32 regs[8]); 119extern int wrmsr_safe_regs(u32 regs[8]);
120 120
121static __always_inline unsigned long long __native_read_tsc(void) 121static __always_inline unsigned long long __native_read_tsc(void)
122{ 122{
@@ -187,43 +187,6 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
187 return err; 187 return err;
188} 188}
189 189
190static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
191{
192 u32 gprs[8] = { 0 };
193 int err;
194
195 gprs[1] = msr;
196 gprs[7] = 0x9c5a203a;
197
198 err = native_rdmsr_safe_regs(gprs);
199
200 *p = gprs[0] | ((u64)gprs[2] << 32);
201
202 return err;
203}
204
205static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val)
206{
207 u32 gprs[8] = { 0 };
208
209 gprs[0] = (u32)val;
210 gprs[1] = msr;
211 gprs[2] = val >> 32;
212 gprs[7] = 0x9c5a203a;
213
214 return native_wrmsr_safe_regs(gprs);
215}
216
217static inline int rdmsr_safe_regs(u32 regs[8])
218{
219 return native_rdmsr_safe_regs(regs);
220}
221
222static inline int wrmsr_safe_regs(u32 regs[8])
223{
224 return native_wrmsr_safe_regs(regs);
225}
226
227#define rdtscl(low) \ 190#define rdtscl(low) \
228 ((low) = (u32)__native_read_tsc()) 191 ((low) = (u32)__native_read_tsc())
229 192
@@ -237,6 +200,8 @@ do { \
237 (high) = (u32)(_l >> 32); \ 200 (high) = (u32)(_l >> 32); \
238} while (0) 201} while (0)
239 202
203#define rdpmcl(counter, val) ((val) = native_read_pmc(counter))
204
240#define rdtscp(low, high, aux) \ 205#define rdtscp(low, high, aux) \
241do { \ 206do { \
242 unsigned long long _val = native_read_tscp(&(aux)); \ 207 unsigned long long _val = native_read_tscp(&(aux)); \
@@ -248,8 +213,7 @@ do { \
248 213
249#endif /* !CONFIG_PARAVIRT */ 214#endif /* !CONFIG_PARAVIRT */
250 215
251 216#define wrmsrl_safe(msr, val) wrmsr_safe((msr), (u32)(val), \
252#define checking_wrmsrl(msr, val) wrmsr_safe((msr), (u32)(val), \
253 (u32)((val) >> 32)) 217 (u32)((val) >> 32))
254 218
255#define write_tsc(val1, val2) wrmsr(MSR_IA32_TSC, (val1), (val2)) 219#define write_tsc(val1, val2) wrmsr(MSR_IA32_TSC, (val1), (val2))
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index dc580c42851c..c0fa356e90de 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -44,28 +44,14 @@ struct nmiaction {
44 const char *name; 44 const char *name;
45}; 45};
46 46
47#define register_nmi_handler(t, fn, fg, n) \ 47#define register_nmi_handler(t, fn, fg, n, init...) \
48({ \ 48({ \
49 static struct nmiaction fn##_na = { \ 49 static struct nmiaction init fn##_na = { \
50 .handler = (fn), \ 50 .handler = (fn), \
51 .name = (n), \ 51 .name = (n), \
52 .flags = (fg), \ 52 .flags = (fg), \
53 }; \ 53 }; \
54 __register_nmi_handler((t), &fn##_na); \ 54 __register_nmi_handler((t), &fn##_na); \
55})
56
57/*
58 * For special handlers that register/unregister in the
59 * init section only. This should be considered rare.
60 */
61#define register_nmi_handler_initonly(t, fn, fg, n) \
62({ \
63 static struct nmiaction fn##_na __initdata = { \
64 .handler = (fn), \
65 .name = (n), \
66 .flags = (fg), \
67 }; \
68 __register_nmi_handler((t), &fn##_na); \
69}) 55})
70 56
71int __register_nmi_handler(unsigned int, struct nmiaction *); 57int __register_nmi_handler(unsigned int, struct nmiaction *);
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 6cbbabf52707..a0facf3908d7 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -128,21 +128,11 @@ static inline u64 paravirt_read_msr(unsigned msr, int *err)
128 return PVOP_CALL2(u64, pv_cpu_ops.read_msr, msr, err); 128 return PVOP_CALL2(u64, pv_cpu_ops.read_msr, msr, err);
129} 129}
130 130
131static inline int paravirt_rdmsr_regs(u32 *regs)
132{
133 return PVOP_CALL1(int, pv_cpu_ops.rdmsr_regs, regs);
134}
135
136static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high) 131static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high)
137{ 132{
138 return PVOP_CALL3(int, pv_cpu_ops.write_msr, msr, low, high); 133 return PVOP_CALL3(int, pv_cpu_ops.write_msr, msr, low, high);
139} 134}
140 135
141static inline int paravirt_wrmsr_regs(u32 *regs)
142{
143 return PVOP_CALL1(int, pv_cpu_ops.wrmsr_regs, regs);
144}
145
146/* These should all do BUG_ON(_err), but our headers are too tangled. */ 136/* These should all do BUG_ON(_err), but our headers are too tangled. */
147#define rdmsr(msr, val1, val2) \ 137#define rdmsr(msr, val1, val2) \
148do { \ 138do { \
@@ -176,9 +166,6 @@ do { \
176 _err; \ 166 _err; \
177}) 167})
178 168
179#define rdmsr_safe_regs(regs) paravirt_rdmsr_regs(regs)
180#define wrmsr_safe_regs(regs) paravirt_wrmsr_regs(regs)
181
182static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) 169static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
183{ 170{
184 int err; 171 int err;
@@ -186,32 +173,6 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
186 *p = paravirt_read_msr(msr, &err); 173 *p = paravirt_read_msr(msr, &err);
187 return err; 174 return err;
188} 175}
189static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
190{
191 u32 gprs[8] = { 0 };
192 int err;
193
194 gprs[1] = msr;
195 gprs[7] = 0x9c5a203a;
196
197 err = paravirt_rdmsr_regs(gprs);
198
199 *p = gprs[0] | ((u64)gprs[2] << 32);
200
201 return err;
202}
203
204static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val)
205{
206 u32 gprs[8] = { 0 };
207
208 gprs[0] = (u32)val;
209 gprs[1] = msr;
210 gprs[2] = val >> 32;
211 gprs[7] = 0x9c5a203a;
212
213 return paravirt_wrmsr_regs(gprs);
214}
215 176
216static inline u64 paravirt_read_tsc(void) 177static inline u64 paravirt_read_tsc(void)
217{ 178{
@@ -252,6 +213,8 @@ do { \
252 high = _l >> 32; \ 213 high = _l >> 32; \
253} while (0) 214} while (0)
254 215
216#define rdpmcl(counter, val) ((val) = paravirt_read_pmc(counter))
217
255static inline unsigned long long paravirt_rdtscp(unsigned int *aux) 218static inline unsigned long long paravirt_rdtscp(unsigned int *aux)
256{ 219{
257 return PVOP_CALL1(u64, pv_cpu_ops.read_tscp, aux); 220 return PVOP_CALL1(u64, pv_cpu_ops.read_tscp, aux);
@@ -397,9 +360,10 @@ static inline void __flush_tlb_single(unsigned long addr)
397 360
398static inline void flush_tlb_others(const struct cpumask *cpumask, 361static inline void flush_tlb_others(const struct cpumask *cpumask,
399 struct mm_struct *mm, 362 struct mm_struct *mm,
400 unsigned long va) 363 unsigned long start,
364 unsigned long end)
401{ 365{
402 PVOP_VCALL3(pv_mmu_ops.flush_tlb_others, cpumask, mm, va); 366 PVOP_VCALL4(pv_mmu_ops.flush_tlb_others, cpumask, mm, start, end);
403} 367}
404 368
405static inline int paravirt_pgd_alloc(struct mm_struct *mm) 369static inline int paravirt_pgd_alloc(struct mm_struct *mm)
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 8e8b9a4987ee..142236ed83af 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -153,9 +153,7 @@ struct pv_cpu_ops {
153 /* MSR, PMC and TSR operations. 153 /* MSR, PMC and TSR operations.
154 err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */ 154 err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */
155 u64 (*read_msr)(unsigned int msr, int *err); 155 u64 (*read_msr)(unsigned int msr, int *err);
156 int (*rdmsr_regs)(u32 *regs);
157 int (*write_msr)(unsigned int msr, unsigned low, unsigned high); 156 int (*write_msr)(unsigned int msr, unsigned low, unsigned high);
158 int (*wrmsr_regs)(u32 *regs);
159 157
160 u64 (*read_tsc)(void); 158 u64 (*read_tsc)(void);
161 u64 (*read_pmc)(int counter); 159 u64 (*read_pmc)(int counter);
@@ -250,7 +248,8 @@ struct pv_mmu_ops {
250 void (*flush_tlb_single)(unsigned long addr); 248 void (*flush_tlb_single)(unsigned long addr);
251 void (*flush_tlb_others)(const struct cpumask *cpus, 249 void (*flush_tlb_others)(const struct cpumask *cpus,
252 struct mm_struct *mm, 250 struct mm_struct *mm,
253 unsigned long va); 251 unsigned long start,
252 unsigned long end);
254 253
255 /* Hooks for allocating and freeing a pagetable top-level */ 254 /* Hooks for allocating and freeing a pagetable top-level */
256 int (*pgd_alloc)(struct mm_struct *mm); 255 int (*pgd_alloc)(struct mm_struct *mm);
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index b3a531746026..73e8eeff22ee 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -7,9 +7,13 @@
7#undef DEBUG 7#undef DEBUG
8 8
9#ifdef DEBUG 9#ifdef DEBUG
10#define DBG(x...) printk(x) 10#define DBG(fmt, ...) printk(fmt, ##__VA_ARGS__)
11#else 11#else
12#define DBG(x...) 12#define DBG(fmt, ...) \
13do { \
14 if (0) \
15 printk(fmt, ##__VA_ARGS__); \
16} while (0)
13#endif 17#endif
14 18
15#define PCI_PROBE_BIOS 0x0001 19#define PCI_PROBE_BIOS 0x0001
@@ -100,6 +104,7 @@ struct pci_raw_ops {
100extern const struct pci_raw_ops *raw_pci_ops; 104extern const struct pci_raw_ops *raw_pci_ops;
101extern const struct pci_raw_ops *raw_pci_ext_ops; 105extern const struct pci_raw_ops *raw_pci_ext_ops;
102 106
107extern const struct pci_raw_ops pci_mmcfg;
103extern const struct pci_raw_ops pci_direct_conf1; 108extern const struct pci_raw_ops pci_direct_conf1;
104extern bool port_cf9_safe; 109extern bool port_cf9_safe;
105 110
@@ -135,6 +140,12 @@ struct pci_mmcfg_region {
135 140
136extern int __init pci_mmcfg_arch_init(void); 141extern int __init pci_mmcfg_arch_init(void);
137extern void __init pci_mmcfg_arch_free(void); 142extern void __init pci_mmcfg_arch_free(void);
143extern int __devinit pci_mmcfg_arch_map(struct pci_mmcfg_region *cfg);
144extern void pci_mmcfg_arch_unmap(struct pci_mmcfg_region *cfg);
145extern int __devinit pci_mmconfig_insert(struct device *dev,
146 u16 seg, u8 start,
147 u8 end, phys_addr_t addr);
148extern int pci_mmconfig_delete(u16 seg, u8 start, u8 end);
138extern struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus); 149extern struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus);
139 150
140extern struct list_head pci_mmcfg_list; 151extern struct list_head pci_mmcfg_list;
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index d9b8e3f7f42a..1104afaba52b 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -551,6 +551,12 @@ DECLARE_PER_CPU(unsigned long, this_cpu_off);
551 { [0 ... NR_CPUS-1] = _initvalue }; \ 551 { [0 ... NR_CPUS-1] = _initvalue }; \
552 __typeof__(_type) *_name##_early_ptr __refdata = _name##_early_map 552 __typeof__(_type) *_name##_early_ptr __refdata = _name##_early_map
553 553
554#define DEFINE_EARLY_PER_CPU_READ_MOSTLY(_type, _name, _initvalue) \
555 DEFINE_PER_CPU_READ_MOSTLY(_type, _name) = _initvalue; \
556 __typeof__(_type) _name##_early_map[NR_CPUS] __initdata = \
557 { [0 ... NR_CPUS-1] = _initvalue }; \
558 __typeof__(_type) *_name##_early_ptr __refdata = _name##_early_map
559
554#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \ 560#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \
555 EXPORT_PER_CPU_SYMBOL(_name) 561 EXPORT_PER_CPU_SYMBOL(_name)
556 562
@@ -559,6 +565,11 @@ DECLARE_PER_CPU(unsigned long, this_cpu_off);
559 extern __typeof__(_type) *_name##_early_ptr; \ 565 extern __typeof__(_type) *_name##_early_ptr; \
560 extern __typeof__(_type) _name##_early_map[] 566 extern __typeof__(_type) _name##_early_map[]
561 567
568#define DECLARE_EARLY_PER_CPU_READ_MOSTLY(_type, _name) \
569 DECLARE_PER_CPU_READ_MOSTLY(_type, _name); \
570 extern __typeof__(_type) *_name##_early_ptr; \
571 extern __typeof__(_type) _name##_early_map[]
572
562#define early_per_cpu_ptr(_name) (_name##_early_ptr) 573#define early_per_cpu_ptr(_name) (_name##_early_ptr)
563#define early_per_cpu_map(_name, _idx) (_name##_early_map[_idx]) 574#define early_per_cpu_map(_name, _idx) (_name##_early_map[_idx])
564#define early_per_cpu(_name, _cpu) \ 575#define early_per_cpu(_name, _cpu) \
@@ -570,12 +581,18 @@ DECLARE_PER_CPU(unsigned long, this_cpu_off);
570#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \ 581#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \
571 DEFINE_PER_CPU(_type, _name) = _initvalue 582 DEFINE_PER_CPU(_type, _name) = _initvalue
572 583
584#define DEFINE_EARLY_PER_CPU_READ_MOSTLY(_type, _name, _initvalue) \
585 DEFINE_PER_CPU_READ_MOSTLY(_type, _name) = _initvalue
586
573#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \ 587#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \
574 EXPORT_PER_CPU_SYMBOL(_name) 588 EXPORT_PER_CPU_SYMBOL(_name)
575 589
576#define DECLARE_EARLY_PER_CPU(_type, _name) \ 590#define DECLARE_EARLY_PER_CPU(_type, _name) \
577 DECLARE_PER_CPU(_type, _name) 591 DECLARE_PER_CPU(_type, _name)
578 592
593#define DECLARE_EARLY_PER_CPU_READ_MOSTLY(_type, _name) \
594 DECLARE_PER_CPU_READ_MOSTLY(_type, _name)
595
579#define early_per_cpu(_name, _cpu) per_cpu(_name, _cpu) 596#define early_per_cpu(_name, _cpu) per_cpu(_name, _cpu)
580#define early_per_cpu_ptr(_name) NULL 597#define early_per_cpu_ptr(_name) NULL
581/* no early_per_cpu_map() */ 598/* no early_per_cpu_map() */
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 588f52ea810e..dab39350e51e 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -5,11 +5,10 @@
5 * Performance event hw details: 5 * Performance event hw details:
6 */ 6 */
7 7
8#define X86_PMC_MAX_GENERIC 32 8#define INTEL_PMC_MAX_GENERIC 32
9#define X86_PMC_MAX_FIXED 3 9#define INTEL_PMC_MAX_FIXED 3
10#define INTEL_PMC_IDX_FIXED 32
10 11
11#define X86_PMC_IDX_GENERIC 0
12#define X86_PMC_IDX_FIXED 32
13#define X86_PMC_IDX_MAX 64 12#define X86_PMC_IDX_MAX 64
14 13
15#define MSR_ARCH_PERFMON_PERFCTR0 0xc1 14#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
@@ -48,8 +47,7 @@
48 (X86_RAW_EVENT_MASK | \ 47 (X86_RAW_EVENT_MASK | \
49 AMD64_EVENTSEL_EVENT) 48 AMD64_EVENTSEL_EVENT)
50#define AMD64_NUM_COUNTERS 4 49#define AMD64_NUM_COUNTERS 4
51#define AMD64_NUM_COUNTERS_F15H 6 50#define AMD64_NUM_COUNTERS_CORE 6
52#define AMD64_NUM_COUNTERS_MAX AMD64_NUM_COUNTERS_F15H
53 51
54#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c 52#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c
55#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) 53#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
@@ -121,16 +119,16 @@ struct x86_pmu_capability {
121 119
122/* Instr_Retired.Any: */ 120/* Instr_Retired.Any: */
123#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309 121#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309
124#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0) 122#define INTEL_PMC_IDX_FIXED_INSTRUCTIONS (INTEL_PMC_IDX_FIXED + 0)
125 123
126/* CPU_CLK_Unhalted.Core: */ 124/* CPU_CLK_Unhalted.Core: */
127#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a 125#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a
128#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1) 126#define INTEL_PMC_IDX_FIXED_CPU_CYCLES (INTEL_PMC_IDX_FIXED + 1)
129 127
130/* CPU_CLK_Unhalted.Ref: */ 128/* CPU_CLK_Unhalted.Ref: */
131#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b 129#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b
132#define X86_PMC_IDX_FIXED_REF_CYCLES (X86_PMC_IDX_FIXED + 2) 130#define INTEL_PMC_IDX_FIXED_REF_CYCLES (INTEL_PMC_IDX_FIXED + 2)
133#define X86_PMC_MSK_FIXED_REF_CYCLES (1ULL << X86_PMC_IDX_FIXED_REF_CYCLES) 131#define INTEL_PMC_MSK_FIXED_REF_CYCLES (1ULL << INTEL_PMC_IDX_FIXED_REF_CYCLES)
134 132
135/* 133/*
136 * We model BTS tracing as another fixed-mode PMC. 134 * We model BTS tracing as another fixed-mode PMC.
@@ -139,7 +137,7 @@ struct x86_pmu_capability {
139 * values are used by actual fixed events and higher values are used 137 * values are used by actual fixed events and higher values are used
140 * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr. 138 * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr.
141 */ 139 */
142#define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16) 140#define INTEL_PMC_IDX_FIXED_BTS (INTEL_PMC_IDX_FIXED + 16)
143 141
144/* 142/*
145 * IBS cpuid feature detection 143 * IBS cpuid feature detection
@@ -234,8 +232,9 @@ struct perf_guest_switch_msr {
234 232
235extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr); 233extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr);
236extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap); 234extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap);
235extern void perf_check_microcode(void);
237#else 236#else
238static inline perf_guest_switch_msr *perf_guest_get_msrs(int *nr) 237static inline struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
239{ 238{
240 *nr = 0; 239 *nr = 0;
241 return NULL; 240 return NULL;
@@ -247,6 +246,7 @@ static inline void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
247} 246}
248 247
249static inline void perf_events_lapic_init(void) { } 248static inline void perf_events_lapic_init(void) { }
249static inline void perf_check_microcode(void) { }
250#endif 250#endif
251 251
252#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) 252#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index 98391db840c6..f2b489cf1602 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -2,9 +2,9 @@
2#define _ASM_X86_PGTABLE_2LEVEL_H 2#define _ASM_X86_PGTABLE_2LEVEL_H
3 3
4#define pte_ERROR(e) \ 4#define pte_ERROR(e) \
5 printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, (e).pte_low) 5 pr_err("%s:%d: bad pte %08lx\n", __FILE__, __LINE__, (e).pte_low)
6#define pgd_ERROR(e) \ 6#define pgd_ERROR(e) \
7 printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) 7 pr_err("%s:%d: bad pgd %08lx\n", __FILE__, __LINE__, pgd_val(e))
8 8
9/* 9/*
10 * Certain architectures need to do special things when PTEs 10 * Certain architectures need to do special things when PTEs
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index cb00ccc7d571..4cc9f2b7cdc3 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -9,13 +9,13 @@
9 */ 9 */
10 10
11#define pte_ERROR(e) \ 11#define pte_ERROR(e) \
12 printk("%s:%d: bad pte %p(%08lx%08lx).\n", \ 12 pr_err("%s:%d: bad pte %p(%08lx%08lx)\n", \
13 __FILE__, __LINE__, &(e), (e).pte_high, (e).pte_low) 13 __FILE__, __LINE__, &(e), (e).pte_high, (e).pte_low)
14#define pmd_ERROR(e) \ 14#define pmd_ERROR(e) \
15 printk("%s:%d: bad pmd %p(%016Lx).\n", \ 15 pr_err("%s:%d: bad pmd %p(%016Lx)\n", \
16 __FILE__, __LINE__, &(e), pmd_val(e)) 16 __FILE__, __LINE__, &(e), pmd_val(e))
17#define pgd_ERROR(e) \ 17#define pgd_ERROR(e) \
18 printk("%s:%d: bad pgd %p(%016Lx).\n", \ 18 pr_err("%s:%d: bad pgd %p(%016Lx)\n", \
19 __FILE__, __LINE__, &(e), pgd_val(e)) 19 __FILE__, __LINE__, &(e), pgd_val(e))
20 20
21/* Rules for using set_pte: the pte being assigned *must* be 21/* Rules for using set_pte: the pte being assigned *must* be
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 975f709e09ae..8251be02301e 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -26,16 +26,16 @@ extern pgd_t init_level4_pgt[];
26extern void paging_init(void); 26extern void paging_init(void);
27 27
28#define pte_ERROR(e) \ 28#define pte_ERROR(e) \
29 printk("%s:%d: bad pte %p(%016lx).\n", \ 29 pr_err("%s:%d: bad pte %p(%016lx)\n", \
30 __FILE__, __LINE__, &(e), pte_val(e)) 30 __FILE__, __LINE__, &(e), pte_val(e))
31#define pmd_ERROR(e) \ 31#define pmd_ERROR(e) \
32 printk("%s:%d: bad pmd %p(%016lx).\n", \ 32 pr_err("%s:%d: bad pmd %p(%016lx)\n", \
33 __FILE__, __LINE__, &(e), pmd_val(e)) 33 __FILE__, __LINE__, &(e), pmd_val(e))
34#define pud_ERROR(e) \ 34#define pud_ERROR(e) \
35 printk("%s:%d: bad pud %p(%016lx).\n", \ 35 pr_err("%s:%d: bad pud %p(%016lx)\n", \
36 __FILE__, __LINE__, &(e), pud_val(e)) 36 __FILE__, __LINE__, &(e), pud_val(e))
37#define pgd_ERROR(e) \ 37#define pgd_ERROR(e) \
38 printk("%s:%d: bad pgd %p(%016lx).\n", \ 38 pr_err("%s:%d: bad pgd %p(%016lx)\n", \
39 __FILE__, __LINE__, &(e), pgd_val(e)) 39 __FILE__, __LINE__, &(e), pgd_val(e))
40 40
41struct mm_struct; 41struct mm_struct;
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index f8ab3eaad128..aea1d1d848c7 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -44,6 +44,7 @@
44 */ 44 */
45#define X86_CR3_PWT 0x00000008 /* Page Write Through */ 45#define X86_CR3_PWT 0x00000008 /* Page Write Through */
46#define X86_CR3_PCD 0x00000010 /* Page Cache Disable */ 46#define X86_CR3_PCD 0x00000010 /* Page Cache Disable */
47#define X86_CR3_PCID_MASK 0x00000fff /* PCID Mask */
47 48
48/* 49/*
49 * Intel CPU features in CR4 50 * Intel CPU features in CR4
@@ -61,6 +62,7 @@
61#define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */ 62#define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */
62#define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */ 63#define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */
63#define X86_CR4_RDWRGSFS 0x00010000 /* enable RDWRGSFS support */ 64#define X86_CR4_RDWRGSFS 0x00010000 /* enable RDWRGSFS support */
65#define X86_CR4_PCIDE 0x00020000 /* enable PCID support */
64#define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */ 66#define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */
65#define X86_CR4_SMEP 0x00100000 /* enable SMEP support */ 67#define X86_CR4_SMEP 0x00100000 /* enable SMEP support */
66 68
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 39bc5777211a..d048cad9bcad 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -61,6 +61,19 @@ static inline void *current_text_addr(void)
61# define ARCH_MIN_MMSTRUCT_ALIGN 0 61# define ARCH_MIN_MMSTRUCT_ALIGN 0
62#endif 62#endif
63 63
64enum tlb_infos {
65 ENTRIES,
66 NR_INFO
67};
68
69extern u16 __read_mostly tlb_lli_4k[NR_INFO];
70extern u16 __read_mostly tlb_lli_2m[NR_INFO];
71extern u16 __read_mostly tlb_lli_4m[NR_INFO];
72extern u16 __read_mostly tlb_lld_4k[NR_INFO];
73extern u16 __read_mostly tlb_lld_2m[NR_INFO];
74extern u16 __read_mostly tlb_lld_4m[NR_INFO];
75extern s8 __read_mostly tlb_flushall_shift;
76
64/* 77/*
65 * CPU type and hardware bug flags. Kept separately for each CPU. 78 * CPU type and hardware bug flags. Kept separately for each CPU.
66 * Members of this structure are referenced in head.S, so think twice 79 * Members of this structure are referenced in head.S, so think twice
diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index fce3f4ae5bd6..fe1ec5bcd846 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -21,8 +21,9 @@ struct real_mode_header {
21 u32 wakeup_header; 21 u32 wakeup_header;
22#endif 22#endif
23 /* APM/BIOS reboot */ 23 /* APM/BIOS reboot */
24#ifdef CONFIG_X86_32
25 u32 machine_real_restart_asm; 24 u32 machine_real_restart_asm;
25#ifdef CONFIG_X86_64
26 u32 machine_real_restart_seg;
26#endif 27#endif
27}; 28};
28 29
diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h
index 92f297069e87..a82c4f1b4d83 100644
--- a/arch/x86/include/asm/reboot.h
+++ b/arch/x86/include/asm/reboot.h
@@ -18,8 +18,8 @@ extern struct machine_ops machine_ops;
18 18
19void native_machine_crash_shutdown(struct pt_regs *regs); 19void native_machine_crash_shutdown(struct pt_regs *regs);
20void native_machine_shutdown(void); 20void native_machine_shutdown(void);
21void machine_real_restart(unsigned int type); 21void __noreturn machine_real_restart(unsigned int type);
22/* These must match dispatch_table in reboot_32.S */ 22/* These must match dispatch in arch/x86/realmore/rm/reboot.S */
23#define MRR_BIOS 0 23#define MRR_BIOS 0
24#define MRR_APM 1 24#define MRR_APM 1
25 25
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index f48394513c37..4f19a1526037 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -31,12 +31,12 @@ static inline bool cpu_has_ht_siblings(void)
31 return has_siblings; 31 return has_siblings;
32} 32}
33 33
34DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map); 34DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
35DECLARE_PER_CPU(cpumask_var_t, cpu_core_map); 35DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
36/* cpus sharing the last level cache: */ 36/* cpus sharing the last level cache: */
37DECLARE_PER_CPU(cpumask_var_t, cpu_llc_shared_map); 37DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
38DECLARE_PER_CPU(u16, cpu_llc_id); 38DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id);
39DECLARE_PER_CPU(int, cpu_number); 39DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);
40 40
41static inline struct cpumask *cpu_sibling_mask(int cpu) 41static inline struct cpumask *cpu_sibling_mask(int cpu)
42{ 42{
@@ -53,10 +53,10 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu)
53 return per_cpu(cpu_llc_shared_map, cpu); 53 return per_cpu(cpu_llc_shared_map, cpu);
54} 54}
55 55
56DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid); 56DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid);
57DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid); 57DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid);
58#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) 58#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
59DECLARE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid); 59DECLARE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid);
60#endif 60#endif
61 61
62/* Static state in head.S used to set up a CPU */ 62/* Static state in head.S used to set up a CPU */
@@ -169,11 +169,6 @@ void x86_idle_thread_init(unsigned int cpu, struct task_struct *idle);
169void smp_store_cpu_info(int id); 169void smp_store_cpu_info(int id);
170#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu) 170#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
171 171
172/* We don't mark CPUs online until __cpu_up(), so we need another measure */
173static inline int num_booting_cpus(void)
174{
175 return cpumask_weight(cpu_callout_mask);
176}
177#else /* !CONFIG_SMP */ 172#else /* !CONFIG_SMP */
178#define wbinvd_on_cpu(cpu) wbinvd() 173#define wbinvd_on_cpu(cpu) wbinvd()
179static inline int wbinvd_on_all_cpus(void) 174static inline int wbinvd_on_all_cpus(void)
diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
index 829215fef9ee..4fef20773b8f 100644
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
@@ -4,7 +4,14 @@
4#define tlb_start_vma(tlb, vma) do { } while (0) 4#define tlb_start_vma(tlb, vma) do { } while (0)
5#define tlb_end_vma(tlb, vma) do { } while (0) 5#define tlb_end_vma(tlb, vma) do { } while (0)
6#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) 6#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
7#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) 7
8#define tlb_flush(tlb) \
9{ \
10 if (tlb->fullmm == 0) \
11 flush_tlb_mm_range(tlb->mm, tlb->start, tlb->end, 0UL); \
12 else \
13 flush_tlb_mm_range(tlb->mm, 0UL, TLB_FLUSH_ALL, 0UL); \
14}
8 15
9#include <asm-generic/tlb.h> 16#include <asm-generic/tlb.h>
10 17
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 36a1a2ab87d2..74a44333545a 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -73,14 +73,10 @@ static inline void __flush_tlb_one(unsigned long addr)
73 * - flush_tlb_page(vma, vmaddr) flushes one page 73 * - flush_tlb_page(vma, vmaddr) flushes one page
74 * - flush_tlb_range(vma, start, end) flushes a range of pages 74 * - flush_tlb_range(vma, start, end) flushes a range of pages
75 * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages 75 * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
76 * - flush_tlb_others(cpumask, mm, va) flushes TLBs on other cpus 76 * - flush_tlb_others(cpumask, mm, start, end) flushes TLBs on other cpus
77 * 77 *
78 * ..but the i386 has somewhat limited tlb flushing capabilities, 78 * ..but the i386 has somewhat limited tlb flushing capabilities,
79 * and page-granular flushes are available only on i486 and up. 79 * and page-granular flushes are available only on i486 and up.
80 *
81 * x86-64 can only flush individual pages or full VMs. For a range flush
82 * we always do the full VM. Might be worth trying if for a small
83 * range a few INVLPGs in a row are a win.
84 */ 80 */
85 81
86#ifndef CONFIG_SMP 82#ifndef CONFIG_SMP
@@ -109,9 +105,17 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
109 __flush_tlb(); 105 __flush_tlb();
110} 106}
111 107
108static inline void flush_tlb_mm_range(struct mm_struct *mm,
109 unsigned long start, unsigned long end, unsigned long vmflag)
110{
111 if (mm == current->active_mm)
112 __flush_tlb();
113}
114
112static inline void native_flush_tlb_others(const struct cpumask *cpumask, 115static inline void native_flush_tlb_others(const struct cpumask *cpumask,
113 struct mm_struct *mm, 116 struct mm_struct *mm,
114 unsigned long va) 117 unsigned long start,
118 unsigned long end)
115{ 119{
116} 120}
117 121
@@ -119,27 +123,35 @@ static inline void reset_lazy_tlbstate(void)
119{ 123{
120} 124}
121 125
126static inline void flush_tlb_kernel_range(unsigned long start,
127 unsigned long end)
128{
129 flush_tlb_all();
130}
131
122#else /* SMP */ 132#else /* SMP */
123 133
124#include <asm/smp.h> 134#include <asm/smp.h>
125 135
126#define local_flush_tlb() __flush_tlb() 136#define local_flush_tlb() __flush_tlb()
127 137
138#define flush_tlb_mm(mm) flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL)
139
140#define flush_tlb_range(vma, start, end) \
141 flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags)
142
128extern void flush_tlb_all(void); 143extern void flush_tlb_all(void);
129extern void flush_tlb_current_task(void); 144extern void flush_tlb_current_task(void);
130extern void flush_tlb_mm(struct mm_struct *);
131extern void flush_tlb_page(struct vm_area_struct *, unsigned long); 145extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
146extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
147 unsigned long end, unsigned long vmflag);
148extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
132 149
133#define flush_tlb() flush_tlb_current_task() 150#define flush_tlb() flush_tlb_current_task()
134 151
135static inline void flush_tlb_range(struct vm_area_struct *vma,
136 unsigned long start, unsigned long end)
137{
138 flush_tlb_mm(vma->vm_mm);
139}
140
141void native_flush_tlb_others(const struct cpumask *cpumask, 152void native_flush_tlb_others(const struct cpumask *cpumask,
142 struct mm_struct *mm, unsigned long va); 153 struct mm_struct *mm,
154 unsigned long start, unsigned long end);
143 155
144#define TLBSTATE_OK 1 156#define TLBSTATE_OK 1
145#define TLBSTATE_LAZY 2 157#define TLBSTATE_LAZY 2
@@ -159,13 +171,8 @@ static inline void reset_lazy_tlbstate(void)
159#endif /* SMP */ 171#endif /* SMP */
160 172
161#ifndef CONFIG_PARAVIRT 173#ifndef CONFIG_PARAVIRT
162#define flush_tlb_others(mask, mm, va) native_flush_tlb_others(mask, mm, va) 174#define flush_tlb_others(mask, mm, start, end) \
175 native_flush_tlb_others(mask, mm, start, end)
163#endif 176#endif
164 177
165static inline void flush_tlb_kernel_range(unsigned long start,
166 unsigned long end)
167{
168 flush_tlb_all();
169}
170
171#endif /* _ASM_X86_TLBFLUSH_H */ 178#endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 8e796fbbf9c6..d8def8b3dba0 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -17,6 +17,8 @@
17 17
18/* Handles exceptions in both to and from, but doesn't do access_ok */ 18/* Handles exceptions in both to and from, but doesn't do access_ok */
19__must_check unsigned long 19__must_check unsigned long
20copy_user_enhanced_fast_string(void *to, const void *from, unsigned len);
21__must_check unsigned long
20copy_user_generic_string(void *to, const void *from, unsigned len); 22copy_user_generic_string(void *to, const void *from, unsigned len);
21__must_check unsigned long 23__must_check unsigned long
22copy_user_generic_unrolled(void *to, const void *from, unsigned len); 24copy_user_generic_unrolled(void *to, const void *from, unsigned len);
@@ -26,9 +28,16 @@ copy_user_generic(void *to, const void *from, unsigned len)
26{ 28{
27 unsigned ret; 29 unsigned ret;
28 30
29 alternative_call(copy_user_generic_unrolled, 31 /*
32 * If CPU has ERMS feature, use copy_user_enhanced_fast_string.
33 * Otherwise, if CPU has rep_good feature, use copy_user_generic_string.
34 * Otherwise, use copy_user_generic_unrolled.
35 */
36 alternative_call_2(copy_user_generic_unrolled,
30 copy_user_generic_string, 37 copy_user_generic_string,
31 X86_FEATURE_REP_GOOD, 38 X86_FEATURE_REP_GOOD,
39 copy_user_enhanced_fast_string,
40 X86_FEATURE_ERMS,
32 ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from), 41 ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from),
33 "=d" (len)), 42 "=d" (len)),
34 "1" (to), "2" (from), "3" (len) 43 "1" (to), "2" (from), "3" (len)
diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h
index 4437001d8e3d..0d9776e9e2dc 100644
--- a/arch/x86/include/asm/unistd.h
+++ b/arch/x86/include/asm/unistd.h
@@ -15,7 +15,6 @@
15# ifdef CONFIG_X86_32 15# ifdef CONFIG_X86_32
16 16
17# include <asm/unistd_32.h> 17# include <asm/unistd_32.h>
18# define __ARCH_WANT_IPC_PARSE_VERSION
19# define __ARCH_WANT_STAT64 18# define __ARCH_WANT_STAT64
20# define __ARCH_WANT_SYS_IPC 19# define __ARCH_WANT_SYS_IPC
21# define __ARCH_WANT_SYS_OLD_MMAP 20# define __ARCH_WANT_SYS_OLD_MMAP
diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h
index 1e9bed14f7ae..f3971bbcd1de 100644
--- a/arch/x86/include/asm/uprobes.h
+++ b/arch/x86/include/asm/uprobes.h
@@ -48,7 +48,7 @@ struct arch_uprobe_task {
48#endif 48#endif
49}; 49};
50 50
51extern int arch_uprobe_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm); 51extern int arch_uprobe_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long addr);
52extern int arch_uprobe_pre_xol(struct arch_uprobe *aup, struct pt_regs *regs); 52extern int arch_uprobe_pre_xol(struct arch_uprobe *aup, struct pt_regs *regs);
53extern int arch_uprobe_post_xol(struct arch_uprobe *aup, struct pt_regs *regs); 53extern int arch_uprobe_post_xol(struct arch_uprobe *aup, struct pt_regs *regs);
54extern bool arch_uprobe_xol_was_trapped(struct task_struct *tsk); 54extern bool arch_uprobe_xol_was_trapped(struct task_struct *tsk);
diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h
index 3bb9491b7659..b47c2a82ff15 100644
--- a/arch/x86/include/asm/uv/uv.h
+++ b/arch/x86/include/asm/uv/uv.h
@@ -15,7 +15,8 @@ extern void uv_nmi_init(void);
15extern void uv_system_init(void); 15extern void uv_system_init(void);
16extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, 16extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
17 struct mm_struct *mm, 17 struct mm_struct *mm,
18 unsigned long va, 18 unsigned long start,
19 unsigned end,
19 unsigned int cpu); 20 unsigned int cpu);
20 21
21#else /* X86_UV */ 22#else /* X86_UV */
@@ -26,7 +27,7 @@ static inline void uv_cpu_init(void) { }
26static inline void uv_system_init(void) { } 27static inline void uv_system_init(void) { }
27static inline const struct cpumask * 28static inline const struct cpumask *
28uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, 29uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm,
29 unsigned long va, unsigned int cpu) 30 unsigned long start, unsigned long end, unsigned int cpu)
30{ return cpumask; } 31{ return cpumask; }
31 32
32#endif /* X86_UV */ 33#endif /* X86_UV */
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 6149b476d9df..a06983cdc125 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -140,6 +140,9 @@
140#define IPI_RESET_LIMIT 1 140#define IPI_RESET_LIMIT 1
141/* after this # consecutive successes, bump up the throttle if it was lowered */ 141/* after this # consecutive successes, bump up the throttle if it was lowered */
142#define COMPLETE_THRESHOLD 5 142#define COMPLETE_THRESHOLD 5
143/* after this # of giveups (fall back to kernel IPI's) disable the use of
144 the BAU for a period of time */
145#define GIVEUP_LIMIT 100
143 146
144#define UV_LB_SUBNODEID 0x10 147#define UV_LB_SUBNODEID 0x10
145 148
@@ -166,7 +169,6 @@
166#define FLUSH_RETRY_TIMEOUT 2 169#define FLUSH_RETRY_TIMEOUT 2
167#define FLUSH_GIVEUP 3 170#define FLUSH_GIVEUP 3
168#define FLUSH_COMPLETE 4 171#define FLUSH_COMPLETE 4
169#define FLUSH_RETRY_BUSYBUG 5
170 172
171/* 173/*
172 * tuning the action when the numalink network is extremely delayed 174 * tuning the action when the numalink network is extremely delayed
@@ -175,7 +177,7 @@
175 microseconds */ 177 microseconds */
176#define CONGESTED_REPS 10 /* long delays averaged over 178#define CONGESTED_REPS 10 /* long delays averaged over
177 this many broadcasts */ 179 this many broadcasts */
178#define CONGESTED_PERIOD 30 /* time for the bau to be 180#define DISABLED_PERIOD 10 /* time for the bau to be
179 disabled, in seconds */ 181 disabled, in seconds */
180/* see msg_type: */ 182/* see msg_type: */
181#define MSG_NOOP 0 183#define MSG_NOOP 0
@@ -520,6 +522,12 @@ struct ptc_stats {
520 unsigned long s_uv2_wars; /* uv2 workaround, perm. busy */ 522 unsigned long s_uv2_wars; /* uv2 workaround, perm. busy */
521 unsigned long s_uv2_wars_hw; /* uv2 workaround, hiwater */ 523 unsigned long s_uv2_wars_hw; /* uv2 workaround, hiwater */
522 unsigned long s_uv2_war_waits; /* uv2 workaround, long waits */ 524 unsigned long s_uv2_war_waits; /* uv2 workaround, long waits */
525 unsigned long s_overipilimit; /* over the ipi reset limit */
526 unsigned long s_giveuplimit; /* disables, over giveup limit*/
527 unsigned long s_enters; /* entries to the driver */
528 unsigned long s_ipifordisabled; /* fall back to IPI; disabled */
529 unsigned long s_plugged; /* plugged by h/w bug*/
530 unsigned long s_congested; /* giveup on long wait */
523 /* destination statistics */ 531 /* destination statistics */
524 unsigned long d_alltlb; /* times all tlb's on this 532 unsigned long d_alltlb; /* times all tlb's on this
525 cpu were flushed */ 533 cpu were flushed */
@@ -586,8 +594,8 @@ struct bau_control {
586 int timeout_tries; 594 int timeout_tries;
587 int ipi_attempts; 595 int ipi_attempts;
588 int conseccompletes; 596 int conseccompletes;
589 int baudisabled; 597 short nobau;
590 int set_bau_off; 598 short baudisabled;
591 short cpu; 599 short cpu;
592 short osnode; 600 short osnode;
593 short uvhub_cpu; 601 short uvhub_cpu;
@@ -596,14 +604,16 @@ struct bau_control {
596 short cpus_in_socket; 604 short cpus_in_socket;
597 short cpus_in_uvhub; 605 short cpus_in_uvhub;
598 short partition_base_pnode; 606 short partition_base_pnode;
599 short using_desc; /* an index, like uvhub_cpu */ 607 short busy; /* all were busy (war) */
600 unsigned int inuse_map;
601 unsigned short message_number; 608 unsigned short message_number;
602 unsigned short uvhub_quiesce; 609 unsigned short uvhub_quiesce;
603 short socket_acknowledge_count[DEST_Q_SIZE]; 610 short socket_acknowledge_count[DEST_Q_SIZE];
604 cycles_t send_message; 611 cycles_t send_message;
612 cycles_t period_end;
613 cycles_t period_time;
605 spinlock_t uvhub_lock; 614 spinlock_t uvhub_lock;
606 spinlock_t queue_lock; 615 spinlock_t queue_lock;
616 spinlock_t disable_lock;
607 /* tunables */ 617 /* tunables */
608 int max_concurr; 618 int max_concurr;
609 int max_concurr_const; 619 int max_concurr_const;
@@ -614,9 +624,9 @@ struct bau_control {
614 int complete_threshold; 624 int complete_threshold;
615 int cong_response_us; 625 int cong_response_us;
616 int cong_reps; 626 int cong_reps;
617 int cong_period; 627 cycles_t disabled_period;
618 unsigned long clocks_per_100_usec; 628 int period_giveups;
619 cycles_t period_time; 629 int giveup_limit;
620 long period_requests; 630 long period_requests;
621 struct hub_and_pnode *thp; 631 struct hub_and_pnode *thp;
622}; 632};
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 31f180c21ce9..74fcb963595b 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -60,6 +60,7 @@
60#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 60#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
61#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 61#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080
62#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 62#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400
63#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000
63 64
64 65
65#define PIN_BASED_EXT_INTR_MASK 0x00000001 66#define PIN_BASED_EXT_INTR_MASK 0x00000001
@@ -281,6 +282,7 @@ enum vmcs_field {
281#define EXIT_REASON_EPT_MISCONFIG 49 282#define EXIT_REASON_EPT_MISCONFIG 49
282#define EXIT_REASON_WBINVD 54 283#define EXIT_REASON_WBINVD 54
283#define EXIT_REASON_XSETBV 55 284#define EXIT_REASON_XSETBV 55
285#define EXIT_REASON_INVPCID 58
284 286
285/* 287/*
286 * Interruption-information format 288 * Interruption-information format
@@ -404,6 +406,7 @@ enum vmcs_field {
404#define VMX_EPTP_WB_BIT (1ull << 14) 406#define VMX_EPTP_WB_BIT (1ull << 14)
405#define VMX_EPT_2MB_PAGE_BIT (1ull << 16) 407#define VMX_EPT_2MB_PAGE_BIT (1ull << 16)
406#define VMX_EPT_1GB_PAGE_BIT (1ull << 17) 408#define VMX_EPT_1GB_PAGE_BIT (1ull << 17)
409#define VMX_EPT_AD_BIT (1ull << 21)
407#define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24) 410#define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24)
408#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) 411#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
409#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) 412#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
@@ -415,11 +418,14 @@ enum vmcs_field {
415#define VMX_EPT_MAX_GAW 0x4 418#define VMX_EPT_MAX_GAW 0x4
416#define VMX_EPT_MT_EPTE_SHIFT 3 419#define VMX_EPT_MT_EPTE_SHIFT 3
417#define VMX_EPT_GAW_EPTP_SHIFT 3 420#define VMX_EPT_GAW_EPTP_SHIFT 3
421#define VMX_EPT_AD_ENABLE_BIT (1ull << 6)
418#define VMX_EPT_DEFAULT_MT 0x6ull 422#define VMX_EPT_DEFAULT_MT 0x6ull
419#define VMX_EPT_READABLE_MASK 0x1ull 423#define VMX_EPT_READABLE_MASK 0x1ull
420#define VMX_EPT_WRITABLE_MASK 0x2ull 424#define VMX_EPT_WRITABLE_MASK 0x2ull
421#define VMX_EPT_EXECUTABLE_MASK 0x4ull 425#define VMX_EPT_EXECUTABLE_MASK 0x4ull
422#define VMX_EPT_IPAT_BIT (1ull << 6) 426#define VMX_EPT_IPAT_BIT (1ull << 6)
427#define VMX_EPT_ACCESS_BIT (1ull << 8)
428#define VMX_EPT_DIRTY_BIT (1ull << 9)
423 429
424#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul 430#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul
425 431
diff --git a/arch/x86/include/asm/x2apic.h b/arch/x86/include/asm/x2apic.h
index 92e54abf89e0..f90f0a587c66 100644
--- a/arch/x86/include/asm/x2apic.h
+++ b/arch/x86/include/asm/x2apic.h
@@ -9,15 +9,6 @@
9#include <asm/ipi.h> 9#include <asm/ipi.h>
10#include <linux/cpumask.h> 10#include <linux/cpumask.h>
11 11
12/*
13 * Need to use more than cpu 0, because we need more vectors
14 * when MSI-X are used.
15 */
16static const struct cpumask *x2apic_target_cpus(void)
17{
18 return cpu_online_mask;
19}
20
21static int x2apic_apic_id_valid(int apicid) 12static int x2apic_apic_id_valid(int apicid)
22{ 13{
23 return 1; 14 return 1;
@@ -28,15 +19,6 @@ static int x2apic_apic_id_registered(void)
28 return 1; 19 return 1;
29} 20}
30 21
31/*
32 * For now each logical cpu is in its own vector allocation domain.
33 */
34static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
35{
36 cpumask_clear(retmask);
37 cpumask_set_cpu(cpu, retmask);
38}
39
40static void 22static void
41__x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest) 23__x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
42{ 24{
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index c090af10ac7d..38155f667144 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -156,7 +156,6 @@ struct x86_cpuinit_ops {
156/** 156/**
157 * struct x86_platform_ops - platform specific runtime functions 157 * struct x86_platform_ops - platform specific runtime functions
158 * @calibrate_tsc: calibrate TSC 158 * @calibrate_tsc: calibrate TSC
159 * @wallclock_init: init the wallclock device
160 * @get_wallclock: get time from HW clock like RTC etc. 159 * @get_wallclock: get time from HW clock like RTC etc.
161 * @set_wallclock: set time back to HW clock 160 * @set_wallclock: set time back to HW clock
162 * @is_untracked_pat_range exclude from PAT logic 161 * @is_untracked_pat_range exclude from PAT logic
@@ -164,10 +163,10 @@ struct x86_cpuinit_ops {
164 * @i8042_detect pre-detect if i8042 controller exists 163 * @i8042_detect pre-detect if i8042 controller exists
165 * @save_sched_clock_state: save state for sched_clock() on suspend 164 * @save_sched_clock_state: save state for sched_clock() on suspend
166 * @restore_sched_clock_state: restore state for sched_clock() on resume 165 * @restore_sched_clock_state: restore state for sched_clock() on resume
166 * @apic_post_init: adjust apic if neeeded
167 */ 167 */
168struct x86_platform_ops { 168struct x86_platform_ops {
169 unsigned long (*calibrate_tsc)(void); 169 unsigned long (*calibrate_tsc)(void);
170 void (*wallclock_init)(void);
171 unsigned long (*get_wallclock)(void); 170 unsigned long (*get_wallclock)(void);
172 int (*set_wallclock)(unsigned long nowtime); 171 int (*set_wallclock)(unsigned long nowtime);
173 void (*iommu_shutdown)(void); 172 void (*iommu_shutdown)(void);
@@ -177,6 +176,7 @@ struct x86_platform_ops {
177 int (*i8042_detect)(void); 176 int (*i8042_detect)(void);
178 void (*save_sched_clock_state)(void); 177 void (*save_sched_clock_state)(void);
179 void (*restore_sched_clock_state)(void); 178 void (*restore_sched_clock_state)(void);
179 void (*apic_post_init)(void);
180}; 180};
181 181
182struct pci_dev; 182struct pci_dev;
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 5728852fb90f..59c226d120cd 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -48,6 +48,7 @@
48#include <xen/interface/sched.h> 48#include <xen/interface/sched.h>
49#include <xen/interface/physdev.h> 49#include <xen/interface/physdev.h>
50#include <xen/interface/platform.h> 50#include <xen/interface/platform.h>
51#include <xen/interface/xen-mca.h>
51 52
52/* 53/*
53 * The hypercall asms have to meet several constraints: 54 * The hypercall asms have to meet several constraints:
@@ -302,6 +303,13 @@ HYPERVISOR_set_timer_op(u64 timeout)
302} 303}
303 304
304static inline int 305static inline int
306HYPERVISOR_mca(struct xen_mc *mc_op)
307{
308 mc_op->interface_version = XEN_MCA_INTERFACE_VERSION;
309 return _hypercall1(int, mca, mc_op);
310}
311
312static inline int
305HYPERVISOR_dom0_op(struct xen_platform_op *platform_op) 313HYPERVISOR_dom0_op(struct xen_platform_op *platform_op)
306{ 314{
307 platform_op->interface_version = XENPF_INTERFACE_VERSION; 315 platform_op->interface_version = XENPF_INTERFACE_VERSION;
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 1f84794f0759..931280ff8299 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1,3 +1,5 @@
1#define pr_fmt(fmt) "SMP alternatives: " fmt
2
1#include <linux/module.h> 3#include <linux/module.h>
2#include <linux/sched.h> 4#include <linux/sched.h>
3#include <linux/mutex.h> 5#include <linux/mutex.h>
@@ -63,8 +65,11 @@ static int __init setup_noreplace_paravirt(char *str)
63__setup("noreplace-paravirt", setup_noreplace_paravirt); 65__setup("noreplace-paravirt", setup_noreplace_paravirt);
64#endif 66#endif
65 67
66#define DPRINTK(fmt, args...) if (debug_alternative) \ 68#define DPRINTK(fmt, ...) \
67 printk(KERN_DEBUG fmt, args) 69do { \
70 if (debug_alternative) \
71 printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
72} while (0)
68 73
69/* 74/*
70 * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes 75 * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
@@ -428,7 +433,7 @@ void alternatives_smp_switch(int smp)
428 * If this still occurs then you should see a hang 433 * If this still occurs then you should see a hang
429 * or crash shortly after this line: 434 * or crash shortly after this line:
430 */ 435 */
431 printk("lockdep: fixing up alternatives.\n"); 436 pr_info("lockdep: fixing up alternatives\n");
432#endif 437#endif
433 438
434 if (noreplace_smp || smp_alt_once || skip_smp_alternatives) 439 if (noreplace_smp || smp_alt_once || skip_smp_alternatives)
@@ -444,14 +449,14 @@ void alternatives_smp_switch(int smp)
444 if (smp == smp_mode) { 449 if (smp == smp_mode) {
445 /* nothing */ 450 /* nothing */
446 } else if (smp) { 451 } else if (smp) {
447 printk(KERN_INFO "SMP alternatives: switching to SMP code\n"); 452 pr_info("switching to SMP code\n");
448 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); 453 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
449 clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP); 454 clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
450 list_for_each_entry(mod, &smp_alt_modules, next) 455 list_for_each_entry(mod, &smp_alt_modules, next)
451 alternatives_smp_lock(mod->locks, mod->locks_end, 456 alternatives_smp_lock(mod->locks, mod->locks_end,
452 mod->text, mod->text_end); 457 mod->text, mod->text_end);
453 } else { 458 } else {
454 printk(KERN_INFO "SMP alternatives: switching to UP code\n"); 459 pr_info("switching to UP code\n");
455 set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); 460 set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
456 set_cpu_cap(&cpu_data(0), X86_FEATURE_UP); 461 set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
457 list_for_each_entry(mod, &smp_alt_modules, next) 462 list_for_each_entry(mod, &smp_alt_modules, next)
@@ -546,7 +551,7 @@ void __init alternative_instructions(void)
546#ifdef CONFIG_SMP 551#ifdef CONFIG_SMP
547 if (smp_alt_once) { 552 if (smp_alt_once) {
548 if (1 == num_possible_cpus()) { 553 if (1 == num_possible_cpus()) {
549 printk(KERN_INFO "SMP alternatives: switching to UP code\n"); 554 pr_info("switching to UP code\n");
550 set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); 555 set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
551 set_cpu_cap(&cpu_data(0), X86_FEATURE_UP); 556 set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
552 557
@@ -664,7 +669,7 @@ static int __kprobes stop_machine_text_poke(void *data)
664 struct text_poke_param *p; 669 struct text_poke_param *p;
665 int i; 670 int i;
666 671
667 if (atomic_dec_and_test(&stop_machine_first)) { 672 if (atomic_xchg(&stop_machine_first, 0)) {
668 for (i = 0; i < tpp->nparams; i++) { 673 for (i = 0; i < tpp->nparams; i++) {
669 p = &tpp->params[i]; 674 p = &tpp->params[i];
670 text_poke(p->addr, p->opcode, p->len); 675 text_poke(p->addr, p->opcode, p->len);
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index be16854591cc..aadf3359e2a7 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -2,6 +2,9 @@
2 * Shared support code for AMD K8 northbridges and derivates. 2 * Shared support code for AMD K8 northbridges and derivates.
3 * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2. 3 * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2.
4 */ 4 */
5
6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7
5#include <linux/types.h> 8#include <linux/types.h>
6#include <linux/slab.h> 9#include <linux/slab.h>
7#include <linux/init.h> 10#include <linux/init.h>
@@ -16,6 +19,7 @@ const struct pci_device_id amd_nb_misc_ids[] = {
16 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, 19 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
17 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, 20 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
18 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) }, 21 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) },
22 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) },
19 {} 23 {}
20}; 24};
21EXPORT_SYMBOL(amd_nb_misc_ids); 25EXPORT_SYMBOL(amd_nb_misc_ids);
@@ -258,7 +262,7 @@ void amd_flush_garts(void)
258 } 262 }
259 spin_unlock_irqrestore(&gart_lock, flags); 263 spin_unlock_irqrestore(&gart_lock, flags);
260 if (!flushed) 264 if (!flushed)
261 printk("nothing to flush?\n"); 265 pr_notice("nothing to flush?\n");
262} 266}
263EXPORT_SYMBOL_GPL(amd_flush_garts); 267EXPORT_SYMBOL_GPL(amd_flush_garts);
264 268
@@ -269,11 +273,10 @@ static __init int init_amd_nbs(void)
269 err = amd_cache_northbridges(); 273 err = amd_cache_northbridges();
270 274
271 if (err < 0) 275 if (err < 0)
272 printk(KERN_NOTICE "AMD NB: Cannot enumerate AMD northbridges.\n"); 276 pr_notice("Cannot enumerate AMD northbridges\n");
273 277
274 if (amd_cache_gart() < 0) 278 if (amd_cache_gart() < 0)
275 printk(KERN_NOTICE "AMD NB: Cannot initialize GART flush words, " 279 pr_notice("Cannot initialize GART flush words, GART support disabled\n");
276 "GART support disabled.\n");
277 280
278 return err; 281 return err;
279} 282}
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 39a222e094af..24deb3082328 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -75,8 +75,8 @@ physid_mask_t phys_cpu_present_map;
75/* 75/*
76 * Map cpu index to physical APIC ID 76 * Map cpu index to physical APIC ID
77 */ 77 */
78DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID); 78DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID);
79DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID); 79DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid, BAD_APICID);
80EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); 80EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
81EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); 81EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
82 82
@@ -88,7 +88,7 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
88 * used for the mapping. This is where the behaviors of x86_64 and 32 88 * used for the mapping. This is where the behaviors of x86_64 and 32
89 * actually diverge. Let's keep it ugly for now. 89 * actually diverge. Let's keep it ugly for now.
90 */ 90 */
91DEFINE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid, BAD_APICID); 91DEFINE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid, BAD_APICID);
92 92
93/* 93/*
94 * Knob to control our willingness to enable the local APIC. 94 * Knob to control our willingness to enable the local APIC.
@@ -2123,6 +2123,42 @@ void default_init_apic_ldr(void)
2123 apic_write(APIC_LDR, val); 2123 apic_write(APIC_LDR, val);
2124} 2124}
2125 2125
2126int default_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
2127 const struct cpumask *andmask,
2128 unsigned int *apicid)
2129{
2130 unsigned int cpu;
2131
2132 for_each_cpu_and(cpu, cpumask, andmask) {
2133 if (cpumask_test_cpu(cpu, cpu_online_mask))
2134 break;
2135 }
2136
2137 if (likely(cpu < nr_cpu_ids)) {
2138 *apicid = per_cpu(x86_cpu_to_apicid, cpu);
2139 return 0;
2140 }
2141
2142 return -EINVAL;
2143}
2144
2145/*
2146 * Override the generic EOI implementation with an optimized version.
2147 * Only called during early boot when only one CPU is active and with
2148 * interrupts disabled, so we know this does not race with actual APIC driver
2149 * use.
2150 */
2151void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v))
2152{
2153 struct apic **drv;
2154
2155 for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
2156 /* Should happen once for each apic */
2157 WARN_ON((*drv)->eoi_write == eoi_write);
2158 (*drv)->eoi_write = eoi_write;
2159 }
2160}
2161
2126/* 2162/*
2127 * Power management 2163 * Power management
2128 */ 2164 */
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 0e881c46e8c8..00c77cf78e9e 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -36,25 +36,6 @@ static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
36 return 1; 36 return 1;
37} 37}
38 38
39static const struct cpumask *flat_target_cpus(void)
40{
41 return cpu_online_mask;
42}
43
44static void flat_vector_allocation_domain(int cpu, struct cpumask *retmask)
45{
46 /* Careful. Some cpus do not strictly honor the set of cpus
47 * specified in the interrupt destination when using lowest
48 * priority interrupt delivery mode.
49 *
50 * In particular there was a hyperthreading cpu observed to
51 * deliver interrupts to the wrong hyperthread when only one
52 * hyperthread was specified in the interrupt desitination.
53 */
54 cpumask_clear(retmask);
55 cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
56}
57
58/* 39/*
59 * Set up the logical destination ID. 40 * Set up the logical destination ID.
60 * 41 *
@@ -92,7 +73,7 @@ static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector)
92} 73}
93 74
94static void 75static void
95 flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) 76flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector)
96{ 77{
97 unsigned long mask = cpumask_bits(cpumask)[0]; 78 unsigned long mask = cpumask_bits(cpumask)[0];
98 int cpu = smp_processor_id(); 79 int cpu = smp_processor_id();
@@ -186,7 +167,7 @@ static struct apic apic_flat = {
186 .irq_delivery_mode = dest_LowestPrio, 167 .irq_delivery_mode = dest_LowestPrio,
187 .irq_dest_mode = 1, /* logical */ 168 .irq_dest_mode = 1, /* logical */
188 169
189 .target_cpus = flat_target_cpus, 170 .target_cpus = online_target_cpus,
190 .disable_esr = 0, 171 .disable_esr = 0,
191 .dest_logical = APIC_DEST_LOGICAL, 172 .dest_logical = APIC_DEST_LOGICAL,
192 .check_apicid_used = NULL, 173 .check_apicid_used = NULL,
@@ -210,8 +191,7 @@ static struct apic apic_flat = {
210 .set_apic_id = set_apic_id, 191 .set_apic_id = set_apic_id,
211 .apic_id_mask = 0xFFu << 24, 192 .apic_id_mask = 0xFFu << 24,
212 193
213 .cpu_mask_to_apicid = default_cpu_mask_to_apicid, 194 .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
214 .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
215 195
216 .send_IPI_mask = flat_send_IPI_mask, 196 .send_IPI_mask = flat_send_IPI_mask,
217 .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself, 197 .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself,
@@ -262,17 +242,6 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
262 return 0; 242 return 0;
263} 243}
264 244
265static const struct cpumask *physflat_target_cpus(void)
266{
267 return cpu_online_mask;
268}
269
270static void physflat_vector_allocation_domain(int cpu, struct cpumask *retmask)
271{
272 cpumask_clear(retmask);
273 cpumask_set_cpu(cpu, retmask);
274}
275
276static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector) 245static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector)
277{ 246{
278 default_send_IPI_mask_sequence_phys(cpumask, vector); 247 default_send_IPI_mask_sequence_phys(cpumask, vector);
@@ -294,38 +263,6 @@ static void physflat_send_IPI_all(int vector)
294 physflat_send_IPI_mask(cpu_online_mask, vector); 263 physflat_send_IPI_mask(cpu_online_mask, vector);
295} 264}
296 265
297static unsigned int physflat_cpu_mask_to_apicid(const struct cpumask *cpumask)
298{
299 int cpu;
300
301 /*
302 * We're using fixed IRQ delivery, can only return one phys APIC ID.
303 * May as well be the first.
304 */
305 cpu = cpumask_first(cpumask);
306 if ((unsigned)cpu < nr_cpu_ids)
307 return per_cpu(x86_cpu_to_apicid, cpu);
308 else
309 return BAD_APICID;
310}
311
312static unsigned int
313physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
314 const struct cpumask *andmask)
315{
316 int cpu;
317
318 /*
319 * We're using fixed IRQ delivery, can only return one phys APIC ID.
320 * May as well be the first.
321 */
322 for_each_cpu_and(cpu, cpumask, andmask) {
323 if (cpumask_test_cpu(cpu, cpu_online_mask))
324 break;
325 }
326 return per_cpu(x86_cpu_to_apicid, cpu);
327}
328
329static int physflat_probe(void) 266static int physflat_probe(void)
330{ 267{
331 if (apic == &apic_physflat || num_possible_cpus() > 8) 268 if (apic == &apic_physflat || num_possible_cpus() > 8)
@@ -345,13 +282,13 @@ static struct apic apic_physflat = {
345 .irq_delivery_mode = dest_Fixed, 282 .irq_delivery_mode = dest_Fixed,
346 .irq_dest_mode = 0, /* physical */ 283 .irq_dest_mode = 0, /* physical */
347 284
348 .target_cpus = physflat_target_cpus, 285 .target_cpus = online_target_cpus,
349 .disable_esr = 0, 286 .disable_esr = 0,
350 .dest_logical = 0, 287 .dest_logical = 0,
351 .check_apicid_used = NULL, 288 .check_apicid_used = NULL,
352 .check_apicid_present = NULL, 289 .check_apicid_present = NULL,
353 290
354 .vector_allocation_domain = physflat_vector_allocation_domain, 291 .vector_allocation_domain = default_vector_allocation_domain,
355 /* not needed, but shouldn't hurt: */ 292 /* not needed, but shouldn't hurt: */
356 .init_apic_ldr = flat_init_apic_ldr, 293 .init_apic_ldr = flat_init_apic_ldr,
357 294
@@ -370,8 +307,7 @@ static struct apic apic_physflat = {
370 .set_apic_id = set_apic_id, 307 .set_apic_id = set_apic_id,
371 .apic_id_mask = 0xFFu << 24, 308 .apic_id_mask = 0xFFu << 24,
372 309
373 .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid, 310 .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
374 .cpu_mask_to_apicid_and = physflat_cpu_mask_to_apicid_and,
375 311
376 .send_IPI_mask = physflat_send_IPI_mask, 312 .send_IPI_mask = physflat_send_IPI_mask,
377 .send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself, 313 .send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index a6e4c6e06c08..e145f28b4099 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -100,12 +100,12 @@ static unsigned long noop_check_apicid_present(int bit)
100 return physid_isset(bit, phys_cpu_present_map); 100 return physid_isset(bit, phys_cpu_present_map);
101} 101}
102 102
103static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask) 103static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask,
104 const struct cpumask *mask)
104{ 105{
105 if (cpu != 0) 106 if (cpu != 0)
106 pr_warning("APIC: Vector allocated for non-BSP cpu\n"); 107 pr_warning("APIC: Vector allocated for non-BSP cpu\n");
107 cpumask_clear(retmask); 108 cpumask_copy(retmask, cpumask_of(cpu));
108 cpumask_set_cpu(cpu, retmask);
109} 109}
110 110
111static u32 noop_apic_read(u32 reg) 111static u32 noop_apic_read(u32 reg)
@@ -159,8 +159,7 @@ struct apic apic_noop = {
159 .set_apic_id = NULL, 159 .set_apic_id = NULL,
160 .apic_id_mask = 0x0F << 24, 160 .apic_id_mask = 0x0F << 24,
161 161
162 .cpu_mask_to_apicid = default_cpu_mask_to_apicid, 162 .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
163 .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
164 163
165 .send_IPI_mask = noop_send_IPI_mask, 164 .send_IPI_mask = noop_send_IPI_mask,
166 .send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself, 165 .send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself,
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 6ec6d5d297c3..bc552cff2578 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -72,17 +72,6 @@ static int numachip_phys_pkg_id(int initial_apic_id, int index_msb)
72 return initial_apic_id >> index_msb; 72 return initial_apic_id >> index_msb;
73} 73}
74 74
75static const struct cpumask *numachip_target_cpus(void)
76{
77 return cpu_online_mask;
78}
79
80static void numachip_vector_allocation_domain(int cpu, struct cpumask *retmask)
81{
82 cpumask_clear(retmask);
83 cpumask_set_cpu(cpu, retmask);
84}
85
86static int __cpuinit numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip) 75static int __cpuinit numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip)
87{ 76{
88 union numachip_csr_g3_ext_irq_gen int_gen; 77 union numachip_csr_g3_ext_irq_gen int_gen;
@@ -157,38 +146,6 @@ static void numachip_send_IPI_self(int vector)
157 __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); 146 __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
158} 147}
159 148
160static unsigned int numachip_cpu_mask_to_apicid(const struct cpumask *cpumask)
161{
162 int cpu;
163
164 /*
165 * We're using fixed IRQ delivery, can only return one phys APIC ID.
166 * May as well be the first.
167 */
168 cpu = cpumask_first(cpumask);
169 if (likely((unsigned)cpu < nr_cpu_ids))
170 return per_cpu(x86_cpu_to_apicid, cpu);
171
172 return BAD_APICID;
173}
174
175static unsigned int
176numachip_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
177 const struct cpumask *andmask)
178{
179 int cpu;
180
181 /*
182 * We're using fixed IRQ delivery, can only return one phys APIC ID.
183 * May as well be the first.
184 */
185 for_each_cpu_and(cpu, cpumask, andmask) {
186 if (cpumask_test_cpu(cpu, cpu_online_mask))
187 break;
188 }
189 return per_cpu(x86_cpu_to_apicid, cpu);
190}
191
192static int __init numachip_probe(void) 149static int __init numachip_probe(void)
193{ 150{
194 return apic == &apic_numachip; 151 return apic == &apic_numachip;
@@ -253,13 +210,13 @@ static struct apic apic_numachip __refconst = {
253 .irq_delivery_mode = dest_Fixed, 210 .irq_delivery_mode = dest_Fixed,
254 .irq_dest_mode = 0, /* physical */ 211 .irq_dest_mode = 0, /* physical */
255 212
256 .target_cpus = numachip_target_cpus, 213 .target_cpus = online_target_cpus,
257 .disable_esr = 0, 214 .disable_esr = 0,
258 .dest_logical = 0, 215 .dest_logical = 0,
259 .check_apicid_used = NULL, 216 .check_apicid_used = NULL,
260 .check_apicid_present = NULL, 217 .check_apicid_present = NULL,
261 218
262 .vector_allocation_domain = numachip_vector_allocation_domain, 219 .vector_allocation_domain = default_vector_allocation_domain,
263 .init_apic_ldr = flat_init_apic_ldr, 220 .init_apic_ldr = flat_init_apic_ldr,
264 221
265 .ioapic_phys_id_map = NULL, 222 .ioapic_phys_id_map = NULL,
@@ -277,8 +234,7 @@ static struct apic apic_numachip __refconst = {
277 .set_apic_id = set_apic_id, 234 .set_apic_id = set_apic_id,
278 .apic_id_mask = 0xffU << 24, 235 .apic_id_mask = 0xffU << 24,
279 236
280 .cpu_mask_to_apicid = numachip_cpu_mask_to_apicid, 237 .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
281 .cpu_mask_to_apicid_and = numachip_cpu_mask_to_apicid_and,
282 238
283 .send_IPI_mask = numachip_send_IPI_mask, 239 .send_IPI_mask = numachip_send_IPI_mask,
284 .send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself, 240 .send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself,
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 31fbdbfbf960..d50e3640d5ae 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -26,15 +26,6 @@ static int bigsmp_apic_id_registered(void)
26 return 1; 26 return 1;
27} 27}
28 28
29static const struct cpumask *bigsmp_target_cpus(void)
30{
31#ifdef CONFIG_SMP
32 return cpu_online_mask;
33#else
34 return cpumask_of(0);
35#endif
36}
37
38static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid) 29static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid)
39{ 30{
40 return 0; 31 return 0;
@@ -105,32 +96,6 @@ static int bigsmp_check_phys_apicid_present(int phys_apicid)
105 return 1; 96 return 1;
106} 97}
107 98
108/* As we are using single CPU as destination, pick only one CPU here */
109static unsigned int bigsmp_cpu_mask_to_apicid(const struct cpumask *cpumask)
110{
111 int cpu = cpumask_first(cpumask);
112
113 if (cpu < nr_cpu_ids)
114 return cpu_physical_id(cpu);
115 return BAD_APICID;
116}
117
118static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
119 const struct cpumask *andmask)
120{
121 int cpu;
122
123 /*
124 * We're using fixed IRQ delivery, can only return one phys APIC ID.
125 * May as well be the first.
126 */
127 for_each_cpu_and(cpu, cpumask, andmask) {
128 if (cpumask_test_cpu(cpu, cpu_online_mask))
129 return cpu_physical_id(cpu);
130 }
131 return BAD_APICID;
132}
133
134static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) 99static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
135{ 100{
136 return cpuid_apic >> index_msb; 101 return cpuid_apic >> index_msb;
@@ -177,12 +142,6 @@ static const struct dmi_system_id bigsmp_dmi_table[] = {
177 { } /* NULL entry stops DMI scanning */ 142 { } /* NULL entry stops DMI scanning */
178}; 143};
179 144
180static void bigsmp_vector_allocation_domain(int cpu, struct cpumask *retmask)
181{
182 cpumask_clear(retmask);
183 cpumask_set_cpu(cpu, retmask);
184}
185
186static int probe_bigsmp(void) 145static int probe_bigsmp(void)
187{ 146{
188 if (def_to_bigsmp) 147 if (def_to_bigsmp)
@@ -205,13 +164,13 @@ static struct apic apic_bigsmp = {
205 /* phys delivery to target CPU: */ 164 /* phys delivery to target CPU: */
206 .irq_dest_mode = 0, 165 .irq_dest_mode = 0,
207 166
208 .target_cpus = bigsmp_target_cpus, 167 .target_cpus = default_target_cpus,
209 .disable_esr = 1, 168 .disable_esr = 1,
210 .dest_logical = 0, 169 .dest_logical = 0,
211 .check_apicid_used = bigsmp_check_apicid_used, 170 .check_apicid_used = bigsmp_check_apicid_used,
212 .check_apicid_present = bigsmp_check_apicid_present, 171 .check_apicid_present = bigsmp_check_apicid_present,
213 172
214 .vector_allocation_domain = bigsmp_vector_allocation_domain, 173 .vector_allocation_domain = default_vector_allocation_domain,
215 .init_apic_ldr = bigsmp_init_apic_ldr, 174 .init_apic_ldr = bigsmp_init_apic_ldr,
216 175
217 .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map, 176 .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map,
@@ -229,8 +188,7 @@ static struct apic apic_bigsmp = {
229 .set_apic_id = NULL, 188 .set_apic_id = NULL,
230 .apic_id_mask = 0xFF << 24, 189 .apic_id_mask = 0xFF << 24,
231 190
232 .cpu_mask_to_apicid = bigsmp_cpu_mask_to_apicid, 191 .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
233 .cpu_mask_to_apicid_and = bigsmp_cpu_mask_to_apicid_and,
234 192
235 .send_IPI_mask = bigsmp_send_IPI_mask, 193 .send_IPI_mask = bigsmp_send_IPI_mask,
236 .send_IPI_mask_allbutself = NULL, 194 .send_IPI_mask_allbutself = NULL,
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index db4ab1be3c79..0874799a98c6 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -394,21 +394,6 @@ static void es7000_enable_apic_mode(void)
394 WARN(1, "Command failed, status = %x\n", mip_status); 394 WARN(1, "Command failed, status = %x\n", mip_status);
395} 395}
396 396
397static void es7000_vector_allocation_domain(int cpu, struct cpumask *retmask)
398{
399 /* Careful. Some cpus do not strictly honor the set of cpus
400 * specified in the interrupt destination when using lowest
401 * priority interrupt delivery mode.
402 *
403 * In particular there was a hyperthreading cpu observed to
404 * deliver interrupts to the wrong hyperthread when only one
405 * hyperthread was specified in the interrupt desitination.
406 */
407 cpumask_clear(retmask);
408 cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
409}
410
411
412static void es7000_wait_for_init_deassert(atomic_t *deassert) 397static void es7000_wait_for_init_deassert(atomic_t *deassert)
413{ 398{
414 while (!atomic_read(deassert)) 399 while (!atomic_read(deassert))
@@ -540,45 +525,49 @@ static int es7000_check_phys_apicid_present(int cpu_physical_apicid)
540 return 1; 525 return 1;
541} 526}
542 527
543static unsigned int es7000_cpu_mask_to_apicid(const struct cpumask *cpumask) 528static inline int
529es7000_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id)
544{ 530{
545 unsigned int round = 0; 531 unsigned int round = 0;
546 int cpu, uninitialized_var(apicid); 532 unsigned int cpu, uninitialized_var(apicid);
547 533
548 /* 534 /*
549 * The cpus in the mask must all be on the apic cluster. 535 * The cpus in the mask must all be on the apic cluster.
550 */ 536 */
551 for_each_cpu(cpu, cpumask) { 537 for_each_cpu_and(cpu, cpumask, cpu_online_mask) {
552 int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); 538 int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
553 539
554 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { 540 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
555 WARN(1, "Not a valid mask!"); 541 WARN(1, "Not a valid mask!");
556 542
557 return BAD_APICID; 543 return -EINVAL;
558 } 544 }
559 apicid = new_apicid; 545 apicid |= new_apicid;
560 round++; 546 round++;
561 } 547 }
562 return apicid; 548 if (!round)
549 return -EINVAL;
550 *dest_id = apicid;
551 return 0;
563} 552}
564 553
565static unsigned int 554static int
566es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask, 555es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask,
567 const struct cpumask *andmask) 556 const struct cpumask *andmask,
557 unsigned int *apicid)
568{ 558{
569 int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
570 cpumask_var_t cpumask; 559 cpumask_var_t cpumask;
560 *apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
571 561
572 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) 562 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
573 return apicid; 563 return 0;
574 564
575 cpumask_and(cpumask, inmask, andmask); 565 cpumask_and(cpumask, inmask, andmask);
576 cpumask_and(cpumask, cpumask, cpu_online_mask); 566 es7000_cpu_mask_to_apicid(cpumask, apicid);
577 apicid = es7000_cpu_mask_to_apicid(cpumask);
578 567
579 free_cpumask_var(cpumask); 568 free_cpumask_var(cpumask);
580 569
581 return apicid; 570 return 0;
582} 571}
583 572
584static int es7000_phys_pkg_id(int cpuid_apic, int index_msb) 573static int es7000_phys_pkg_id(int cpuid_apic, int index_msb)
@@ -638,7 +627,7 @@ static struct apic __refdata apic_es7000_cluster = {
638 .check_apicid_used = es7000_check_apicid_used, 627 .check_apicid_used = es7000_check_apicid_used,
639 .check_apicid_present = es7000_check_apicid_present, 628 .check_apicid_present = es7000_check_apicid_present,
640 629
641 .vector_allocation_domain = es7000_vector_allocation_domain, 630 .vector_allocation_domain = flat_vector_allocation_domain,
642 .init_apic_ldr = es7000_init_apic_ldr_cluster, 631 .init_apic_ldr = es7000_init_apic_ldr_cluster,
643 632
644 .ioapic_phys_id_map = es7000_ioapic_phys_id_map, 633 .ioapic_phys_id_map = es7000_ioapic_phys_id_map,
@@ -656,7 +645,6 @@ static struct apic __refdata apic_es7000_cluster = {
656 .set_apic_id = NULL, 645 .set_apic_id = NULL,
657 .apic_id_mask = 0xFF << 24, 646 .apic_id_mask = 0xFF << 24,
658 647
659 .cpu_mask_to_apicid = es7000_cpu_mask_to_apicid,
660 .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and, 648 .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and,
661 649
662 .send_IPI_mask = es7000_send_IPI_mask, 650 .send_IPI_mask = es7000_send_IPI_mask,
@@ -705,7 +693,7 @@ static struct apic __refdata apic_es7000 = {
705 .check_apicid_used = es7000_check_apicid_used, 693 .check_apicid_used = es7000_check_apicid_used,
706 .check_apicid_present = es7000_check_apicid_present, 694 .check_apicid_present = es7000_check_apicid_present,
707 695
708 .vector_allocation_domain = es7000_vector_allocation_domain, 696 .vector_allocation_domain = flat_vector_allocation_domain,
709 .init_apic_ldr = es7000_init_apic_ldr, 697 .init_apic_ldr = es7000_init_apic_ldr,
710 698
711 .ioapic_phys_id_map = es7000_ioapic_phys_id_map, 699 .ioapic_phys_id_map = es7000_ioapic_phys_id_map,
@@ -723,7 +711,6 @@ static struct apic __refdata apic_es7000 = {
723 .set_apic_id = NULL, 711 .set_apic_id = NULL,
724 .apic_id_mask = 0xFF << 24, 712 .apic_id_mask = 0xFF << 24,
725 713
726 .cpu_mask_to_apicid = es7000_cpu_mask_to_apicid,
727 .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and, 714 .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and,
728 715
729 .send_IPI_mask = es7000_send_IPI_mask, 716 .send_IPI_mask = es7000_send_IPI_mask,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 5f0ff597437c..406eee784684 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -448,8 +448,8 @@ static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pi
448 448
449 entry = alloc_irq_pin_list(node); 449 entry = alloc_irq_pin_list(node);
450 if (!entry) { 450 if (!entry) {
451 printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n", 451 pr_err("can not alloc irq_pin_list (%d,%d,%d)\n",
452 node, apic, pin); 452 node, apic, pin);
453 return -ENOMEM; 453 return -ENOMEM;
454 } 454 }
455 entry->apic = apic; 455 entry->apic = apic;
@@ -661,7 +661,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
661 ioapic_mask_entry(apic, pin); 661 ioapic_mask_entry(apic, pin);
662 entry = ioapic_read_entry(apic, pin); 662 entry = ioapic_read_entry(apic, pin);
663 if (entry.irr) 663 if (entry.irr)
664 printk(KERN_ERR "Unable to reset IRR for apic: %d, pin :%d\n", 664 pr_err("Unable to reset IRR for apic: %d, pin :%d\n",
665 mpc_ioapic_id(apic), pin); 665 mpc_ioapic_id(apic), pin);
666} 666}
667 667
@@ -895,7 +895,7 @@ static int irq_polarity(int idx)
895 } 895 }
896 case 2: /* reserved */ 896 case 2: /* reserved */
897 { 897 {
898 printk(KERN_WARNING "broken BIOS!!\n"); 898 pr_warn("broken BIOS!!\n");
899 polarity = 1; 899 polarity = 1;
900 break; 900 break;
901 } 901 }
@@ -906,7 +906,7 @@ static int irq_polarity(int idx)
906 } 906 }
907 default: /* invalid */ 907 default: /* invalid */
908 { 908 {
909 printk(KERN_WARNING "broken BIOS!!\n"); 909 pr_warn("broken BIOS!!\n");
910 polarity = 1; 910 polarity = 1;
911 break; 911 break;
912 } 912 }
@@ -948,7 +948,7 @@ static int irq_trigger(int idx)
948 } 948 }
949 default: 949 default:
950 { 950 {
951 printk(KERN_WARNING "broken BIOS!!\n"); 951 pr_warn("broken BIOS!!\n");
952 trigger = 1; 952 trigger = 1;
953 break; 953 break;
954 } 954 }
@@ -962,7 +962,7 @@ static int irq_trigger(int idx)
962 } 962 }
963 case 2: /* reserved */ 963 case 2: /* reserved */
964 { 964 {
965 printk(KERN_WARNING "broken BIOS!!\n"); 965 pr_warn("broken BIOS!!\n");
966 trigger = 1; 966 trigger = 1;
967 break; 967 break;
968 } 968 }
@@ -973,7 +973,7 @@ static int irq_trigger(int idx)
973 } 973 }
974 default: /* invalid */ 974 default: /* invalid */
975 { 975 {
976 printk(KERN_WARNING "broken BIOS!!\n"); 976 pr_warn("broken BIOS!!\n");
977 trigger = 0; 977 trigger = 0;
978 break; 978 break;
979 } 979 }
@@ -991,7 +991,7 @@ static int pin_2_irq(int idx, int apic, int pin)
991 * Debugging check, we are in big trouble if this message pops up! 991 * Debugging check, we are in big trouble if this message pops up!
992 */ 992 */
993 if (mp_irqs[idx].dstirq != pin) 993 if (mp_irqs[idx].dstirq != pin)
994 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); 994 pr_err("broken BIOS or MPTABLE parser, ayiee!!\n");
995 995
996 if (test_bit(bus, mp_bus_not_pci)) { 996 if (test_bit(bus, mp_bus_not_pci)) {
997 irq = mp_irqs[idx].srcbusirq; 997 irq = mp_irqs[idx].srcbusirq;
@@ -1112,8 +1112,7 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1112 * 0x80, because int 0x80 is hm, kind of importantish. ;) 1112 * 0x80, because int 0x80 is hm, kind of importantish. ;)
1113 */ 1113 */
1114 static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START; 1114 static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
1115 static int current_offset = VECTOR_OFFSET_START % 8; 1115 static int current_offset = VECTOR_OFFSET_START % 16;
1116 unsigned int old_vector;
1117 int cpu, err; 1116 int cpu, err;
1118 cpumask_var_t tmp_mask; 1117 cpumask_var_t tmp_mask;
1119 1118
@@ -1123,35 +1122,45 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1123 if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) 1122 if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
1124 return -ENOMEM; 1123 return -ENOMEM;
1125 1124
1126 old_vector = cfg->vector;
1127 if (old_vector) {
1128 cpumask_and(tmp_mask, mask, cpu_online_mask);
1129 cpumask_and(tmp_mask, cfg->domain, tmp_mask);
1130 if (!cpumask_empty(tmp_mask)) {
1131 free_cpumask_var(tmp_mask);
1132 return 0;
1133 }
1134 }
1135
1136 /* Only try and allocate irqs on cpus that are present */ 1125 /* Only try and allocate irqs on cpus that are present */
1137 err = -ENOSPC; 1126 err = -ENOSPC;
1138 for_each_cpu_and(cpu, mask, cpu_online_mask) { 1127 cpumask_clear(cfg->old_domain);
1139 int new_cpu; 1128 cpu = cpumask_first_and(mask, cpu_online_mask);
1140 int vector, offset; 1129 while (cpu < nr_cpu_ids) {
1130 int new_cpu, vector, offset;
1141 1131
1142 apic->vector_allocation_domain(cpu, tmp_mask); 1132 apic->vector_allocation_domain(cpu, tmp_mask, mask);
1133
1134 if (cpumask_subset(tmp_mask, cfg->domain)) {
1135 err = 0;
1136 if (cpumask_equal(tmp_mask, cfg->domain))
1137 break;
1138 /*
1139 * New cpumask using the vector is a proper subset of
1140 * the current in use mask. So cleanup the vector
1141 * allocation for the members that are not used anymore.
1142 */
1143 cpumask_andnot(cfg->old_domain, cfg->domain, tmp_mask);
1144 cfg->move_in_progress = 1;
1145 cpumask_and(cfg->domain, cfg->domain, tmp_mask);
1146 break;
1147 }
1143 1148
1144 vector = current_vector; 1149 vector = current_vector;
1145 offset = current_offset; 1150 offset = current_offset;
1146next: 1151next:
1147 vector += 8; 1152 vector += 16;
1148 if (vector >= first_system_vector) { 1153 if (vector >= first_system_vector) {
1149 /* If out of vectors on large boxen, must share them. */ 1154 offset = (offset + 1) % 16;
1150 offset = (offset + 1) % 8;
1151 vector = FIRST_EXTERNAL_VECTOR + offset; 1155 vector = FIRST_EXTERNAL_VECTOR + offset;
1152 } 1156 }
1153 if (unlikely(current_vector == vector)) 1157
1158 if (unlikely(current_vector == vector)) {
1159 cpumask_or(cfg->old_domain, cfg->old_domain, tmp_mask);
1160 cpumask_andnot(tmp_mask, mask, cfg->old_domain);
1161 cpu = cpumask_first_and(tmp_mask, cpu_online_mask);
1154 continue; 1162 continue;
1163 }
1155 1164
1156 if (test_bit(vector, used_vectors)) 1165 if (test_bit(vector, used_vectors))
1157 goto next; 1166 goto next;
@@ -1162,7 +1171,7 @@ next:
1162 /* Found one! */ 1171 /* Found one! */
1163 current_vector = vector; 1172 current_vector = vector;
1164 current_offset = offset; 1173 current_offset = offset;
1165 if (old_vector) { 1174 if (cfg->vector) {
1166 cfg->move_in_progress = 1; 1175 cfg->move_in_progress = 1;
1167 cpumask_copy(cfg->old_domain, cfg->domain); 1176 cpumask_copy(cfg->old_domain, cfg->domain);
1168 } 1177 }
@@ -1346,18 +1355,18 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg,
1346 1355
1347 if (!IO_APIC_IRQ(irq)) 1356 if (!IO_APIC_IRQ(irq))
1348 return; 1357 return;
1349 /*
1350 * For legacy irqs, cfg->domain starts with cpu 0 for legacy
1351 * controllers like 8259. Now that IO-APIC can handle this irq, update
1352 * the cfg->domain.
1353 */
1354 if (irq < legacy_pic->nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain))
1355 apic->vector_allocation_domain(0, cfg->domain);
1356 1358
1357 if (assign_irq_vector(irq, cfg, apic->target_cpus())) 1359 if (assign_irq_vector(irq, cfg, apic->target_cpus()))
1358 return; 1360 return;
1359 1361
1360 dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); 1362 if (apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus(),
1363 &dest)) {
1364 pr_warn("Failed to obtain apicid for ioapic %d, pin %d\n",
1365 mpc_ioapic_id(attr->ioapic), attr->ioapic_pin);
1366 __clear_irq_vector(irq, cfg);
1367
1368 return;
1369 }
1361 1370
1362 apic_printk(APIC_VERBOSE,KERN_DEBUG 1371 apic_printk(APIC_VERBOSE,KERN_DEBUG
1363 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " 1372 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
@@ -1366,7 +1375,7 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg,
1366 cfg->vector, irq, attr->trigger, attr->polarity, dest); 1375 cfg->vector, irq, attr->trigger, attr->polarity, dest);
1367 1376
1368 if (setup_ioapic_entry(irq, &entry, dest, cfg->vector, attr)) { 1377 if (setup_ioapic_entry(irq, &entry, dest, cfg->vector, attr)) {
1369 pr_warn("Failed to setup ioapic entry for ioapic %d, pin %d\n", 1378 pr_warn("Failed to setup ioapic entry for ioapic %d, pin %d\n",
1370 mpc_ioapic_id(attr->ioapic), attr->ioapic_pin); 1379 mpc_ioapic_id(attr->ioapic), attr->ioapic_pin);
1371 __clear_irq_vector(irq, cfg); 1380 __clear_irq_vector(irq, cfg);
1372 1381
@@ -1469,9 +1478,10 @@ void setup_IO_APIC_irq_extra(u32 gsi)
1469 * Set up the timer pin, possibly with the 8259A-master behind. 1478 * Set up the timer pin, possibly with the 8259A-master behind.
1470 */ 1479 */
1471static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx, 1480static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx,
1472 unsigned int pin, int vector) 1481 unsigned int pin, int vector)
1473{ 1482{
1474 struct IO_APIC_route_entry entry; 1483 struct IO_APIC_route_entry entry;
1484 unsigned int dest;
1475 1485
1476 if (irq_remapping_enabled) 1486 if (irq_remapping_enabled)
1477 return; 1487 return;
@@ -1482,9 +1492,13 @@ static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx,
1482 * We use logical delivery to get the timer IRQ 1492 * We use logical delivery to get the timer IRQ
1483 * to the first CPU. 1493 * to the first CPU.
1484 */ 1494 */
1495 if (unlikely(apic->cpu_mask_to_apicid_and(apic->target_cpus(),
1496 apic->target_cpus(), &dest)))
1497 dest = BAD_APICID;
1498
1485 entry.dest_mode = apic->irq_dest_mode; 1499 entry.dest_mode = apic->irq_dest_mode;
1486 entry.mask = 0; /* don't mask IRQ for edge */ 1500 entry.mask = 0; /* don't mask IRQ for edge */
1487 entry.dest = apic->cpu_mask_to_apicid(apic->target_cpus()); 1501 entry.dest = dest;
1488 entry.delivery_mode = apic->irq_delivery_mode; 1502 entry.delivery_mode = apic->irq_delivery_mode;
1489 entry.polarity = 0; 1503 entry.polarity = 0;
1490 entry.trigger = 0; 1504 entry.trigger = 0;
@@ -1521,7 +1535,6 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx)
1521 reg_03.raw = io_apic_read(ioapic_idx, 3); 1535 reg_03.raw = io_apic_read(ioapic_idx, 3);
1522 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 1536 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
1523 1537
1524 printk("\n");
1525 printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(ioapic_idx)); 1538 printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(ioapic_idx));
1526 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); 1539 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
1527 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); 1540 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
@@ -1578,7 +1591,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx)
1578 i, 1591 i,
1579 ir_entry->index 1592 ir_entry->index
1580 ); 1593 );
1581 printk("%1d %1d %1d %1d %1d " 1594 pr_cont("%1d %1d %1d %1d %1d "
1582 "%1d %1d %X %02X\n", 1595 "%1d %1d %X %02X\n",
1583 ir_entry->format, 1596 ir_entry->format,
1584 ir_entry->mask, 1597 ir_entry->mask,
@@ -1598,7 +1611,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx)
1598 i, 1611 i,
1599 entry.dest 1612 entry.dest
1600 ); 1613 );
1601 printk("%1d %1d %1d %1d %1d " 1614 pr_cont("%1d %1d %1d %1d %1d "
1602 "%1d %1d %02X\n", 1615 "%1d %1d %02X\n",
1603 entry.mask, 1616 entry.mask,
1604 entry.trigger, 1617 entry.trigger,
@@ -1651,8 +1664,8 @@ __apicdebuginit(void) print_IO_APICs(void)
1651 continue; 1664 continue;
1652 printk(KERN_DEBUG "IRQ%d ", irq); 1665 printk(KERN_DEBUG "IRQ%d ", irq);
1653 for_each_irq_pin(entry, cfg->irq_2_pin) 1666 for_each_irq_pin(entry, cfg->irq_2_pin)
1654 printk("-> %d:%d", entry->apic, entry->pin); 1667 pr_cont("-> %d:%d", entry->apic, entry->pin);
1655 printk("\n"); 1668 pr_cont("\n");
1656 } 1669 }
1657 1670
1658 printk(KERN_INFO ".................................... done.\n"); 1671 printk(KERN_INFO ".................................... done.\n");
@@ -1665,9 +1678,9 @@ __apicdebuginit(void) print_APIC_field(int base)
1665 printk(KERN_DEBUG); 1678 printk(KERN_DEBUG);
1666 1679
1667 for (i = 0; i < 8; i++) 1680 for (i = 0; i < 8; i++)
1668 printk(KERN_CONT "%08x", apic_read(base + i*0x10)); 1681 pr_cont("%08x", apic_read(base + i*0x10));
1669 1682
1670 printk(KERN_CONT "\n"); 1683 pr_cont("\n");
1671} 1684}
1672 1685
1673__apicdebuginit(void) print_local_APIC(void *dummy) 1686__apicdebuginit(void) print_local_APIC(void *dummy)
@@ -1769,7 +1782,7 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
1769 printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v); 1782 printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v);
1770 } 1783 }
1771 } 1784 }
1772 printk("\n"); 1785 pr_cont("\n");
1773} 1786}
1774 1787
1775__apicdebuginit(void) print_local_APICs(int maxcpu) 1788__apicdebuginit(void) print_local_APICs(int maxcpu)
@@ -2065,7 +2078,7 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)
2065 reg_00.raw = io_apic_read(ioapic_idx, 0); 2078 reg_00.raw = io_apic_read(ioapic_idx, 0);
2066 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 2079 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2067 if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx)) 2080 if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx))
2068 printk("could not set ID!\n"); 2081 pr_cont("could not set ID!\n");
2069 else 2082 else
2070 apic_printk(APIC_VERBOSE, " ok.\n"); 2083 apic_printk(APIC_VERBOSE, " ok.\n");
2071 } 2084 }
@@ -2210,71 +2223,6 @@ void send_cleanup_vector(struct irq_cfg *cfg)
2210 cfg->move_in_progress = 0; 2223 cfg->move_in_progress = 0;
2211} 2224}
2212 2225
2213static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
2214{
2215 int apic, pin;
2216 struct irq_pin_list *entry;
2217 u8 vector = cfg->vector;
2218
2219 for_each_irq_pin(entry, cfg->irq_2_pin) {
2220 unsigned int reg;
2221
2222 apic = entry->apic;
2223 pin = entry->pin;
2224 /*
2225 * With interrupt-remapping, destination information comes
2226 * from interrupt-remapping table entry.
2227 */
2228 if (!irq_remapped(cfg))
2229 io_apic_write(apic, 0x11 + pin*2, dest);
2230 reg = io_apic_read(apic, 0x10 + pin*2);
2231 reg &= ~IO_APIC_REDIR_VECTOR_MASK;
2232 reg |= vector;
2233 io_apic_modify(apic, 0x10 + pin*2, reg);
2234 }
2235}
2236
2237/*
2238 * Either sets data->affinity to a valid value, and returns
2239 * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
2240 * leaves data->affinity untouched.
2241 */
2242int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
2243 unsigned int *dest_id)
2244{
2245 struct irq_cfg *cfg = data->chip_data;
2246
2247 if (!cpumask_intersects(mask, cpu_online_mask))
2248 return -1;
2249
2250 if (assign_irq_vector(data->irq, data->chip_data, mask))
2251 return -1;
2252
2253 cpumask_copy(data->affinity, mask);
2254
2255 *dest_id = apic->cpu_mask_to_apicid_and(mask, cfg->domain);
2256 return 0;
2257}
2258
2259static int
2260ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
2261 bool force)
2262{
2263 unsigned int dest, irq = data->irq;
2264 unsigned long flags;
2265 int ret;
2266
2267 raw_spin_lock_irqsave(&ioapic_lock, flags);
2268 ret = __ioapic_set_affinity(data, mask, &dest);
2269 if (!ret) {
2270 /* Only the high 8 bits are valid. */
2271 dest = SET_APIC_LOGICAL_ID(dest);
2272 __target_IO_APIC_irq(irq, dest, data->chip_data);
2273 }
2274 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2275 return ret;
2276}
2277
2278asmlinkage void smp_irq_move_cleanup_interrupt(void) 2226asmlinkage void smp_irq_move_cleanup_interrupt(void)
2279{ 2227{
2280 unsigned vector, me; 2228 unsigned vector, me;
@@ -2362,6 +2310,87 @@ void irq_force_complete_move(int irq)
2362static inline void irq_complete_move(struct irq_cfg *cfg) { } 2310static inline void irq_complete_move(struct irq_cfg *cfg) { }
2363#endif 2311#endif
2364 2312
2313static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
2314{
2315 int apic, pin;
2316 struct irq_pin_list *entry;
2317 u8 vector = cfg->vector;
2318
2319 for_each_irq_pin(entry, cfg->irq_2_pin) {
2320 unsigned int reg;
2321
2322 apic = entry->apic;
2323 pin = entry->pin;
2324 /*
2325 * With interrupt-remapping, destination information comes
2326 * from interrupt-remapping table entry.
2327 */
2328 if (!irq_remapped(cfg))
2329 io_apic_write(apic, 0x11 + pin*2, dest);
2330 reg = io_apic_read(apic, 0x10 + pin*2);
2331 reg &= ~IO_APIC_REDIR_VECTOR_MASK;
2332 reg |= vector;
2333 io_apic_modify(apic, 0x10 + pin*2, reg);
2334 }
2335}
2336
2337/*
2338 * Either sets data->affinity to a valid value, and returns
2339 * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
2340 * leaves data->affinity untouched.
2341 */
2342int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
2343 unsigned int *dest_id)
2344{
2345 struct irq_cfg *cfg = data->chip_data;
2346 unsigned int irq = data->irq;
2347 int err;
2348
2349 if (!config_enabled(CONFIG_SMP))
2350 return -1;
2351
2352 if (!cpumask_intersects(mask, cpu_online_mask))
2353 return -EINVAL;
2354
2355 err = assign_irq_vector(irq, cfg, mask);
2356 if (err)
2357 return err;
2358
2359 err = apic->cpu_mask_to_apicid_and(mask, cfg->domain, dest_id);
2360 if (err) {
2361 if (assign_irq_vector(irq, cfg, data->affinity))
2362 pr_err("Failed to recover vector for irq %d\n", irq);
2363 return err;
2364 }
2365
2366 cpumask_copy(data->affinity, mask);
2367
2368 return 0;
2369}
2370
2371static int
2372ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
2373 bool force)
2374{
2375 unsigned int dest, irq = data->irq;
2376 unsigned long flags;
2377 int ret;
2378
2379 if (!config_enabled(CONFIG_SMP))
2380 return -1;
2381
2382 raw_spin_lock_irqsave(&ioapic_lock, flags);
2383 ret = __ioapic_set_affinity(data, mask, &dest);
2384 if (!ret) {
2385 /* Only the high 8 bits are valid. */
2386 dest = SET_APIC_LOGICAL_ID(dest);
2387 __target_IO_APIC_irq(irq, dest, data->chip_data);
2388 ret = IRQ_SET_MASK_OK_NOCOPY;
2389 }
2390 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2391 return ret;
2392}
2393
2365static void ack_apic_edge(struct irq_data *data) 2394static void ack_apic_edge(struct irq_data *data)
2366{ 2395{
2367 irq_complete_move(data->chip_data); 2396 irq_complete_move(data->chip_data);
@@ -2541,9 +2570,7 @@ static void irq_remap_modify_chip_defaults(struct irq_chip *chip)
2541 chip->irq_ack = ir_ack_apic_edge; 2570 chip->irq_ack = ir_ack_apic_edge;
2542 chip->irq_eoi = ir_ack_apic_level; 2571 chip->irq_eoi = ir_ack_apic_level;
2543 2572
2544#ifdef CONFIG_SMP
2545 chip->irq_set_affinity = set_remapped_irq_affinity; 2573 chip->irq_set_affinity = set_remapped_irq_affinity;
2546#endif
2547} 2574}
2548#endif /* CONFIG_IRQ_REMAP */ 2575#endif /* CONFIG_IRQ_REMAP */
2549 2576
@@ -2554,9 +2581,7 @@ static struct irq_chip ioapic_chip __read_mostly = {
2554 .irq_unmask = unmask_ioapic_irq, 2581 .irq_unmask = unmask_ioapic_irq,
2555 .irq_ack = ack_apic_edge, 2582 .irq_ack = ack_apic_edge,
2556 .irq_eoi = ack_apic_level, 2583 .irq_eoi = ack_apic_level,
2557#ifdef CONFIG_SMP
2558 .irq_set_affinity = ioapic_set_affinity, 2584 .irq_set_affinity = ioapic_set_affinity,
2559#endif
2560 .irq_retrigger = ioapic_retrigger_irq, 2585 .irq_retrigger = ioapic_retrigger_irq,
2561}; 2586};
2562 2587
@@ -3038,7 +3063,10 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
3038 if (err) 3063 if (err)
3039 return err; 3064 return err;
3040 3065
3041 dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); 3066 err = apic->cpu_mask_to_apicid_and(cfg->domain,
3067 apic->target_cpus(), &dest);
3068 if (err)
3069 return err;
3042 3070
3043 if (irq_remapped(cfg)) { 3071 if (irq_remapped(cfg)) {
3044 compose_remapped_msi_msg(pdev, irq, dest, msg, hpet_id); 3072 compose_remapped_msi_msg(pdev, irq, dest, msg, hpet_id);
@@ -3072,7 +3100,6 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
3072 return err; 3100 return err;
3073} 3101}
3074 3102
3075#ifdef CONFIG_SMP
3076static int 3103static int
3077msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) 3104msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
3078{ 3105{
@@ -3092,9 +3119,8 @@ msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
3092 3119
3093 __write_msi_msg(data->msi_desc, &msg); 3120 __write_msi_msg(data->msi_desc, &msg);
3094 3121
3095 return 0; 3122 return IRQ_SET_MASK_OK_NOCOPY;
3096} 3123}
3097#endif /* CONFIG_SMP */
3098 3124
3099/* 3125/*
3100 * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, 3126 * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
@@ -3105,9 +3131,7 @@ static struct irq_chip msi_chip = {
3105 .irq_unmask = unmask_msi_irq, 3131 .irq_unmask = unmask_msi_irq,
3106 .irq_mask = mask_msi_irq, 3132 .irq_mask = mask_msi_irq,
3107 .irq_ack = ack_apic_edge, 3133 .irq_ack = ack_apic_edge,
3108#ifdef CONFIG_SMP
3109 .irq_set_affinity = msi_set_affinity, 3134 .irq_set_affinity = msi_set_affinity,
3110#endif
3111 .irq_retrigger = ioapic_retrigger_irq, 3135 .irq_retrigger = ioapic_retrigger_irq,
3112}; 3136};
3113 3137
@@ -3192,7 +3216,6 @@ void native_teardown_msi_irq(unsigned int irq)
3192} 3216}
3193 3217
3194#ifdef CONFIG_DMAR_TABLE 3218#ifdef CONFIG_DMAR_TABLE
3195#ifdef CONFIG_SMP
3196static int 3219static int
3197dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, 3220dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
3198 bool force) 3221 bool force)
@@ -3214,19 +3237,15 @@ dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
3214 3237
3215 dmar_msi_write(irq, &msg); 3238 dmar_msi_write(irq, &msg);
3216 3239
3217 return 0; 3240 return IRQ_SET_MASK_OK_NOCOPY;
3218} 3241}
3219 3242
3220#endif /* CONFIG_SMP */
3221
3222static struct irq_chip dmar_msi_type = { 3243static struct irq_chip dmar_msi_type = {
3223 .name = "DMAR_MSI", 3244 .name = "DMAR_MSI",
3224 .irq_unmask = dmar_msi_unmask, 3245 .irq_unmask = dmar_msi_unmask,
3225 .irq_mask = dmar_msi_mask, 3246 .irq_mask = dmar_msi_mask,
3226 .irq_ack = ack_apic_edge, 3247 .irq_ack = ack_apic_edge,
3227#ifdef CONFIG_SMP
3228 .irq_set_affinity = dmar_msi_set_affinity, 3248 .irq_set_affinity = dmar_msi_set_affinity,
3229#endif
3230 .irq_retrigger = ioapic_retrigger_irq, 3249 .irq_retrigger = ioapic_retrigger_irq,
3231}; 3250};
3232 3251
@@ -3247,7 +3266,6 @@ int arch_setup_dmar_msi(unsigned int irq)
3247 3266
3248#ifdef CONFIG_HPET_TIMER 3267#ifdef CONFIG_HPET_TIMER
3249 3268
3250#ifdef CONFIG_SMP
3251static int hpet_msi_set_affinity(struct irq_data *data, 3269static int hpet_msi_set_affinity(struct irq_data *data,
3252 const struct cpumask *mask, bool force) 3270 const struct cpumask *mask, bool force)
3253{ 3271{
@@ -3267,19 +3285,15 @@ static int hpet_msi_set_affinity(struct irq_data *data,
3267 3285
3268 hpet_msi_write(data->handler_data, &msg); 3286 hpet_msi_write(data->handler_data, &msg);
3269 3287
3270 return 0; 3288 return IRQ_SET_MASK_OK_NOCOPY;
3271} 3289}
3272 3290
3273#endif /* CONFIG_SMP */
3274
3275static struct irq_chip hpet_msi_type = { 3291static struct irq_chip hpet_msi_type = {
3276 .name = "HPET_MSI", 3292 .name = "HPET_MSI",
3277 .irq_unmask = hpet_msi_unmask, 3293 .irq_unmask = hpet_msi_unmask,
3278 .irq_mask = hpet_msi_mask, 3294 .irq_mask = hpet_msi_mask,
3279 .irq_ack = ack_apic_edge, 3295 .irq_ack = ack_apic_edge,
3280#ifdef CONFIG_SMP
3281 .irq_set_affinity = hpet_msi_set_affinity, 3296 .irq_set_affinity = hpet_msi_set_affinity,
3282#endif
3283 .irq_retrigger = ioapic_retrigger_irq, 3297 .irq_retrigger = ioapic_retrigger_irq,
3284}; 3298};
3285 3299
@@ -3314,8 +3328,6 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
3314 */ 3328 */
3315#ifdef CONFIG_HT_IRQ 3329#ifdef CONFIG_HT_IRQ
3316 3330
3317#ifdef CONFIG_SMP
3318
3319static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) 3331static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
3320{ 3332{
3321 struct ht_irq_msg msg; 3333 struct ht_irq_msg msg;
@@ -3340,25 +3352,23 @@ ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
3340 return -1; 3352 return -1;
3341 3353
3342 target_ht_irq(data->irq, dest, cfg->vector); 3354 target_ht_irq(data->irq, dest, cfg->vector);
3343 return 0; 3355 return IRQ_SET_MASK_OK_NOCOPY;
3344} 3356}
3345 3357
3346#endif
3347
3348static struct irq_chip ht_irq_chip = { 3358static struct irq_chip ht_irq_chip = {
3349 .name = "PCI-HT", 3359 .name = "PCI-HT",
3350 .irq_mask = mask_ht_irq, 3360 .irq_mask = mask_ht_irq,
3351 .irq_unmask = unmask_ht_irq, 3361 .irq_unmask = unmask_ht_irq,
3352 .irq_ack = ack_apic_edge, 3362 .irq_ack = ack_apic_edge,
3353#ifdef CONFIG_SMP
3354 .irq_set_affinity = ht_set_affinity, 3363 .irq_set_affinity = ht_set_affinity,
3355#endif
3356 .irq_retrigger = ioapic_retrigger_irq, 3364 .irq_retrigger = ioapic_retrigger_irq,
3357}; 3365};
3358 3366
3359int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) 3367int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3360{ 3368{
3361 struct irq_cfg *cfg; 3369 struct irq_cfg *cfg;
3370 struct ht_irq_msg msg;
3371 unsigned dest;
3362 int err; 3372 int err;
3363 3373
3364 if (disable_apic) 3374 if (disable_apic)
@@ -3366,36 +3376,37 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3366 3376
3367 cfg = irq_cfg(irq); 3377 cfg = irq_cfg(irq);
3368 err = assign_irq_vector(irq, cfg, apic->target_cpus()); 3378 err = assign_irq_vector(irq, cfg, apic->target_cpus());
3369 if (!err) { 3379 if (err)
3370 struct ht_irq_msg msg; 3380 return err;
3371 unsigned dest; 3381
3382 err = apic->cpu_mask_to_apicid_and(cfg->domain,
3383 apic->target_cpus(), &dest);
3384 if (err)
3385 return err;
3372 3386
3373 dest = apic->cpu_mask_to_apicid_and(cfg->domain, 3387 msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
3374 apic->target_cpus());
3375 3388
3376 msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); 3389 msg.address_lo =
3390 HT_IRQ_LOW_BASE |
3391 HT_IRQ_LOW_DEST_ID(dest) |
3392 HT_IRQ_LOW_VECTOR(cfg->vector) |
3393 ((apic->irq_dest_mode == 0) ?
3394 HT_IRQ_LOW_DM_PHYSICAL :
3395 HT_IRQ_LOW_DM_LOGICAL) |
3396 HT_IRQ_LOW_RQEOI_EDGE |
3397 ((apic->irq_delivery_mode != dest_LowestPrio) ?
3398 HT_IRQ_LOW_MT_FIXED :
3399 HT_IRQ_LOW_MT_ARBITRATED) |
3400 HT_IRQ_LOW_IRQ_MASKED;
3377 3401
3378 msg.address_lo = 3402 write_ht_irq_msg(irq, &msg);
3379 HT_IRQ_LOW_BASE |
3380 HT_IRQ_LOW_DEST_ID(dest) |
3381 HT_IRQ_LOW_VECTOR(cfg->vector) |
3382 ((apic->irq_dest_mode == 0) ?
3383 HT_IRQ_LOW_DM_PHYSICAL :
3384 HT_IRQ_LOW_DM_LOGICAL) |
3385 HT_IRQ_LOW_RQEOI_EDGE |
3386 ((apic->irq_delivery_mode != dest_LowestPrio) ?
3387 HT_IRQ_LOW_MT_FIXED :
3388 HT_IRQ_LOW_MT_ARBITRATED) |
3389 HT_IRQ_LOW_IRQ_MASKED;
3390 3403
3391 write_ht_irq_msg(irq, &msg); 3404 irq_set_chip_and_handler_name(irq, &ht_irq_chip,
3405 handle_edge_irq, "edge");
3392 3406
3393 irq_set_chip_and_handler_name(irq, &ht_irq_chip, 3407 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq);
3394 handle_edge_irq, "edge");
3395 3408
3396 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq); 3409 return 0;
3397 }
3398 return err;
3399} 3410}
3400#endif /* CONFIG_HT_IRQ */ 3411#endif /* CONFIG_HT_IRQ */
3401 3412
@@ -3563,7 +3574,8 @@ static int __init io_apic_get_unique_id(int ioapic, int apic_id)
3563 3574
3564 /* Sanity check */ 3575 /* Sanity check */
3565 if (reg_00.bits.ID != apic_id) { 3576 if (reg_00.bits.ID != apic_id) {
3566 printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic); 3577 pr_err("IOAPIC[%d]: Unable to change apic_id!\n",
3578 ioapic);
3567 return -1; 3579 return -1;
3568 } 3580 }
3569 } 3581 }
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index f00a68cca37a..d661ee95cabf 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -406,16 +406,13 @@ static inline int numaq_check_phys_apicid_present(int phys_apicid)
406 * We use physical apicids here, not logical, so just return the default 406 * We use physical apicids here, not logical, so just return the default
407 * physical broadcast to stop people from breaking us 407 * physical broadcast to stop people from breaking us
408 */ 408 */
409static unsigned int numaq_cpu_mask_to_apicid(const struct cpumask *cpumask) 409static int
410{
411 return 0x0F;
412}
413
414static inline unsigned int
415numaq_cpu_mask_to_apicid_and(const struct cpumask *cpumask, 410numaq_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
416 const struct cpumask *andmask) 411 const struct cpumask *andmask,
412 unsigned int *apicid)
417{ 413{
418 return 0x0F; 414 *apicid = 0x0F;
415 return 0;
419} 416}
420 417
421/* No NUMA-Q box has a HT CPU, but it can't hurt to use the default code. */ 418/* No NUMA-Q box has a HT CPU, but it can't hurt to use the default code. */
@@ -441,20 +438,6 @@ static int probe_numaq(void)
441 return found_numaq; 438 return found_numaq;
442} 439}
443 440
444static void numaq_vector_allocation_domain(int cpu, struct cpumask *retmask)
445{
446 /* Careful. Some cpus do not strictly honor the set of cpus
447 * specified in the interrupt destination when using lowest
448 * priority interrupt delivery mode.
449 *
450 * In particular there was a hyperthreading cpu observed to
451 * deliver interrupts to the wrong hyperthread when only one
452 * hyperthread was specified in the interrupt desitination.
453 */
454 cpumask_clear(retmask);
455 cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
456}
457
458static void numaq_setup_portio_remap(void) 441static void numaq_setup_portio_remap(void)
459{ 442{
460 int num_quads = num_online_nodes(); 443 int num_quads = num_online_nodes();
@@ -491,7 +474,7 @@ static struct apic __refdata apic_numaq = {
491 .check_apicid_used = numaq_check_apicid_used, 474 .check_apicid_used = numaq_check_apicid_used,
492 .check_apicid_present = numaq_check_apicid_present, 475 .check_apicid_present = numaq_check_apicid_present,
493 476
494 .vector_allocation_domain = numaq_vector_allocation_domain, 477 .vector_allocation_domain = flat_vector_allocation_domain,
495 .init_apic_ldr = numaq_init_apic_ldr, 478 .init_apic_ldr = numaq_init_apic_ldr,
496 479
497 .ioapic_phys_id_map = numaq_ioapic_phys_id_map, 480 .ioapic_phys_id_map = numaq_ioapic_phys_id_map,
@@ -509,7 +492,6 @@ static struct apic __refdata apic_numaq = {
509 .set_apic_id = NULL, 492 .set_apic_id = NULL,
510 .apic_id_mask = 0x0F << 24, 493 .apic_id_mask = 0x0F << 24,
511 494
512 .cpu_mask_to_apicid = numaq_cpu_mask_to_apicid,
513 .cpu_mask_to_apicid_and = numaq_cpu_mask_to_apicid_and, 495 .cpu_mask_to_apicid_and = numaq_cpu_mask_to_apicid_and,
514 496
515 .send_IPI_mask = numaq_send_IPI_mask, 497 .send_IPI_mask = numaq_send_IPI_mask,
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 1b291da09e60..eb35ef9ee63f 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -66,21 +66,6 @@ static void setup_apic_flat_routing(void)
66#endif 66#endif
67} 67}
68 68
69static void default_vector_allocation_domain(int cpu, struct cpumask *retmask)
70{
71 /*
72 * Careful. Some cpus do not strictly honor the set of cpus
73 * specified in the interrupt destination when using lowest
74 * priority interrupt delivery mode.
75 *
76 * In particular there was a hyperthreading cpu observed to
77 * deliver interrupts to the wrong hyperthread when only one
78 * hyperthread was specified in the interrupt desitination.
79 */
80 cpumask_clear(retmask);
81 cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
82}
83
84/* should be called last. */ 69/* should be called last. */
85static int probe_default(void) 70static int probe_default(void)
86{ 71{
@@ -105,7 +90,7 @@ static struct apic apic_default = {
105 .check_apicid_used = default_check_apicid_used, 90 .check_apicid_used = default_check_apicid_used,
106 .check_apicid_present = default_check_apicid_present, 91 .check_apicid_present = default_check_apicid_present,
107 92
108 .vector_allocation_domain = default_vector_allocation_domain, 93 .vector_allocation_domain = flat_vector_allocation_domain,
109 .init_apic_ldr = default_init_apic_ldr, 94 .init_apic_ldr = default_init_apic_ldr,
110 95
111 .ioapic_phys_id_map = default_ioapic_phys_id_map, 96 .ioapic_phys_id_map = default_ioapic_phys_id_map,
@@ -123,8 +108,7 @@ static struct apic apic_default = {
123 .set_apic_id = NULL, 108 .set_apic_id = NULL,
124 .apic_id_mask = 0x0F << 24, 109 .apic_id_mask = 0x0F << 24,
125 110
126 .cpu_mask_to_apicid = default_cpu_mask_to_apicid, 111 .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
127 .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
128 112
129 .send_IPI_mask = default_send_IPI_mask_logical, 113 .send_IPI_mask = default_send_IPI_mask_logical,
130 .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_logical, 114 .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_logical,
@@ -208,6 +192,9 @@ void __init default_setup_apic_routing(void)
208 192
209 if (apic->setup_apic_routing) 193 if (apic->setup_apic_routing)
210 apic->setup_apic_routing(); 194 apic->setup_apic_routing();
195
196 if (x86_platform.apic_post_init)
197 x86_platform.apic_post_init();
211} 198}
212 199
213void __init generic_apic_probe(void) 200void __init generic_apic_probe(void)
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index 3fe986698929..1793dba7a741 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -23,11 +23,6 @@
23#include <asm/ipi.h> 23#include <asm/ipi.h>
24#include <asm/setup.h> 24#include <asm/setup.h>
25 25
26static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
27{
28 return hard_smp_processor_id() >> index_msb;
29}
30
31/* 26/*
32 * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. 27 * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
33 */ 28 */
@@ -48,10 +43,8 @@ void __init default_setup_apic_routing(void)
48 } 43 }
49 } 44 }
50 45
51 if (is_vsmp_box()) { 46 if (x86_platform.apic_post_init)
52 /* need to update phys_pkg_id */ 47 x86_platform.apic_post_init();
53 apic->phys_pkg_id = apicid_phys_pkg_id;
54 }
55} 48}
56 49
57/* Same for both flat and physical. */ 50/* Same for both flat and physical. */
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 659897c00755..77c95c0e1bf7 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -26,6 +26,8 @@
26 * 26 *
27 */ 27 */
28 28
29#define pr_fmt(fmt) "summit: %s: " fmt, __func__
30
29#include <linux/mm.h> 31#include <linux/mm.h>
30#include <linux/init.h> 32#include <linux/init.h>
31#include <asm/io.h> 33#include <asm/io.h>
@@ -235,8 +237,8 @@ static int summit_apic_id_registered(void)
235 237
236static void summit_setup_apic_routing(void) 238static void summit_setup_apic_routing(void)
237{ 239{
238 printk("Enabling APIC mode: Summit. Using %d I/O APICs\n", 240 pr_info("Enabling APIC mode: Summit. Using %d I/O APICs\n",
239 nr_ioapics); 241 nr_ioapics);
240} 242}
241 243
242static int summit_cpu_present_to_apicid(int mps_cpu) 244static int summit_cpu_present_to_apicid(int mps_cpu)
@@ -263,43 +265,48 @@ static int summit_check_phys_apicid_present(int physical_apicid)
263 return 1; 265 return 1;
264} 266}
265 267
266static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask) 268static inline int
269summit_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id)
267{ 270{
268 unsigned int round = 0; 271 unsigned int round = 0;
269 int cpu, apicid = 0; 272 unsigned int cpu, apicid = 0;
270 273
271 /* 274 /*
272 * The cpus in the mask must all be on the apic cluster. 275 * The cpus in the mask must all be on the apic cluster.
273 */ 276 */
274 for_each_cpu(cpu, cpumask) { 277 for_each_cpu_and(cpu, cpumask, cpu_online_mask) {
275 int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); 278 int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
276 279
277 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { 280 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
278 printk("%s: Not a valid mask!\n", __func__); 281 pr_err("Not a valid mask!\n");
279 return BAD_APICID; 282 return -EINVAL;
280 } 283 }
281 apicid |= new_apicid; 284 apicid |= new_apicid;
282 round++; 285 round++;
283 } 286 }
284 return apicid; 287 if (!round)
288 return -EINVAL;
289 *dest_id = apicid;
290 return 0;
285} 291}
286 292
287static unsigned int summit_cpu_mask_to_apicid_and(const struct cpumask *inmask, 293static int
288 const struct cpumask *andmask) 294summit_cpu_mask_to_apicid_and(const struct cpumask *inmask,
295 const struct cpumask *andmask,
296 unsigned int *apicid)
289{ 297{
290 int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
291 cpumask_var_t cpumask; 298 cpumask_var_t cpumask;
299 *apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
292 300
293 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) 301 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
294 return apicid; 302 return 0;
295 303
296 cpumask_and(cpumask, inmask, andmask); 304 cpumask_and(cpumask, inmask, andmask);
297 cpumask_and(cpumask, cpumask, cpu_online_mask); 305 summit_cpu_mask_to_apicid(cpumask, apicid);
298 apicid = summit_cpu_mask_to_apicid(cpumask);
299 306
300 free_cpumask_var(cpumask); 307 free_cpumask_var(cpumask);
301 308
302 return apicid; 309 return 0;
303} 310}
304 311
305/* 312/*
@@ -320,20 +327,6 @@ static int probe_summit(void)
320 return 0; 327 return 0;
321} 328}
322 329
323static void summit_vector_allocation_domain(int cpu, struct cpumask *retmask)
324{
325 /* Careful. Some cpus do not strictly honor the set of cpus
326 * specified in the interrupt destination when using lowest
327 * priority interrupt delivery mode.
328 *
329 * In particular there was a hyperthreading cpu observed to
330 * deliver interrupts to the wrong hyperthread when only one
331 * hyperthread was specified in the interrupt desitination.
332 */
333 cpumask_clear(retmask);
334 cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
335}
336
337#ifdef CONFIG_X86_SUMMIT_NUMA 330#ifdef CONFIG_X86_SUMMIT_NUMA
338static struct rio_table_hdr *rio_table_hdr; 331static struct rio_table_hdr *rio_table_hdr;
339static struct scal_detail *scal_devs[MAX_NUMNODES]; 332static struct scal_detail *scal_devs[MAX_NUMNODES];
@@ -355,7 +348,7 @@ static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
355 } 348 }
356 } 349 }
357 if (i == rio_table_hdr->num_rio_dev) { 350 if (i == rio_table_hdr->num_rio_dev) {
358 printk(KERN_ERR "%s: Couldn't find owner Cyclone for Winnipeg!\n", __func__); 351 pr_err("Couldn't find owner Cyclone for Winnipeg!\n");
359 return last_bus; 352 return last_bus;
360 } 353 }
361 354
@@ -366,7 +359,7 @@ static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
366 } 359 }
367 } 360 }
368 if (i == rio_table_hdr->num_scal_dev) { 361 if (i == rio_table_hdr->num_scal_dev) {
369 printk(KERN_ERR "%s: Couldn't find owner Twister for Cyclone!\n", __func__); 362 pr_err("Couldn't find owner Twister for Cyclone!\n");
370 return last_bus; 363 return last_bus;
371 } 364 }
372 365
@@ -396,7 +389,7 @@ static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
396 num_buses = 9; 389 num_buses = 9;
397 break; 390 break;
398 default: 391 default:
399 printk(KERN_INFO "%s: Unsupported Winnipeg type!\n", __func__); 392 pr_info("Unsupported Winnipeg type!\n");
400 return last_bus; 393 return last_bus;
401 } 394 }
402 395
@@ -411,13 +404,15 @@ static int build_detail_arrays(void)
411 int i, scal_detail_size, rio_detail_size; 404 int i, scal_detail_size, rio_detail_size;
412 405
413 if (rio_table_hdr->num_scal_dev > MAX_NUMNODES) { 406 if (rio_table_hdr->num_scal_dev > MAX_NUMNODES) {
414 printk(KERN_WARNING "%s: MAX_NUMNODES too low! Defined as %d, but system has %d nodes.\n", __func__, MAX_NUMNODES, rio_table_hdr->num_scal_dev); 407 pr_warn("MAX_NUMNODES too low! Defined as %d, but system has %d nodes\n",
408 MAX_NUMNODES, rio_table_hdr->num_scal_dev);
415 return 0; 409 return 0;
416 } 410 }
417 411
418 switch (rio_table_hdr->version) { 412 switch (rio_table_hdr->version) {
419 default: 413 default:
420 printk(KERN_WARNING "%s: Invalid Rio Grande Table Version: %d\n", __func__, rio_table_hdr->version); 414 pr_warn("Invalid Rio Grande Table Version: %d\n",
415 rio_table_hdr->version);
421 return 0; 416 return 0;
422 case 2: 417 case 2:
423 scal_detail_size = 11; 418 scal_detail_size = 11;
@@ -462,7 +457,7 @@ void setup_summit(void)
462 offset = *((unsigned short *)(ptr + offset)); 457 offset = *((unsigned short *)(ptr + offset));
463 } 458 }
464 if (!rio_table_hdr) { 459 if (!rio_table_hdr) {
465 printk(KERN_ERR "%s: Unable to locate Rio Grande Table in EBDA - bailing!\n", __func__); 460 pr_err("Unable to locate Rio Grande Table in EBDA - bailing!\n");
466 return; 461 return;
467 } 462 }
468 463
@@ -509,7 +504,7 @@ static struct apic apic_summit = {
509 .check_apicid_used = summit_check_apicid_used, 504 .check_apicid_used = summit_check_apicid_used,
510 .check_apicid_present = summit_check_apicid_present, 505 .check_apicid_present = summit_check_apicid_present,
511 506
512 .vector_allocation_domain = summit_vector_allocation_domain, 507 .vector_allocation_domain = flat_vector_allocation_domain,
513 .init_apic_ldr = summit_init_apic_ldr, 508 .init_apic_ldr = summit_init_apic_ldr,
514 509
515 .ioapic_phys_id_map = summit_ioapic_phys_id_map, 510 .ioapic_phys_id_map = summit_ioapic_phys_id_map,
@@ -527,7 +522,6 @@ static struct apic apic_summit = {
527 .set_apic_id = NULL, 522 .set_apic_id = NULL,
528 .apic_id_mask = 0xFF << 24, 523 .apic_id_mask = 0xFF << 24,
529 524
530 .cpu_mask_to_apicid = summit_cpu_mask_to_apicid,
531 .cpu_mask_to_apicid_and = summit_cpu_mask_to_apicid_and, 525 .cpu_mask_to_apicid_and = summit_cpu_mask_to_apicid_and,
532 526
533 .send_IPI_mask = summit_send_IPI_mask, 527 .send_IPI_mask = summit_send_IPI_mask,
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index ff35cff0e1a7..c88baa4ff0e5 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -81,7 +81,7 @@ static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
81} 81}
82 82
83static void 83static void
84 x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) 84x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
85{ 85{
86 __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT); 86 __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
87} 87}
@@ -96,36 +96,37 @@ static void x2apic_send_IPI_all(int vector)
96 __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC); 96 __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
97} 97}
98 98
99static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask) 99static int
100x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
101 const struct cpumask *andmask,
102 unsigned int *apicid)
100{ 103{
101 /* 104 u32 dest = 0;
102 * We're using fixed IRQ delivery, can only return one logical APIC ID. 105 u16 cluster;
103 * May as well be the first. 106 int i;
104 */
105 int cpu = cpumask_first(cpumask);
106 107
107 if ((unsigned)cpu < nr_cpu_ids) 108 for_each_cpu_and(i, cpumask, andmask) {
108 return per_cpu(x86_cpu_to_logical_apicid, cpu); 109 if (!cpumask_test_cpu(i, cpu_online_mask))
109 else 110 continue;
110 return BAD_APICID; 111 dest = per_cpu(x86_cpu_to_logical_apicid, i);
111} 112 cluster = x2apic_cluster(i);
113 break;
114 }
112 115
113static unsigned int 116 if (!dest)
114x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, 117 return -EINVAL;
115 const struct cpumask *andmask)
116{
117 int cpu;
118 118
119 /* 119 for_each_cpu_and(i, cpumask, andmask) {
120 * We're using fixed IRQ delivery, can only return one logical APIC ID. 120 if (!cpumask_test_cpu(i, cpu_online_mask))
121 * May as well be the first. 121 continue;
122 */ 122 if (cluster != x2apic_cluster(i))
123 for_each_cpu_and(cpu, cpumask, andmask) { 123 continue;
124 if (cpumask_test_cpu(cpu, cpu_online_mask)) 124 dest |= per_cpu(x86_cpu_to_logical_apicid, i);
125 break;
126 } 125 }
127 126
128 return per_cpu(x86_cpu_to_logical_apicid, cpu); 127 *apicid = dest;
128
129 return 0;
129} 130}
130 131
131static void init_x2apic_ldr(void) 132static void init_x2apic_ldr(void)
@@ -208,6 +209,32 @@ static int x2apic_cluster_probe(void)
208 return 0; 209 return 0;
209} 210}
210 211
212static const struct cpumask *x2apic_cluster_target_cpus(void)
213{
214 return cpu_all_mask;
215}
216
217/*
218 * Each x2apic cluster is an allocation domain.
219 */
220static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask,
221 const struct cpumask *mask)
222{
223 /*
224 * To minimize vector pressure, default case of boot, device bringup
225 * etc will use a single cpu for the interrupt destination.
226 *
227 * On explicit migration requests coming from irqbalance etc,
228 * interrupts will be routed to the x2apic cluster (cluster-id
229 * derived from the first cpu in the mask) members specified
230 * in the mask.
231 */
232 if (mask == x2apic_cluster_target_cpus())
233 cpumask_copy(retmask, cpumask_of(cpu));
234 else
235 cpumask_and(retmask, mask, per_cpu(cpus_in_cluster, cpu));
236}
237
211static struct apic apic_x2apic_cluster = { 238static struct apic apic_x2apic_cluster = {
212 239
213 .name = "cluster x2apic", 240 .name = "cluster x2apic",
@@ -219,13 +246,13 @@ static struct apic apic_x2apic_cluster = {
219 .irq_delivery_mode = dest_LowestPrio, 246 .irq_delivery_mode = dest_LowestPrio,
220 .irq_dest_mode = 1, /* logical */ 247 .irq_dest_mode = 1, /* logical */
221 248
222 .target_cpus = x2apic_target_cpus, 249 .target_cpus = x2apic_cluster_target_cpus,
223 .disable_esr = 0, 250 .disable_esr = 0,
224 .dest_logical = APIC_DEST_LOGICAL, 251 .dest_logical = APIC_DEST_LOGICAL,
225 .check_apicid_used = NULL, 252 .check_apicid_used = NULL,
226 .check_apicid_present = NULL, 253 .check_apicid_present = NULL,
227 254
228 .vector_allocation_domain = x2apic_vector_allocation_domain, 255 .vector_allocation_domain = cluster_vector_allocation_domain,
229 .init_apic_ldr = init_x2apic_ldr, 256 .init_apic_ldr = init_x2apic_ldr,
230 257
231 .ioapic_phys_id_map = NULL, 258 .ioapic_phys_id_map = NULL,
@@ -243,7 +270,6 @@ static struct apic apic_x2apic_cluster = {
243 .set_apic_id = x2apic_set_apic_id, 270 .set_apic_id = x2apic_set_apic_id,
244 .apic_id_mask = 0xFFFFFFFFu, 271 .apic_id_mask = 0xFFFFFFFFu,
245 272
246 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
247 .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and, 273 .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
248 274
249 .send_IPI_mask = x2apic_send_IPI_mask, 275 .send_IPI_mask = x2apic_send_IPI_mask,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index c17e982db275..e03a1e180e81 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -76,38 +76,6 @@ static void x2apic_send_IPI_all(int vector)
76 __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC); 76 __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
77} 77}
78 78
79static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
80{
81 /*
82 * We're using fixed IRQ delivery, can only return one phys APIC ID.
83 * May as well be the first.
84 */
85 int cpu = cpumask_first(cpumask);
86
87 if ((unsigned)cpu < nr_cpu_ids)
88 return per_cpu(x86_cpu_to_apicid, cpu);
89 else
90 return BAD_APICID;
91}
92
93static unsigned int
94x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
95 const struct cpumask *andmask)
96{
97 int cpu;
98
99 /*
100 * We're using fixed IRQ delivery, can only return one phys APIC ID.
101 * May as well be the first.
102 */
103 for_each_cpu_and(cpu, cpumask, andmask) {
104 if (cpumask_test_cpu(cpu, cpu_online_mask))
105 break;
106 }
107
108 return per_cpu(x86_cpu_to_apicid, cpu);
109}
110
111static void init_x2apic_ldr(void) 79static void init_x2apic_ldr(void)
112{ 80{
113} 81}
@@ -131,13 +99,13 @@ static struct apic apic_x2apic_phys = {
131 .irq_delivery_mode = dest_Fixed, 99 .irq_delivery_mode = dest_Fixed,
132 .irq_dest_mode = 0, /* physical */ 100 .irq_dest_mode = 0, /* physical */
133 101
134 .target_cpus = x2apic_target_cpus, 102 .target_cpus = online_target_cpus,
135 .disable_esr = 0, 103 .disable_esr = 0,
136 .dest_logical = 0, 104 .dest_logical = 0,
137 .check_apicid_used = NULL, 105 .check_apicid_used = NULL,
138 .check_apicid_present = NULL, 106 .check_apicid_present = NULL,
139 107
140 .vector_allocation_domain = x2apic_vector_allocation_domain, 108 .vector_allocation_domain = default_vector_allocation_domain,
141 .init_apic_ldr = init_x2apic_ldr, 109 .init_apic_ldr = init_x2apic_ldr,
142 110
143 .ioapic_phys_id_map = NULL, 111 .ioapic_phys_id_map = NULL,
@@ -155,8 +123,7 @@ static struct apic apic_x2apic_phys = {
155 .set_apic_id = x2apic_set_apic_id, 123 .set_apic_id = x2apic_set_apic_id,
156 .apic_id_mask = 0xFFFFFFFFu, 124 .apic_id_mask = 0xFFFFFFFFu,
157 125
158 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, 126 .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
159 .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
160 127
161 .send_IPI_mask = x2apic_send_IPI_mask, 128 .send_IPI_mask = x2apic_send_IPI_mask,
162 .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself, 129 .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index c6d03f7a4401..8cfade9510a4 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -185,17 +185,6 @@ EXPORT_SYMBOL_GPL(uv_possible_blades);
185unsigned long sn_rtc_cycles_per_second; 185unsigned long sn_rtc_cycles_per_second;
186EXPORT_SYMBOL(sn_rtc_cycles_per_second); 186EXPORT_SYMBOL(sn_rtc_cycles_per_second);
187 187
188static const struct cpumask *uv_target_cpus(void)
189{
190 return cpu_online_mask;
191}
192
193static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
194{
195 cpumask_clear(retmask);
196 cpumask_set_cpu(cpu, retmask);
197}
198
199static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) 188static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
200{ 189{
201#ifdef CONFIG_SMP 190#ifdef CONFIG_SMP
@@ -280,25 +269,12 @@ static void uv_init_apic_ldr(void)
280{ 269{
281} 270}
282 271
283static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask) 272static int
284{
285 /*
286 * We're using fixed IRQ delivery, can only return one phys APIC ID.
287 * May as well be the first.
288 */
289 int cpu = cpumask_first(cpumask);
290
291 if ((unsigned)cpu < nr_cpu_ids)
292 return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits;
293 else
294 return BAD_APICID;
295}
296
297static unsigned int
298uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask, 273uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
299 const struct cpumask *andmask) 274 const struct cpumask *andmask,
275 unsigned int *apicid)
300{ 276{
301 int cpu; 277 int unsigned cpu;
302 278
303 /* 279 /*
304 * We're using fixed IRQ delivery, can only return one phys APIC ID. 280 * We're using fixed IRQ delivery, can only return one phys APIC ID.
@@ -308,7 +284,13 @@ uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
308 if (cpumask_test_cpu(cpu, cpu_online_mask)) 284 if (cpumask_test_cpu(cpu, cpu_online_mask))
309 break; 285 break;
310 } 286 }
311 return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits; 287
288 if (likely(cpu < nr_cpu_ids)) {
289 *apicid = per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits;
290 return 0;
291 }
292
293 return -EINVAL;
312} 294}
313 295
314static unsigned int x2apic_get_apic_id(unsigned long x) 296static unsigned int x2apic_get_apic_id(unsigned long x)
@@ -362,13 +344,13 @@ static struct apic __refdata apic_x2apic_uv_x = {
362 .irq_delivery_mode = dest_Fixed, 344 .irq_delivery_mode = dest_Fixed,
363 .irq_dest_mode = 0, /* physical */ 345 .irq_dest_mode = 0, /* physical */
364 346
365 .target_cpus = uv_target_cpus, 347 .target_cpus = online_target_cpus,
366 .disable_esr = 0, 348 .disable_esr = 0,
367 .dest_logical = APIC_DEST_LOGICAL, 349 .dest_logical = APIC_DEST_LOGICAL,
368 .check_apicid_used = NULL, 350 .check_apicid_used = NULL,
369 .check_apicid_present = NULL, 351 .check_apicid_present = NULL,
370 352
371 .vector_allocation_domain = uv_vector_allocation_domain, 353 .vector_allocation_domain = default_vector_allocation_domain,
372 .init_apic_ldr = uv_init_apic_ldr, 354 .init_apic_ldr = uv_init_apic_ldr,
373 355
374 .ioapic_phys_id_map = NULL, 356 .ioapic_phys_id_map = NULL,
@@ -386,7 +368,6 @@ static struct apic __refdata apic_x2apic_uv_x = {
386 .set_apic_id = set_apic_id, 368 .set_apic_id = set_apic_id,
387 .apic_id_mask = 0xFFFFFFFFu, 369 .apic_id_mask = 0xFFFFFFFFu,
388 370
389 .cpu_mask_to_apicid = uv_cpu_mask_to_apicid,
390 .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and, 371 .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and,
391 372
392 .send_IPI_mask = uv_send_IPI_mask, 373 .send_IPI_mask = uv_send_IPI_mask,
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 07b0c0db466c..d65464e43503 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -201,6 +201,8 @@
201 * http://www.microsoft.com/whdc/archive/amp_12.mspx] 201 * http://www.microsoft.com/whdc/archive/amp_12.mspx]
202 */ 202 */
203 203
204#define pr_fmt(fmt) "apm: " fmt
205
204#include <linux/module.h> 206#include <linux/module.h>
205 207
206#include <linux/poll.h> 208#include <linux/poll.h>
@@ -485,11 +487,11 @@ static void apm_error(char *str, int err)
485 if (error_table[i].key == err) 487 if (error_table[i].key == err)
486 break; 488 break;
487 if (i < ERROR_COUNT) 489 if (i < ERROR_COUNT)
488 printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg); 490 pr_notice("%s: %s\n", str, error_table[i].msg);
489 else if (err < 0) 491 else if (err < 0)
490 printk(KERN_NOTICE "apm: %s: linux error code %i\n", str, err); 492 pr_notice("%s: linux error code %i\n", str, err);
491 else 493 else
492 printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n", 494 pr_notice("%s: unknown error code %#2.2x\n",
493 str, err); 495 str, err);
494} 496}
495 497
@@ -1184,7 +1186,7 @@ static void queue_event(apm_event_t event, struct apm_user *sender)
1184 static int notified; 1186 static int notified;
1185 1187
1186 if (notified++ == 0) 1188 if (notified++ == 0)
1187 printk(KERN_ERR "apm: an event queue overflowed\n"); 1189 pr_err("an event queue overflowed\n");
1188 if (++as->event_tail >= APM_MAX_EVENTS) 1190 if (++as->event_tail >= APM_MAX_EVENTS)
1189 as->event_tail = 0; 1191 as->event_tail = 0;
1190 } 1192 }
@@ -1447,7 +1449,7 @@ static void apm_mainloop(void)
1447static int check_apm_user(struct apm_user *as, const char *func) 1449static int check_apm_user(struct apm_user *as, const char *func)
1448{ 1450{
1449 if (as == NULL || as->magic != APM_BIOS_MAGIC) { 1451 if (as == NULL || as->magic != APM_BIOS_MAGIC) {
1450 printk(KERN_ERR "apm: %s passed bad filp\n", func); 1452 pr_err("%s passed bad filp\n", func);
1451 return 1; 1453 return 1;
1452 } 1454 }
1453 return 0; 1455 return 0;
@@ -1586,7 +1588,7 @@ static int do_release(struct inode *inode, struct file *filp)
1586 as1 = as1->next) 1588 as1 = as1->next)
1587 ; 1589 ;
1588 if (as1 == NULL) 1590 if (as1 == NULL)
1589 printk(KERN_ERR "apm: filp not in user list\n"); 1591 pr_err("filp not in user list\n");
1590 else 1592 else
1591 as1->next = as->next; 1593 as1->next = as->next;
1592 } 1594 }
@@ -1600,11 +1602,9 @@ static int do_open(struct inode *inode, struct file *filp)
1600 struct apm_user *as; 1602 struct apm_user *as;
1601 1603
1602 as = kmalloc(sizeof(*as), GFP_KERNEL); 1604 as = kmalloc(sizeof(*as), GFP_KERNEL);
1603 if (as == NULL) { 1605 if (as == NULL)
1604 printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n",
1605 sizeof(*as));
1606 return -ENOMEM; 1606 return -ENOMEM;
1607 } 1607
1608 as->magic = APM_BIOS_MAGIC; 1608 as->magic = APM_BIOS_MAGIC;
1609 as->event_tail = as->event_head = 0; 1609 as->event_tail = as->event_head = 0;
1610 as->suspends_pending = as->standbys_pending = 0; 1610 as->suspends_pending = as->standbys_pending = 0;
@@ -2313,16 +2313,16 @@ static int __init apm_init(void)
2313 } 2313 }
2314 2314
2315 if (apm_info.disabled) { 2315 if (apm_info.disabled) {
2316 printk(KERN_NOTICE "apm: disabled on user request.\n"); 2316 pr_notice("disabled on user request.\n");
2317 return -ENODEV; 2317 return -ENODEV;
2318 } 2318 }
2319 if ((num_online_cpus() > 1) && !power_off && !smp) { 2319 if ((num_online_cpus() > 1) && !power_off && !smp) {
2320 printk(KERN_NOTICE "apm: disabled - APM is not SMP safe.\n"); 2320 pr_notice("disabled - APM is not SMP safe.\n");
2321 apm_info.disabled = 1; 2321 apm_info.disabled = 1;
2322 return -ENODEV; 2322 return -ENODEV;
2323 } 2323 }
2324 if (!acpi_disabled) { 2324 if (!acpi_disabled) {
2325 printk(KERN_NOTICE "apm: overridden by ACPI.\n"); 2325 pr_notice("overridden by ACPI.\n");
2326 apm_info.disabled = 1; 2326 apm_info.disabled = 1;
2327 return -ENODEV; 2327 return -ENODEV;
2328 } 2328 }
@@ -2356,8 +2356,7 @@ static int __init apm_init(void)
2356 2356
2357 kapmd_task = kthread_create(apm, NULL, "kapmd"); 2357 kapmd_task = kthread_create(apm, NULL, "kapmd");
2358 if (IS_ERR(kapmd_task)) { 2358 if (IS_ERR(kapmd_task)) {
2359 printk(KERN_ERR "apm: disabled - Unable to start kernel " 2359 pr_err("disabled - Unable to start kernel thread\n");
2360 "thread.\n");
2361 err = PTR_ERR(kapmd_task); 2360 err = PTR_ERR(kapmd_task);
2362 kapmd_task = NULL; 2361 kapmd_task = NULL;
2363 remove_proc_entry("apm", NULL); 2362 remove_proc_entry("apm", NULL);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 6ab6aa2fdfdd..d30a6a9a0121 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -14,7 +14,7 @@ CFLAGS_common.o := $(nostackp)
14 14
15obj-y := intel_cacheinfo.o scattered.o topology.o 15obj-y := intel_cacheinfo.o scattered.o topology.o
16obj-y += proc.o capflags.o powerflags.o common.o 16obj-y += proc.o capflags.o powerflags.o common.o
17obj-y += vmware.o hypervisor.o sched.o mshyperv.o 17obj-y += vmware.o hypervisor.o mshyperv.o
18obj-y += rdrand.o 18obj-y += rdrand.o
19obj-y += match.o 19obj-y += match.o
20 20
@@ -32,7 +32,9 @@ obj-$(CONFIG_PERF_EVENTS) += perf_event.o
32 32
33ifdef CONFIG_PERF_EVENTS 33ifdef CONFIG_PERF_EVENTS
34obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd.o 34obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd.o
35obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_p4.o perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o 35obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_p4.o
36obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
37obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o
36endif 38endif
37 39
38obj-$(CONFIG_X86_MCE) += mcheck/ 40obj-$(CONFIG_X86_MCE) += mcheck/
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 146bb6218eec..9d92e19039f0 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -19,6 +19,39 @@
19 19
20#include "cpu.h" 20#include "cpu.h"
21 21
22static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
23{
24 struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
25 u32 gprs[8] = { 0 };
26 int err;
27
28 WARN_ONCE((c->x86 != 0xf), "%s should only be used on K8!\n", __func__);
29
30 gprs[1] = msr;
31 gprs[7] = 0x9c5a203a;
32
33 err = rdmsr_safe_regs(gprs);
34
35 *p = gprs[0] | ((u64)gprs[2] << 32);
36
37 return err;
38}
39
40static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val)
41{
42 struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
43 u32 gprs[8] = { 0 };
44
45 WARN_ONCE((c->x86 != 0xf), "%s should only be used on K8!\n", __func__);
46
47 gprs[0] = (u32)val;
48 gprs[1] = msr;
49 gprs[2] = val >> 32;
50 gprs[7] = 0x9c5a203a;
51
52 return wrmsr_safe_regs(gprs);
53}
54
22#ifdef CONFIG_X86_32 55#ifdef CONFIG_X86_32
23/* 56/*
24 * B step AMD K6 before B 9730xxxx have hardware bugs that can cause 57 * B step AMD K6 before B 9730xxxx have hardware bugs that can cause
@@ -586,9 +619,9 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
586 !cpu_has(c, X86_FEATURE_TOPOEXT)) { 619 !cpu_has(c, X86_FEATURE_TOPOEXT)) {
587 u64 val; 620 u64 val;
588 621
589 if (!rdmsrl_amd_safe(0xc0011005, &val)) { 622 if (!rdmsrl_safe(0xc0011005, &val)) {
590 val |= 1ULL << 54; 623 val |= 1ULL << 54;
591 wrmsrl_amd_safe(0xc0011005, val); 624 wrmsrl_safe(0xc0011005, val);
592 rdmsrl(0xc0011005, val); 625 rdmsrl(0xc0011005, val);
593 if (val & (1ULL << 54)) { 626 if (val & (1ULL << 54)) {
594 set_cpu_cap(c, X86_FEATURE_TOPOEXT); 627 set_cpu_cap(c, X86_FEATURE_TOPOEXT);
@@ -679,7 +712,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
679 err = rdmsrl_safe(MSR_AMD64_MCx_MASK(4), &mask); 712 err = rdmsrl_safe(MSR_AMD64_MCx_MASK(4), &mask);
680 if (err == 0) { 713 if (err == 0) {
681 mask |= (1 << 10); 714 mask |= (1 << 10);
682 checking_wrmsrl(MSR_AMD64_MCx_MASK(4), mask); 715 wrmsrl_safe(MSR_AMD64_MCx_MASK(4), mask);
683 } 716 }
684 } 717 }
685 718
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 46674fbb62ba..c97bb7b5a9f8 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -55,8 +55,8 @@ static void __init check_fpu(void)
55 55
56 if (!boot_cpu_data.hard_math) { 56 if (!boot_cpu_data.hard_math) {
57#ifndef CONFIG_MATH_EMULATION 57#ifndef CONFIG_MATH_EMULATION
58 printk(KERN_EMERG "No coprocessor found and no math emulation present.\n"); 58 pr_emerg("No coprocessor found and no math emulation present\n");
59 printk(KERN_EMERG "Giving up.\n"); 59 pr_emerg("Giving up\n");
60 for (;;) ; 60 for (;;) ;
61#endif 61#endif
62 return; 62 return;
@@ -86,7 +86,7 @@ static void __init check_fpu(void)
86 86
87 boot_cpu_data.fdiv_bug = fdiv_bug; 87 boot_cpu_data.fdiv_bug = fdiv_bug;
88 if (boot_cpu_data.fdiv_bug) 88 if (boot_cpu_data.fdiv_bug)
89 printk(KERN_WARNING "Hmm, FPU with FDIV bug.\n"); 89 pr_warn("Hmm, FPU with FDIV bug\n");
90} 90}
91 91
92static void __init check_hlt(void) 92static void __init check_hlt(void)
@@ -94,16 +94,16 @@ static void __init check_hlt(void)
94 if (boot_cpu_data.x86 >= 5 || paravirt_enabled()) 94 if (boot_cpu_data.x86 >= 5 || paravirt_enabled())
95 return; 95 return;
96 96
97 printk(KERN_INFO "Checking 'hlt' instruction... "); 97 pr_info("Checking 'hlt' instruction... ");
98 if (!boot_cpu_data.hlt_works_ok) { 98 if (!boot_cpu_data.hlt_works_ok) {
99 printk("disabled\n"); 99 pr_cont("disabled\n");
100 return; 100 return;
101 } 101 }
102 halt(); 102 halt();
103 halt(); 103 halt();
104 halt(); 104 halt();
105 halt(); 105 halt();
106 printk(KERN_CONT "OK.\n"); 106 pr_cont("OK\n");
107} 107}
108 108
109/* 109/*
@@ -116,7 +116,7 @@ static void __init check_popad(void)
116#ifndef CONFIG_X86_POPAD_OK 116#ifndef CONFIG_X86_POPAD_OK
117 int res, inp = (int) &res; 117 int res, inp = (int) &res;
118 118
119 printk(KERN_INFO "Checking for popad bug... "); 119 pr_info("Checking for popad bug... ");
120 __asm__ __volatile__( 120 __asm__ __volatile__(
121 "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx " 121 "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx "
122 : "=&a" (res) 122 : "=&a" (res)
@@ -127,9 +127,9 @@ static void __init check_popad(void)
127 * CPU hard. Too bad. 127 * CPU hard. Too bad.
128 */ 128 */
129 if (res != 12345678) 129 if (res != 12345678)
130 printk(KERN_CONT "Buggy.\n"); 130 pr_cont("Buggy\n");
131 else 131 else
132 printk(KERN_CONT "OK.\n"); 132 pr_cont("OK\n");
133#endif 133#endif
134} 134}
135 135
@@ -161,7 +161,7 @@ void __init check_bugs(void)
161{ 161{
162 identify_boot_cpu(); 162 identify_boot_cpu();
163#ifndef CONFIG_SMP 163#ifndef CONFIG_SMP
164 printk(KERN_INFO "CPU: "); 164 pr_info("CPU: ");
165 print_cpu_info(&boot_cpu_data); 165 print_cpu_info(&boot_cpu_data);
166#endif 166#endif
167 check_config(); 167 check_config();
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 6b9333b429ba..46d8786d655e 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -452,6 +452,35 @@ void __cpuinit cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
452 c->x86_cache_size = l2size; 452 c->x86_cache_size = l2size;
453} 453}
454 454
455u16 __read_mostly tlb_lli_4k[NR_INFO];
456u16 __read_mostly tlb_lli_2m[NR_INFO];
457u16 __read_mostly tlb_lli_4m[NR_INFO];
458u16 __read_mostly tlb_lld_4k[NR_INFO];
459u16 __read_mostly tlb_lld_2m[NR_INFO];
460u16 __read_mostly tlb_lld_4m[NR_INFO];
461
462/*
463 * tlb_flushall_shift shows the balance point in replacing cr3 write
464 * with multiple 'invlpg'. It will do this replacement when
465 * flush_tlb_lines <= active_lines/2^tlb_flushall_shift.
466 * If tlb_flushall_shift is -1, means the replacement will be disabled.
467 */
468s8 __read_mostly tlb_flushall_shift = -1;
469
470void __cpuinit cpu_detect_tlb(struct cpuinfo_x86 *c)
471{
472 if (this_cpu->c_detect_tlb)
473 this_cpu->c_detect_tlb(c);
474
475 printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \
476 "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \
477 "tlb_flushall_shift is 0x%x\n",
478 tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES],
479 tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES],
480 tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES],
481 tlb_flushall_shift);
482}
483
455void __cpuinit detect_ht(struct cpuinfo_x86 *c) 484void __cpuinit detect_ht(struct cpuinfo_x86 *c)
456{ 485{
457#ifdef CONFIG_X86_HT 486#ifdef CONFIG_X86_HT
@@ -911,6 +940,8 @@ void __init identify_boot_cpu(void)
911#else 940#else
912 vgetcpu_set_mode(); 941 vgetcpu_set_mode();
913#endif 942#endif
943 if (boot_cpu_data.cpuid_level >= 2)
944 cpu_detect_tlb(&boot_cpu_data);
914} 945}
915 946
916void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) 947void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
@@ -947,7 +978,7 @@ static void __cpuinit __print_cpu_msr(void)
947 index_max = msr_range_array[i].max; 978 index_max = msr_range_array[i].max;
948 979
949 for (index = index_min; index < index_max; index++) { 980 for (index = index_min; index < index_max; index++) {
950 if (rdmsrl_amd_safe(index, &val)) 981 if (rdmsrl_safe(index, &val))
951 continue; 982 continue;
952 printk(KERN_INFO " MSR%08x: %016llx\n", index, val); 983 printk(KERN_INFO " MSR%08x: %016llx\n", index, val);
953 } 984 }
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 8bacc7826fb3..4041c24ae7db 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -20,10 +20,19 @@ struct cpu_dev {
20 void (*c_bsp_init)(struct cpuinfo_x86 *); 20 void (*c_bsp_init)(struct cpuinfo_x86 *);
21 void (*c_init)(struct cpuinfo_x86 *); 21 void (*c_init)(struct cpuinfo_x86 *);
22 void (*c_identify)(struct cpuinfo_x86 *); 22 void (*c_identify)(struct cpuinfo_x86 *);
23 void (*c_detect_tlb)(struct cpuinfo_x86 *);
23 unsigned int (*c_size_cache)(struct cpuinfo_x86 *, unsigned int); 24 unsigned int (*c_size_cache)(struct cpuinfo_x86 *, unsigned int);
24 int c_x86_vendor; 25 int c_x86_vendor;
25}; 26};
26 27
28struct _tlb_table {
29 unsigned char descriptor;
30 char tlb_type;
31 unsigned int entries;
32 /* unsigned int ways; */
33 char info[128];
34};
35
27#define cpu_dev_register(cpu_devX) \ 36#define cpu_dev_register(cpu_devX) \
28 static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \ 37 static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \
29 __attribute__((__section__(".x86_cpu_dev.init"))) = \ 38 __attribute__((__section__(".x86_cpu_dev.init"))) = \
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 755f64fb0743..a8f8fa9769d6 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -37,6 +37,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] =
37#endif 37#endif
38 &x86_hyper_vmware, 38 &x86_hyper_vmware,
39 &x86_hyper_ms_hyperv, 39 &x86_hyper_ms_hyperv,
40#ifdef CONFIG_KVM_GUEST
41 &x86_hyper_kvm,
42#endif
40}; 43};
41 44
42const struct hypervisor_x86 *x86_hyper; 45const struct hypervisor_x86 *x86_hyper;
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 3e6ff6cbf42a..0a4ce2980a5a 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -491,6 +491,181 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned i
491} 491}
492#endif 492#endif
493 493
494#define TLB_INST_4K 0x01
495#define TLB_INST_4M 0x02
496#define TLB_INST_2M_4M 0x03
497
498#define TLB_INST_ALL 0x05
499#define TLB_INST_1G 0x06
500
501#define TLB_DATA_4K 0x11
502#define TLB_DATA_4M 0x12
503#define TLB_DATA_2M_4M 0x13
504#define TLB_DATA_4K_4M 0x14
505
506#define TLB_DATA_1G 0x16
507
508#define TLB_DATA0_4K 0x21
509#define TLB_DATA0_4M 0x22
510#define TLB_DATA0_2M_4M 0x23
511
512#define STLB_4K 0x41
513
514static const struct _tlb_table intel_tlb_table[] __cpuinitconst = {
515 { 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" },
516 { 0x02, TLB_INST_4M, 2, " TLB_INST 4 MByte pages, full associative" },
517 { 0x03, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way set associative" },
518 { 0x04, TLB_DATA_4M, 8, " TLB_DATA 4 MByte pages, 4-way set associative" },
519 { 0x05, TLB_DATA_4M, 32, " TLB_DATA 4 MByte pages, 4-way set associative" },
520 { 0x0b, TLB_INST_4M, 4, " TLB_INST 4 MByte pages, 4-way set associative" },
521 { 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages */" },
522 { 0x50, TLB_INST_ALL, 64, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
523 { 0x51, TLB_INST_ALL, 128, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
524 { 0x52, TLB_INST_ALL, 256, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
525 { 0x55, TLB_INST_2M_4M, 7, " TLB_INST 2-MByte or 4-MByte pages, fully associative" },
526 { 0x56, TLB_DATA0_4M, 16, " TLB_DATA0 4 MByte pages, 4-way set associative" },
527 { 0x57, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, 4-way associative" },
528 { 0x59, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, fully associative" },
529 { 0x5a, TLB_DATA0_2M_4M, 32, " TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative" },
530 { 0x5b, TLB_DATA_4K_4M, 64, " TLB_DATA 4 KByte and 4 MByte pages" },
531 { 0x5c, TLB_DATA_4K_4M, 128, " TLB_DATA 4 KByte and 4 MByte pages" },
532 { 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" },
533 { 0xb0, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 4-way set associative" },
534 { 0xb1, TLB_INST_2M_4M, 4, " TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" },
535 { 0xb2, TLB_INST_4K, 64, " TLB_INST 4KByte pages, 4-way set associative" },
536 { 0xb3, TLB_DATA_4K, 128, " TLB_DATA 4 KByte pages, 4-way set associative" },
537 { 0xb4, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 4-way associative" },
538 { 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" },
539 { 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" },
540 { 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" },
541 { 0x00, 0, 0 }
542};
543
544static void __cpuinit intel_tlb_lookup(const unsigned char desc)
545{
546 unsigned char k;
547 if (desc == 0)
548 return;
549
550 /* look up this descriptor in the table */
551 for (k = 0; intel_tlb_table[k].descriptor != desc && \
552 intel_tlb_table[k].descriptor != 0; k++)
553 ;
554
555 if (intel_tlb_table[k].tlb_type == 0)
556 return;
557
558 switch (intel_tlb_table[k].tlb_type) {
559 case STLB_4K:
560 if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
561 tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
562 if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
563 tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
564 break;
565 case TLB_INST_ALL:
566 if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
567 tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
568 if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
569 tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
570 if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
571 tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
572 break;
573 case TLB_INST_4K:
574 if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
575 tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
576 break;
577 case TLB_INST_4M:
578 if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
579 tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
580 break;
581 case TLB_INST_2M_4M:
582 if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
583 tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
584 if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
585 tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
586 break;
587 case TLB_DATA_4K:
588 case TLB_DATA0_4K:
589 if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
590 tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
591 break;
592 case TLB_DATA_4M:
593 case TLB_DATA0_4M:
594 if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
595 tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
596 break;
597 case TLB_DATA_2M_4M:
598 case TLB_DATA0_2M_4M:
599 if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries)
600 tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries;
601 if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
602 tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
603 break;
604 case TLB_DATA_4K_4M:
605 if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
606 tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
607 if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
608 tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
609 break;
610 }
611}
612
613static void __cpuinit intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c)
614{
615 if (!cpu_has_invlpg) {
616 tlb_flushall_shift = -1;
617 return;
618 }
619 switch ((c->x86 << 8) + c->x86_model) {
620 case 0x60f: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
621 case 0x616: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
622 case 0x617: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
623 case 0x61d: /* six-core 45 nm xeon "Dunnington" */
624 tlb_flushall_shift = -1;
625 break;
626 case 0x61a: /* 45 nm nehalem, "Bloomfield" */
627 case 0x61e: /* 45 nm nehalem, "Lynnfield" */
628 case 0x625: /* 32 nm nehalem, "Clarkdale" */
629 case 0x62c: /* 32 nm nehalem, "Gulftown" */
630 case 0x62e: /* 45 nm nehalem-ex, "Beckton" */
631 case 0x62f: /* 32 nm Xeon E7 */
632 tlb_flushall_shift = 6;
633 break;
634 case 0x62a: /* SandyBridge */
635 case 0x62d: /* SandyBridge, "Romely-EP" */
636 tlb_flushall_shift = 5;
637 break;
638 case 0x63a: /* Ivybridge */
639 tlb_flushall_shift = 1;
640 break;
641 default:
642 tlb_flushall_shift = 6;
643 }
644}
645
646static void __cpuinit intel_detect_tlb(struct cpuinfo_x86 *c)
647{
648 int i, j, n;
649 unsigned int regs[4];
650 unsigned char *desc = (unsigned char *)regs;
651 /* Number of times to iterate */
652 n = cpuid_eax(2) & 0xFF;
653
654 for (i = 0 ; i < n ; i++) {
655 cpuid(2, &regs[0], &regs[1], &regs[2], &regs[3]);
656
657 /* If bit 31 is set, this is an unknown format */
658 for (j = 0 ; j < 3 ; j++)
659 if (regs[j] & (1 << 31))
660 regs[j] = 0;
661
662 /* Byte 0 is level count, not a descriptor */
663 for (j = 1 ; j < 16 ; j++)
664 intel_tlb_lookup(desc[j]);
665 }
666 intel_tlb_flushall_shift_set(c);
667}
668
494static const struct cpu_dev __cpuinitconst intel_cpu_dev = { 669static const struct cpu_dev __cpuinitconst intel_cpu_dev = {
495 .c_vendor = "Intel", 670 .c_vendor = "Intel",
496 .c_ident = { "GenuineIntel" }, 671 .c_ident = { "GenuineIntel" },
@@ -546,6 +721,7 @@ static const struct cpu_dev __cpuinitconst intel_cpu_dev = {
546 }, 721 },
547 .c_size_cache = intel_size_cache, 722 .c_size_cache = intel_size_cache,
548#endif 723#endif
724 .c_detect_tlb = intel_detect_tlb,
549 .c_early_init = early_init_intel, 725 .c_early_init = early_init_intel,
550 .c_init = init_intel, 726 .c_init = init_intel,
551 .c_x86_vendor = X86_VENDOR_INTEL, 727 .c_x86_vendor = X86_VENDOR_INTEL,
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index da27c5d2168a..5e095f873e3e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -7,6 +7,9 @@
7 * Copyright 2008 Intel Corporation 7 * Copyright 2008 Intel Corporation
8 * Author: Andi Kleen 8 * Author: Andi Kleen
9 */ 9 */
10
11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12
10#include <linux/thread_info.h> 13#include <linux/thread_info.h>
11#include <linux/capability.h> 14#include <linux/capability.h>
12#include <linux/miscdevice.h> 15#include <linux/miscdevice.h>
@@ -57,8 +60,6 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex);
57 60
58int mce_disabled __read_mostly; 61int mce_disabled __read_mostly;
59 62
60#define MISC_MCELOG_MINOR 227
61
62#define SPINUNIT 100 /* 100ns */ 63#define SPINUNIT 100 /* 100ns */
63 64
64atomic_t mce_entry; 65atomic_t mce_entry;
@@ -210,7 +211,7 @@ static void drain_mcelog_buffer(void)
210 cpu_relax(); 211 cpu_relax();
211 212
212 if (!m->finished && retries >= 4) { 213 if (!m->finished && retries >= 4) {
213 pr_err("MCE: skipping error being logged currently!\n"); 214 pr_err("skipping error being logged currently!\n");
214 break; 215 break;
215 } 216 }
216 } 217 }
@@ -1167,8 +1168,9 @@ int memory_failure(unsigned long pfn, int vector, int flags)
1167{ 1168{
1168 /* mce_severity() should not hand us an ACTION_REQUIRED error */ 1169 /* mce_severity() should not hand us an ACTION_REQUIRED error */
1169 BUG_ON(flags & MF_ACTION_REQUIRED); 1170 BUG_ON(flags & MF_ACTION_REQUIRED);
1170 printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n" 1171 pr_err("Uncorrected memory error in page 0x%lx ignored\n"
1171 "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn); 1172 "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
1173 pfn);
1172 1174
1173 return 0; 1175 return 0;
1174} 1176}
@@ -1186,6 +1188,7 @@ void mce_notify_process(void)
1186{ 1188{
1187 unsigned long pfn; 1189 unsigned long pfn;
1188 struct mce_info *mi = mce_find_info(); 1190 struct mce_info *mi = mce_find_info();
1191 int flags = MF_ACTION_REQUIRED;
1189 1192
1190 if (!mi) 1193 if (!mi)
1191 mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL); 1194 mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL);
@@ -1200,8 +1203,9 @@ void mce_notify_process(void)
1200 * doomed. We still need to mark the page as poisoned and alert any 1203 * doomed. We still need to mark the page as poisoned and alert any
1201 * other users of the page. 1204 * other users of the page.
1202 */ 1205 */
1203 if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0 || 1206 if (!mi->restartable)
1204 mi->restartable == 0) { 1207 flags |= MF_MUST_KILL;
1208 if (memory_failure(pfn, MCE_VECTOR, flags) < 0) {
1205 pr_err("Memory error not recovered"); 1209 pr_err("Memory error not recovered");
1206 force_sig(SIGBUS, current); 1210 force_sig(SIGBUS, current);
1207 } 1211 }
@@ -1358,11 +1362,10 @@ static int __cpuinit __mcheck_cpu_cap_init(void)
1358 1362
1359 b = cap & MCG_BANKCNT_MASK; 1363 b = cap & MCG_BANKCNT_MASK;
1360 if (!banks) 1364 if (!banks)
1361 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); 1365 pr_info("CPU supports %d MCE banks\n", b);
1362 1366
1363 if (b > MAX_NR_BANKS) { 1367 if (b > MAX_NR_BANKS) {
1364 printk(KERN_WARNING 1368 pr_warn("Using only %u machine check banks out of %u\n",
1365 "MCE: Using only %u machine check banks out of %u\n",
1366 MAX_NR_BANKS, b); 1369 MAX_NR_BANKS, b);
1367 b = MAX_NR_BANKS; 1370 b = MAX_NR_BANKS;
1368 } 1371 }
@@ -1419,7 +1422,7 @@ static void __mcheck_cpu_init_generic(void)
1419static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) 1422static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1420{ 1423{
1421 if (c->x86_vendor == X86_VENDOR_UNKNOWN) { 1424 if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1422 pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); 1425 pr_info("unknown CPU type - not enabling MCE support\n");
1423 return -EOPNOTSUPP; 1426 return -EOPNOTSUPP;
1424 } 1427 }
1425 1428
@@ -1574,7 +1577,7 @@ static void __mcheck_cpu_init_timer(void)
1574/* Handle unconfigured int18 (should never happen) */ 1577/* Handle unconfigured int18 (should never happen) */
1575static void unexpected_machine_check(struct pt_regs *regs, long error_code) 1578static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1576{ 1579{
1577 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", 1580 pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
1578 smp_processor_id()); 1581 smp_processor_id());
1579} 1582}
1580 1583
@@ -1893,8 +1896,7 @@ static int __init mcheck_enable(char *str)
1893 get_option(&str, &monarch_timeout); 1896 get_option(&str, &monarch_timeout);
1894 } 1897 }
1895 } else { 1898 } else {
1896 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", 1899 pr_info("mce argument %s ignored. Please use /sys\n", str);
1897 str);
1898 return 0; 1900 return 0;
1899 } 1901 }
1900 return 1; 1902 return 1;
@@ -2342,7 +2344,7 @@ static __init int mcheck_init_device(void)
2342 2344
2343 return err; 2345 return err;
2344} 2346}
2345device_initcall(mcheck_init_device); 2347device_initcall_sync(mcheck_init_device);
2346 2348
2347/* 2349/*
2348 * Old style boot options parsing. Only for compatibility. 2350 * Old style boot options parsing. Only for compatibility.
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index f4873a64f46d..c4e916d77378 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -1,15 +1,17 @@
1/* 1/*
2 * (c) 2005, 2006 Advanced Micro Devices, Inc. 2 * (c) 2005-2012 Advanced Micro Devices, Inc.
3 * Your use of this code is subject to the terms and conditions of the 3 * Your use of this code is subject to the terms and conditions of the
4 * GNU general public license version 2. See "COPYING" or 4 * GNU general public license version 2. See "COPYING" or
5 * http://www.gnu.org/licenses/gpl.html 5 * http://www.gnu.org/licenses/gpl.html
6 * 6 *
7 * Written by Jacob Shin - AMD, Inc. 7 * Written by Jacob Shin - AMD, Inc.
8 * 8 *
9 * Support : jacob.shin@amd.com 9 * Support: borislav.petkov@amd.com
10 * 10 *
11 * April 2006 11 * April 2006
12 * - added support for AMD Family 0x10 processors 12 * - added support for AMD Family 0x10 processors
13 * May 2012
14 * - major scrubbing
13 * 15 *
14 * All MC4_MISCi registers are shared between multi-cores 16 * All MC4_MISCi registers are shared between multi-cores
15 */ 17 */
@@ -25,6 +27,7 @@
25#include <linux/cpu.h> 27#include <linux/cpu.h>
26#include <linux/smp.h> 28#include <linux/smp.h>
27 29
30#include <asm/amd_nb.h>
28#include <asm/apic.h> 31#include <asm/apic.h>
29#include <asm/idle.h> 32#include <asm/idle.h>
30#include <asm/mce.h> 33#include <asm/mce.h>
@@ -45,23 +48,15 @@
45#define MASK_BLKPTR_LO 0xFF000000 48#define MASK_BLKPTR_LO 0xFF000000
46#define MCG_XBLK_ADDR 0xC0000400 49#define MCG_XBLK_ADDR 0xC0000400
47 50
48struct threshold_block { 51static const char * const th_names[] = {
49 unsigned int block; 52 "load_store",
50 unsigned int bank; 53 "insn_fetch",
51 unsigned int cpu; 54 "combined_unit",
52 u32 address; 55 "",
53 u16 interrupt_enable; 56 "northbridge",
54 bool interrupt_capable; 57 "execution_unit",
55 u16 threshold_limit;
56 struct kobject kobj;
57 struct list_head miscj;
58}; 58};
59 59
60struct threshold_bank {
61 struct kobject *kobj;
62 struct threshold_block *blocks;
63 cpumask_var_t cpus;
64};
65static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks); 60static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks);
66 61
67static unsigned char shared_bank[NR_BANKS] = { 62static unsigned char shared_bank[NR_BANKS] = {
@@ -84,6 +79,26 @@ struct thresh_restart {
84 u16 old_limit; 79 u16 old_limit;
85}; 80};
86 81
82static const char * const bank4_names(struct threshold_block *b)
83{
84 switch (b->address) {
85 /* MSR4_MISC0 */
86 case 0x00000413:
87 return "dram";
88
89 case 0xc0000408:
90 return "ht_links";
91
92 case 0xc0000409:
93 return "l3_cache";
94
95 default:
96 WARN(1, "Funny MSR: 0x%08x\n", b->address);
97 return "";
98 }
99};
100
101
87static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits) 102static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits)
88{ 103{
89 /* 104 /*
@@ -224,8 +239,6 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
224 239
225 if (!block) 240 if (!block)
226 per_cpu(bank_map, cpu) |= (1 << bank); 241 per_cpu(bank_map, cpu) |= (1 << bank);
227 if (shared_bank[bank] && c->cpu_core_id)
228 break;
229 242
230 memset(&b, 0, sizeof(b)); 243 memset(&b, 0, sizeof(b));
231 b.cpu = cpu; 244 b.cpu = cpu;
@@ -326,7 +339,7 @@ struct threshold_attr {
326#define SHOW_FIELDS(name) \ 339#define SHOW_FIELDS(name) \
327static ssize_t show_ ## name(struct threshold_block *b, char *buf) \ 340static ssize_t show_ ## name(struct threshold_block *b, char *buf) \
328{ \ 341{ \
329 return sprintf(buf, "%lx\n", (unsigned long) b->name); \ 342 return sprintf(buf, "%lu\n", (unsigned long) b->name); \
330} 343}
331SHOW_FIELDS(interrupt_enable) 344SHOW_FIELDS(interrupt_enable)
332SHOW_FIELDS(threshold_limit) 345SHOW_FIELDS(threshold_limit)
@@ -377,38 +390,21 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
377 return size; 390 return size;
378} 391}
379 392
380struct threshold_block_cross_cpu {
381 struct threshold_block *tb;
382 long retval;
383};
384
385static void local_error_count_handler(void *_tbcc)
386{
387 struct threshold_block_cross_cpu *tbcc = _tbcc;
388 struct threshold_block *b = tbcc->tb;
389 u32 low, high;
390
391 rdmsr(b->address, low, high);
392 tbcc->retval = (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit);
393}
394
395static ssize_t show_error_count(struct threshold_block *b, char *buf) 393static ssize_t show_error_count(struct threshold_block *b, char *buf)
396{ 394{
397 struct threshold_block_cross_cpu tbcc = { .tb = b, }; 395 u32 lo, hi;
398 396
399 smp_call_function_single(b->cpu, local_error_count_handler, &tbcc, 1); 397 rdmsr_on_cpu(b->cpu, b->address, &lo, &hi);
400 return sprintf(buf, "%lx\n", tbcc.retval);
401}
402
403static ssize_t store_error_count(struct threshold_block *b,
404 const char *buf, size_t count)
405{
406 struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 };
407 398
408 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); 399 return sprintf(buf, "%u\n", ((hi & THRESHOLD_MAX) -
409 return 1; 400 (THRESHOLD_MAX - b->threshold_limit)));
410} 401}
411 402
403static struct threshold_attr error_count = {
404 .attr = {.name = __stringify(error_count), .mode = 0444 },
405 .show = show_error_count,
406};
407
412#define RW_ATTR(val) \ 408#define RW_ATTR(val) \
413static struct threshold_attr val = { \ 409static struct threshold_attr val = { \
414 .attr = {.name = __stringify(val), .mode = 0644 }, \ 410 .attr = {.name = __stringify(val), .mode = 0644 }, \
@@ -418,7 +414,6 @@ static struct threshold_attr val = { \
418 414
419RW_ATTR(interrupt_enable); 415RW_ATTR(interrupt_enable);
420RW_ATTR(threshold_limit); 416RW_ATTR(threshold_limit);
421RW_ATTR(error_count);
422 417
423static struct attribute *default_attrs[] = { 418static struct attribute *default_attrs[] = {
424 &threshold_limit.attr, 419 &threshold_limit.attr,
@@ -517,7 +512,7 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
517 512
518 err = kobject_init_and_add(&b->kobj, &threshold_ktype, 513 err = kobject_init_and_add(&b->kobj, &threshold_ktype,
519 per_cpu(threshold_banks, cpu)[bank]->kobj, 514 per_cpu(threshold_banks, cpu)[bank]->kobj,
520 "misc%i", block); 515 (bank == 4 ? bank4_names(b) : th_names[bank]));
521 if (err) 516 if (err)
522 goto out_free; 517 goto out_free;
523recurse: 518recurse:
@@ -548,98 +543,91 @@ out_free:
548 return err; 543 return err;
549} 544}
550 545
551static __cpuinit long 546static __cpuinit int __threshold_add_blocks(struct threshold_bank *b)
552local_allocate_threshold_blocks(int cpu, unsigned int bank)
553{ 547{
554 return allocate_threshold_blocks(cpu, bank, 0, 548 struct list_head *head = &b->blocks->miscj;
555 MSR_IA32_MC0_MISC + bank * 4); 549 struct threshold_block *pos = NULL;
550 struct threshold_block *tmp = NULL;
551 int err = 0;
552
553 err = kobject_add(&b->blocks->kobj, b->kobj, b->blocks->kobj.name);
554 if (err)
555 return err;
556
557 list_for_each_entry_safe(pos, tmp, head, miscj) {
558
559 err = kobject_add(&pos->kobj, b->kobj, pos->kobj.name);
560 if (err) {
561 list_for_each_entry_safe_reverse(pos, tmp, head, miscj)
562 kobject_del(&pos->kobj);
563
564 return err;
565 }
566 }
567 return err;
556} 568}
557 569
558/* symlinks sibling shared banks to first core. first core owns dir/files. */
559static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) 570static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
560{ 571{
561 int i, err = 0;
562 struct threshold_bank *b = NULL;
563 struct device *dev = per_cpu(mce_device, cpu); 572 struct device *dev = per_cpu(mce_device, cpu);
564 char name[32]; 573 struct amd_northbridge *nb = NULL;
574 struct threshold_bank *b = NULL;
575 const char *name = th_names[bank];
576 int err = 0;
565 577
566 sprintf(name, "threshold_bank%i", bank); 578 if (shared_bank[bank]) {
567 579
568#ifdef CONFIG_SMP 580 nb = node_to_amd_nb(amd_get_nb_id(cpu));
569 if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ 581 WARN_ON(!nb);
570 i = cpumask_first(cpu_llc_shared_mask(cpu));
571 582
572 /* first core not up yet */ 583 /* threshold descriptor already initialized on this node? */
573 if (cpu_data(i).cpu_core_id) 584 if (nb->bank4) {
574 goto out; 585 /* yes, use it */
586 b = nb->bank4;
587 err = kobject_add(b->kobj, &dev->kobj, name);
588 if (err)
589 goto out;
575 590
576 /* already linked */ 591 per_cpu(threshold_banks, cpu)[bank] = b;
577 if (per_cpu(threshold_banks, cpu)[bank]) 592 atomic_inc(&b->cpus);
578 goto out;
579 593
580 b = per_cpu(threshold_banks, i)[bank]; 594 err = __threshold_add_blocks(b);
581 595
582 if (!b)
583 goto out; 596 goto out;
584 597 }
585 err = sysfs_create_link(&dev->kobj, b->kobj, name);
586 if (err)
587 goto out;
588
589 cpumask_copy(b->cpus, cpu_llc_shared_mask(cpu));
590 per_cpu(threshold_banks, cpu)[bank] = b;
591
592 goto out;
593 } 598 }
594#endif
595 599
596 b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); 600 b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
597 if (!b) { 601 if (!b) {
598 err = -ENOMEM; 602 err = -ENOMEM;
599 goto out; 603 goto out;
600 } 604 }
601 if (!zalloc_cpumask_var(&b->cpus, GFP_KERNEL)) {
602 kfree(b);
603 err = -ENOMEM;
604 goto out;
605 }
606 605
607 b->kobj = kobject_create_and_add(name, &dev->kobj); 606 b->kobj = kobject_create_and_add(name, &dev->kobj);
608 if (!b->kobj) 607 if (!b->kobj) {
608 err = -EINVAL;
609 goto out_free; 609 goto out_free;
610 610 }
611#ifndef CONFIG_SMP
612 cpumask_setall(b->cpus);
613#else
614 cpumask_set_cpu(cpu, b->cpus);
615#endif
616 611
617 per_cpu(threshold_banks, cpu)[bank] = b; 612 per_cpu(threshold_banks, cpu)[bank] = b;
618 613
619 err = local_allocate_threshold_blocks(cpu, bank); 614 if (shared_bank[bank]) {
620 if (err) 615 atomic_set(&b->cpus, 1);
621 goto out_free;
622
623 for_each_cpu(i, b->cpus) {
624 if (i == cpu)
625 continue;
626
627 dev = per_cpu(mce_device, i);
628 if (dev)
629 err = sysfs_create_link(&dev->kobj,b->kobj, name);
630 if (err)
631 goto out;
632 616
633 per_cpu(threshold_banks, i)[bank] = b; 617 /* nb is already initialized, see above */
618 WARN_ON(nb->bank4);
619 nb->bank4 = b;
634 } 620 }
635 621
636 goto out; 622 err = allocate_threshold_blocks(cpu, bank, 0,
623 MSR_IA32_MC0_MISC + bank * 4);
624 if (!err)
625 goto out;
637 626
638out_free: 627 out_free:
639 per_cpu(threshold_banks, cpu)[bank] = NULL;
640 free_cpumask_var(b->cpus);
641 kfree(b); 628 kfree(b);
642out: 629
630 out:
643 return err; 631 return err;
644} 632}
645 633
@@ -660,12 +648,6 @@ static __cpuinit int threshold_create_device(unsigned int cpu)
660 return err; 648 return err;
661} 649}
662 650
663/*
664 * let's be hotplug friendly.
665 * in case of multiple core processors, the first core always takes ownership
666 * of shared sysfs dir/files, and rest of the cores will be symlinked to it.
667 */
668
669static void deallocate_threshold_block(unsigned int cpu, 651static void deallocate_threshold_block(unsigned int cpu,
670 unsigned int bank) 652 unsigned int bank)
671{ 653{
@@ -686,41 +668,42 @@ static void deallocate_threshold_block(unsigned int cpu,
686 per_cpu(threshold_banks, cpu)[bank]->blocks = NULL; 668 per_cpu(threshold_banks, cpu)[bank]->blocks = NULL;
687} 669}
688 670
671static void __threshold_remove_blocks(struct threshold_bank *b)
672{
673 struct threshold_block *pos = NULL;
674 struct threshold_block *tmp = NULL;
675
676 kobject_del(b->kobj);
677
678 list_for_each_entry_safe(pos, tmp, &b->blocks->miscj, miscj)
679 kobject_del(&pos->kobj);
680}
681
689static void threshold_remove_bank(unsigned int cpu, int bank) 682static void threshold_remove_bank(unsigned int cpu, int bank)
690{ 683{
684 struct amd_northbridge *nb;
691 struct threshold_bank *b; 685 struct threshold_bank *b;
692 struct device *dev;
693 char name[32];
694 int i = 0;
695 686
696 b = per_cpu(threshold_banks, cpu)[bank]; 687 b = per_cpu(threshold_banks, cpu)[bank];
697 if (!b) 688 if (!b)
698 return; 689 return;
690
699 if (!b->blocks) 691 if (!b->blocks)
700 goto free_out; 692 goto free_out;
701 693
702 sprintf(name, "threshold_bank%i", bank); 694 if (shared_bank[bank]) {
703 695 if (!atomic_dec_and_test(&b->cpus)) {
704#ifdef CONFIG_SMP 696 __threshold_remove_blocks(b);
705 /* sibling symlink */ 697 per_cpu(threshold_banks, cpu)[bank] = NULL;
706 if (shared_bank[bank] && b->blocks->cpu != cpu) { 698 return;
707 dev = per_cpu(mce_device, cpu); 699 } else {
708 sysfs_remove_link(&dev->kobj, name); 700 /*
709 per_cpu(threshold_banks, cpu)[bank] = NULL; 701 * the last CPU on this node using the shared bank is
710 702 * going away, remove that bank now.
711 return; 703 */
712 } 704 nb = node_to_amd_nb(amd_get_nb_id(cpu));
713#endif 705 nb->bank4 = NULL;
714 706 }
715 /* remove all sibling symlinks before unregistering */
716 for_each_cpu(i, b->cpus) {
717 if (i == cpu)
718 continue;
719
720 dev = per_cpu(mce_device, i);
721 if (dev)
722 sysfs_remove_link(&dev->kobj, name);
723 per_cpu(threshold_banks, i)[bank] = NULL;
724 } 707 }
725 708
726 deallocate_threshold_block(cpu, bank); 709 deallocate_threshold_block(cpu, bank);
@@ -728,7 +711,6 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
728free_out: 711free_out:
729 kobject_del(b->kobj); 712 kobject_del(b->kobj);
730 kobject_put(b->kobj); 713 kobject_put(b->kobj);
731 free_cpumask_var(b->cpus);
732 kfree(b); 714 kfree(b);
733 per_cpu(threshold_banks, cpu)[bank] = NULL; 715 per_cpu(threshold_banks, cpu)[bank] = NULL;
734} 716}
@@ -777,4 +759,24 @@ static __init int threshold_init_device(void)
777 759
778 return 0; 760 return 0;
779} 761}
780device_initcall(threshold_init_device); 762/*
763 * there are 3 funcs which need to be _initcalled in a logic sequence:
764 * 1. xen_late_init_mcelog
765 * 2. mcheck_init_device
766 * 3. threshold_init_device
767 *
768 * xen_late_init_mcelog must register xen_mce_chrdev_device before
769 * native mce_chrdev_device registration if running under xen platform;
770 *
771 * mcheck_init_device should be inited before threshold_init_device to
772 * initialize mce_device, otherwise a NULL ptr dereference will cause panic.
773 *
774 * so we use following _initcalls
775 * 1. device_initcall(xen_late_init_mcelog);
776 * 2. device_initcall_sync(mcheck_init_device);
777 * 3. late_initcall(threshold_init_device);
778 *
779 * when running under xen, the initcall order is 1,2,3;
780 * on baremetal, we skip 1 and we do only 2 and 3.
781 */
782late_initcall(threshold_init_device);
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index bdda2e6c673b..35ffda5d0727 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -258,11 +258,11 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk,
258 258
259 /* Compute the maximum size with which we can make a range: */ 259 /* Compute the maximum size with which we can make a range: */
260 if (range_startk) 260 if (range_startk)
261 max_align = ffs(range_startk) - 1; 261 max_align = __ffs(range_startk);
262 else 262 else
263 max_align = 32; 263 max_align = BITS_PER_LONG - 1;
264 264
265 align = fls(range_sizek) - 1; 265 align = __fls(range_sizek);
266 if (align > max_align) 266 if (align > max_align)
267 align = max_align; 267 align = max_align;
268 268
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 75772ae6c65f..e9fe907cd249 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -361,11 +361,7 @@ static void __init print_mtrr_state(void)
361 } 361 }
362 pr_debug("MTRR variable ranges %sabled:\n", 362 pr_debug("MTRR variable ranges %sabled:\n",
363 mtrr_state.enabled & 2 ? "en" : "dis"); 363 mtrr_state.enabled & 2 ? "en" : "dis");
364 if (size_or_mask & 0xffffffffUL) 364 high_width = (__ffs64(size_or_mask) - (32 - PAGE_SHIFT) + 3) / 4;
365 high_width = ffs(size_or_mask & 0xffffffffUL) - 1;
366 else
367 high_width = ffs(size_or_mask>>32) + 32 - 1;
368 high_width = (high_width - (32 - PAGE_SHIFT) + 3) / 4;
369 365
370 for (i = 0; i < num_var_ranges; ++i) { 366 for (i = 0; i < num_var_ranges; ++i) {
371 if (mtrr_state.var_ranges[i].mask_lo & (1 << 11)) 367 if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index c4706cf9c011..29557aa06dda 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -35,17 +35,6 @@
35 35
36#include "perf_event.h" 36#include "perf_event.h"
37 37
38#if 0
39#undef wrmsrl
40#define wrmsrl(msr, val) \
41do { \
42 trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\
43 (unsigned long)(val)); \
44 native_write_msr((msr), (u32)((u64)(val)), \
45 (u32)((u64)(val) >> 32)); \
46} while (0)
47#endif
48
49struct x86_pmu x86_pmu __read_mostly; 38struct x86_pmu x86_pmu __read_mostly;
50 39
51DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { 40DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
@@ -74,7 +63,7 @@ u64 x86_perf_event_update(struct perf_event *event)
74 int idx = hwc->idx; 63 int idx = hwc->idx;
75 s64 delta; 64 s64 delta;
76 65
77 if (idx == X86_PMC_IDX_FIXED_BTS) 66 if (idx == INTEL_PMC_IDX_FIXED_BTS)
78 return 0; 67 return 0;
79 68
80 /* 69 /*
@@ -86,7 +75,7 @@ u64 x86_perf_event_update(struct perf_event *event)
86 */ 75 */
87again: 76again:
88 prev_raw_count = local64_read(&hwc->prev_count); 77 prev_raw_count = local64_read(&hwc->prev_count);
89 rdmsrl(hwc->event_base, new_raw_count); 78 rdpmcl(hwc->event_base_rdpmc, new_raw_count);
90 79
91 if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, 80 if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
92 new_raw_count) != prev_raw_count) 81 new_raw_count) != prev_raw_count)
@@ -189,7 +178,7 @@ static void release_pmc_hardware(void) {}
189 178
190static bool check_hw_exists(void) 179static bool check_hw_exists(void)
191{ 180{
192 u64 val, val_new = 0; 181 u64 val, val_new = ~0;
193 int i, reg, ret = 0; 182 int i, reg, ret = 0;
194 183
195 /* 184 /*
@@ -222,8 +211,9 @@ static bool check_hw_exists(void)
222 * that don't trap on the MSR access and always return 0s. 211 * that don't trap on the MSR access and always return 0s.
223 */ 212 */
224 val = 0xabcdUL; 213 val = 0xabcdUL;
225 ret = checking_wrmsrl(x86_pmu_event_addr(0), val); 214 reg = x86_pmu_event_addr(0);
226 ret |= rdmsrl_safe(x86_pmu_event_addr(0), &val_new); 215 ret = wrmsrl_safe(reg, val);
216 ret |= rdmsrl_safe(reg, &val_new);
227 if (ret || val != val_new) 217 if (ret || val != val_new)
228 goto msr_fail; 218 goto msr_fail;
229 219
@@ -240,6 +230,7 @@ bios_fail:
240 230
241msr_fail: 231msr_fail:
242 printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n"); 232 printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n");
233 printk(KERN_ERR "Failed to access perfctr msr (MSR %x is %Lx)\n", reg, val_new);
243 234
244 return false; 235 return false;
245} 236}
@@ -388,7 +379,7 @@ int x86_pmu_hw_config(struct perf_event *event)
388 int precise = 0; 379 int precise = 0;
389 380
390 /* Support for constant skid */ 381 /* Support for constant skid */
391 if (x86_pmu.pebs_active) { 382 if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
392 precise++; 383 precise++;
393 384
394 /* Support for IP fixup */ 385 /* Support for IP fixup */
@@ -637,8 +628,8 @@ static bool __perf_sched_find_counter(struct perf_sched *sched)
637 c = sched->constraints[sched->state.event]; 628 c = sched->constraints[sched->state.event];
638 629
639 /* Prefer fixed purpose counters */ 630 /* Prefer fixed purpose counters */
640 if (x86_pmu.num_counters_fixed) { 631 if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
641 idx = X86_PMC_IDX_FIXED; 632 idx = INTEL_PMC_IDX_FIXED;
642 for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) { 633 for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
643 if (!__test_and_set_bit(idx, sched->state.used)) 634 if (!__test_and_set_bit(idx, sched->state.used))
644 goto done; 635 goto done;
@@ -646,7 +637,7 @@ static bool __perf_sched_find_counter(struct perf_sched *sched)
646 } 637 }
647 /* Grab the first unused counter starting with idx */ 638 /* Grab the first unused counter starting with idx */
648 idx = sched->state.counter; 639 idx = sched->state.counter;
649 for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_FIXED) { 640 for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
650 if (!__test_and_set_bit(idx, sched->state.used)) 641 if (!__test_and_set_bit(idx, sched->state.used))
651 goto done; 642 goto done;
652 } 643 }
@@ -704,8 +695,8 @@ static bool perf_sched_next_event(struct perf_sched *sched)
704/* 695/*
705 * Assign a counter for each event. 696 * Assign a counter for each event.
706 */ 697 */
707static int perf_assign_events(struct event_constraint **constraints, int n, 698int perf_assign_events(struct event_constraint **constraints, int n,
708 int wmin, int wmax, int *assign) 699 int wmin, int wmax, int *assign)
709{ 700{
710 struct perf_sched sched; 701 struct perf_sched sched;
711 702
@@ -824,15 +815,17 @@ static inline void x86_assign_hw_event(struct perf_event *event,
824 hwc->last_cpu = smp_processor_id(); 815 hwc->last_cpu = smp_processor_id();
825 hwc->last_tag = ++cpuc->tags[i]; 816 hwc->last_tag = ++cpuc->tags[i];
826 817
827 if (hwc->idx == X86_PMC_IDX_FIXED_BTS) { 818 if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) {
828 hwc->config_base = 0; 819 hwc->config_base = 0;
829 hwc->event_base = 0; 820 hwc->event_base = 0;
830 } else if (hwc->idx >= X86_PMC_IDX_FIXED) { 821 } else if (hwc->idx >= INTEL_PMC_IDX_FIXED) {
831 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; 822 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
832 hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - X86_PMC_IDX_FIXED); 823 hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - INTEL_PMC_IDX_FIXED);
824 hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 1<<30;
833 } else { 825 } else {
834 hwc->config_base = x86_pmu_config_addr(hwc->idx); 826 hwc->config_base = x86_pmu_config_addr(hwc->idx);
835 hwc->event_base = x86_pmu_event_addr(hwc->idx); 827 hwc->event_base = x86_pmu_event_addr(hwc->idx);
828 hwc->event_base_rdpmc = hwc->idx;
836 } 829 }
837} 830}
838 831
@@ -930,7 +923,7 @@ int x86_perf_event_set_period(struct perf_event *event)
930 s64 period = hwc->sample_period; 923 s64 period = hwc->sample_period;
931 int ret = 0, idx = hwc->idx; 924 int ret = 0, idx = hwc->idx;
932 925
933 if (idx == X86_PMC_IDX_FIXED_BTS) 926 if (idx == INTEL_PMC_IDX_FIXED_BTS)
934 return 0; 927 return 0;
935 928
936 /* 929 /*
@@ -1316,7 +1309,6 @@ static struct attribute_group x86_pmu_format_group = {
1316static int __init init_hw_perf_events(void) 1309static int __init init_hw_perf_events(void)
1317{ 1310{
1318 struct x86_pmu_quirk *quirk; 1311 struct x86_pmu_quirk *quirk;
1319 struct event_constraint *c;
1320 int err; 1312 int err;
1321 1313
1322 pr_info("Performance Events: "); 1314 pr_info("Performance Events: ");
@@ -1347,21 +1339,8 @@ static int __init init_hw_perf_events(void)
1347 for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next) 1339 for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next)
1348 quirk->func(); 1340 quirk->func();
1349 1341
1350 if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { 1342 if (!x86_pmu.intel_ctrl)
1351 WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", 1343 x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
1352 x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
1353 x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
1354 }
1355 x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
1356
1357 if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
1358 WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
1359 x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
1360 x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
1361 }
1362
1363 x86_pmu.intel_ctrl |=
1364 ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
1365 1344
1366 perf_events_lapic_init(); 1345 perf_events_lapic_init();
1367 register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI"); 1346 register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI");
@@ -1370,22 +1349,6 @@ static int __init init_hw_perf_events(void)
1370 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1, 1349 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
1371 0, x86_pmu.num_counters, 0); 1350 0, x86_pmu.num_counters, 0);
1372 1351
1373 if (x86_pmu.event_constraints) {
1374 /*
1375 * event on fixed counter2 (REF_CYCLES) only works on this
1376 * counter, so do not extend mask to generic counters
1377 */
1378 for_each_event_constraint(c, x86_pmu.event_constraints) {
1379 if (c->cmask != X86_RAW_EVENT_MASK
1380 || c->idxmsk64 == X86_PMC_MSK_FIXED_REF_CYCLES) {
1381 continue;
1382 }
1383
1384 c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
1385 c->weight += x86_pmu.num_counters;
1386 }
1387 }
1388
1389 x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ 1352 x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */
1390 x86_pmu_format_group.attrs = x86_pmu.format_attrs; 1353 x86_pmu_format_group.attrs = x86_pmu.format_attrs;
1391 1354
@@ -1620,8 +1583,8 @@ static int x86_pmu_event_idx(struct perf_event *event)
1620 if (!x86_pmu.attr_rdpmc) 1583 if (!x86_pmu.attr_rdpmc)
1621 return 0; 1584 return 0;
1622 1585
1623 if (x86_pmu.num_counters_fixed && idx >= X86_PMC_IDX_FIXED) { 1586 if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) {
1624 idx -= X86_PMC_IDX_FIXED; 1587 idx -= INTEL_PMC_IDX_FIXED;
1625 idx |= 1 << 30; 1588 idx |= 1 << 30;
1626 } 1589 }
1627 1590
@@ -1649,7 +1612,12 @@ static ssize_t set_attr_rdpmc(struct device *cdev,
1649 struct device_attribute *attr, 1612 struct device_attribute *attr,
1650 const char *buf, size_t count) 1613 const char *buf, size_t count)
1651{ 1614{
1652 unsigned long val = simple_strtoul(buf, NULL, 0); 1615 unsigned long val;
1616 ssize_t ret;
1617
1618 ret = kstrtoul(buf, 0, &val);
1619 if (ret)
1620 return ret;
1653 1621
1654 if (!!val != !!x86_pmu.attr_rdpmc) { 1622 if (!!val != !!x86_pmu.attr_rdpmc) {
1655 x86_pmu.attr_rdpmc = !!val; 1623 x86_pmu.attr_rdpmc = !!val;
@@ -1682,13 +1650,20 @@ static void x86_pmu_flush_branch_stack(void)
1682 x86_pmu.flush_branch_stack(); 1650 x86_pmu.flush_branch_stack();
1683} 1651}
1684 1652
1653void perf_check_microcode(void)
1654{
1655 if (x86_pmu.check_microcode)
1656 x86_pmu.check_microcode();
1657}
1658EXPORT_SYMBOL_GPL(perf_check_microcode);
1659
1685static struct pmu pmu = { 1660static struct pmu pmu = {
1686 .pmu_enable = x86_pmu_enable, 1661 .pmu_enable = x86_pmu_enable,
1687 .pmu_disable = x86_pmu_disable, 1662 .pmu_disable = x86_pmu_disable,
1688 1663
1689 .attr_groups = x86_pmu_attr_groups, 1664 .attr_groups = x86_pmu_attr_groups,
1690 1665
1691 .event_init = x86_pmu_event_init, 1666 .event_init = x86_pmu_event_init,
1692 1667
1693 .add = x86_pmu_add, 1668 .add = x86_pmu_add,
1694 .del = x86_pmu_del, 1669 .del = x86_pmu_del,
@@ -1696,11 +1671,11 @@ static struct pmu pmu = {
1696 .stop = x86_pmu_stop, 1671 .stop = x86_pmu_stop,
1697 .read = x86_pmu_read, 1672 .read = x86_pmu_read,
1698 1673
1699 .start_txn = x86_pmu_start_txn, 1674 .start_txn = x86_pmu_start_txn,
1700 .cancel_txn = x86_pmu_cancel_txn, 1675 .cancel_txn = x86_pmu_cancel_txn,
1701 .commit_txn = x86_pmu_commit_txn, 1676 .commit_txn = x86_pmu_commit_txn,
1702 1677
1703 .event_idx = x86_pmu_event_idx, 1678 .event_idx = x86_pmu_event_idx,
1704 .flush_branch_stack = x86_pmu_flush_branch_stack, 1679 .flush_branch_stack = x86_pmu_flush_branch_stack,
1705}; 1680};
1706 1681
@@ -1863,7 +1838,7 @@ unsigned long perf_misc_flags(struct pt_regs *regs)
1863 else 1838 else
1864 misc |= PERF_RECORD_MISC_GUEST_KERNEL; 1839 misc |= PERF_RECORD_MISC_GUEST_KERNEL;
1865 } else { 1840 } else {
1866 if (user_mode(regs)) 1841 if (!kernel_ip(regs->ip))
1867 misc |= PERF_RECORD_MISC_USER; 1842 misc |= PERF_RECORD_MISC_USER;
1868 else 1843 else
1869 misc |= PERF_RECORD_MISC_KERNEL; 1844 misc |= PERF_RECORD_MISC_KERNEL;
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 7241e2fc3c17..821d53b696d1 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -14,6 +14,18 @@
14 14
15#include <linux/perf_event.h> 15#include <linux/perf_event.h>
16 16
17#if 0
18#undef wrmsrl
19#define wrmsrl(msr, val) \
20do { \
21 unsigned int _msr = (msr); \
22 u64 _val = (val); \
23 trace_printk("wrmsrl(%x, %Lx)\n", (unsigned int)(_msr), \
24 (unsigned long long)(_val)); \
25 native_write_msr((_msr), (u32)(_val), (u32)(_val >> 32)); \
26} while (0)
27#endif
28
17/* 29/*
18 * | NHM/WSM | SNB | 30 * | NHM/WSM | SNB |
19 * register ------------------------------- 31 * register -------------------------------
@@ -57,7 +69,7 @@ struct amd_nb {
57}; 69};
58 70
59/* The maximal number of PEBS events: */ 71/* The maximal number of PEBS events: */
60#define MAX_PEBS_EVENTS 4 72#define MAX_PEBS_EVENTS 8
61 73
62/* 74/*
63 * A debug store configuration. 75 * A debug store configuration.
@@ -349,6 +361,8 @@ struct x86_pmu {
349 void (*cpu_starting)(int cpu); 361 void (*cpu_starting)(int cpu);
350 void (*cpu_dying)(int cpu); 362 void (*cpu_dying)(int cpu);
351 void (*cpu_dead)(int cpu); 363 void (*cpu_dead)(int cpu);
364
365 void (*check_microcode)(void);
352 void (*flush_branch_stack)(void); 366 void (*flush_branch_stack)(void);
353 367
354 /* 368 /*
@@ -360,12 +374,16 @@ struct x86_pmu {
360 /* 374 /*
361 * Intel DebugStore bits 375 * Intel DebugStore bits
362 */ 376 */
363 int bts, pebs; 377 unsigned int bts :1,
364 int bts_active, pebs_active; 378 bts_active :1,
379 pebs :1,
380 pebs_active :1,
381 pebs_broken :1;
365 int pebs_record_size; 382 int pebs_record_size;
366 void (*drain_pebs)(struct pt_regs *regs); 383 void (*drain_pebs)(struct pt_regs *regs);
367 struct event_constraint *pebs_constraints; 384 struct event_constraint *pebs_constraints;
368 void (*pebs_aliases)(struct perf_event *event); 385 void (*pebs_aliases)(struct perf_event *event);
386 int max_pebs_events;
369 387
370 /* 388 /*
371 * Intel LBR 389 * Intel LBR
@@ -468,6 +486,8 @@ static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
468 486
469void x86_pmu_enable_all(int added); 487void x86_pmu_enable_all(int added);
470 488
489int perf_assign_events(struct event_constraint **constraints, int n,
490 int wmin, int wmax, int *assign);
471int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign); 491int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign);
472 492
473void x86_pmu_stop(struct perf_event *event, int flags); 493void x86_pmu_stop(struct perf_event *event, int flags);
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 11a4eb9131d5..4528ae7b6ec4 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -366,7 +366,7 @@ static void amd_pmu_cpu_starting(int cpu)
366 366
367 cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY; 367 cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY;
368 368
369 if (boot_cpu_data.x86_max_cores < 2 || boot_cpu_data.x86 == 0x15) 369 if (boot_cpu_data.x86_max_cores < 2)
370 return; 370 return;
371 371
372 nb_id = amd_get_nb_id(cpu); 372 nb_id = amd_get_nb_id(cpu);
@@ -422,35 +422,6 @@ static struct attribute *amd_format_attr[] = {
422 NULL, 422 NULL,
423}; 423};
424 424
425static __initconst const struct x86_pmu amd_pmu = {
426 .name = "AMD",
427 .handle_irq = x86_pmu_handle_irq,
428 .disable_all = x86_pmu_disable_all,
429 .enable_all = x86_pmu_enable_all,
430 .enable = x86_pmu_enable_event,
431 .disable = x86_pmu_disable_event,
432 .hw_config = amd_pmu_hw_config,
433 .schedule_events = x86_schedule_events,
434 .eventsel = MSR_K7_EVNTSEL0,
435 .perfctr = MSR_K7_PERFCTR0,
436 .event_map = amd_pmu_event_map,
437 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
438 .num_counters = AMD64_NUM_COUNTERS,
439 .cntval_bits = 48,
440 .cntval_mask = (1ULL << 48) - 1,
441 .apic = 1,
442 /* use highest bit to detect overflow */
443 .max_period = (1ULL << 47) - 1,
444 .get_event_constraints = amd_get_event_constraints,
445 .put_event_constraints = amd_put_event_constraints,
446
447 .format_attrs = amd_format_attr,
448
449 .cpu_prepare = amd_pmu_cpu_prepare,
450 .cpu_starting = amd_pmu_cpu_starting,
451 .cpu_dead = amd_pmu_cpu_dead,
452};
453
454/* AMD Family 15h */ 425/* AMD Family 15h */
455 426
456#define AMD_EVENT_TYPE_MASK 0x000000F0ULL 427#define AMD_EVENT_TYPE_MASK 0x000000F0ULL
@@ -597,8 +568,8 @@ amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *ev
597 } 568 }
598} 569}
599 570
600static __initconst const struct x86_pmu amd_pmu_f15h = { 571static __initconst const struct x86_pmu amd_pmu = {
601 .name = "AMD Family 15h", 572 .name = "AMD",
602 .handle_irq = x86_pmu_handle_irq, 573 .handle_irq = x86_pmu_handle_irq,
603 .disable_all = x86_pmu_disable_all, 574 .disable_all = x86_pmu_disable_all,
604 .enable_all = x86_pmu_enable_all, 575 .enable_all = x86_pmu_enable_all,
@@ -606,50 +577,68 @@ static __initconst const struct x86_pmu amd_pmu_f15h = {
606 .disable = x86_pmu_disable_event, 577 .disable = x86_pmu_disable_event,
607 .hw_config = amd_pmu_hw_config, 578 .hw_config = amd_pmu_hw_config,
608 .schedule_events = x86_schedule_events, 579 .schedule_events = x86_schedule_events,
609 .eventsel = MSR_F15H_PERF_CTL, 580 .eventsel = MSR_K7_EVNTSEL0,
610 .perfctr = MSR_F15H_PERF_CTR, 581 .perfctr = MSR_K7_PERFCTR0,
611 .event_map = amd_pmu_event_map, 582 .event_map = amd_pmu_event_map,
612 .max_events = ARRAY_SIZE(amd_perfmon_event_map), 583 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
613 .num_counters = AMD64_NUM_COUNTERS_F15H, 584 .num_counters = AMD64_NUM_COUNTERS,
614 .cntval_bits = 48, 585 .cntval_bits = 48,
615 .cntval_mask = (1ULL << 48) - 1, 586 .cntval_mask = (1ULL << 48) - 1,
616 .apic = 1, 587 .apic = 1,
617 /* use highest bit to detect overflow */ 588 /* use highest bit to detect overflow */
618 .max_period = (1ULL << 47) - 1, 589 .max_period = (1ULL << 47) - 1,
619 .get_event_constraints = amd_get_event_constraints_f15h, 590 .get_event_constraints = amd_get_event_constraints,
620 /* nortbridge counters not yet implemented: */
621#if 0
622 .put_event_constraints = amd_put_event_constraints, 591 .put_event_constraints = amd_put_event_constraints,
623 592
593 .format_attrs = amd_format_attr,
594
624 .cpu_prepare = amd_pmu_cpu_prepare, 595 .cpu_prepare = amd_pmu_cpu_prepare,
625 .cpu_dead = amd_pmu_cpu_dead,
626#endif
627 .cpu_starting = amd_pmu_cpu_starting, 596 .cpu_starting = amd_pmu_cpu_starting,
628 .format_attrs = amd_format_attr, 597 .cpu_dead = amd_pmu_cpu_dead,
629}; 598};
630 599
600static int setup_event_constraints(void)
601{
602 if (boot_cpu_data.x86 >= 0x15)
603 x86_pmu.get_event_constraints = amd_get_event_constraints_f15h;
604 return 0;
605}
606
607static int setup_perfctr_core(void)
608{
609 if (!cpu_has_perfctr_core) {
610 WARN(x86_pmu.get_event_constraints == amd_get_event_constraints_f15h,
611 KERN_ERR "Odd, counter constraints enabled but no core perfctrs detected!");
612 return -ENODEV;
613 }
614
615 WARN(x86_pmu.get_event_constraints == amd_get_event_constraints,
616 KERN_ERR "hw perf events core counters need constraints handler!");
617
618 /*
619 * If core performance counter extensions exists, we must use
620 * MSR_F15H_PERF_CTL/MSR_F15H_PERF_CTR msrs. See also
621 * x86_pmu_addr_offset().
622 */
623 x86_pmu.eventsel = MSR_F15H_PERF_CTL;
624 x86_pmu.perfctr = MSR_F15H_PERF_CTR;
625 x86_pmu.num_counters = AMD64_NUM_COUNTERS_CORE;
626
627 printk(KERN_INFO "perf: AMD core performance counters detected\n");
628
629 return 0;
630}
631
631__init int amd_pmu_init(void) 632__init int amd_pmu_init(void)
632{ 633{
633 /* Performance-monitoring supported from K7 and later: */ 634 /* Performance-monitoring supported from K7 and later: */
634 if (boot_cpu_data.x86 < 6) 635 if (boot_cpu_data.x86 < 6)
635 return -ENODEV; 636 return -ENODEV;
636 637
637 /* 638 x86_pmu = amd_pmu;
638 * If core performance counter extensions exists, it must be 639
639 * family 15h, otherwise fail. See x86_pmu_addr_offset(). 640 setup_event_constraints();
640 */ 641 setup_perfctr_core();
641 switch (boot_cpu_data.x86) {
642 case 0x15:
643 if (!cpu_has_perfctr_core)
644 return -ENODEV;
645 x86_pmu = amd_pmu_f15h;
646 break;
647 default:
648 if (cpu_has_perfctr_core)
649 return -ENODEV;
650 x86_pmu = amd_pmu;
651 break;
652 }
653 642
654 /* Events are common for all AMDs */ 643 /* Events are common for all AMDs */
655 memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, 644 memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 187c294bc658..382366977d4c 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -5,6 +5,8 @@
5 * among events on a single PMU. 5 * among events on a single PMU.
6 */ 6 */
7 7
8#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
9
8#include <linux/stddef.h> 10#include <linux/stddef.h>
9#include <linux/types.h> 11#include <linux/types.h>
10#include <linux/init.h> 12#include <linux/init.h>
@@ -21,14 +23,14 @@
21 */ 23 */
22static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = 24static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
23{ 25{
24 [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, 26 [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
25 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, 27 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
26 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, 28 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
27 [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, 29 [PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
28 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, 30 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
29 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, 31 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
30 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, 32 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
31 [PERF_COUNT_HW_REF_CPU_CYCLES] = 0x0300, /* pseudo-encoding */ 33 [PERF_COUNT_HW_REF_CPU_CYCLES] = 0x0300, /* pseudo-encoding */
32}; 34};
33 35
34static struct event_constraint intel_core_event_constraints[] __read_mostly = 36static struct event_constraint intel_core_event_constraints[] __read_mostly =
@@ -136,6 +138,84 @@ static u64 intel_pmu_event_map(int hw_event)
136 return intel_perfmon_event_map[hw_event]; 138 return intel_perfmon_event_map[hw_event];
137} 139}
138 140
141#define SNB_DMND_DATA_RD (1ULL << 0)
142#define SNB_DMND_RFO (1ULL << 1)
143#define SNB_DMND_IFETCH (1ULL << 2)
144#define SNB_DMND_WB (1ULL << 3)
145#define SNB_PF_DATA_RD (1ULL << 4)
146#define SNB_PF_RFO (1ULL << 5)
147#define SNB_PF_IFETCH (1ULL << 6)
148#define SNB_LLC_DATA_RD (1ULL << 7)
149#define SNB_LLC_RFO (1ULL << 8)
150#define SNB_LLC_IFETCH (1ULL << 9)
151#define SNB_BUS_LOCKS (1ULL << 10)
152#define SNB_STRM_ST (1ULL << 11)
153#define SNB_OTHER (1ULL << 15)
154#define SNB_RESP_ANY (1ULL << 16)
155#define SNB_NO_SUPP (1ULL << 17)
156#define SNB_LLC_HITM (1ULL << 18)
157#define SNB_LLC_HITE (1ULL << 19)
158#define SNB_LLC_HITS (1ULL << 20)
159#define SNB_LLC_HITF (1ULL << 21)
160#define SNB_LOCAL (1ULL << 22)
161#define SNB_REMOTE (0xffULL << 23)
162#define SNB_SNP_NONE (1ULL << 31)
163#define SNB_SNP_NOT_NEEDED (1ULL << 32)
164#define SNB_SNP_MISS (1ULL << 33)
165#define SNB_NO_FWD (1ULL << 34)
166#define SNB_SNP_FWD (1ULL << 35)
167#define SNB_HITM (1ULL << 36)
168#define SNB_NON_DRAM (1ULL << 37)
169
170#define SNB_DMND_READ (SNB_DMND_DATA_RD|SNB_LLC_DATA_RD)
171#define SNB_DMND_WRITE (SNB_DMND_RFO|SNB_LLC_RFO)
172#define SNB_DMND_PREFETCH (SNB_PF_DATA_RD|SNB_PF_RFO)
173
174#define SNB_SNP_ANY (SNB_SNP_NONE|SNB_SNP_NOT_NEEDED| \
175 SNB_SNP_MISS|SNB_NO_FWD|SNB_SNP_FWD| \
176 SNB_HITM)
177
178#define SNB_DRAM_ANY (SNB_LOCAL|SNB_REMOTE|SNB_SNP_ANY)
179#define SNB_DRAM_REMOTE (SNB_REMOTE|SNB_SNP_ANY)
180
181#define SNB_L3_ACCESS SNB_RESP_ANY
182#define SNB_L3_MISS (SNB_DRAM_ANY|SNB_NON_DRAM)
183
184static __initconst const u64 snb_hw_cache_extra_regs
185 [PERF_COUNT_HW_CACHE_MAX]
186 [PERF_COUNT_HW_CACHE_OP_MAX]
187 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
188{
189 [ C(LL ) ] = {
190 [ C(OP_READ) ] = {
191 [ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_L3_ACCESS,
192 [ C(RESULT_MISS) ] = SNB_DMND_READ|SNB_L3_MISS,
193 },
194 [ C(OP_WRITE) ] = {
195 [ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_L3_ACCESS,
196 [ C(RESULT_MISS) ] = SNB_DMND_WRITE|SNB_L3_MISS,
197 },
198 [ C(OP_PREFETCH) ] = {
199 [ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_L3_ACCESS,
200 [ C(RESULT_MISS) ] = SNB_DMND_PREFETCH|SNB_L3_MISS,
201 },
202 },
203 [ C(NODE) ] = {
204 [ C(OP_READ) ] = {
205 [ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_DRAM_ANY,
206 [ C(RESULT_MISS) ] = SNB_DMND_READ|SNB_DRAM_REMOTE,
207 },
208 [ C(OP_WRITE) ] = {
209 [ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_DRAM_ANY,
210 [ C(RESULT_MISS) ] = SNB_DMND_WRITE|SNB_DRAM_REMOTE,
211 },
212 [ C(OP_PREFETCH) ] = {
213 [ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_DRAM_ANY,
214 [ C(RESULT_MISS) ] = SNB_DMND_PREFETCH|SNB_DRAM_REMOTE,
215 },
216 },
217};
218
139static __initconst const u64 snb_hw_cache_event_ids 219static __initconst const u64 snb_hw_cache_event_ids
140 [PERF_COUNT_HW_CACHE_MAX] 220 [PERF_COUNT_HW_CACHE_MAX]
141 [PERF_COUNT_HW_CACHE_OP_MAX] 221 [PERF_COUNT_HW_CACHE_OP_MAX]
@@ -233,16 +313,16 @@ static __initconst const u64 snb_hw_cache_event_ids
233 }, 313 },
234 [ C(NODE) ] = { 314 [ C(NODE) ] = {
235 [ C(OP_READ) ] = { 315 [ C(OP_READ) ] = {
236 [ C(RESULT_ACCESS) ] = -1, 316 [ C(RESULT_ACCESS) ] = 0x01b7,
237 [ C(RESULT_MISS) ] = -1, 317 [ C(RESULT_MISS) ] = 0x01b7,
238 }, 318 },
239 [ C(OP_WRITE) ] = { 319 [ C(OP_WRITE) ] = {
240 [ C(RESULT_ACCESS) ] = -1, 320 [ C(RESULT_ACCESS) ] = 0x01b7,
241 [ C(RESULT_MISS) ] = -1, 321 [ C(RESULT_MISS) ] = 0x01b7,
242 }, 322 },
243 [ C(OP_PREFETCH) ] = { 323 [ C(OP_PREFETCH) ] = {
244 [ C(RESULT_ACCESS) ] = -1, 324 [ C(RESULT_ACCESS) ] = 0x01b7,
245 [ C(RESULT_MISS) ] = -1, 325 [ C(RESULT_MISS) ] = 0x01b7,
246 }, 326 },
247 }, 327 },
248 328
@@ -747,7 +827,7 @@ static void intel_pmu_disable_all(void)
747 827
748 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); 828 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
749 829
750 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) 830 if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
751 intel_pmu_disable_bts(); 831 intel_pmu_disable_bts();
752 832
753 intel_pmu_pebs_disable_all(); 833 intel_pmu_pebs_disable_all();
@@ -763,9 +843,9 @@ static void intel_pmu_enable_all(int added)
763 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 843 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL,
764 x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask); 844 x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask);
765 845
766 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { 846 if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
767 struct perf_event *event = 847 struct perf_event *event =
768 cpuc->events[X86_PMC_IDX_FIXED_BTS]; 848 cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
769 849
770 if (WARN_ON_ONCE(!event)) 850 if (WARN_ON_ONCE(!event))
771 return; 851 return;
@@ -871,7 +951,7 @@ static inline void intel_pmu_ack_status(u64 ack)
871 951
872static void intel_pmu_disable_fixed(struct hw_perf_event *hwc) 952static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
873{ 953{
874 int idx = hwc->idx - X86_PMC_IDX_FIXED; 954 int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
875 u64 ctrl_val, mask; 955 u64 ctrl_val, mask;
876 956
877 mask = 0xfULL << (idx * 4); 957 mask = 0xfULL << (idx * 4);
@@ -886,7 +966,7 @@ static void intel_pmu_disable_event(struct perf_event *event)
886 struct hw_perf_event *hwc = &event->hw; 966 struct hw_perf_event *hwc = &event->hw;
887 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 967 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
888 968
889 if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { 969 if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) {
890 intel_pmu_disable_bts(); 970 intel_pmu_disable_bts();
891 intel_pmu_drain_bts_buffer(); 971 intel_pmu_drain_bts_buffer();
892 return; 972 return;
@@ -915,7 +995,7 @@ static void intel_pmu_disable_event(struct perf_event *event)
915 995
916static void intel_pmu_enable_fixed(struct hw_perf_event *hwc) 996static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
917{ 997{
918 int idx = hwc->idx - X86_PMC_IDX_FIXED; 998 int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
919 u64 ctrl_val, bits, mask; 999 u64 ctrl_val, bits, mask;
920 1000
921 /* 1001 /*
@@ -949,7 +1029,7 @@ static void intel_pmu_enable_event(struct perf_event *event)
949 struct hw_perf_event *hwc = &event->hw; 1029 struct hw_perf_event *hwc = &event->hw;
950 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1030 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
951 1031
952 if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { 1032 if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) {
953 if (!__this_cpu_read(cpu_hw_events.enabled)) 1033 if (!__this_cpu_read(cpu_hw_events.enabled))
954 return; 1034 return;
955 1035
@@ -1000,14 +1080,14 @@ static void intel_pmu_reset(void)
1000 1080
1001 local_irq_save(flags); 1081 local_irq_save(flags);
1002 1082
1003 printk("clearing PMU state on CPU#%d\n", smp_processor_id()); 1083 pr_info("clearing PMU state on CPU#%d\n", smp_processor_id());
1004 1084
1005 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1085 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1006 checking_wrmsrl(x86_pmu_config_addr(idx), 0ull); 1086 wrmsrl_safe(x86_pmu_config_addr(idx), 0ull);
1007 checking_wrmsrl(x86_pmu_event_addr(idx), 0ull); 1087 wrmsrl_safe(x86_pmu_event_addr(idx), 0ull);
1008 } 1088 }
1009 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) 1089 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
1010 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); 1090 wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
1011 1091
1012 if (ds) 1092 if (ds)
1013 ds->bts_index = ds->bts_buffer_base; 1093 ds->bts_index = ds->bts_buffer_base;
@@ -1707,16 +1787,61 @@ static __init void intel_clovertown_quirk(void)
1707 * But taken together it might just make sense to not enable PEBS on 1787 * But taken together it might just make sense to not enable PEBS on
1708 * these chips. 1788 * these chips.
1709 */ 1789 */
1710 printk(KERN_WARNING "PEBS disabled due to CPU errata.\n"); 1790 pr_warn("PEBS disabled due to CPU errata\n");
1711 x86_pmu.pebs = 0; 1791 x86_pmu.pebs = 0;
1712 x86_pmu.pebs_constraints = NULL; 1792 x86_pmu.pebs_constraints = NULL;
1713} 1793}
1714 1794
1795static int intel_snb_pebs_broken(int cpu)
1796{
1797 u32 rev = UINT_MAX; /* default to broken for unknown models */
1798
1799 switch (cpu_data(cpu).x86_model) {
1800 case 42: /* SNB */
1801 rev = 0x28;
1802 break;
1803
1804 case 45: /* SNB-EP */
1805 switch (cpu_data(cpu).x86_mask) {
1806 case 6: rev = 0x618; break;
1807 case 7: rev = 0x70c; break;
1808 }
1809 }
1810
1811 return (cpu_data(cpu).microcode < rev);
1812}
1813
1814static void intel_snb_check_microcode(void)
1815{
1816 int pebs_broken = 0;
1817 int cpu;
1818
1819 get_online_cpus();
1820 for_each_online_cpu(cpu) {
1821 if ((pebs_broken = intel_snb_pebs_broken(cpu)))
1822 break;
1823 }
1824 put_online_cpus();
1825
1826 if (pebs_broken == x86_pmu.pebs_broken)
1827 return;
1828
1829 /*
1830 * Serialized by the microcode lock..
1831 */
1832 if (x86_pmu.pebs_broken) {
1833 pr_info("PEBS enabled due to microcode update\n");
1834 x86_pmu.pebs_broken = 0;
1835 } else {
1836 pr_info("PEBS disabled due to CPU errata, please upgrade microcode\n");
1837 x86_pmu.pebs_broken = 1;
1838 }
1839}
1840
1715static __init void intel_sandybridge_quirk(void) 1841static __init void intel_sandybridge_quirk(void)
1716{ 1842{
1717 printk(KERN_WARNING "PEBS disabled due to CPU errata.\n"); 1843 x86_pmu.check_microcode = intel_snb_check_microcode;
1718 x86_pmu.pebs = 0; 1844 intel_snb_check_microcode();
1719 x86_pmu.pebs_constraints = NULL;
1720} 1845}
1721 1846
1722static const struct { int id; char *name; } intel_arch_events_map[] __initconst = { 1847static const struct { int id; char *name; } intel_arch_events_map[] __initconst = {
@@ -1736,8 +1861,8 @@ static __init void intel_arch_events_quirk(void)
1736 /* disable event that reported as not presend by cpuid */ 1861 /* disable event that reported as not presend by cpuid */
1737 for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) { 1862 for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) {
1738 intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0; 1863 intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0;
1739 printk(KERN_WARNING "CPUID marked event: \'%s\' unavailable\n", 1864 pr_warn("CPUID marked event: \'%s\' unavailable\n",
1740 intel_arch_events_map[bit].name); 1865 intel_arch_events_map[bit].name);
1741 } 1866 }
1742} 1867}
1743 1868
@@ -1756,7 +1881,7 @@ static __init void intel_nehalem_quirk(void)
1756 intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89; 1881 intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
1757 ebx.split.no_branch_misses_retired = 0; 1882 ebx.split.no_branch_misses_retired = 0;
1758 x86_pmu.events_maskl = ebx.full; 1883 x86_pmu.events_maskl = ebx.full;
1759 printk(KERN_INFO "CPU erratum AAJ80 worked around\n"); 1884 pr_info("CPU erratum AAJ80 worked around\n");
1760 } 1885 }
1761} 1886}
1762 1887
@@ -1765,6 +1890,7 @@ __init int intel_pmu_init(void)
1765 union cpuid10_edx edx; 1890 union cpuid10_edx edx;
1766 union cpuid10_eax eax; 1891 union cpuid10_eax eax;
1767 union cpuid10_ebx ebx; 1892 union cpuid10_ebx ebx;
1893 struct event_constraint *c;
1768 unsigned int unused; 1894 unsigned int unused;
1769 int version; 1895 int version;
1770 1896
@@ -1800,6 +1926,8 @@ __init int intel_pmu_init(void)
1800 x86_pmu.events_maskl = ebx.full; 1926 x86_pmu.events_maskl = ebx.full;
1801 x86_pmu.events_mask_len = eax.split.mask_length; 1927 x86_pmu.events_mask_len = eax.split.mask_length;
1802 1928
1929 x86_pmu.max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters);
1930
1803 /* 1931 /*
1804 * Quirk: v2 perfmon does not report fixed-purpose events, so 1932 * Quirk: v2 perfmon does not report fixed-purpose events, so
1805 * assume at least 3 events: 1933 * assume at least 3 events:
@@ -1914,6 +2042,8 @@ __init int intel_pmu_init(void)
1914 case 58: /* IvyBridge */ 2042 case 58: /* IvyBridge */
1915 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, 2043 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
1916 sizeof(hw_cache_event_ids)); 2044 sizeof(hw_cache_event_ids));
2045 memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
2046 sizeof(hw_cache_extra_regs));
1917 2047
1918 intel_pmu_lbr_init_snb(); 2048 intel_pmu_lbr_init_snb();
1919 2049
@@ -1951,5 +2081,37 @@ __init int intel_pmu_init(void)
1951 } 2081 }
1952 } 2082 }
1953 2083
2084 if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) {
2085 WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
2086 x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC);
2087 x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC;
2088 }
2089 x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
2090
2091 if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) {
2092 WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
2093 x86_pmu.num_counters_fixed, INTEL_PMC_MAX_FIXED);
2094 x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED;
2095 }
2096
2097 x86_pmu.intel_ctrl |=
2098 ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED;
2099
2100 if (x86_pmu.event_constraints) {
2101 /*
2102 * event on fixed counter2 (REF_CYCLES) only works on this
2103 * counter, so do not extend mask to generic counters
2104 */
2105 for_each_event_constraint(c, x86_pmu.event_constraints) {
2106 if (c->cmask != X86_RAW_EVENT_MASK
2107 || c->idxmsk64 == INTEL_PMC_MSK_FIXED_REF_CYCLES) {
2108 continue;
2109 }
2110
2111 c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
2112 c->weight += x86_pmu.num_counters;
2113 }
2114 }
2115
1954 return 0; 2116 return 0;
1955} 2117}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 35e2192df9f4..629ae0b7ad90 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -248,7 +248,7 @@ void reserve_ds_buffers(void)
248 */ 248 */
249 249
250struct event_constraint bts_constraint = 250struct event_constraint bts_constraint =
251 EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0); 251 EVENT_CONSTRAINT(0, 1ULL << INTEL_PMC_IDX_FIXED_BTS, 0);
252 252
253void intel_pmu_enable_bts(u64 config) 253void intel_pmu_enable_bts(u64 config)
254{ 254{
@@ -295,7 +295,7 @@ int intel_pmu_drain_bts_buffer(void)
295 u64 to; 295 u64 to;
296 u64 flags; 296 u64 flags;
297 }; 297 };
298 struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS]; 298 struct perf_event *event = cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
299 struct bts_record *at, *top; 299 struct bts_record *at, *top;
300 struct perf_output_handle handle; 300 struct perf_output_handle handle;
301 struct perf_event_header header; 301 struct perf_event_header header;
@@ -620,7 +620,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
620 * Should not happen, we program the threshold at 1 and do not 620 * Should not happen, we program the threshold at 1 and do not
621 * set a reset value. 621 * set a reset value.
622 */ 622 */
623 WARN_ON_ONCE(n > 1); 623 WARN_ONCE(n > 1, "bad leftover pebs %d\n", n);
624 at += n - 1; 624 at += n - 1;
625 625
626 __intel_pmu_pebs_event(event, iregs, at); 626 __intel_pmu_pebs_event(event, iregs, at);
@@ -651,10 +651,10 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
651 * Should not happen, we program the threshold at 1 and do not 651 * Should not happen, we program the threshold at 1 and do not
652 * set a reset value. 652 * set a reset value.
653 */ 653 */
654 WARN_ON_ONCE(n > MAX_PEBS_EVENTS); 654 WARN_ONCE(n > x86_pmu.max_pebs_events, "Unexpected number of pebs records %d\n", n);
655 655
656 for ( ; at < top; at++) { 656 for ( ; at < top; at++) {
657 for_each_set_bit(bit, (unsigned long *)&at->status, MAX_PEBS_EVENTS) { 657 for_each_set_bit(bit, (unsigned long *)&at->status, x86_pmu.max_pebs_events) {
658 event = cpuc->events[bit]; 658 event = cpuc->events[bit];
659 if (!test_bit(bit, cpuc->active_mask)) 659 if (!test_bit(bit, cpuc->active_mask))
660 continue; 660 continue;
@@ -670,7 +670,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
670 break; 670 break;
671 } 671 }
672 672
673 if (!event || bit >= MAX_PEBS_EVENTS) 673 if (!event || bit >= x86_pmu.max_pebs_events)
674 continue; 674 continue;
675 675
676 __intel_pmu_pebs_event(event, iregs, at); 676 __intel_pmu_pebs_event(event, iregs, at);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
new file mode 100644
index 000000000000..7563fda9f033
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -0,0 +1,2900 @@
1#include "perf_event_intel_uncore.h"
2
3static struct intel_uncore_type *empty_uncore[] = { NULL, };
4static struct intel_uncore_type **msr_uncores = empty_uncore;
5static struct intel_uncore_type **pci_uncores = empty_uncore;
6/* pci bus to socket mapping */
7static int pcibus_to_physid[256] = { [0 ... 255] = -1, };
8
9static DEFINE_RAW_SPINLOCK(uncore_box_lock);
10
11/* mask of cpus that collect uncore events */
12static cpumask_t uncore_cpu_mask;
13
14/* constraint for the fixed counter */
15static struct event_constraint constraint_fixed =
16 EVENT_CONSTRAINT(~0ULL, 1 << UNCORE_PMC_IDX_FIXED, ~0ULL);
17static struct event_constraint constraint_empty =
18 EVENT_CONSTRAINT(0, 0, 0);
19
20DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
21DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21");
22DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
23DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
24DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19");
25DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
26DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28");
27DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31");
28DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
29DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28");
30DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15");
31DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30");
32DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51");
33DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4");
34DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17");
35DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22");
36DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31");
37DEFINE_UNCORE_FORMAT_ATTR(filter_band0, filter_band0, "config1:0-7");
38DEFINE_UNCORE_FORMAT_ATTR(filter_band1, filter_band1, "config1:8-15");
39DEFINE_UNCORE_FORMAT_ATTR(filter_band2, filter_band2, "config1:16-23");
40DEFINE_UNCORE_FORMAT_ATTR(filter_band3, filter_band3, "config1:24-31");
41
42static u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)
43{
44 u64 count;
45
46 rdmsrl(event->hw.event_base, count);
47
48 return count;
49}
50
51/*
52 * generic get constraint function for shared match/mask registers.
53 */
54static struct event_constraint *
55uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
56{
57 struct intel_uncore_extra_reg *er;
58 struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
59 struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
60 unsigned long flags;
61 bool ok = false;
62
63 /*
64 * reg->alloc can be set due to existing state, so for fake box we
65 * need to ignore this, otherwise we might fail to allocate proper
66 * fake state for this extra reg constraint.
67 */
68 if (reg1->idx == EXTRA_REG_NONE ||
69 (!uncore_box_is_fake(box) && reg1->alloc))
70 return NULL;
71
72 er = &box->shared_regs[reg1->idx];
73 raw_spin_lock_irqsave(&er->lock, flags);
74 if (!atomic_read(&er->ref) ||
75 (er->config1 == reg1->config && er->config2 == reg2->config)) {
76 atomic_inc(&er->ref);
77 er->config1 = reg1->config;
78 er->config2 = reg2->config;
79 ok = true;
80 }
81 raw_spin_unlock_irqrestore(&er->lock, flags);
82
83 if (ok) {
84 if (!uncore_box_is_fake(box))
85 reg1->alloc = 1;
86 return NULL;
87 }
88
89 return &constraint_empty;
90}
91
92static void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
93{
94 struct intel_uncore_extra_reg *er;
95 struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
96
97 /*
98 * Only put constraint if extra reg was actually allocated. Also
99 * takes care of event which do not use an extra shared reg.
100 *
101 * Also, if this is a fake box we shouldn't touch any event state
102 * (reg->alloc) and we don't care about leaving inconsistent box
103 * state either since it will be thrown out.
104 */
105 if (uncore_box_is_fake(box) || !reg1->alloc)
106 return;
107
108 er = &box->shared_regs[reg1->idx];
109 atomic_dec(&er->ref);
110 reg1->alloc = 0;
111}
112
113/* Sandy Bridge-EP uncore support */
114static struct intel_uncore_type snbep_uncore_cbox;
115static struct intel_uncore_type snbep_uncore_pcu;
116
117static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box)
118{
119 struct pci_dev *pdev = box->pci_dev;
120 int box_ctl = uncore_pci_box_ctl(box);
121 u32 config;
122
123 pci_read_config_dword(pdev, box_ctl, &config);
124 config |= SNBEP_PMON_BOX_CTL_FRZ;
125 pci_write_config_dword(pdev, box_ctl, config);
126}
127
128static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box)
129{
130 struct pci_dev *pdev = box->pci_dev;
131 int box_ctl = uncore_pci_box_ctl(box);
132 u32 config;
133
134 pci_read_config_dword(pdev, box_ctl, &config);
135 config &= ~SNBEP_PMON_BOX_CTL_FRZ;
136 pci_write_config_dword(pdev, box_ctl, config);
137}
138
139static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box, struct perf_event *event)
140{
141 struct pci_dev *pdev = box->pci_dev;
142 struct hw_perf_event *hwc = &event->hw;
143
144 pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
145}
146
147static void snbep_uncore_pci_disable_event(struct intel_uncore_box *box, struct perf_event *event)
148{
149 struct pci_dev *pdev = box->pci_dev;
150 struct hw_perf_event *hwc = &event->hw;
151
152 pci_write_config_dword(pdev, hwc->config_base, hwc->config);
153}
154
155static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, struct perf_event *event)
156{
157 struct pci_dev *pdev = box->pci_dev;
158 struct hw_perf_event *hwc = &event->hw;
159 u64 count;
160
161 pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count);
162 pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1);
163
164 return count;
165}
166
167static void snbep_uncore_pci_init_box(struct intel_uncore_box *box)
168{
169 struct pci_dev *pdev = box->pci_dev;
170
171 pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, SNBEP_PMON_BOX_CTL_INT);
172}
173
174static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box)
175{
176 u64 config;
177 unsigned msr;
178
179 msr = uncore_msr_box_ctl(box);
180 if (msr) {
181 rdmsrl(msr, config);
182 config |= SNBEP_PMON_BOX_CTL_FRZ;
183 wrmsrl(msr, config);
184 }
185}
186
187static void snbep_uncore_msr_enable_box(struct intel_uncore_box *box)
188{
189 u64 config;
190 unsigned msr;
191
192 msr = uncore_msr_box_ctl(box);
193 if (msr) {
194 rdmsrl(msr, config);
195 config &= ~SNBEP_PMON_BOX_CTL_FRZ;
196 wrmsrl(msr, config);
197 }
198}
199
200static void snbep_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
201{
202 struct hw_perf_event *hwc = &event->hw;
203 struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
204
205 if (reg1->idx != EXTRA_REG_NONE)
206 wrmsrl(reg1->reg, reg1->config);
207
208 wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
209}
210
211static void snbep_uncore_msr_disable_event(struct intel_uncore_box *box,
212 struct perf_event *event)
213{
214 struct hw_perf_event *hwc = &event->hw;
215
216 wrmsrl(hwc->config_base, hwc->config);
217}
218
219static void snbep_uncore_msr_init_box(struct intel_uncore_box *box)
220{
221 unsigned msr = uncore_msr_box_ctl(box);
222
223 if (msr)
224 wrmsrl(msr, SNBEP_PMON_BOX_CTL_INT);
225}
226
227static int snbep_uncore_hw_config(struct intel_uncore_box *box, struct perf_event *event)
228{
229 struct hw_perf_event *hwc = &event->hw;
230 struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
231
232 if (box->pmu->type == &snbep_uncore_cbox) {
233 reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
234 SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
235 reg1->config = event->attr.config1 &
236 SNBEP_CB0_MSR_PMON_BOX_FILTER_MASK;
237 } else {
238 if (box->pmu->type == &snbep_uncore_pcu) {
239 reg1->reg = SNBEP_PCU_MSR_PMON_BOX_FILTER;
240 reg1->config = event->attr.config1 & SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK;
241 } else {
242 return 0;
243 }
244 }
245 reg1->idx = 0;
246
247 return 0;
248}
249
250static struct attribute *snbep_uncore_formats_attr[] = {
251 &format_attr_event.attr,
252 &format_attr_umask.attr,
253 &format_attr_edge.attr,
254 &format_attr_inv.attr,
255 &format_attr_thresh8.attr,
256 NULL,
257};
258
259static struct attribute *snbep_uncore_ubox_formats_attr[] = {
260 &format_attr_event.attr,
261 &format_attr_umask.attr,
262 &format_attr_edge.attr,
263 &format_attr_inv.attr,
264 &format_attr_thresh5.attr,
265 NULL,
266};
267
268static struct attribute *snbep_uncore_cbox_formats_attr[] = {
269 &format_attr_event.attr,
270 &format_attr_umask.attr,
271 &format_attr_edge.attr,
272 &format_attr_tid_en.attr,
273 &format_attr_inv.attr,
274 &format_attr_thresh8.attr,
275 &format_attr_filter_tid.attr,
276 &format_attr_filter_nid.attr,
277 &format_attr_filter_state.attr,
278 &format_attr_filter_opc.attr,
279 NULL,
280};
281
282static struct attribute *snbep_uncore_pcu_formats_attr[] = {
283 &format_attr_event.attr,
284 &format_attr_occ_sel.attr,
285 &format_attr_edge.attr,
286 &format_attr_inv.attr,
287 &format_attr_thresh5.attr,
288 &format_attr_occ_invert.attr,
289 &format_attr_occ_edge.attr,
290 &format_attr_filter_band0.attr,
291 &format_attr_filter_band1.attr,
292 &format_attr_filter_band2.attr,
293 &format_attr_filter_band3.attr,
294 NULL,
295};
296
297static struct attribute *snbep_uncore_qpi_formats_attr[] = {
298 &format_attr_event_ext.attr,
299 &format_attr_umask.attr,
300 &format_attr_edge.attr,
301 &format_attr_inv.attr,
302 &format_attr_thresh8.attr,
303 NULL,
304};
305
306static struct uncore_event_desc snbep_uncore_imc_events[] = {
307 INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
308 INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03"),
309 INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"),
310 { /* end: all zeroes */ },
311};
312
313static struct uncore_event_desc snbep_uncore_qpi_events[] = {
314 INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x14"),
315 INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"),
316 INTEL_UNCORE_EVENT_DESC(drs_data, "event=0x02,umask=0x08"),
317 INTEL_UNCORE_EVENT_DESC(ncb_data, "event=0x03,umask=0x04"),
318 { /* end: all zeroes */ },
319};
320
321static struct attribute_group snbep_uncore_format_group = {
322 .name = "format",
323 .attrs = snbep_uncore_formats_attr,
324};
325
326static struct attribute_group snbep_uncore_ubox_format_group = {
327 .name = "format",
328 .attrs = snbep_uncore_ubox_formats_attr,
329};
330
331static struct attribute_group snbep_uncore_cbox_format_group = {
332 .name = "format",
333 .attrs = snbep_uncore_cbox_formats_attr,
334};
335
336static struct attribute_group snbep_uncore_pcu_format_group = {
337 .name = "format",
338 .attrs = snbep_uncore_pcu_formats_attr,
339};
340
341static struct attribute_group snbep_uncore_qpi_format_group = {
342 .name = "format",
343 .attrs = snbep_uncore_qpi_formats_attr,
344};
345
346static struct intel_uncore_ops snbep_uncore_msr_ops = {
347 .init_box = snbep_uncore_msr_init_box,
348 .disable_box = snbep_uncore_msr_disable_box,
349 .enable_box = snbep_uncore_msr_enable_box,
350 .disable_event = snbep_uncore_msr_disable_event,
351 .enable_event = snbep_uncore_msr_enable_event,
352 .read_counter = uncore_msr_read_counter,
353 .get_constraint = uncore_get_constraint,
354 .put_constraint = uncore_put_constraint,
355 .hw_config = snbep_uncore_hw_config,
356};
357
358static struct intel_uncore_ops snbep_uncore_pci_ops = {
359 .init_box = snbep_uncore_pci_init_box,
360 .disable_box = snbep_uncore_pci_disable_box,
361 .enable_box = snbep_uncore_pci_enable_box,
362 .disable_event = snbep_uncore_pci_disable_event,
363 .enable_event = snbep_uncore_pci_enable_event,
364 .read_counter = snbep_uncore_pci_read_counter,
365};
366
367static struct event_constraint snbep_uncore_cbox_constraints[] = {
368 UNCORE_EVENT_CONSTRAINT(0x01, 0x1),
369 UNCORE_EVENT_CONSTRAINT(0x02, 0x3),
370 UNCORE_EVENT_CONSTRAINT(0x04, 0x3),
371 UNCORE_EVENT_CONSTRAINT(0x05, 0x3),
372 UNCORE_EVENT_CONSTRAINT(0x07, 0x3),
373 UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
374 UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
375 UNCORE_EVENT_CONSTRAINT(0x13, 0x3),
376 UNCORE_EVENT_CONSTRAINT(0x1b, 0xc),
377 UNCORE_EVENT_CONSTRAINT(0x1c, 0xc),
378 UNCORE_EVENT_CONSTRAINT(0x1d, 0xc),
379 UNCORE_EVENT_CONSTRAINT(0x1e, 0xc),
380 EVENT_CONSTRAINT_OVERLAP(0x1f, 0xe, 0xff),
381 UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
382 UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
383 UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
384 UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
385 UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
386 UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
387 UNCORE_EVENT_CONSTRAINT(0x35, 0x3),
388 UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
389 UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
390 UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
391 UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
392 UNCORE_EVENT_CONSTRAINT(0x3b, 0x1),
393 EVENT_CONSTRAINT_END
394};
395
396static struct event_constraint snbep_uncore_r2pcie_constraints[] = {
397 UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
398 UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
399 UNCORE_EVENT_CONSTRAINT(0x12, 0x1),
400 UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
401 UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
402 UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
403 UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
404 UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
405 UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
406 UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
407 EVENT_CONSTRAINT_END
408};
409
410static struct event_constraint snbep_uncore_r3qpi_constraints[] = {
411 UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
412 UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
413 UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
414 UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
415 UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
416 UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
417 UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
418 UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
419 UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
420 UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
421 UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
422 UNCORE_EVENT_CONSTRAINT(0x30, 0x3),
423 UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
424 UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
425 UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
426 UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
427 UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
428 UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
429 EVENT_CONSTRAINT_END
430};
431
432static struct intel_uncore_type snbep_uncore_ubox = {
433 .name = "ubox",
434 .num_counters = 2,
435 .num_boxes = 1,
436 .perf_ctr_bits = 44,
437 .fixed_ctr_bits = 48,
438 .perf_ctr = SNBEP_U_MSR_PMON_CTR0,
439 .event_ctl = SNBEP_U_MSR_PMON_CTL0,
440 .event_mask = SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
441 .fixed_ctr = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
442 .fixed_ctl = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
443 .ops = &snbep_uncore_msr_ops,
444 .format_group = &snbep_uncore_ubox_format_group,
445};
446
447static struct intel_uncore_type snbep_uncore_cbox = {
448 .name = "cbox",
449 .num_counters = 4,
450 .num_boxes = 8,
451 .perf_ctr_bits = 44,
452 .event_ctl = SNBEP_C0_MSR_PMON_CTL0,
453 .perf_ctr = SNBEP_C0_MSR_PMON_CTR0,
454 .event_mask = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
455 .box_ctl = SNBEP_C0_MSR_PMON_BOX_CTL,
456 .msr_offset = SNBEP_CBO_MSR_OFFSET,
457 .num_shared_regs = 1,
458 .constraints = snbep_uncore_cbox_constraints,
459 .ops = &snbep_uncore_msr_ops,
460 .format_group = &snbep_uncore_cbox_format_group,
461};
462
463static struct intel_uncore_type snbep_uncore_pcu = {
464 .name = "pcu",
465 .num_counters = 4,
466 .num_boxes = 1,
467 .perf_ctr_bits = 48,
468 .perf_ctr = SNBEP_PCU_MSR_PMON_CTR0,
469 .event_ctl = SNBEP_PCU_MSR_PMON_CTL0,
470 .event_mask = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
471 .box_ctl = SNBEP_PCU_MSR_PMON_BOX_CTL,
472 .num_shared_regs = 1,
473 .ops = &snbep_uncore_msr_ops,
474 .format_group = &snbep_uncore_pcu_format_group,
475};
476
477static struct intel_uncore_type *snbep_msr_uncores[] = {
478 &snbep_uncore_ubox,
479 &snbep_uncore_cbox,
480 &snbep_uncore_pcu,
481 NULL,
482};
483
484#define SNBEP_UNCORE_PCI_COMMON_INIT() \
485 .perf_ctr = SNBEP_PCI_PMON_CTR0, \
486 .event_ctl = SNBEP_PCI_PMON_CTL0, \
487 .event_mask = SNBEP_PMON_RAW_EVENT_MASK, \
488 .box_ctl = SNBEP_PCI_PMON_BOX_CTL, \
489 .ops = &snbep_uncore_pci_ops, \
490 .format_group = &snbep_uncore_format_group
491
492static struct intel_uncore_type snbep_uncore_ha = {
493 .name = "ha",
494 .num_counters = 4,
495 .num_boxes = 1,
496 .perf_ctr_bits = 48,
497 SNBEP_UNCORE_PCI_COMMON_INIT(),
498};
499
500static struct intel_uncore_type snbep_uncore_imc = {
501 .name = "imc",
502 .num_counters = 4,
503 .num_boxes = 4,
504 .perf_ctr_bits = 48,
505 .fixed_ctr_bits = 48,
506 .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
507 .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
508 .event_descs = snbep_uncore_imc_events,
509 SNBEP_UNCORE_PCI_COMMON_INIT(),
510};
511
512static struct intel_uncore_type snbep_uncore_qpi = {
513 .name = "qpi",
514 .num_counters = 4,
515 .num_boxes = 2,
516 .perf_ctr_bits = 48,
517 .perf_ctr = SNBEP_PCI_PMON_CTR0,
518 .event_ctl = SNBEP_PCI_PMON_CTL0,
519 .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
520 .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
521 .ops = &snbep_uncore_pci_ops,
522 .event_descs = snbep_uncore_qpi_events,
523 .format_group = &snbep_uncore_qpi_format_group,
524};
525
526
527static struct intel_uncore_type snbep_uncore_r2pcie = {
528 .name = "r2pcie",
529 .num_counters = 4,
530 .num_boxes = 1,
531 .perf_ctr_bits = 44,
532 .constraints = snbep_uncore_r2pcie_constraints,
533 SNBEP_UNCORE_PCI_COMMON_INIT(),
534};
535
536static struct intel_uncore_type snbep_uncore_r3qpi = {
537 .name = "r3qpi",
538 .num_counters = 3,
539 .num_boxes = 2,
540 .perf_ctr_bits = 44,
541 .constraints = snbep_uncore_r3qpi_constraints,
542 SNBEP_UNCORE_PCI_COMMON_INIT(),
543};
544
545static struct intel_uncore_type *snbep_pci_uncores[] = {
546 &snbep_uncore_ha,
547 &snbep_uncore_imc,
548 &snbep_uncore_qpi,
549 &snbep_uncore_r2pcie,
550 &snbep_uncore_r3qpi,
551 NULL,
552};
553
554static DEFINE_PCI_DEVICE_TABLE(snbep_uncore_pci_ids) = {
555 { /* Home Agent */
556 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA),
557 .driver_data = (unsigned long)&snbep_uncore_ha,
558 },
559 { /* MC Channel 0 */
560 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0),
561 .driver_data = (unsigned long)&snbep_uncore_imc,
562 },
563 { /* MC Channel 1 */
564 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1),
565 .driver_data = (unsigned long)&snbep_uncore_imc,
566 },
567 { /* MC Channel 2 */
568 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2),
569 .driver_data = (unsigned long)&snbep_uncore_imc,
570 },
571 { /* MC Channel 3 */
572 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3),
573 .driver_data = (unsigned long)&snbep_uncore_imc,
574 },
575 { /* QPI Port 0 */
576 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0),
577 .driver_data = (unsigned long)&snbep_uncore_qpi,
578 },
579 { /* QPI Port 1 */
580 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1),
581 .driver_data = (unsigned long)&snbep_uncore_qpi,
582 },
583 { /* P2PCIe */
584 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE),
585 .driver_data = (unsigned long)&snbep_uncore_r2pcie,
586 },
587 { /* R3QPI Link 0 */
588 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0),
589 .driver_data = (unsigned long)&snbep_uncore_r3qpi,
590 },
591 { /* R3QPI Link 1 */
592 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1),
593 .driver_data = (unsigned long)&snbep_uncore_r3qpi,
594 },
595 { /* end: all zeroes */ }
596};
597
598static struct pci_driver snbep_uncore_pci_driver = {
599 .name = "snbep_uncore",
600 .id_table = snbep_uncore_pci_ids,
601};
602
603/*
604 * build pci bus to socket mapping
605 */
606static void snbep_pci2phy_map_init(void)
607{
608 struct pci_dev *ubox_dev = NULL;
609 int i, bus, nodeid;
610 u32 config;
611
612 while (1) {
613 /* find the UBOX device */
614 ubox_dev = pci_get_device(PCI_VENDOR_ID_INTEL,
615 PCI_DEVICE_ID_INTEL_JAKETOWN_UBOX,
616 ubox_dev);
617 if (!ubox_dev)
618 break;
619 bus = ubox_dev->bus->number;
620 /* get the Node ID of the local register */
621 pci_read_config_dword(ubox_dev, 0x40, &config);
622 nodeid = config;
623 /* get the Node ID mapping */
624 pci_read_config_dword(ubox_dev, 0x54, &config);
625 /*
626 * every three bits in the Node ID mapping register maps
627 * to a particular node.
628 */
629 for (i = 0; i < 8; i++) {
630 if (nodeid == ((config >> (3 * i)) & 0x7)) {
631 pcibus_to_physid[bus] = i;
632 break;
633 }
634 }
635 };
636 return;
637}
638/* end of Sandy Bridge-EP uncore support */
639
640/* Sandy Bridge uncore support */
641static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
642{
643 struct hw_perf_event *hwc = &event->hw;
644
645 if (hwc->idx < UNCORE_PMC_IDX_FIXED)
646 wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
647 else
648 wrmsrl(hwc->config_base, SNB_UNC_CTL_EN);
649}
650
651static void snb_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
652{
653 wrmsrl(event->hw.config_base, 0);
654}
655
656static void snb_uncore_msr_init_box(struct intel_uncore_box *box)
657{
658 if (box->pmu->pmu_idx == 0) {
659 wrmsrl(SNB_UNC_PERF_GLOBAL_CTL,
660 SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL);
661 }
662}
663
664static struct attribute *snb_uncore_formats_attr[] = {
665 &format_attr_event.attr,
666 &format_attr_umask.attr,
667 &format_attr_edge.attr,
668 &format_attr_inv.attr,
669 &format_attr_cmask5.attr,
670 NULL,
671};
672
673static struct attribute_group snb_uncore_format_group = {
674 .name = "format",
675 .attrs = snb_uncore_formats_attr,
676};
677
678static struct intel_uncore_ops snb_uncore_msr_ops = {
679 .init_box = snb_uncore_msr_init_box,
680 .disable_event = snb_uncore_msr_disable_event,
681 .enable_event = snb_uncore_msr_enable_event,
682 .read_counter = uncore_msr_read_counter,
683};
684
685static struct event_constraint snb_uncore_cbox_constraints[] = {
686 UNCORE_EVENT_CONSTRAINT(0x80, 0x1),
687 UNCORE_EVENT_CONSTRAINT(0x83, 0x1),
688 EVENT_CONSTRAINT_END
689};
690
691static struct intel_uncore_type snb_uncore_cbox = {
692 .name = "cbox",
693 .num_counters = 2,
694 .num_boxes = 4,
695 .perf_ctr_bits = 44,
696 .fixed_ctr_bits = 48,
697 .perf_ctr = SNB_UNC_CBO_0_PER_CTR0,
698 .event_ctl = SNB_UNC_CBO_0_PERFEVTSEL0,
699 .fixed_ctr = SNB_UNC_FIXED_CTR,
700 .fixed_ctl = SNB_UNC_FIXED_CTR_CTRL,
701 .single_fixed = 1,
702 .event_mask = SNB_UNC_RAW_EVENT_MASK,
703 .msr_offset = SNB_UNC_CBO_MSR_OFFSET,
704 .constraints = snb_uncore_cbox_constraints,
705 .ops = &snb_uncore_msr_ops,
706 .format_group = &snb_uncore_format_group,
707};
708
709static struct intel_uncore_type *snb_msr_uncores[] = {
710 &snb_uncore_cbox,
711 NULL,
712};
713/* end of Sandy Bridge uncore support */
714
715/* Nehalem uncore support */
716static void nhm_uncore_msr_disable_box(struct intel_uncore_box *box)
717{
718 wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, 0);
719}
720
721static void nhm_uncore_msr_enable_box(struct intel_uncore_box *box)
722{
723 wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC);
724}
725
726static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
727{
728 struct hw_perf_event *hwc = &event->hw;
729
730 if (hwc->idx < UNCORE_PMC_IDX_FIXED)
731 wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
732 else
733 wrmsrl(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN);
734}
735
736static struct attribute *nhm_uncore_formats_attr[] = {
737 &format_attr_event.attr,
738 &format_attr_umask.attr,
739 &format_attr_edge.attr,
740 &format_attr_inv.attr,
741 &format_attr_cmask8.attr,
742 NULL,
743};
744
745static struct attribute_group nhm_uncore_format_group = {
746 .name = "format",
747 .attrs = nhm_uncore_formats_attr,
748};
749
750static struct uncore_event_desc nhm_uncore_events[] = {
751 INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
752 INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any, "event=0x2f,umask=0x0f"),
753 INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any, "event=0x2c,umask=0x0f"),
754 INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads, "event=0x20,umask=0x01"),
755 INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_writes, "event=0x20,umask=0x02"),
756 INTEL_UNCORE_EVENT_DESC(qhl_request_remote_reads, "event=0x20,umask=0x04"),
757 INTEL_UNCORE_EVENT_DESC(qhl_request_remote_writes, "event=0x20,umask=0x08"),
758 INTEL_UNCORE_EVENT_DESC(qhl_request_local_reads, "event=0x20,umask=0x10"),
759 INTEL_UNCORE_EVENT_DESC(qhl_request_local_writes, "event=0x20,umask=0x20"),
760 { /* end: all zeroes */ },
761};
762
763static struct intel_uncore_ops nhm_uncore_msr_ops = {
764 .disable_box = nhm_uncore_msr_disable_box,
765 .enable_box = nhm_uncore_msr_enable_box,
766 .disable_event = snb_uncore_msr_disable_event,
767 .enable_event = nhm_uncore_msr_enable_event,
768 .read_counter = uncore_msr_read_counter,
769};
770
771static struct intel_uncore_type nhm_uncore = {
772 .name = "",
773 .num_counters = 8,
774 .num_boxes = 1,
775 .perf_ctr_bits = 48,
776 .fixed_ctr_bits = 48,
777 .event_ctl = NHM_UNC_PERFEVTSEL0,
778 .perf_ctr = NHM_UNC_UNCORE_PMC0,
779 .fixed_ctr = NHM_UNC_FIXED_CTR,
780 .fixed_ctl = NHM_UNC_FIXED_CTR_CTRL,
781 .event_mask = NHM_UNC_RAW_EVENT_MASK,
782 .event_descs = nhm_uncore_events,
783 .ops = &nhm_uncore_msr_ops,
784 .format_group = &nhm_uncore_format_group,
785};
786
787static struct intel_uncore_type *nhm_msr_uncores[] = {
788 &nhm_uncore,
789 NULL,
790};
791/* end of Nehalem uncore support */
792
793/* Nehalem-EX uncore support */
794#define __BITS_VALUE(x, i, n) ((typeof(x))(((x) >> ((i) * (n))) & \
795 ((1ULL << (n)) - 1)))
796
797DEFINE_UNCORE_FORMAT_ATTR(event5, event, "config:1-5");
798DEFINE_UNCORE_FORMAT_ATTR(counter, counter, "config:6-7");
799DEFINE_UNCORE_FORMAT_ATTR(mm_cfg, mm_cfg, "config:63");
800DEFINE_UNCORE_FORMAT_ATTR(match, match, "config1:0-63");
801DEFINE_UNCORE_FORMAT_ATTR(mask, mask, "config2:0-63");
802
803static void nhmex_uncore_msr_init_box(struct intel_uncore_box *box)
804{
805 wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, NHMEX_U_PMON_GLOBAL_EN_ALL);
806}
807
808static void nhmex_uncore_msr_disable_box(struct intel_uncore_box *box)
809{
810 unsigned msr = uncore_msr_box_ctl(box);
811 u64 config;
812
813 if (msr) {
814 rdmsrl(msr, config);
815 config &= ~((1ULL << uncore_num_counters(box)) - 1);
816 /* WBox has a fixed counter */
817 if (uncore_msr_fixed_ctl(box))
818 config &= ~NHMEX_W_PMON_GLOBAL_FIXED_EN;
819 wrmsrl(msr, config);
820 }
821}
822
823static void nhmex_uncore_msr_enable_box(struct intel_uncore_box *box)
824{
825 unsigned msr = uncore_msr_box_ctl(box);
826 u64 config;
827
828 if (msr) {
829 rdmsrl(msr, config);
830 config |= (1ULL << uncore_num_counters(box)) - 1;
831 /* WBox has a fixed counter */
832 if (uncore_msr_fixed_ctl(box))
833 config |= NHMEX_W_PMON_GLOBAL_FIXED_EN;
834 wrmsrl(msr, config);
835 }
836}
837
838static void nhmex_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
839{
840 wrmsrl(event->hw.config_base, 0);
841}
842
843static void nhmex_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
844{
845 struct hw_perf_event *hwc = &event->hw;
846
847 if (hwc->idx >= UNCORE_PMC_IDX_FIXED)
848 wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0);
849 else if (box->pmu->type->event_mask & NHMEX_PMON_CTL_EN_BIT0)
850 wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
851 else
852 wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
853}
854
855#define NHMEX_UNCORE_OPS_COMMON_INIT() \
856 .init_box = nhmex_uncore_msr_init_box, \
857 .disable_box = nhmex_uncore_msr_disable_box, \
858 .enable_box = nhmex_uncore_msr_enable_box, \
859 .disable_event = nhmex_uncore_msr_disable_event, \
860 .read_counter = uncore_msr_read_counter
861
862static struct intel_uncore_ops nhmex_uncore_ops = {
863 NHMEX_UNCORE_OPS_COMMON_INIT(),
864 .enable_event = nhmex_uncore_msr_enable_event,
865};
866
867static struct attribute *nhmex_uncore_ubox_formats_attr[] = {
868 &format_attr_event.attr,
869 &format_attr_edge.attr,
870 NULL,
871};
872
873static struct attribute_group nhmex_uncore_ubox_format_group = {
874 .name = "format",
875 .attrs = nhmex_uncore_ubox_formats_attr,
876};
877
878static struct intel_uncore_type nhmex_uncore_ubox = {
879 .name = "ubox",
880 .num_counters = 1,
881 .num_boxes = 1,
882 .perf_ctr_bits = 48,
883 .event_ctl = NHMEX_U_MSR_PMON_EV_SEL,
884 .perf_ctr = NHMEX_U_MSR_PMON_CTR,
885 .event_mask = NHMEX_U_PMON_RAW_EVENT_MASK,
886 .box_ctl = NHMEX_U_MSR_PMON_GLOBAL_CTL,
887 .ops = &nhmex_uncore_ops,
888 .format_group = &nhmex_uncore_ubox_format_group
889};
890
891static struct attribute *nhmex_uncore_cbox_formats_attr[] = {
892 &format_attr_event.attr,
893 &format_attr_umask.attr,
894 &format_attr_edge.attr,
895 &format_attr_inv.attr,
896 &format_attr_thresh8.attr,
897 NULL,
898};
899
900static struct attribute_group nhmex_uncore_cbox_format_group = {
901 .name = "format",
902 .attrs = nhmex_uncore_cbox_formats_attr,
903};
904
905static struct intel_uncore_type nhmex_uncore_cbox = {
906 .name = "cbox",
907 .num_counters = 6,
908 .num_boxes = 8,
909 .perf_ctr_bits = 48,
910 .event_ctl = NHMEX_C0_MSR_PMON_EV_SEL0,
911 .perf_ctr = NHMEX_C0_MSR_PMON_CTR0,
912 .event_mask = NHMEX_PMON_RAW_EVENT_MASK,
913 .box_ctl = NHMEX_C0_MSR_PMON_GLOBAL_CTL,
914 .msr_offset = NHMEX_C_MSR_OFFSET,
915 .pair_ctr_ctl = 1,
916 .ops = &nhmex_uncore_ops,
917 .format_group = &nhmex_uncore_cbox_format_group
918};
919
920static struct uncore_event_desc nhmex_uncore_wbox_events[] = {
921 INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0"),
922 { /* end: all zeroes */ },
923};
924
925static struct intel_uncore_type nhmex_uncore_wbox = {
926 .name = "wbox",
927 .num_counters = 4,
928 .num_boxes = 1,
929 .perf_ctr_bits = 48,
930 .event_ctl = NHMEX_W_MSR_PMON_CNT0,
931 .perf_ctr = NHMEX_W_MSR_PMON_EVT_SEL0,
932 .fixed_ctr = NHMEX_W_MSR_PMON_FIXED_CTR,
933 .fixed_ctl = NHMEX_W_MSR_PMON_FIXED_CTL,
934 .event_mask = NHMEX_PMON_RAW_EVENT_MASK,
935 .box_ctl = NHMEX_W_MSR_GLOBAL_CTL,
936 .pair_ctr_ctl = 1,
937 .event_descs = nhmex_uncore_wbox_events,
938 .ops = &nhmex_uncore_ops,
939 .format_group = &nhmex_uncore_cbox_format_group
940};
941
942static int nhmex_bbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
943{
944 struct hw_perf_event *hwc = &event->hw;
945 struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
946 struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
947 int ctr, ev_sel;
948
949 ctr = (hwc->config & NHMEX_B_PMON_CTR_MASK) >>
950 NHMEX_B_PMON_CTR_SHIFT;
951 ev_sel = (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK) >>
952 NHMEX_B_PMON_CTL_EV_SEL_SHIFT;
953
954 /* events that do not use the match/mask registers */
955 if ((ctr == 0 && ev_sel > 0x3) || (ctr == 1 && ev_sel > 0x6) ||
956 (ctr == 2 && ev_sel != 0x4) || ctr == 3)
957 return 0;
958
959 if (box->pmu->pmu_idx == 0)
960 reg1->reg = NHMEX_B0_MSR_MATCH;
961 else
962 reg1->reg = NHMEX_B1_MSR_MATCH;
963 reg1->idx = 0;
964 reg1->config = event->attr.config1;
965 reg2->config = event->attr.config2;
966 return 0;
967}
968
969static void nhmex_bbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
970{
971 struct hw_perf_event *hwc = &event->hw;
972 struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
973 struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
974
975 if (reg1->idx != EXTRA_REG_NONE) {
976 wrmsrl(reg1->reg, reg1->config);
977 wrmsrl(reg1->reg + 1, reg2->config);
978 }
979 wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
980 (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK));
981}
982
983/*
984 * The Bbox has 4 counters, but each counter monitors different events.
985 * Use bits 6-7 in the event config to select counter.
986 */
987static struct event_constraint nhmex_uncore_bbox_constraints[] = {
988 EVENT_CONSTRAINT(0 , 1, 0xc0),
989 EVENT_CONSTRAINT(0x40, 2, 0xc0),
990 EVENT_CONSTRAINT(0x80, 4, 0xc0),
991 EVENT_CONSTRAINT(0xc0, 8, 0xc0),
992 EVENT_CONSTRAINT_END,
993};
994
995static struct attribute *nhmex_uncore_bbox_formats_attr[] = {
996 &format_attr_event5.attr,
997 &format_attr_counter.attr,
998 &format_attr_match.attr,
999 &format_attr_mask.attr,
1000 NULL,
1001};
1002
1003static struct attribute_group nhmex_uncore_bbox_format_group = {
1004 .name = "format",
1005 .attrs = nhmex_uncore_bbox_formats_attr,
1006};
1007
1008static struct intel_uncore_ops nhmex_uncore_bbox_ops = {
1009 NHMEX_UNCORE_OPS_COMMON_INIT(),
1010 .enable_event = nhmex_bbox_msr_enable_event,
1011 .hw_config = nhmex_bbox_hw_config,
1012 .get_constraint = uncore_get_constraint,
1013 .put_constraint = uncore_put_constraint,
1014};
1015
1016static struct intel_uncore_type nhmex_uncore_bbox = {
1017 .name = "bbox",
1018 .num_counters = 4,
1019 .num_boxes = 2,
1020 .perf_ctr_bits = 48,
1021 .event_ctl = NHMEX_B0_MSR_PMON_CTL0,
1022 .perf_ctr = NHMEX_B0_MSR_PMON_CTR0,
1023 .event_mask = NHMEX_B_PMON_RAW_EVENT_MASK,
1024 .box_ctl = NHMEX_B0_MSR_PMON_GLOBAL_CTL,
1025 .msr_offset = NHMEX_B_MSR_OFFSET,
1026 .pair_ctr_ctl = 1,
1027 .num_shared_regs = 1,
1028 .constraints = nhmex_uncore_bbox_constraints,
1029 .ops = &nhmex_uncore_bbox_ops,
1030 .format_group = &nhmex_uncore_bbox_format_group
1031};
1032
1033static int nhmex_sbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
1034{
1035 struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
1036 struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
1037
1038 if (event->attr.config & NHMEX_S_PMON_MM_CFG_EN) {
1039 reg1->config = event->attr.config1;
1040 reg2->config = event->attr.config2;
1041 } else {
1042 reg1->config = ~0ULL;
1043 reg2->config = ~0ULL;
1044 }
1045
1046 if (box->pmu->pmu_idx == 0)
1047 reg1->reg = NHMEX_S0_MSR_MM_CFG;
1048 else
1049 reg1->reg = NHMEX_S1_MSR_MM_CFG;
1050
1051 reg1->idx = 0;
1052
1053 return 0;
1054}
1055
1056static void nhmex_sbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
1057{
1058 struct hw_perf_event *hwc = &event->hw;
1059 struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
1060 struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
1061
1062 wrmsrl(reg1->reg, 0);
1063 if (reg1->config != ~0ULL || reg2->config != ~0ULL) {
1064 wrmsrl(reg1->reg + 1, reg1->config);
1065 wrmsrl(reg1->reg + 2, reg2->config);
1066 wrmsrl(reg1->reg, NHMEX_S_PMON_MM_CFG_EN);
1067 }
1068 wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
1069}
1070
1071static struct attribute *nhmex_uncore_sbox_formats_attr[] = {
1072 &format_attr_event.attr,
1073 &format_attr_umask.attr,
1074 &format_attr_edge.attr,
1075 &format_attr_inv.attr,
1076 &format_attr_thresh8.attr,
1077 &format_attr_mm_cfg.attr,
1078 &format_attr_match.attr,
1079 &format_attr_mask.attr,
1080 NULL,
1081};
1082
1083static struct attribute_group nhmex_uncore_sbox_format_group = {
1084 .name = "format",
1085 .attrs = nhmex_uncore_sbox_formats_attr,
1086};
1087
1088static struct intel_uncore_ops nhmex_uncore_sbox_ops = {
1089 NHMEX_UNCORE_OPS_COMMON_INIT(),
1090 .enable_event = nhmex_sbox_msr_enable_event,
1091 .hw_config = nhmex_sbox_hw_config,
1092 .get_constraint = uncore_get_constraint,
1093 .put_constraint = uncore_put_constraint,
1094};
1095
1096static struct intel_uncore_type nhmex_uncore_sbox = {
1097 .name = "sbox",
1098 .num_counters = 4,
1099 .num_boxes = 2,
1100 .perf_ctr_bits = 48,
1101 .event_ctl = NHMEX_S0_MSR_PMON_CTL0,
1102 .perf_ctr = NHMEX_S0_MSR_PMON_CTR0,
1103 .event_mask = NHMEX_PMON_RAW_EVENT_MASK,
1104 .box_ctl = NHMEX_S0_MSR_PMON_GLOBAL_CTL,
1105 .msr_offset = NHMEX_S_MSR_OFFSET,
1106 .pair_ctr_ctl = 1,
1107 .num_shared_regs = 1,
1108 .ops = &nhmex_uncore_sbox_ops,
1109 .format_group = &nhmex_uncore_sbox_format_group
1110};
1111
1112enum {
1113 EXTRA_REG_NHMEX_M_FILTER,
1114 EXTRA_REG_NHMEX_M_DSP,
1115 EXTRA_REG_NHMEX_M_ISS,
1116 EXTRA_REG_NHMEX_M_MAP,
1117 EXTRA_REG_NHMEX_M_MSC_THR,
1118 EXTRA_REG_NHMEX_M_PGT,
1119 EXTRA_REG_NHMEX_M_PLD,
1120 EXTRA_REG_NHMEX_M_ZDP_CTL_FVC,
1121};
1122
1123static struct extra_reg nhmex_uncore_mbox_extra_regs[] = {
1124 MBOX_INC_SEL_EXTAR_REG(0x0, DSP),
1125 MBOX_INC_SEL_EXTAR_REG(0x4, MSC_THR),
1126 MBOX_INC_SEL_EXTAR_REG(0x5, MSC_THR),
1127 MBOX_INC_SEL_EXTAR_REG(0x9, ISS),
1128 /* event 0xa uses two extra registers */
1129 MBOX_INC_SEL_EXTAR_REG(0xa, ISS),
1130 MBOX_INC_SEL_EXTAR_REG(0xa, PLD),
1131 MBOX_INC_SEL_EXTAR_REG(0xb, PLD),
1132 /* events 0xd ~ 0x10 use the same extra register */
1133 MBOX_INC_SEL_EXTAR_REG(0xd, ZDP_CTL_FVC),
1134 MBOX_INC_SEL_EXTAR_REG(0xe, ZDP_CTL_FVC),
1135 MBOX_INC_SEL_EXTAR_REG(0xf, ZDP_CTL_FVC),
1136 MBOX_INC_SEL_EXTAR_REG(0x10, ZDP_CTL_FVC),
1137 MBOX_INC_SEL_EXTAR_REG(0x16, PGT),
1138 MBOX_SET_FLAG_SEL_EXTRA_REG(0x0, DSP),
1139 MBOX_SET_FLAG_SEL_EXTRA_REG(0x1, ISS),
1140 MBOX_SET_FLAG_SEL_EXTRA_REG(0x5, PGT),
1141 MBOX_SET_FLAG_SEL_EXTRA_REG(0x6, MAP),
1142 EVENT_EXTRA_END
1143};
1144
1145static bool nhmex_mbox_get_shared_reg(struct intel_uncore_box *box, int idx, u64 config)
1146{
1147 struct intel_uncore_extra_reg *er;
1148 unsigned long flags;
1149 bool ret = false;
1150 u64 mask;
1151
1152 if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
1153 er = &box->shared_regs[idx];
1154 raw_spin_lock_irqsave(&er->lock, flags);
1155 if (!atomic_read(&er->ref) || er->config == config) {
1156 atomic_inc(&er->ref);
1157 er->config = config;
1158 ret = true;
1159 }
1160 raw_spin_unlock_irqrestore(&er->lock, flags);
1161
1162 return ret;
1163 }
1164 /*
1165 * The ZDP_CTL_FVC MSR has 4 fields which are used to control
1166 * events 0xd ~ 0x10. Besides these 4 fields, there are additional
1167 * fields which are shared.
1168 */
1169 idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
1170 if (WARN_ON_ONCE(idx >= 4))
1171 return false;
1172
1173 /* mask of the shared fields */
1174 mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK;
1175 er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
1176
1177 raw_spin_lock_irqsave(&er->lock, flags);
1178 /* add mask of the non-shared field if it's in use */
1179 if (__BITS_VALUE(atomic_read(&er->ref), idx, 8))
1180 mask |= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
1181
1182 if (!atomic_read(&er->ref) || !((er->config ^ config) & mask)) {
1183 atomic_add(1 << (idx * 8), &er->ref);
1184 mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK |
1185 NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
1186 er->config &= ~mask;
1187 er->config |= (config & mask);
1188 ret = true;
1189 }
1190 raw_spin_unlock_irqrestore(&er->lock, flags);
1191
1192 return ret;
1193}
1194
1195static void nhmex_mbox_put_shared_reg(struct intel_uncore_box *box, int idx)
1196{
1197 struct intel_uncore_extra_reg *er;
1198
1199 if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
1200 er = &box->shared_regs[idx];
1201 atomic_dec(&er->ref);
1202 return;
1203 }
1204
1205 idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
1206 er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
1207 atomic_sub(1 << (idx * 8), &er->ref);
1208}
1209
1210u64 nhmex_mbox_alter_er(struct perf_event *event, int new_idx, bool modify)
1211{
1212 struct hw_perf_event *hwc = &event->hw;
1213 struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
1214 int idx, orig_idx = __BITS_VALUE(reg1->idx, 0, 8);
1215 u64 config = reg1->config;
1216
1217 /* get the non-shared control bits and shift them */
1218 idx = orig_idx - EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
1219 config &= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
1220 if (new_idx > orig_idx) {
1221 idx = new_idx - orig_idx;
1222 config <<= 3 * idx;
1223 } else {
1224 idx = orig_idx - new_idx;
1225 config >>= 3 * idx;
1226 }
1227
1228 /* add the shared control bits back */
1229 config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
1230 if (modify) {
1231 /* adjust the main event selector */
1232 if (new_idx > orig_idx)
1233 hwc->config += idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
1234 else
1235 hwc->config -= idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
1236 reg1->config = config;
1237 reg1->idx = ~0xff | new_idx;
1238 }
1239 return config;
1240}
1241
1242static struct event_constraint *
1243nhmex_mbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
1244{
1245 struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
1246 struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
1247 int i, idx[2], alloc = 0;
1248 u64 config1 = reg1->config;
1249
1250 idx[0] = __BITS_VALUE(reg1->idx, 0, 8);
1251 idx[1] = __BITS_VALUE(reg1->idx, 1, 8);
1252again:
1253 for (i = 0; i < 2; i++) {
1254 if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
1255 idx[i] = 0xff;
1256
1257 if (idx[i] == 0xff)
1258 continue;
1259
1260 if (!nhmex_mbox_get_shared_reg(box, idx[i],
1261 __BITS_VALUE(config1, i, 32)))
1262 goto fail;
1263 alloc |= (0x1 << i);
1264 }
1265
1266 /* for the match/mask registers */
1267 if ((uncore_box_is_fake(box) || !reg2->alloc) &&
1268 !nhmex_mbox_get_shared_reg(box, reg2->idx, reg2->config))
1269 goto fail;
1270
1271 /*
1272 * If it's a fake box -- as per validate_{group,event}() we
1273 * shouldn't touch event state and we can avoid doing so
1274 * since both will only call get_event_constraints() once
1275 * on each event, this avoids the need for reg->alloc.
1276 */
1277 if (!uncore_box_is_fake(box)) {
1278 if (idx[0] != 0xff && idx[0] != __BITS_VALUE(reg1->idx, 0, 8))
1279 nhmex_mbox_alter_er(event, idx[0], true);
1280 reg1->alloc |= alloc;
1281 reg2->alloc = 1;
1282 }
1283 return NULL;
1284fail:
1285 if (idx[0] != 0xff && !(alloc & 0x1) &&
1286 idx[0] >= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
1287 /*
1288 * events 0xd ~ 0x10 are functional identical, but are
1289 * controlled by different fields in the ZDP_CTL_FVC
1290 * register. If we failed to take one field, try the
1291 * rest 3 choices.
1292 */
1293 BUG_ON(__BITS_VALUE(reg1->idx, 1, 8) != 0xff);
1294 idx[0] -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
1295 idx[0] = (idx[0] + 1) % 4;
1296 idx[0] += EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
1297 if (idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) {
1298 config1 = nhmex_mbox_alter_er(event, idx[0], false);
1299 goto again;
1300 }
1301 }
1302
1303 if (alloc & 0x1)
1304 nhmex_mbox_put_shared_reg(box, idx[0]);
1305 if (alloc & 0x2)
1306 nhmex_mbox_put_shared_reg(box, idx[1]);
1307 return &constraint_empty;
1308}
1309
1310static void nhmex_mbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
1311{
1312 struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
1313 struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
1314
1315 if (uncore_box_is_fake(box))
1316 return;
1317
1318 if (reg1->alloc & 0x1)
1319 nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 0, 8));
1320 if (reg1->alloc & 0x2)
1321 nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 1, 8));
1322 reg1->alloc = 0;
1323
1324 if (reg2->alloc) {
1325 nhmex_mbox_put_shared_reg(box, reg2->idx);
1326 reg2->alloc = 0;
1327 }
1328}
1329
1330static int nhmex_mbox_extra_reg_idx(struct extra_reg *er)
1331{
1332 if (er->idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
1333 return er->idx;
1334 return er->idx + (er->event >> NHMEX_M_PMON_CTL_INC_SEL_SHIFT) - 0xd;
1335}
1336
1337static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
1338{
1339 struct intel_uncore_type *type = box->pmu->type;
1340 struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
1341 struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
1342 struct extra_reg *er;
1343 unsigned msr;
1344 int reg_idx = 0;
1345
1346 if (WARN_ON_ONCE(reg1->idx != -1))
1347 return -EINVAL;
1348 /*
1349 * The mbox events may require 2 extra MSRs at the most. But only
1350 * the lower 32 bits in these MSRs are significant, so we can use
1351 * config1 to pass two MSRs' config.
1352 */
1353 for (er = nhmex_uncore_mbox_extra_regs; er->msr; er++) {
1354 if (er->event != (event->hw.config & er->config_mask))
1355 continue;
1356 if (event->attr.config1 & ~er->valid_mask)
1357 return -EINVAL;
1358 if (er->idx == __BITS_VALUE(reg1->idx, 0, 8) ||
1359 er->idx == __BITS_VALUE(reg1->idx, 1, 8))
1360 continue;
1361 if (WARN_ON_ONCE(reg_idx >= 2))
1362 return -EINVAL;
1363
1364 msr = er->msr + type->msr_offset * box->pmu->pmu_idx;
1365 if (WARN_ON_ONCE(msr >= 0xffff || er->idx >= 0xff))
1366 return -EINVAL;
1367
1368 /* always use the 32~63 bits to pass the PLD config */
1369 if (er->idx == EXTRA_REG_NHMEX_M_PLD)
1370 reg_idx = 1;
1371
1372 reg1->idx &= ~(0xff << (reg_idx * 8));
1373 reg1->reg &= ~(0xffff << (reg_idx * 16));
1374 reg1->idx |= nhmex_mbox_extra_reg_idx(er) << (reg_idx * 8);
1375 reg1->reg |= msr << (reg_idx * 16);
1376 reg1->config = event->attr.config1;
1377 reg_idx++;
1378 }
1379 /* use config2 to pass the filter config */
1380 reg2->idx = EXTRA_REG_NHMEX_M_FILTER;
1381 if (event->attr.config2 & NHMEX_M_PMON_MM_CFG_EN)
1382 reg2->config = event->attr.config2;
1383 else
1384 reg2->config = ~0ULL;
1385 if (box->pmu->pmu_idx == 0)
1386 reg2->reg = NHMEX_M0_MSR_PMU_MM_CFG;
1387 else
1388 reg2->reg = NHMEX_M1_MSR_PMU_MM_CFG;
1389
1390 return 0;
1391}
1392
1393static u64 nhmex_mbox_shared_reg_config(struct intel_uncore_box *box, int idx)
1394{
1395 struct intel_uncore_extra_reg *er;
1396 unsigned long flags;
1397 u64 config;
1398
1399 if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
1400 return box->shared_regs[idx].config;
1401
1402 er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
1403 raw_spin_lock_irqsave(&er->lock, flags);
1404 config = er->config;
1405 raw_spin_unlock_irqrestore(&er->lock, flags);
1406 return config;
1407}
1408
1409static void nhmex_mbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
1410{
1411 struct hw_perf_event *hwc = &event->hw;
1412 struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
1413 struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
1414 int idx;
1415
1416 idx = __BITS_VALUE(reg1->idx, 0, 8);
1417 if (idx != 0xff)
1418 wrmsrl(__BITS_VALUE(reg1->reg, 0, 16),
1419 nhmex_mbox_shared_reg_config(box, idx));
1420 idx = __BITS_VALUE(reg1->idx, 1, 8);
1421 if (idx != 0xff)
1422 wrmsrl(__BITS_VALUE(reg1->reg, 1, 16),
1423 nhmex_mbox_shared_reg_config(box, idx));
1424
1425 wrmsrl(reg2->reg, 0);
1426 if (reg2->config != ~0ULL) {
1427 wrmsrl(reg2->reg + 1,
1428 reg2->config & NHMEX_M_PMON_ADDR_MATCH_MASK);
1429 wrmsrl(reg2->reg + 2, NHMEX_M_PMON_ADDR_MASK_MASK &
1430 (reg2->config >> NHMEX_M_PMON_ADDR_MASK_SHIFT));
1431 wrmsrl(reg2->reg, NHMEX_M_PMON_MM_CFG_EN);
1432 }
1433
1434 wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
1435}
1436
1437DEFINE_UNCORE_FORMAT_ATTR(count_mode, count_mode, "config:2-3");
1438DEFINE_UNCORE_FORMAT_ATTR(storage_mode, storage_mode, "config:4-5");
1439DEFINE_UNCORE_FORMAT_ATTR(wrap_mode, wrap_mode, "config:6");
1440DEFINE_UNCORE_FORMAT_ATTR(flag_mode, flag_mode, "config:7");
1441DEFINE_UNCORE_FORMAT_ATTR(inc_sel, inc_sel, "config:9-13");
1442DEFINE_UNCORE_FORMAT_ATTR(set_flag_sel, set_flag_sel, "config:19-21");
1443DEFINE_UNCORE_FORMAT_ATTR(filter_cfg, filter_cfg, "config2:63");
1444DEFINE_UNCORE_FORMAT_ATTR(filter_match, filter_match, "config2:0-33");
1445DEFINE_UNCORE_FORMAT_ATTR(filter_mask, filter_mask, "config2:34-61");
1446DEFINE_UNCORE_FORMAT_ATTR(dsp, dsp, "config1:0-31");
1447DEFINE_UNCORE_FORMAT_ATTR(thr, thr, "config1:0-31");
1448DEFINE_UNCORE_FORMAT_ATTR(fvc, fvc, "config1:0-31");
1449DEFINE_UNCORE_FORMAT_ATTR(pgt, pgt, "config1:0-31");
1450DEFINE_UNCORE_FORMAT_ATTR(map, map, "config1:0-31");
1451DEFINE_UNCORE_FORMAT_ATTR(iss, iss, "config1:0-31");
1452DEFINE_UNCORE_FORMAT_ATTR(pld, pld, "config1:32-63");
1453
1454static struct attribute *nhmex_uncore_mbox_formats_attr[] = {
1455 &format_attr_count_mode.attr,
1456 &format_attr_storage_mode.attr,
1457 &format_attr_wrap_mode.attr,
1458 &format_attr_flag_mode.attr,
1459 &format_attr_inc_sel.attr,
1460 &format_attr_set_flag_sel.attr,
1461 &format_attr_filter_cfg.attr,
1462 &format_attr_filter_match.attr,
1463 &format_attr_filter_mask.attr,
1464 &format_attr_dsp.attr,
1465 &format_attr_thr.attr,
1466 &format_attr_fvc.attr,
1467 &format_attr_pgt.attr,
1468 &format_attr_map.attr,
1469 &format_attr_iss.attr,
1470 &format_attr_pld.attr,
1471 NULL,
1472};
1473
1474static struct attribute_group nhmex_uncore_mbox_format_group = {
1475 .name = "format",
1476 .attrs = nhmex_uncore_mbox_formats_attr,
1477};
1478
1479static struct uncore_event_desc nhmex_uncore_mbox_events[] = {
1480 INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x2800"),
1481 INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x2820"),
1482 { /* end: all zeroes */ },
1483};
1484
1485static struct intel_uncore_ops nhmex_uncore_mbox_ops = {
1486 NHMEX_UNCORE_OPS_COMMON_INIT(),
1487 .enable_event = nhmex_mbox_msr_enable_event,
1488 .hw_config = nhmex_mbox_hw_config,
1489 .get_constraint = nhmex_mbox_get_constraint,
1490 .put_constraint = nhmex_mbox_put_constraint,
1491};
1492
1493static struct intel_uncore_type nhmex_uncore_mbox = {
1494 .name = "mbox",
1495 .num_counters = 6,
1496 .num_boxes = 2,
1497 .perf_ctr_bits = 48,
1498 .event_ctl = NHMEX_M0_MSR_PMU_CTL0,
1499 .perf_ctr = NHMEX_M0_MSR_PMU_CNT0,
1500 .event_mask = NHMEX_M_PMON_RAW_EVENT_MASK,
1501 .box_ctl = NHMEX_M0_MSR_GLOBAL_CTL,
1502 .msr_offset = NHMEX_M_MSR_OFFSET,
1503 .pair_ctr_ctl = 1,
1504 .num_shared_regs = 8,
1505 .event_descs = nhmex_uncore_mbox_events,
1506 .ops = &nhmex_uncore_mbox_ops,
1507 .format_group = &nhmex_uncore_mbox_format_group,
1508};
1509
1510void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event)
1511{
1512 struct hw_perf_event *hwc = &event->hw;
1513 struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
1514 int port;
1515
1516 /* adjust the main event selector */
1517 if (reg1->idx % 2) {
1518 reg1->idx--;
1519 hwc->config -= 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
1520 } else {
1521 reg1->idx++;
1522 hwc->config += 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
1523 }
1524
1525 /* adjust address or config of extra register */
1526 port = reg1->idx / 6 + box->pmu->pmu_idx * 4;
1527 switch (reg1->idx % 6) {
1528 case 0:
1529 reg1->reg = NHMEX_R_MSR_PORTN_IPERF_CFG0(port);
1530 break;
1531 case 1:
1532 reg1->reg = NHMEX_R_MSR_PORTN_IPERF_CFG1(port);
1533 break;
1534 case 2:
1535 /* the 8~15 bits to the 0~7 bits */
1536 reg1->config >>= 8;
1537 break;
1538 case 3:
1539 /* the 0~7 bits to the 8~15 bits */
1540 reg1->config <<= 8;
1541 break;
1542 case 4:
1543 reg1->reg = NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port);
1544 break;
1545 case 5:
1546 reg1->reg = NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port);
1547 break;
1548 };
1549}
1550
1551/*
1552 * Each rbox has 4 event set which monitor PQI port 0~3 or 4~7.
1553 * An event set consists of 6 events, the 3rd and 4th events in
1554 * an event set use the same extra register. So an event set uses
1555 * 5 extra registers.
1556 */
1557static struct event_constraint *
1558nhmex_rbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
1559{
1560 struct hw_perf_event *hwc = &event->hw;
1561 struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
1562 struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
1563 struct intel_uncore_extra_reg *er;
1564 unsigned long flags;
1565 int idx, er_idx;
1566 u64 config1;
1567 bool ok = false;
1568
1569 if (!uncore_box_is_fake(box) && reg1->alloc)
1570 return NULL;
1571
1572 idx = reg1->idx % 6;
1573 config1 = reg1->config;
1574again:
1575 er_idx = idx;
1576 /* the 3rd and 4th events use the same extra register */
1577 if (er_idx > 2)
1578 er_idx--;
1579 er_idx += (reg1->idx / 6) * 5;
1580
1581 er = &box->shared_regs[er_idx];
1582 raw_spin_lock_irqsave(&er->lock, flags);
1583 if (idx < 2) {
1584 if (!atomic_read(&er->ref) || er->config == reg1->config) {
1585 atomic_inc(&er->ref);
1586 er->config = reg1->config;
1587 ok = true;
1588 }
1589 } else if (idx == 2 || idx == 3) {
1590 /*
1591 * these two events use different fields in a extra register,
1592 * the 0~7 bits and the 8~15 bits respectively.
1593 */
1594 u64 mask = 0xff << ((idx - 2) * 8);
1595 if (!__BITS_VALUE(atomic_read(&er->ref), idx - 2, 8) ||
1596 !((er->config ^ config1) & mask)) {
1597 atomic_add(1 << ((idx - 2) * 8), &er->ref);
1598 er->config &= ~mask;
1599 er->config |= config1 & mask;
1600 ok = true;
1601 }
1602 } else {
1603 if (!atomic_read(&er->ref) ||
1604 (er->config == (hwc->config >> 32) &&
1605 er->config1 == reg1->config &&
1606 er->config2 == reg2->config)) {
1607 atomic_inc(&er->ref);
1608 er->config = (hwc->config >> 32);
1609 er->config1 = reg1->config;
1610 er->config2 = reg2->config;
1611 ok = true;
1612 }
1613 }
1614 raw_spin_unlock_irqrestore(&er->lock, flags);
1615
1616 if (!ok) {
1617 /*
1618 * The Rbox events are always in pairs. The paired
1619 * events are functional identical, but use different
1620 * extra registers. If we failed to take an extra
1621 * register, try the alternative.
1622 */
1623 if (idx % 2)
1624 idx--;
1625 else
1626 idx++;
1627 if (idx != reg1->idx % 6) {
1628 if (idx == 2)
1629 config1 >>= 8;
1630 else if (idx == 3)
1631 config1 <<= 8;
1632 goto again;
1633 }
1634 } else {
1635 if (!uncore_box_is_fake(box)) {
1636 if (idx != reg1->idx % 6)
1637 nhmex_rbox_alter_er(box, event);
1638 reg1->alloc = 1;
1639 }
1640 return NULL;
1641 }
1642 return &constraint_empty;
1643}
1644
1645static void nhmex_rbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
1646{
1647 struct intel_uncore_extra_reg *er;
1648 struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
1649 int idx, er_idx;
1650
1651 if (uncore_box_is_fake(box) || !reg1->alloc)
1652 return;
1653
1654 idx = reg1->idx % 6;
1655 er_idx = idx;
1656 if (er_idx > 2)
1657 er_idx--;
1658 er_idx += (reg1->idx / 6) * 5;
1659
1660 er = &box->shared_regs[er_idx];
1661 if (idx == 2 || idx == 3)
1662 atomic_sub(1 << ((idx - 2) * 8), &er->ref);
1663 else
1664 atomic_dec(&er->ref);
1665
1666 reg1->alloc = 0;
1667}
1668
1669static int nhmex_rbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
1670{
1671 struct hw_perf_event *hwc = &event->hw;
1672 struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
1673 struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
1674 int port, idx;
1675
1676 idx = (event->hw.config & NHMEX_R_PMON_CTL_EV_SEL_MASK) >>
1677 NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
1678 if (idx >= 0x18)
1679 return -EINVAL;
1680
1681 reg1->idx = idx;
1682 reg1->config = event->attr.config1;
1683
1684 port = idx / 6 + box->pmu->pmu_idx * 4;
1685 idx %= 6;
1686 switch (idx) {
1687 case 0:
1688 reg1->reg = NHMEX_R_MSR_PORTN_IPERF_CFG0(port);
1689 break;
1690 case 1:
1691 reg1->reg = NHMEX_R_MSR_PORTN_IPERF_CFG1(port);
1692 break;
1693 case 2:
1694 case 3:
1695 reg1->reg = NHMEX_R_MSR_PORTN_QLX_CFG(port);
1696 break;
1697 case 4:
1698 case 5:
1699 if (idx == 4)
1700 reg1->reg = NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port);
1701 else
1702 reg1->reg = NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port);
1703 reg2->config = event->attr.config2;
1704 hwc->config |= event->attr.config & (~0ULL << 32);
1705 break;
1706 };
1707 return 0;
1708}
1709
1710static u64 nhmex_rbox_shared_reg_config(struct intel_uncore_box *box, int idx)
1711{
1712 struct intel_uncore_extra_reg *er;
1713 unsigned long flags;
1714 u64 config;
1715
1716 er = &box->shared_regs[idx];
1717
1718 raw_spin_lock_irqsave(&er->lock, flags);
1719 config = er->config;
1720 raw_spin_unlock_irqrestore(&er->lock, flags);
1721
1722 return config;
1723}
1724
1725static void nhmex_rbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
1726{
1727 struct hw_perf_event *hwc = &event->hw;
1728 struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
1729 struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
1730 int idx, er_idx;
1731
1732 idx = reg1->idx % 6;
1733 er_idx = idx;
1734 if (er_idx > 2)
1735 er_idx--;
1736 er_idx += (reg1->idx / 6) * 5;
1737
1738 switch (idx) {
1739 case 0:
1740 case 1:
1741 wrmsrl(reg1->reg, reg1->config);
1742 break;
1743 case 2:
1744 case 3:
1745 wrmsrl(reg1->reg, nhmex_rbox_shared_reg_config(box, er_idx));
1746 break;
1747 case 4:
1748 case 5:
1749 wrmsrl(reg1->reg, reg1->config);
1750 wrmsrl(reg1->reg + 1, hwc->config >> 32);
1751 wrmsrl(reg1->reg + 2, reg2->config);
1752 break;
1753 };
1754
1755 wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
1756 (hwc->config & NHMEX_R_PMON_CTL_EV_SEL_MASK));
1757}
1758
1759DEFINE_UNCORE_FORMAT_ATTR(xbr_match, xbr_match, "config:32-63");
1760DEFINE_UNCORE_FORMAT_ATTR(xbr_mm_cfg, xbr_mm_cfg, "config1:0-63");
1761DEFINE_UNCORE_FORMAT_ATTR(xbr_mask, xbr_mask, "config2:0-63");
1762DEFINE_UNCORE_FORMAT_ATTR(qlx_cfg, qlx_cfg, "config1:0-15");
1763DEFINE_UNCORE_FORMAT_ATTR(iperf_cfg, iperf_cfg, "config1:0-31");
1764
1765static struct attribute *nhmex_uncore_rbox_formats_attr[] = {
1766 &format_attr_event5.attr,
1767 &format_attr_xbr_mm_cfg.attr,
1768 &format_attr_xbr_match.attr,
1769 &format_attr_xbr_mask.attr,
1770 &format_attr_qlx_cfg.attr,
1771 &format_attr_iperf_cfg.attr,
1772 NULL,
1773};
1774
1775static struct attribute_group nhmex_uncore_rbox_format_group = {
1776 .name = "format",
1777 .attrs = nhmex_uncore_rbox_formats_attr,
1778};
1779
1780static struct uncore_event_desc nhmex_uncore_rbox_events[] = {
1781 INTEL_UNCORE_EVENT_DESC(qpi0_flit_send, "event=0x0,iperf_cfg=0x80000000"),
1782 INTEL_UNCORE_EVENT_DESC(qpi1_filt_send, "event=0x6,iperf_cfg=0x80000000"),
1783 INTEL_UNCORE_EVENT_DESC(qpi0_idle_filt, "event=0x0,iperf_cfg=0x40000000"),
1784 INTEL_UNCORE_EVENT_DESC(qpi1_idle_filt, "event=0x6,iperf_cfg=0x40000000"),
1785 INTEL_UNCORE_EVENT_DESC(qpi0_date_response, "event=0x0,iperf_cfg=0xc4"),
1786 INTEL_UNCORE_EVENT_DESC(qpi1_date_response, "event=0x6,iperf_cfg=0xc4"),
1787 { /* end: all zeroes */ },
1788};
1789
1790static struct intel_uncore_ops nhmex_uncore_rbox_ops = {
1791 NHMEX_UNCORE_OPS_COMMON_INIT(),
1792 .enable_event = nhmex_rbox_msr_enable_event,
1793 .hw_config = nhmex_rbox_hw_config,
1794 .get_constraint = nhmex_rbox_get_constraint,
1795 .put_constraint = nhmex_rbox_put_constraint,
1796};
1797
1798static struct intel_uncore_type nhmex_uncore_rbox = {
1799 .name = "rbox",
1800 .num_counters = 8,
1801 .num_boxes = 2,
1802 .perf_ctr_bits = 48,
1803 .event_ctl = NHMEX_R_MSR_PMON_CTL0,
1804 .perf_ctr = NHMEX_R_MSR_PMON_CNT0,
1805 .event_mask = NHMEX_R_PMON_RAW_EVENT_MASK,
1806 .box_ctl = NHMEX_R_MSR_GLOBAL_CTL,
1807 .msr_offset = NHMEX_R_MSR_OFFSET,
1808 .pair_ctr_ctl = 1,
1809 .num_shared_regs = 20,
1810 .event_descs = nhmex_uncore_rbox_events,
1811 .ops = &nhmex_uncore_rbox_ops,
1812 .format_group = &nhmex_uncore_rbox_format_group
1813};
1814
1815static struct intel_uncore_type *nhmex_msr_uncores[] = {
1816 &nhmex_uncore_ubox,
1817 &nhmex_uncore_cbox,
1818 &nhmex_uncore_bbox,
1819 &nhmex_uncore_sbox,
1820 &nhmex_uncore_mbox,
1821 &nhmex_uncore_rbox,
1822 &nhmex_uncore_wbox,
1823 NULL,
1824};
1825/* end of Nehalem-EX uncore support */
1826
1827static void uncore_assign_hw_event(struct intel_uncore_box *box, struct perf_event *event, int idx)
1828{
1829 struct hw_perf_event *hwc = &event->hw;
1830
1831 hwc->idx = idx;
1832 hwc->last_tag = ++box->tags[idx];
1833
1834 if (hwc->idx == UNCORE_PMC_IDX_FIXED) {
1835 hwc->event_base = uncore_fixed_ctr(box);
1836 hwc->config_base = uncore_fixed_ctl(box);
1837 return;
1838 }
1839
1840 hwc->config_base = uncore_event_ctl(box, hwc->idx);
1841 hwc->event_base = uncore_perf_ctr(box, hwc->idx);
1842}
1843
1844static void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event)
1845{
1846 u64 prev_count, new_count, delta;
1847 int shift;
1848
1849 if (event->hw.idx >= UNCORE_PMC_IDX_FIXED)
1850 shift = 64 - uncore_fixed_ctr_bits(box);
1851 else
1852 shift = 64 - uncore_perf_ctr_bits(box);
1853
1854 /* the hrtimer might modify the previous event value */
1855again:
1856 prev_count = local64_read(&event->hw.prev_count);
1857 new_count = uncore_read_counter(box, event);
1858 if (local64_xchg(&event->hw.prev_count, new_count) != prev_count)
1859 goto again;
1860
1861 delta = (new_count << shift) - (prev_count << shift);
1862 delta >>= shift;
1863
1864 local64_add(delta, &event->count);
1865}
1866
1867/*
1868 * The overflow interrupt is unavailable for SandyBridge-EP, is broken
1869 * for SandyBridge. So we use hrtimer to periodically poll the counter
1870 * to avoid overflow.
1871 */
1872static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer)
1873{
1874 struct intel_uncore_box *box;
1875 unsigned long flags;
1876 int bit;
1877
1878 box = container_of(hrtimer, struct intel_uncore_box, hrtimer);
1879 if (!box->n_active || box->cpu != smp_processor_id())
1880 return HRTIMER_NORESTART;
1881 /*
1882 * disable local interrupt to prevent uncore_pmu_event_start/stop
1883 * to interrupt the update process
1884 */
1885 local_irq_save(flags);
1886
1887 for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX)
1888 uncore_perf_event_update(box, box->events[bit]);
1889
1890 local_irq_restore(flags);
1891
1892 hrtimer_forward_now(hrtimer, ns_to_ktime(UNCORE_PMU_HRTIMER_INTERVAL));
1893 return HRTIMER_RESTART;
1894}
1895
1896static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box)
1897{
1898 __hrtimer_start_range_ns(&box->hrtimer,
1899 ns_to_ktime(UNCORE_PMU_HRTIMER_INTERVAL), 0,
1900 HRTIMER_MODE_REL_PINNED, 0);
1901}
1902
1903static void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box)
1904{
1905 hrtimer_cancel(&box->hrtimer);
1906}
1907
1908static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box)
1909{
1910 hrtimer_init(&box->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1911 box->hrtimer.function = uncore_pmu_hrtimer;
1912}
1913
1914struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, int cpu)
1915{
1916 struct intel_uncore_box *box;
1917 int i, size;
1918
1919 size = sizeof(*box) + type->num_shared_regs * sizeof(struct intel_uncore_extra_reg);
1920
1921 box = kmalloc_node(size, GFP_KERNEL | __GFP_ZERO, cpu_to_node(cpu));
1922 if (!box)
1923 return NULL;
1924
1925 for (i = 0; i < type->num_shared_regs; i++)
1926 raw_spin_lock_init(&box->shared_regs[i].lock);
1927
1928 uncore_pmu_init_hrtimer(box);
1929 atomic_set(&box->refcnt, 1);
1930 box->cpu = -1;
1931 box->phys_id = -1;
1932
1933 return box;
1934}
1935
1936static struct intel_uncore_box *
1937uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
1938{
1939 static struct intel_uncore_box *box;
1940
1941 box = *per_cpu_ptr(pmu->box, cpu);
1942 if (box)
1943 return box;
1944
1945 raw_spin_lock(&uncore_box_lock);
1946 list_for_each_entry(box, &pmu->box_list, list) {
1947 if (box->phys_id == topology_physical_package_id(cpu)) {
1948 atomic_inc(&box->refcnt);
1949 *per_cpu_ptr(pmu->box, cpu) = box;
1950 break;
1951 }
1952 }
1953 raw_spin_unlock(&uncore_box_lock);
1954
1955 return *per_cpu_ptr(pmu->box, cpu);
1956}
1957
1958static struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event)
1959{
1960 return container_of(event->pmu, struct intel_uncore_pmu, pmu);
1961}
1962
1963static struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
1964{
1965 /*
1966 * perf core schedules event on the basis of cpu, uncore events are
1967 * collected by one of the cpus inside a physical package.
1968 */
1969 return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id());
1970}
1971
1972static int
1973uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader, bool dogrp)
1974{
1975 struct perf_event *event;
1976 int n, max_count;
1977
1978 max_count = box->pmu->type->num_counters;
1979 if (box->pmu->type->fixed_ctl)
1980 max_count++;
1981
1982 if (box->n_events >= max_count)
1983 return -EINVAL;
1984
1985 n = box->n_events;
1986 box->event_list[n] = leader;
1987 n++;
1988 if (!dogrp)
1989 return n;
1990
1991 list_for_each_entry(event, &leader->sibling_list, group_entry) {
1992 if (event->state <= PERF_EVENT_STATE_OFF)
1993 continue;
1994
1995 if (n >= max_count)
1996 return -EINVAL;
1997
1998 box->event_list[n] = event;
1999 n++;
2000 }
2001 return n;
2002}
2003
2004static struct event_constraint *
2005uncore_get_event_constraint(struct intel_uncore_box *box, struct perf_event *event)
2006{
2007 struct intel_uncore_type *type = box->pmu->type;
2008 struct event_constraint *c;
2009
2010 if (type->ops->get_constraint) {
2011 c = type->ops->get_constraint(box, event);
2012 if (c)
2013 return c;
2014 }
2015
2016 if (event->hw.config == ~0ULL)
2017 return &constraint_fixed;
2018
2019 if (type->constraints) {
2020 for_each_event_constraint(c, type->constraints) {
2021 if ((event->hw.config & c->cmask) == c->code)
2022 return c;
2023 }
2024 }
2025
2026 return &type->unconstrainted;
2027}
2028
2029static void uncore_put_event_constraint(struct intel_uncore_box *box, struct perf_event *event)
2030{
2031 if (box->pmu->type->ops->put_constraint)
2032 box->pmu->type->ops->put_constraint(box, event);
2033}
2034
2035static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int n)
2036{
2037 unsigned long used_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
2038 struct event_constraint *c, *constraints[UNCORE_PMC_IDX_MAX];
2039 int i, wmin, wmax, ret = 0;
2040 struct hw_perf_event *hwc;
2041
2042 bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX);
2043
2044 for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) {
2045 c = uncore_get_event_constraint(box, box->event_list[i]);
2046 constraints[i] = c;
2047 wmin = min(wmin, c->weight);
2048 wmax = max(wmax, c->weight);
2049 }
2050
2051 /* fastpath, try to reuse previous register */
2052 for (i = 0; i < n; i++) {
2053 hwc = &box->event_list[i]->hw;
2054 c = constraints[i];
2055
2056 /* never assigned */
2057 if (hwc->idx == -1)
2058 break;
2059
2060 /* constraint still honored */
2061 if (!test_bit(hwc->idx, c->idxmsk))
2062 break;
2063
2064 /* not already used */
2065 if (test_bit(hwc->idx, used_mask))
2066 break;
2067
2068 __set_bit(hwc->idx, used_mask);
2069 if (assign)
2070 assign[i] = hwc->idx;
2071 }
2072 /* slow path */
2073 if (i != n)
2074 ret = perf_assign_events(constraints, n, wmin, wmax, assign);
2075
2076 if (!assign || ret) {
2077 for (i = 0; i < n; i++)
2078 uncore_put_event_constraint(box, box->event_list[i]);
2079 }
2080 return ret ? -EINVAL : 0;
2081}
2082
2083static void uncore_pmu_event_start(struct perf_event *event, int flags)
2084{
2085 struct intel_uncore_box *box = uncore_event_to_box(event);
2086 int idx = event->hw.idx;
2087
2088 if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
2089 return;
2090
2091 if (WARN_ON_ONCE(idx == -1 || idx >= UNCORE_PMC_IDX_MAX))
2092 return;
2093
2094 event->hw.state = 0;
2095 box->events[idx] = event;
2096 box->n_active++;
2097 __set_bit(idx, box->active_mask);
2098
2099 local64_set(&event->hw.prev_count, uncore_read_counter(box, event));
2100 uncore_enable_event(box, event);
2101
2102 if (box->n_active == 1) {
2103 uncore_enable_box(box);
2104 uncore_pmu_start_hrtimer(box);
2105 }
2106}
2107
2108static void uncore_pmu_event_stop(struct perf_event *event, int flags)
2109{
2110 struct intel_uncore_box *box = uncore_event_to_box(event);
2111 struct hw_perf_event *hwc = &event->hw;
2112
2113 if (__test_and_clear_bit(hwc->idx, box->active_mask)) {
2114 uncore_disable_event(box, event);
2115 box->n_active--;
2116 box->events[hwc->idx] = NULL;
2117 WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
2118 hwc->state |= PERF_HES_STOPPED;
2119
2120 if (box->n_active == 0) {
2121 uncore_disable_box(box);
2122 uncore_pmu_cancel_hrtimer(box);
2123 }
2124 }
2125
2126 if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
2127 /*
2128 * Drain the remaining delta count out of a event
2129 * that we are disabling:
2130 */
2131 uncore_perf_event_update(box, event);
2132 hwc->state |= PERF_HES_UPTODATE;
2133 }
2134}
2135
2136static int uncore_pmu_event_add(struct perf_event *event, int flags)
2137{
2138 struct intel_uncore_box *box = uncore_event_to_box(event);
2139 struct hw_perf_event *hwc = &event->hw;
2140 int assign[UNCORE_PMC_IDX_MAX];
2141 int i, n, ret;
2142
2143 if (!box)
2144 return -ENODEV;
2145
2146 ret = n = uncore_collect_events(box, event, false);
2147 if (ret < 0)
2148 return ret;
2149
2150 hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
2151 if (!(flags & PERF_EF_START))
2152 hwc->state |= PERF_HES_ARCH;
2153
2154 ret = uncore_assign_events(box, assign, n);
2155 if (ret)
2156 return ret;
2157
2158 /* save events moving to new counters */
2159 for (i = 0; i < box->n_events; i++) {
2160 event = box->event_list[i];
2161 hwc = &event->hw;
2162
2163 if (hwc->idx == assign[i] &&
2164 hwc->last_tag == box->tags[assign[i]])
2165 continue;
2166 /*
2167 * Ensure we don't accidentally enable a stopped
2168 * counter simply because we rescheduled.
2169 */
2170 if (hwc->state & PERF_HES_STOPPED)
2171 hwc->state |= PERF_HES_ARCH;
2172
2173 uncore_pmu_event_stop(event, PERF_EF_UPDATE);
2174 }
2175
2176 /* reprogram moved events into new counters */
2177 for (i = 0; i < n; i++) {
2178 event = box->event_list[i];
2179 hwc = &event->hw;
2180
2181 if (hwc->idx != assign[i] ||
2182 hwc->last_tag != box->tags[assign[i]])
2183 uncore_assign_hw_event(box, event, assign[i]);
2184 else if (i < box->n_events)
2185 continue;
2186
2187 if (hwc->state & PERF_HES_ARCH)
2188 continue;
2189
2190 uncore_pmu_event_start(event, 0);
2191 }
2192 box->n_events = n;
2193
2194 return 0;
2195}
2196
2197static void uncore_pmu_event_del(struct perf_event *event, int flags)
2198{
2199 struct intel_uncore_box *box = uncore_event_to_box(event);
2200 int i;
2201
2202 uncore_pmu_event_stop(event, PERF_EF_UPDATE);
2203
2204 for (i = 0; i < box->n_events; i++) {
2205 if (event == box->event_list[i]) {
2206 uncore_put_event_constraint(box, event);
2207
2208 while (++i < box->n_events)
2209 box->event_list[i - 1] = box->event_list[i];
2210
2211 --box->n_events;
2212 break;
2213 }
2214 }
2215
2216 event->hw.idx = -1;
2217 event->hw.last_tag = ~0ULL;
2218}
2219
2220static void uncore_pmu_event_read(struct perf_event *event)
2221{
2222 struct intel_uncore_box *box = uncore_event_to_box(event);
2223 uncore_perf_event_update(box, event);
2224}
2225
2226/*
2227 * validation ensures the group can be loaded onto the
2228 * PMU if it was the only group available.
2229 */
2230static int uncore_validate_group(struct intel_uncore_pmu *pmu,
2231 struct perf_event *event)
2232{
2233 struct perf_event *leader = event->group_leader;
2234 struct intel_uncore_box *fake_box;
2235 int ret = -EINVAL, n;
2236
2237 fake_box = uncore_alloc_box(pmu->type, smp_processor_id());
2238 if (!fake_box)
2239 return -ENOMEM;
2240
2241 fake_box->pmu = pmu;
2242 /*
2243 * the event is not yet connected with its
2244 * siblings therefore we must first collect
2245 * existing siblings, then add the new event
2246 * before we can simulate the scheduling
2247 */
2248 n = uncore_collect_events(fake_box, leader, true);
2249 if (n < 0)
2250 goto out;
2251
2252 fake_box->n_events = n;
2253 n = uncore_collect_events(fake_box, event, false);
2254 if (n < 0)
2255 goto out;
2256
2257 fake_box->n_events = n;
2258
2259 ret = uncore_assign_events(fake_box, NULL, n);
2260out:
2261 kfree(fake_box);
2262 return ret;
2263}
2264
2265int uncore_pmu_event_init(struct perf_event *event)
2266{
2267 struct intel_uncore_pmu *pmu;
2268 struct intel_uncore_box *box;
2269 struct hw_perf_event *hwc = &event->hw;
2270 int ret;
2271
2272 if (event->attr.type != event->pmu->type)
2273 return -ENOENT;
2274
2275 pmu = uncore_event_to_pmu(event);
2276 /* no device found for this pmu */
2277 if (pmu->func_id < 0)
2278 return -ENOENT;
2279
2280 /*
2281 * Uncore PMU does measure at all privilege level all the time.
2282 * So it doesn't make sense to specify any exclude bits.
2283 */
2284 if (event->attr.exclude_user || event->attr.exclude_kernel ||
2285 event->attr.exclude_hv || event->attr.exclude_idle)
2286 return -EINVAL;
2287
2288 /* Sampling not supported yet */
2289 if (hwc->sample_period)
2290 return -EINVAL;
2291
2292 /*
2293 * Place all uncore events for a particular physical package
2294 * onto a single cpu
2295 */
2296 if (event->cpu < 0)
2297 return -EINVAL;
2298 box = uncore_pmu_to_box(pmu, event->cpu);
2299 if (!box || box->cpu < 0)
2300 return -EINVAL;
2301 event->cpu = box->cpu;
2302
2303 event->hw.idx = -1;
2304 event->hw.last_tag = ~0ULL;
2305 event->hw.extra_reg.idx = EXTRA_REG_NONE;
2306
2307 if (event->attr.config == UNCORE_FIXED_EVENT) {
2308 /* no fixed counter */
2309 if (!pmu->type->fixed_ctl)
2310 return -EINVAL;
2311 /*
2312 * if there is only one fixed counter, only the first pmu
2313 * can access the fixed counter
2314 */
2315 if (pmu->type->single_fixed && pmu->pmu_idx > 0)
2316 return -EINVAL;
2317 hwc->config = ~0ULL;
2318 } else {
2319 hwc->config = event->attr.config & pmu->type->event_mask;
2320 if (pmu->type->ops->hw_config) {
2321 ret = pmu->type->ops->hw_config(box, event);
2322 if (ret)
2323 return ret;
2324 }
2325 }
2326
2327 if (event->group_leader != event)
2328 ret = uncore_validate_group(pmu, event);
2329 else
2330 ret = 0;
2331
2332 return ret;
2333}
2334
2335static int __init uncore_pmu_register(struct intel_uncore_pmu *pmu)
2336{
2337 int ret;
2338
2339 pmu->pmu = (struct pmu) {
2340 .attr_groups = pmu->type->attr_groups,
2341 .task_ctx_nr = perf_invalid_context,
2342 .event_init = uncore_pmu_event_init,
2343 .add = uncore_pmu_event_add,
2344 .del = uncore_pmu_event_del,
2345 .start = uncore_pmu_event_start,
2346 .stop = uncore_pmu_event_stop,
2347 .read = uncore_pmu_event_read,
2348 };
2349
2350 if (pmu->type->num_boxes == 1) {
2351 if (strlen(pmu->type->name) > 0)
2352 sprintf(pmu->name, "uncore_%s", pmu->type->name);
2353 else
2354 sprintf(pmu->name, "uncore");
2355 } else {
2356 sprintf(pmu->name, "uncore_%s_%d", pmu->type->name,
2357 pmu->pmu_idx);
2358 }
2359
2360 ret = perf_pmu_register(&pmu->pmu, pmu->name, -1);
2361 return ret;
2362}
2363
2364static void __init uncore_type_exit(struct intel_uncore_type *type)
2365{
2366 int i;
2367
2368 for (i = 0; i < type->num_boxes; i++)
2369 free_percpu(type->pmus[i].box);
2370 kfree(type->pmus);
2371 type->pmus = NULL;
2372 kfree(type->attr_groups[1]);
2373 type->attr_groups[1] = NULL;
2374}
2375
2376static void uncore_types_exit(struct intel_uncore_type **types)
2377{
2378 int i;
2379 for (i = 0; types[i]; i++)
2380 uncore_type_exit(types[i]);
2381}
2382
2383static int __init uncore_type_init(struct intel_uncore_type *type)
2384{
2385 struct intel_uncore_pmu *pmus;
2386 struct attribute_group *events_group;
2387 struct attribute **attrs;
2388 int i, j;
2389
2390 pmus = kzalloc(sizeof(*pmus) * type->num_boxes, GFP_KERNEL);
2391 if (!pmus)
2392 return -ENOMEM;
2393
2394 type->unconstrainted = (struct event_constraint)
2395 __EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1,
2396 0, type->num_counters, 0);
2397
2398 for (i = 0; i < type->num_boxes; i++) {
2399 pmus[i].func_id = -1;
2400 pmus[i].pmu_idx = i;
2401 pmus[i].type = type;
2402 INIT_LIST_HEAD(&pmus[i].box_list);
2403 pmus[i].box = alloc_percpu(struct intel_uncore_box *);
2404 if (!pmus[i].box)
2405 goto fail;
2406 }
2407
2408 if (type->event_descs) {
2409 i = 0;
2410 while (type->event_descs[i].attr.attr.name)
2411 i++;
2412
2413 events_group = kzalloc(sizeof(struct attribute *) * (i + 1) +
2414 sizeof(*events_group), GFP_KERNEL);
2415 if (!events_group)
2416 goto fail;
2417
2418 attrs = (struct attribute **)(events_group + 1);
2419 events_group->name = "events";
2420 events_group->attrs = attrs;
2421
2422 for (j = 0; j < i; j++)
2423 attrs[j] = &type->event_descs[j].attr.attr;
2424
2425 type->attr_groups[1] = events_group;
2426 }
2427
2428 type->pmus = pmus;
2429 return 0;
2430fail:
2431 uncore_type_exit(type);
2432 return -ENOMEM;
2433}
2434
2435static int __init uncore_types_init(struct intel_uncore_type **types)
2436{
2437 int i, ret;
2438
2439 for (i = 0; types[i]; i++) {
2440 ret = uncore_type_init(types[i]);
2441 if (ret)
2442 goto fail;
2443 }
2444 return 0;
2445fail:
2446 while (--i >= 0)
2447 uncore_type_exit(types[i]);
2448 return ret;
2449}
2450
2451static struct pci_driver *uncore_pci_driver;
2452static bool pcidrv_registered;
2453
2454/*
2455 * add a pci uncore device
2456 */
2457static int __devinit uncore_pci_add(struct intel_uncore_type *type, struct pci_dev *pdev)
2458{
2459 struct intel_uncore_pmu *pmu;
2460 struct intel_uncore_box *box;
2461 int i, phys_id;
2462
2463 phys_id = pcibus_to_physid[pdev->bus->number];
2464 if (phys_id < 0)
2465 return -ENODEV;
2466
2467 box = uncore_alloc_box(type, 0);
2468 if (!box)
2469 return -ENOMEM;
2470
2471 /*
2472 * for performance monitoring unit with multiple boxes,
2473 * each box has a different function id.
2474 */
2475 for (i = 0; i < type->num_boxes; i++) {
2476 pmu = &type->pmus[i];
2477 if (pmu->func_id == pdev->devfn)
2478 break;
2479 if (pmu->func_id < 0) {
2480 pmu->func_id = pdev->devfn;
2481 break;
2482 }
2483 pmu = NULL;
2484 }
2485
2486 if (!pmu) {
2487 kfree(box);
2488 return -EINVAL;
2489 }
2490
2491 box->phys_id = phys_id;
2492 box->pci_dev = pdev;
2493 box->pmu = pmu;
2494 uncore_box_init(box);
2495 pci_set_drvdata(pdev, box);
2496
2497 raw_spin_lock(&uncore_box_lock);
2498 list_add_tail(&box->list, &pmu->box_list);
2499 raw_spin_unlock(&uncore_box_lock);
2500
2501 return 0;
2502}
2503
2504static void uncore_pci_remove(struct pci_dev *pdev)
2505{
2506 struct intel_uncore_box *box = pci_get_drvdata(pdev);
2507 struct intel_uncore_pmu *pmu = box->pmu;
2508 int cpu, phys_id = pcibus_to_physid[pdev->bus->number];
2509
2510 if (WARN_ON_ONCE(phys_id != box->phys_id))
2511 return;
2512
2513 raw_spin_lock(&uncore_box_lock);
2514 list_del(&box->list);
2515 raw_spin_unlock(&uncore_box_lock);
2516
2517 for_each_possible_cpu(cpu) {
2518 if (*per_cpu_ptr(pmu->box, cpu) == box) {
2519 *per_cpu_ptr(pmu->box, cpu) = NULL;
2520 atomic_dec(&box->refcnt);
2521 }
2522 }
2523
2524 WARN_ON_ONCE(atomic_read(&box->refcnt) != 1);
2525 kfree(box);
2526}
2527
2528static int __devinit uncore_pci_probe(struct pci_dev *pdev,
2529 const struct pci_device_id *id)
2530{
2531 struct intel_uncore_type *type;
2532
2533 type = (struct intel_uncore_type *)id->driver_data;
2534
2535 return uncore_pci_add(type, pdev);
2536}
2537
2538static int __init uncore_pci_init(void)
2539{
2540 int ret;
2541
2542 switch (boot_cpu_data.x86_model) {
2543 case 45: /* Sandy Bridge-EP */
2544 pci_uncores = snbep_pci_uncores;
2545 uncore_pci_driver = &snbep_uncore_pci_driver;
2546 snbep_pci2phy_map_init();
2547 break;
2548 default:
2549 return 0;
2550 }
2551
2552 ret = uncore_types_init(pci_uncores);
2553 if (ret)
2554 return ret;
2555
2556 uncore_pci_driver->probe = uncore_pci_probe;
2557 uncore_pci_driver->remove = uncore_pci_remove;
2558
2559 ret = pci_register_driver(uncore_pci_driver);
2560 if (ret == 0)
2561 pcidrv_registered = true;
2562 else
2563 uncore_types_exit(pci_uncores);
2564
2565 return ret;
2566}
2567
2568static void __init uncore_pci_exit(void)
2569{
2570 if (pcidrv_registered) {
2571 pcidrv_registered = false;
2572 pci_unregister_driver(uncore_pci_driver);
2573 uncore_types_exit(pci_uncores);
2574 }
2575}
2576
2577static void __cpuinit uncore_cpu_dying(int cpu)
2578{
2579 struct intel_uncore_type *type;
2580 struct intel_uncore_pmu *pmu;
2581 struct intel_uncore_box *box;
2582 int i, j;
2583
2584 for (i = 0; msr_uncores[i]; i++) {
2585 type = msr_uncores[i];
2586 for (j = 0; j < type->num_boxes; j++) {
2587 pmu = &type->pmus[j];
2588 box = *per_cpu_ptr(pmu->box, cpu);
2589 *per_cpu_ptr(pmu->box, cpu) = NULL;
2590 if (box && atomic_dec_and_test(&box->refcnt))
2591 kfree(box);
2592 }
2593 }
2594}
2595
2596static int __cpuinit uncore_cpu_starting(int cpu)
2597{
2598 struct intel_uncore_type *type;
2599 struct intel_uncore_pmu *pmu;
2600 struct intel_uncore_box *box, *exist;
2601 int i, j, k, phys_id;
2602
2603 phys_id = topology_physical_package_id(cpu);
2604
2605 for (i = 0; msr_uncores[i]; i++) {
2606 type = msr_uncores[i];
2607 for (j = 0; j < type->num_boxes; j++) {
2608 pmu = &type->pmus[j];
2609 box = *per_cpu_ptr(pmu->box, cpu);
2610 /* called by uncore_cpu_init? */
2611 if (box && box->phys_id >= 0) {
2612 uncore_box_init(box);
2613 continue;
2614 }
2615
2616 for_each_online_cpu(k) {
2617 exist = *per_cpu_ptr(pmu->box, k);
2618 if (exist && exist->phys_id == phys_id) {
2619 atomic_inc(&exist->refcnt);
2620 *per_cpu_ptr(pmu->box, cpu) = exist;
2621 kfree(box);
2622 box = NULL;
2623 break;
2624 }
2625 }
2626
2627 if (box) {
2628 box->phys_id = phys_id;
2629 uncore_box_init(box);
2630 }
2631 }
2632 }
2633 return 0;
2634}
2635
2636static int __cpuinit uncore_cpu_prepare(int cpu, int phys_id)
2637{
2638 struct intel_uncore_type *type;
2639 struct intel_uncore_pmu *pmu;
2640 struct intel_uncore_box *box;
2641 int i, j;
2642
2643 for (i = 0; msr_uncores[i]; i++) {
2644 type = msr_uncores[i];
2645 for (j = 0; j < type->num_boxes; j++) {
2646 pmu = &type->pmus[j];
2647 if (pmu->func_id < 0)
2648 pmu->func_id = j;
2649
2650 box = uncore_alloc_box(type, cpu);
2651 if (!box)
2652 return -ENOMEM;
2653
2654 box->pmu = pmu;
2655 box->phys_id = phys_id;
2656 *per_cpu_ptr(pmu->box, cpu) = box;
2657 }
2658 }
2659 return 0;
2660}
2661
2662static void __cpuinit
2663uncore_change_context(struct intel_uncore_type **uncores, int old_cpu, int new_cpu)
2664{
2665 struct intel_uncore_type *type;
2666 struct intel_uncore_pmu *pmu;
2667 struct intel_uncore_box *box;
2668 int i, j;
2669
2670 for (i = 0; uncores[i]; i++) {
2671 type = uncores[i];
2672 for (j = 0; j < type->num_boxes; j++) {
2673 pmu = &type->pmus[j];
2674 if (old_cpu < 0)
2675 box = uncore_pmu_to_box(pmu, new_cpu);
2676 else
2677 box = uncore_pmu_to_box(pmu, old_cpu);
2678 if (!box)
2679 continue;
2680
2681 if (old_cpu < 0) {
2682 WARN_ON_ONCE(box->cpu != -1);
2683 box->cpu = new_cpu;
2684 continue;
2685 }
2686
2687 WARN_ON_ONCE(box->cpu != old_cpu);
2688 if (new_cpu >= 0) {
2689 uncore_pmu_cancel_hrtimer(box);
2690 perf_pmu_migrate_context(&pmu->pmu,
2691 old_cpu, new_cpu);
2692 box->cpu = new_cpu;
2693 } else {
2694 box->cpu = -1;
2695 }
2696 }
2697 }
2698}
2699
2700static void __cpuinit uncore_event_exit_cpu(int cpu)
2701{
2702 int i, phys_id, target;
2703
2704 /* if exiting cpu is used for collecting uncore events */
2705 if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask))
2706 return;
2707
2708 /* find a new cpu to collect uncore events */
2709 phys_id = topology_physical_package_id(cpu);
2710 target = -1;
2711 for_each_online_cpu(i) {
2712 if (i == cpu)
2713 continue;
2714 if (phys_id == topology_physical_package_id(i)) {
2715 target = i;
2716 break;
2717 }
2718 }
2719
2720 /* migrate uncore events to the new cpu */
2721 if (target >= 0)
2722 cpumask_set_cpu(target, &uncore_cpu_mask);
2723
2724 uncore_change_context(msr_uncores, cpu, target);
2725 uncore_change_context(pci_uncores, cpu, target);
2726}
2727
2728static void __cpuinit uncore_event_init_cpu(int cpu)
2729{
2730 int i, phys_id;
2731
2732 phys_id = topology_physical_package_id(cpu);
2733 for_each_cpu(i, &uncore_cpu_mask) {
2734 if (phys_id == topology_physical_package_id(i))
2735 return;
2736 }
2737
2738 cpumask_set_cpu(cpu, &uncore_cpu_mask);
2739
2740 uncore_change_context(msr_uncores, -1, cpu);
2741 uncore_change_context(pci_uncores, -1, cpu);
2742}
2743
2744static int
2745 __cpuinit uncore_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
2746{
2747 unsigned int cpu = (long)hcpu;
2748
2749 /* allocate/free data structure for uncore box */
2750 switch (action & ~CPU_TASKS_FROZEN) {
2751 case CPU_UP_PREPARE:
2752 uncore_cpu_prepare(cpu, -1);
2753 break;
2754 case CPU_STARTING:
2755 uncore_cpu_starting(cpu);
2756 break;
2757 case CPU_UP_CANCELED:
2758 case CPU_DYING:
2759 uncore_cpu_dying(cpu);
2760 break;
2761 default:
2762 break;
2763 }
2764
2765 /* select the cpu that collects uncore events */
2766 switch (action & ~CPU_TASKS_FROZEN) {
2767 case CPU_DOWN_FAILED:
2768 case CPU_STARTING:
2769 uncore_event_init_cpu(cpu);
2770 break;
2771 case CPU_DOWN_PREPARE:
2772 uncore_event_exit_cpu(cpu);
2773 break;
2774 default:
2775 break;
2776 }
2777
2778 return NOTIFY_OK;
2779}
2780
2781static struct notifier_block uncore_cpu_nb __cpuinitdata = {
2782 .notifier_call = uncore_cpu_notifier,
2783 /*
2784 * to migrate uncore events, our notifier should be executed
2785 * before perf core's notifier.
2786 */
2787 .priority = CPU_PRI_PERF + 1,
2788};
2789
2790static void __init uncore_cpu_setup(void *dummy)
2791{
2792 uncore_cpu_starting(smp_processor_id());
2793}
2794
2795static int __init uncore_cpu_init(void)
2796{
2797 int ret, cpu, max_cores;
2798
2799 max_cores = boot_cpu_data.x86_max_cores;
2800 switch (boot_cpu_data.x86_model) {
2801 case 26: /* Nehalem */
2802 case 30:
2803 case 37: /* Westmere */
2804 case 44:
2805 msr_uncores = nhm_msr_uncores;
2806 break;
2807 case 42: /* Sandy Bridge */
2808 if (snb_uncore_cbox.num_boxes > max_cores)
2809 snb_uncore_cbox.num_boxes = max_cores;
2810 msr_uncores = snb_msr_uncores;
2811 break;
2812 case 45: /* Sandy Birdge-EP */
2813 if (snbep_uncore_cbox.num_boxes > max_cores)
2814 snbep_uncore_cbox.num_boxes = max_cores;
2815 msr_uncores = snbep_msr_uncores;
2816 break;
2817 case 46:
2818 msr_uncores = nhmex_msr_uncores;
2819 break;
2820 default:
2821 return 0;
2822 }
2823
2824 ret = uncore_types_init(msr_uncores);
2825 if (ret)
2826 return ret;
2827
2828 get_online_cpus();
2829
2830 for_each_online_cpu(cpu) {
2831 int i, phys_id = topology_physical_package_id(cpu);
2832
2833 for_each_cpu(i, &uncore_cpu_mask) {
2834 if (phys_id == topology_physical_package_id(i)) {
2835 phys_id = -1;
2836 break;
2837 }
2838 }
2839 if (phys_id < 0)
2840 continue;
2841
2842 uncore_cpu_prepare(cpu, phys_id);
2843 uncore_event_init_cpu(cpu);
2844 }
2845 on_each_cpu(uncore_cpu_setup, NULL, 1);
2846
2847 register_cpu_notifier(&uncore_cpu_nb);
2848
2849 put_online_cpus();
2850
2851 return 0;
2852}
2853
2854static int __init uncore_pmus_register(void)
2855{
2856 struct intel_uncore_pmu *pmu;
2857 struct intel_uncore_type *type;
2858 int i, j;
2859
2860 for (i = 0; msr_uncores[i]; i++) {
2861 type = msr_uncores[i];
2862 for (j = 0; j < type->num_boxes; j++) {
2863 pmu = &type->pmus[j];
2864 uncore_pmu_register(pmu);
2865 }
2866 }
2867
2868 for (i = 0; pci_uncores[i]; i++) {
2869 type = pci_uncores[i];
2870 for (j = 0; j < type->num_boxes; j++) {
2871 pmu = &type->pmus[j];
2872 uncore_pmu_register(pmu);
2873 }
2874 }
2875
2876 return 0;
2877}
2878
2879static int __init intel_uncore_init(void)
2880{
2881 int ret;
2882
2883 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
2884 return -ENODEV;
2885
2886 ret = uncore_pci_init();
2887 if (ret)
2888 goto fail;
2889 ret = uncore_cpu_init();
2890 if (ret) {
2891 uncore_pci_exit();
2892 goto fail;
2893 }
2894
2895 uncore_pmus_register();
2896 return 0;
2897fail:
2898 return ret;
2899}
2900device_initcall(intel_uncore_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
new file mode 100644
index 000000000000..f3851892e077
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -0,0 +1,621 @@
1#include <linux/module.h>
2#include <linux/slab.h>
3#include <linux/pci.h>
4#include <linux/perf_event.h>
5#include "perf_event.h"
6
7#define UNCORE_PMU_NAME_LEN 32
8#define UNCORE_PMU_HRTIMER_INTERVAL (60 * NSEC_PER_SEC)
9
10#define UNCORE_FIXED_EVENT 0xff
11#define UNCORE_PMC_IDX_MAX_GENERIC 8
12#define UNCORE_PMC_IDX_FIXED UNCORE_PMC_IDX_MAX_GENERIC
13#define UNCORE_PMC_IDX_MAX (UNCORE_PMC_IDX_FIXED + 1)
14
15#define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff)
16
17/* SNB event control */
18#define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff
19#define SNB_UNC_CTL_UMASK_MASK 0x0000ff00
20#define SNB_UNC_CTL_EDGE_DET (1 << 18)
21#define SNB_UNC_CTL_EN (1 << 22)
22#define SNB_UNC_CTL_INVERT (1 << 23)
23#define SNB_UNC_CTL_CMASK_MASK 0x1f000000
24#define NHM_UNC_CTL_CMASK_MASK 0xff000000
25#define NHM_UNC_FIXED_CTR_CTL_EN (1 << 0)
26
27#define SNB_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \
28 SNB_UNC_CTL_UMASK_MASK | \
29 SNB_UNC_CTL_EDGE_DET | \
30 SNB_UNC_CTL_INVERT | \
31 SNB_UNC_CTL_CMASK_MASK)
32
33#define NHM_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \
34 SNB_UNC_CTL_UMASK_MASK | \
35 SNB_UNC_CTL_EDGE_DET | \
36 SNB_UNC_CTL_INVERT | \
37 NHM_UNC_CTL_CMASK_MASK)
38
39/* SNB global control register */
40#define SNB_UNC_PERF_GLOBAL_CTL 0x391
41#define SNB_UNC_FIXED_CTR_CTRL 0x394
42#define SNB_UNC_FIXED_CTR 0x395
43
44/* SNB uncore global control */
45#define SNB_UNC_GLOBAL_CTL_CORE_ALL ((1 << 4) - 1)
46#define SNB_UNC_GLOBAL_CTL_EN (1 << 29)
47
48/* SNB Cbo register */
49#define SNB_UNC_CBO_0_PERFEVTSEL0 0x700
50#define SNB_UNC_CBO_0_PER_CTR0 0x706
51#define SNB_UNC_CBO_MSR_OFFSET 0x10
52
53/* NHM global control register */
54#define NHM_UNC_PERF_GLOBAL_CTL 0x391
55#define NHM_UNC_FIXED_CTR 0x394
56#define NHM_UNC_FIXED_CTR_CTRL 0x395
57
58/* NHM uncore global control */
59#define NHM_UNC_GLOBAL_CTL_EN_PC_ALL ((1ULL << 8) - 1)
60#define NHM_UNC_GLOBAL_CTL_EN_FC (1ULL << 32)
61
62/* NHM uncore register */
63#define NHM_UNC_PERFEVTSEL0 0x3c0
64#define NHM_UNC_UNCORE_PMC0 0x3b0
65
66/* SNB-EP Box level control */
67#define SNBEP_PMON_BOX_CTL_RST_CTRL (1 << 0)
68#define SNBEP_PMON_BOX_CTL_RST_CTRS (1 << 1)
69#define SNBEP_PMON_BOX_CTL_FRZ (1 << 8)
70#define SNBEP_PMON_BOX_CTL_FRZ_EN (1 << 16)
71#define SNBEP_PMON_BOX_CTL_INT (SNBEP_PMON_BOX_CTL_RST_CTRL | \
72 SNBEP_PMON_BOX_CTL_RST_CTRS | \
73 SNBEP_PMON_BOX_CTL_FRZ_EN)
74/* SNB-EP event control */
75#define SNBEP_PMON_CTL_EV_SEL_MASK 0x000000ff
76#define SNBEP_PMON_CTL_UMASK_MASK 0x0000ff00
77#define SNBEP_PMON_CTL_RST (1 << 17)
78#define SNBEP_PMON_CTL_EDGE_DET (1 << 18)
79#define SNBEP_PMON_CTL_EV_SEL_EXT (1 << 21) /* only for QPI */
80#define SNBEP_PMON_CTL_EN (1 << 22)
81#define SNBEP_PMON_CTL_INVERT (1 << 23)
82#define SNBEP_PMON_CTL_TRESH_MASK 0xff000000
83#define SNBEP_PMON_RAW_EVENT_MASK (SNBEP_PMON_CTL_EV_SEL_MASK | \
84 SNBEP_PMON_CTL_UMASK_MASK | \
85 SNBEP_PMON_CTL_EDGE_DET | \
86 SNBEP_PMON_CTL_INVERT | \
87 SNBEP_PMON_CTL_TRESH_MASK)
88
89/* SNB-EP Ubox event control */
90#define SNBEP_U_MSR_PMON_CTL_TRESH_MASK 0x1f000000
91#define SNBEP_U_MSR_PMON_RAW_EVENT_MASK \
92 (SNBEP_PMON_CTL_EV_SEL_MASK | \
93 SNBEP_PMON_CTL_UMASK_MASK | \
94 SNBEP_PMON_CTL_EDGE_DET | \
95 SNBEP_PMON_CTL_INVERT | \
96 SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
97
98#define SNBEP_CBO_PMON_CTL_TID_EN (1 << 19)
99#define SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK (SNBEP_PMON_RAW_EVENT_MASK | \
100 SNBEP_CBO_PMON_CTL_TID_EN)
101
102/* SNB-EP PCU event control */
103#define SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK 0x0000c000
104#define SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK 0x1f000000
105#define SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT (1 << 30)
106#define SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET (1 << 31)
107#define SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK \
108 (SNBEP_PMON_CTL_EV_SEL_MASK | \
109 SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
110 SNBEP_PMON_CTL_EDGE_DET | \
111 SNBEP_PMON_CTL_INVERT | \
112 SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
113 SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
114 SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
115
116#define SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK \
117 (SNBEP_PMON_RAW_EVENT_MASK | \
118 SNBEP_PMON_CTL_EV_SEL_EXT)
119
120/* SNB-EP pci control register */
121#define SNBEP_PCI_PMON_BOX_CTL 0xf4
122#define SNBEP_PCI_PMON_CTL0 0xd8
123/* SNB-EP pci counter register */
124#define SNBEP_PCI_PMON_CTR0 0xa0
125
126/* SNB-EP home agent register */
127#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH0 0x40
128#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH1 0x44
129#define SNBEP_HA_PCI_PMON_BOX_OPCODEMATCH 0x48
130/* SNB-EP memory controller register */
131#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTL 0xf0
132#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTR 0xd0
133/* SNB-EP QPI register */
134#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH0 0x228
135#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH1 0x22c
136#define SNBEP_Q_Py_PCI_PMON_PKT_MASK0 0x238
137#define SNBEP_Q_Py_PCI_PMON_PKT_MASK1 0x23c
138
139/* SNB-EP Ubox register */
140#define SNBEP_U_MSR_PMON_CTR0 0xc16
141#define SNBEP_U_MSR_PMON_CTL0 0xc10
142
143#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTL 0xc08
144#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTR 0xc09
145
146/* SNB-EP Cbo register */
147#define SNBEP_C0_MSR_PMON_CTR0 0xd16
148#define SNBEP_C0_MSR_PMON_CTL0 0xd10
149#define SNBEP_C0_MSR_PMON_BOX_CTL 0xd04
150#define SNBEP_C0_MSR_PMON_BOX_FILTER 0xd14
151#define SNBEP_CB0_MSR_PMON_BOX_FILTER_MASK 0xfffffc1f
152#define SNBEP_CBO_MSR_OFFSET 0x20
153
154/* SNB-EP PCU register */
155#define SNBEP_PCU_MSR_PMON_CTR0 0xc36
156#define SNBEP_PCU_MSR_PMON_CTL0 0xc30
157#define SNBEP_PCU_MSR_PMON_BOX_CTL 0xc24
158#define SNBEP_PCU_MSR_PMON_BOX_FILTER 0xc34
159#define SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK 0xffffffff
160#define SNBEP_PCU_MSR_CORE_C3_CTR 0x3fc
161#define SNBEP_PCU_MSR_CORE_C6_CTR 0x3fd
162
163/* NHM-EX event control */
164#define NHMEX_PMON_CTL_EV_SEL_MASK 0x000000ff
165#define NHMEX_PMON_CTL_UMASK_MASK 0x0000ff00
166#define NHMEX_PMON_CTL_EN_BIT0 (1 << 0)
167#define NHMEX_PMON_CTL_EDGE_DET (1 << 18)
168#define NHMEX_PMON_CTL_PMI_EN (1 << 20)
169#define NHMEX_PMON_CTL_EN_BIT22 (1 << 22)
170#define NHMEX_PMON_CTL_INVERT (1 << 23)
171#define NHMEX_PMON_CTL_TRESH_MASK 0xff000000
172#define NHMEX_PMON_RAW_EVENT_MASK (NHMEX_PMON_CTL_EV_SEL_MASK | \
173 NHMEX_PMON_CTL_UMASK_MASK | \
174 NHMEX_PMON_CTL_EDGE_DET | \
175 NHMEX_PMON_CTL_INVERT | \
176 NHMEX_PMON_CTL_TRESH_MASK)
177
178/* NHM-EX Ubox */
179#define NHMEX_U_MSR_PMON_GLOBAL_CTL 0xc00
180#define NHMEX_U_MSR_PMON_CTR 0xc11
181#define NHMEX_U_MSR_PMON_EV_SEL 0xc10
182
183#define NHMEX_U_PMON_GLOBAL_EN (1 << 0)
184#define NHMEX_U_PMON_GLOBAL_PMI_CORE_SEL 0x0000001e
185#define NHMEX_U_PMON_GLOBAL_EN_ALL (1 << 28)
186#define NHMEX_U_PMON_GLOBAL_RST_ALL (1 << 29)
187#define NHMEX_U_PMON_GLOBAL_FRZ_ALL (1 << 31)
188
189#define NHMEX_U_PMON_RAW_EVENT_MASK \
190 (NHMEX_PMON_CTL_EV_SEL_MASK | \
191 NHMEX_PMON_CTL_EDGE_DET)
192
193/* NHM-EX Cbox */
194#define NHMEX_C0_MSR_PMON_GLOBAL_CTL 0xd00
195#define NHMEX_C0_MSR_PMON_CTR0 0xd11
196#define NHMEX_C0_MSR_PMON_EV_SEL0 0xd10
197#define NHMEX_C_MSR_OFFSET 0x20
198
199/* NHM-EX Bbox */
200#define NHMEX_B0_MSR_PMON_GLOBAL_CTL 0xc20
201#define NHMEX_B0_MSR_PMON_CTR0 0xc31
202#define NHMEX_B0_MSR_PMON_CTL0 0xc30
203#define NHMEX_B_MSR_OFFSET 0x40
204#define NHMEX_B0_MSR_MATCH 0xe45
205#define NHMEX_B0_MSR_MASK 0xe46
206#define NHMEX_B1_MSR_MATCH 0xe4d
207#define NHMEX_B1_MSR_MASK 0xe4e
208
209#define NHMEX_B_PMON_CTL_EN (1 << 0)
210#define NHMEX_B_PMON_CTL_EV_SEL_SHIFT 1
211#define NHMEX_B_PMON_CTL_EV_SEL_MASK \
212 (0x1f << NHMEX_B_PMON_CTL_EV_SEL_SHIFT)
213#define NHMEX_B_PMON_CTR_SHIFT 6
214#define NHMEX_B_PMON_CTR_MASK \
215 (0x3 << NHMEX_B_PMON_CTR_SHIFT)
216#define NHMEX_B_PMON_RAW_EVENT_MASK \
217 (NHMEX_B_PMON_CTL_EV_SEL_MASK | \
218 NHMEX_B_PMON_CTR_MASK)
219
220/* NHM-EX Sbox */
221#define NHMEX_S0_MSR_PMON_GLOBAL_CTL 0xc40
222#define NHMEX_S0_MSR_PMON_CTR0 0xc51
223#define NHMEX_S0_MSR_PMON_CTL0 0xc50
224#define NHMEX_S_MSR_OFFSET 0x80
225#define NHMEX_S0_MSR_MM_CFG 0xe48
226#define NHMEX_S0_MSR_MATCH 0xe49
227#define NHMEX_S0_MSR_MASK 0xe4a
228#define NHMEX_S1_MSR_MM_CFG 0xe58
229#define NHMEX_S1_MSR_MATCH 0xe59
230#define NHMEX_S1_MSR_MASK 0xe5a
231
232#define NHMEX_S_PMON_MM_CFG_EN (0x1ULL << 63)
233
234/* NHM-EX Mbox */
235#define NHMEX_M0_MSR_GLOBAL_CTL 0xca0
236#define NHMEX_M0_MSR_PMU_DSP 0xca5
237#define NHMEX_M0_MSR_PMU_ISS 0xca6
238#define NHMEX_M0_MSR_PMU_MAP 0xca7
239#define NHMEX_M0_MSR_PMU_MSC_THR 0xca8
240#define NHMEX_M0_MSR_PMU_PGT 0xca9
241#define NHMEX_M0_MSR_PMU_PLD 0xcaa
242#define NHMEX_M0_MSR_PMU_ZDP_CTL_FVC 0xcab
243#define NHMEX_M0_MSR_PMU_CTL0 0xcb0
244#define NHMEX_M0_MSR_PMU_CNT0 0xcb1
245#define NHMEX_M_MSR_OFFSET 0x40
246#define NHMEX_M0_MSR_PMU_MM_CFG 0xe54
247#define NHMEX_M1_MSR_PMU_MM_CFG 0xe5c
248
249#define NHMEX_M_PMON_MM_CFG_EN (1ULL << 63)
250#define NHMEX_M_PMON_ADDR_MATCH_MASK 0x3ffffffffULL
251#define NHMEX_M_PMON_ADDR_MASK_MASK 0x7ffffffULL
252#define NHMEX_M_PMON_ADDR_MASK_SHIFT 34
253
254#define NHMEX_M_PMON_CTL_EN (1 << 0)
255#define NHMEX_M_PMON_CTL_PMI_EN (1 << 1)
256#define NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT 2
257#define NHMEX_M_PMON_CTL_COUNT_MODE_MASK \
258 (0x3 << NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT)
259#define NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT 4
260#define NHMEX_M_PMON_CTL_STORAGE_MODE_MASK \
261 (0x3 << NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT)
262#define NHMEX_M_PMON_CTL_WRAP_MODE (1 << 6)
263#define NHMEX_M_PMON_CTL_FLAG_MODE (1 << 7)
264#define NHMEX_M_PMON_CTL_INC_SEL_SHIFT 9
265#define NHMEX_M_PMON_CTL_INC_SEL_MASK \
266 (0x1f << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
267#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT 19
268#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK \
269 (0x7 << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT)
270#define NHMEX_M_PMON_RAW_EVENT_MASK \
271 (NHMEX_M_PMON_CTL_COUNT_MODE_MASK | \
272 NHMEX_M_PMON_CTL_STORAGE_MODE_MASK | \
273 NHMEX_M_PMON_CTL_WRAP_MODE | \
274 NHMEX_M_PMON_CTL_FLAG_MODE | \
275 NHMEX_M_PMON_CTL_INC_SEL_MASK | \
276 NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK)
277
278
279#define NHMEX_M_PMON_ZDP_CTL_FVC_FVID_MASK 0x1f
280#define NHMEX_M_PMON_ZDP_CTL_FVC_BCMD_MASK (0x7 << 5)
281#define NHMEX_M_PMON_ZDP_CTL_FVC_RSP_MASK (0x7 << 8)
282#define NHMEX_M_PMON_ZDP_CTL_FVC_PBOX_INIT_ERR (1 << 23)
283#define NHMEX_M_PMON_ZDP_CTL_FVC_MASK \
284 (NHMEX_M_PMON_ZDP_CTL_FVC_FVID_MASK | \
285 NHMEX_M_PMON_ZDP_CTL_FVC_BCMD_MASK | \
286 NHMEX_M_PMON_ZDP_CTL_FVC_RSP_MASK | \
287 NHMEX_M_PMON_ZDP_CTL_FVC_PBOX_INIT_ERR)
288#define NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7 << (11 + 3 * (n)))
289
290/*
291 * use the 9~13 bits to select event If the 7th bit is not set,
292 * otherwise use the 19~21 bits to select event.
293 */
294#define MBOX_INC_SEL(x) ((x) << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
295#define MBOX_SET_FLAG_SEL(x) (((x) << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT) | \
296 NHMEX_M_PMON_CTL_FLAG_MODE)
297#define MBOX_INC_SEL_MASK (NHMEX_M_PMON_CTL_INC_SEL_MASK | \
298 NHMEX_M_PMON_CTL_FLAG_MODE)
299#define MBOX_SET_FLAG_SEL_MASK (NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK | \
300 NHMEX_M_PMON_CTL_FLAG_MODE)
301#define MBOX_INC_SEL_EXTAR_REG(c, r) \
302 EVENT_EXTRA_REG(MBOX_INC_SEL(c), NHMEX_M0_MSR_PMU_##r, \
303 MBOX_INC_SEL_MASK, (u64)-1, NHMEX_M_##r)
304#define MBOX_SET_FLAG_SEL_EXTRA_REG(c, r) \
305 EVENT_EXTRA_REG(MBOX_SET_FLAG_SEL(c), NHMEX_M0_MSR_PMU_##r, \
306 MBOX_SET_FLAG_SEL_MASK, \
307 (u64)-1, NHMEX_M_##r)
308
309/* NHM-EX Rbox */
310#define NHMEX_R_MSR_GLOBAL_CTL 0xe00
311#define NHMEX_R_MSR_PMON_CTL0 0xe10
312#define NHMEX_R_MSR_PMON_CNT0 0xe11
313#define NHMEX_R_MSR_OFFSET 0x20
314
315#define NHMEX_R_MSR_PORTN_QLX_CFG(n) \
316 ((n) < 4 ? (0xe0c + (n)) : (0xe2c + (n) - 4))
317#define NHMEX_R_MSR_PORTN_IPERF_CFG0(n) (0xe04 + (n))
318#define NHMEX_R_MSR_PORTN_IPERF_CFG1(n) (0xe24 + (n))
319#define NHMEX_R_MSR_PORTN_XBR_OFFSET(n) \
320 (((n) < 4 ? 0 : 0x10) + (n) * 4)
321#define NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) \
322 (0xe60 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
323#define NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(n) \
324 (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 1)
325#define NHMEX_R_MSR_PORTN_XBR_SET1_MASK(n) \
326 (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 2)
327#define NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) \
328 (0xe70 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
329#define NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(n) \
330 (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 1)
331#define NHMEX_R_MSR_PORTN_XBR_SET2_MASK(n) \
332 (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 2)
333
334#define NHMEX_R_PMON_CTL_EN (1 << 0)
335#define NHMEX_R_PMON_CTL_EV_SEL_SHIFT 1
336#define NHMEX_R_PMON_CTL_EV_SEL_MASK \
337 (0x1f << NHMEX_R_PMON_CTL_EV_SEL_SHIFT)
338#define NHMEX_R_PMON_CTL_PMI_EN (1 << 6)
339#define NHMEX_R_PMON_RAW_EVENT_MASK NHMEX_R_PMON_CTL_EV_SEL_MASK
340
341/* NHM-EX Wbox */
342#define NHMEX_W_MSR_GLOBAL_CTL 0xc80
343#define NHMEX_W_MSR_PMON_CNT0 0xc90
344#define NHMEX_W_MSR_PMON_EVT_SEL0 0xc91
345#define NHMEX_W_MSR_PMON_FIXED_CTR 0x394
346#define NHMEX_W_MSR_PMON_FIXED_CTL 0x395
347
348#define NHMEX_W_PMON_GLOBAL_FIXED_EN (1ULL << 31)
349
350struct intel_uncore_ops;
351struct intel_uncore_pmu;
352struct intel_uncore_box;
353struct uncore_event_desc;
354
355struct intel_uncore_type {
356 const char *name;
357 int num_counters;
358 int num_boxes;
359 int perf_ctr_bits;
360 int fixed_ctr_bits;
361 unsigned perf_ctr;
362 unsigned event_ctl;
363 unsigned event_mask;
364 unsigned fixed_ctr;
365 unsigned fixed_ctl;
366 unsigned box_ctl;
367 unsigned msr_offset;
368 unsigned num_shared_regs:8;
369 unsigned single_fixed:1;
370 unsigned pair_ctr_ctl:1;
371 struct event_constraint unconstrainted;
372 struct event_constraint *constraints;
373 struct intel_uncore_pmu *pmus;
374 struct intel_uncore_ops *ops;
375 struct uncore_event_desc *event_descs;
376 const struct attribute_group *attr_groups[3];
377};
378
379#define format_group attr_groups[0]
380
381struct intel_uncore_ops {
382 void (*init_box)(struct intel_uncore_box *);
383 void (*disable_box)(struct intel_uncore_box *);
384 void (*enable_box)(struct intel_uncore_box *);
385 void (*disable_event)(struct intel_uncore_box *, struct perf_event *);
386 void (*enable_event)(struct intel_uncore_box *, struct perf_event *);
387 u64 (*read_counter)(struct intel_uncore_box *, struct perf_event *);
388 int (*hw_config)(struct intel_uncore_box *, struct perf_event *);
389 struct event_constraint *(*get_constraint)(struct intel_uncore_box *,
390 struct perf_event *);
391 void (*put_constraint)(struct intel_uncore_box *, struct perf_event *);
392};
393
394struct intel_uncore_pmu {
395 struct pmu pmu;
396 char name[UNCORE_PMU_NAME_LEN];
397 int pmu_idx;
398 int func_id;
399 struct intel_uncore_type *type;
400 struct intel_uncore_box ** __percpu box;
401 struct list_head box_list;
402};
403
404struct intel_uncore_extra_reg {
405 raw_spinlock_t lock;
406 u64 config, config1, config2;
407 atomic_t ref;
408};
409
410struct intel_uncore_box {
411 int phys_id;
412 int n_active; /* number of active events */
413 int n_events;
414 int cpu; /* cpu to collect events */
415 unsigned long flags;
416 atomic_t refcnt;
417 struct perf_event *events[UNCORE_PMC_IDX_MAX];
418 struct perf_event *event_list[UNCORE_PMC_IDX_MAX];
419 unsigned long active_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
420 u64 tags[UNCORE_PMC_IDX_MAX];
421 struct pci_dev *pci_dev;
422 struct intel_uncore_pmu *pmu;
423 struct hrtimer hrtimer;
424 struct list_head list;
425 struct intel_uncore_extra_reg shared_regs[0];
426};
427
428#define UNCORE_BOX_FLAG_INITIATED 0
429
430struct uncore_event_desc {
431 struct kobj_attribute attr;
432 const char *config;
433};
434
435#define INTEL_UNCORE_EVENT_DESC(_name, _config) \
436{ \
437 .attr = __ATTR(_name, 0444, uncore_event_show, NULL), \
438 .config = _config, \
439}
440
441#define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format) \
442static ssize_t __uncore_##_var##_show(struct kobject *kobj, \
443 struct kobj_attribute *attr, \
444 char *page) \
445{ \
446 BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \
447 return sprintf(page, _format "\n"); \
448} \
449static struct kobj_attribute format_attr_##_var = \
450 __ATTR(_name, 0444, __uncore_##_var##_show, NULL)
451
452
453static ssize_t uncore_event_show(struct kobject *kobj,
454 struct kobj_attribute *attr, char *buf)
455{
456 struct uncore_event_desc *event =
457 container_of(attr, struct uncore_event_desc, attr);
458 return sprintf(buf, "%s", event->config);
459}
460
461static inline unsigned uncore_pci_box_ctl(struct intel_uncore_box *box)
462{
463 return box->pmu->type->box_ctl;
464}
465
466static inline unsigned uncore_pci_fixed_ctl(struct intel_uncore_box *box)
467{
468 return box->pmu->type->fixed_ctl;
469}
470
471static inline unsigned uncore_pci_fixed_ctr(struct intel_uncore_box *box)
472{
473 return box->pmu->type->fixed_ctr;
474}
475
476static inline
477unsigned uncore_pci_event_ctl(struct intel_uncore_box *box, int idx)
478{
479 return idx * 4 + box->pmu->type->event_ctl;
480}
481
482static inline
483unsigned uncore_pci_perf_ctr(struct intel_uncore_box *box, int idx)
484{
485 return idx * 8 + box->pmu->type->perf_ctr;
486}
487
488static inline
489unsigned uncore_msr_box_ctl(struct intel_uncore_box *box)
490{
491 if (!box->pmu->type->box_ctl)
492 return 0;
493 return box->pmu->type->box_ctl +
494 box->pmu->type->msr_offset * box->pmu->pmu_idx;
495}
496
497static inline
498unsigned uncore_msr_fixed_ctl(struct intel_uncore_box *box)
499{
500 if (!box->pmu->type->fixed_ctl)
501 return 0;
502 return box->pmu->type->fixed_ctl +
503 box->pmu->type->msr_offset * box->pmu->pmu_idx;
504}
505
506static inline
507unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box)
508{
509 return box->pmu->type->fixed_ctr +
510 box->pmu->type->msr_offset * box->pmu->pmu_idx;
511}
512
513static inline
514unsigned uncore_msr_event_ctl(struct intel_uncore_box *box, int idx)
515{
516 return box->pmu->type->event_ctl +
517 (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
518 box->pmu->type->msr_offset * box->pmu->pmu_idx;
519}
520
521static inline
522unsigned uncore_msr_perf_ctr(struct intel_uncore_box *box, int idx)
523{
524 return box->pmu->type->perf_ctr +
525 (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
526 box->pmu->type->msr_offset * box->pmu->pmu_idx;
527}
528
529static inline
530unsigned uncore_fixed_ctl(struct intel_uncore_box *box)
531{
532 if (box->pci_dev)
533 return uncore_pci_fixed_ctl(box);
534 else
535 return uncore_msr_fixed_ctl(box);
536}
537
538static inline
539unsigned uncore_fixed_ctr(struct intel_uncore_box *box)
540{
541 if (box->pci_dev)
542 return uncore_pci_fixed_ctr(box);
543 else
544 return uncore_msr_fixed_ctr(box);
545}
546
547static inline
548unsigned uncore_event_ctl(struct intel_uncore_box *box, int idx)
549{
550 if (box->pci_dev)
551 return uncore_pci_event_ctl(box, idx);
552 else
553 return uncore_msr_event_ctl(box, idx);
554}
555
556static inline
557unsigned uncore_perf_ctr(struct intel_uncore_box *box, int idx)
558{
559 if (box->pci_dev)
560 return uncore_pci_perf_ctr(box, idx);
561 else
562 return uncore_msr_perf_ctr(box, idx);
563}
564
565static inline int uncore_perf_ctr_bits(struct intel_uncore_box *box)
566{
567 return box->pmu->type->perf_ctr_bits;
568}
569
570static inline int uncore_fixed_ctr_bits(struct intel_uncore_box *box)
571{
572 return box->pmu->type->fixed_ctr_bits;
573}
574
575static inline int uncore_num_counters(struct intel_uncore_box *box)
576{
577 return box->pmu->type->num_counters;
578}
579
580static inline void uncore_disable_box(struct intel_uncore_box *box)
581{
582 if (box->pmu->type->ops->disable_box)
583 box->pmu->type->ops->disable_box(box);
584}
585
586static inline void uncore_enable_box(struct intel_uncore_box *box)
587{
588 if (box->pmu->type->ops->enable_box)
589 box->pmu->type->ops->enable_box(box);
590}
591
592static inline void uncore_disable_event(struct intel_uncore_box *box,
593 struct perf_event *event)
594{
595 box->pmu->type->ops->disable_event(box, event);
596}
597
598static inline void uncore_enable_event(struct intel_uncore_box *box,
599 struct perf_event *event)
600{
601 box->pmu->type->ops->enable_event(box, event);
602}
603
604static inline u64 uncore_read_counter(struct intel_uncore_box *box,
605 struct perf_event *event)
606{
607 return box->pmu->type->ops->read_counter(box, event);
608}
609
610static inline void uncore_box_init(struct intel_uncore_box *box)
611{
612 if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) {
613 if (box->pmu->type->ops->init_box)
614 box->pmu->type->ops->init_box(box);
615 }
616}
617
618static inline bool uncore_box_is_fake(struct intel_uncore_box *box)
619{
620 return (box->phys_id < 0);
621}
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 47124a73dd73..92c7e39a079f 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -895,8 +895,8 @@ static void p4_pmu_disable_pebs(void)
895 * So at moment let leave metrics turned on forever -- it's 895 * So at moment let leave metrics turned on forever -- it's
896 * ok for now but need to be revisited! 896 * ok for now but need to be revisited!
897 * 897 *
898 * (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)0); 898 * (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, (u64)0);
899 * (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)0); 899 * (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, (u64)0);
900 */ 900 */
901} 901}
902 902
@@ -909,7 +909,7 @@ static inline void p4_pmu_disable_event(struct perf_event *event)
909 * state we need to clear P4_CCCR_OVF, otherwise interrupt get 909 * state we need to clear P4_CCCR_OVF, otherwise interrupt get
910 * asserted again and again 910 * asserted again and again
911 */ 911 */
912 (void)checking_wrmsrl(hwc->config_base, 912 (void)wrmsrl_safe(hwc->config_base,
913 (u64)(p4_config_unpack_cccr(hwc->config)) & 913 (u64)(p4_config_unpack_cccr(hwc->config)) &
914 ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED); 914 ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
915} 915}
@@ -943,8 +943,8 @@ static void p4_pmu_enable_pebs(u64 config)
943 943
944 bind = &p4_pebs_bind_map[idx]; 944 bind = &p4_pebs_bind_map[idx];
945 945
946 (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs); 946 (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs);
947 (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert); 947 (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert);
948} 948}
949 949
950static void p4_pmu_enable_event(struct perf_event *event) 950static void p4_pmu_enable_event(struct perf_event *event)
@@ -978,8 +978,8 @@ static void p4_pmu_enable_event(struct perf_event *event)
978 */ 978 */
979 p4_pmu_enable_pebs(hwc->config); 979 p4_pmu_enable_pebs(hwc->config);
980 980
981 (void)checking_wrmsrl(escr_addr, escr_conf); 981 (void)wrmsrl_safe(escr_addr, escr_conf);
982 (void)checking_wrmsrl(hwc->config_base, 982 (void)wrmsrl_safe(hwc->config_base,
983 (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE); 983 (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
984} 984}
985 985
@@ -1325,7 +1325,7 @@ __init int p4_pmu_init(void)
1325 unsigned int low, high; 1325 unsigned int low, high;
1326 1326
1327 /* If we get stripped -- indexing fails */ 1327 /* If we get stripped -- indexing fails */
1328 BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC); 1328 BUILD_BUG_ON(ARCH_P4_MAX_CCCR > INTEL_PMC_MAX_GENERIC);
1329 1329
1330 rdmsr(MSR_IA32_MISC_ENABLE, low, high); 1330 rdmsr(MSR_IA32_MISC_ENABLE, low, high);
1331 if (!(low & (1 << 7))) { 1331 if (!(low & (1 << 7))) {
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index 32bcfc7dd230..e4dd0f7a0453 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -71,7 +71,7 @@ p6_pmu_disable_event(struct perf_event *event)
71 if (cpuc->enabled) 71 if (cpuc->enabled)
72 val |= ARCH_PERFMON_EVENTSEL_ENABLE; 72 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
73 73
74 (void)checking_wrmsrl(hwc->config_base, val); 74 (void)wrmsrl_safe(hwc->config_base, val);
75} 75}
76 76
77static void p6_pmu_enable_event(struct perf_event *event) 77static void p6_pmu_enable_event(struct perf_event *event)
@@ -84,7 +84,7 @@ static void p6_pmu_enable_event(struct perf_event *event)
84 if (cpuc->enabled) 84 if (cpuc->enabled)
85 val |= ARCH_PERFMON_EVENTSEL_ENABLE; 85 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
86 86
87 (void)checking_wrmsrl(hwc->config_base, val); 87 (void)wrmsrl_safe(hwc->config_base, val);
88} 88}
89 89
90PMU_FORMAT_ATTR(event, "config:0-7" ); 90PMU_FORMAT_ATTR(event, "config:0-7" );
diff --git a/arch/x86/kernel/cpu/sched.c b/arch/x86/kernel/cpu/sched.c
deleted file mode 100644
index a640ae5ad201..000000000000
--- a/arch/x86/kernel/cpu/sched.c
+++ /dev/null
@@ -1,55 +0,0 @@
1#include <linux/sched.h>
2#include <linux/math64.h>
3#include <linux/percpu.h>
4#include <linux/irqflags.h>
5
6#include <asm/cpufeature.h>
7#include <asm/processor.h>
8
9#ifdef CONFIG_SMP
10
11static DEFINE_PER_CPU(struct aperfmperf, old_perf_sched);
12
13static unsigned long scale_aperfmperf(void)
14{
15 struct aperfmperf val, *old = &__get_cpu_var(old_perf_sched);
16 unsigned long ratio, flags;
17
18 local_irq_save(flags);
19 get_aperfmperf(&val);
20 local_irq_restore(flags);
21
22 ratio = calc_aperfmperf_ratio(old, &val);
23 *old = val;
24
25 return ratio;
26}
27
28unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu)
29{
30 /*
31 * do aperf/mperf on the cpu level because it includes things
32 * like turbo mode, which are relevant to full cores.
33 */
34 if (boot_cpu_has(X86_FEATURE_APERFMPERF))
35 return scale_aperfmperf();
36
37 /*
38 * maybe have something cpufreq here
39 */
40
41 return default_scale_freq_power(sd, cpu);
42}
43
44unsigned long arch_scale_smt_power(struct sched_domain *sd, int cpu)
45{
46 /*
47 * aperf/mperf already includes the smt gain
48 */
49 if (boot_cpu_has(X86_FEATURE_APERFMPERF))
50 return SCHED_LOAD_SCALE;
51
52 return default_scale_smt_power(sd, cpu);
53}
54
55#endif
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 571246d81edf..ae42418bc50f 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -27,8 +27,8 @@ static int die_counter;
27 27
28void printk_address(unsigned long address, int reliable) 28void printk_address(unsigned long address, int reliable)
29{ 29{
30 printk(" [<%p>] %s%pB\n", (void *) address, 30 pr_cont(" [<%p>] %s%pB\n",
31 reliable ? "" : "? ", (void *) address); 31 (void *)address, reliable ? "" : "? ", (void *)address);
32} 32}
33 33
34#ifdef CONFIG_FUNCTION_GRAPH_TRACER 34#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -271,6 +271,7 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
271 current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP) 271 current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)
272 return 1; 272 return 1;
273 273
274 print_modules();
274 show_regs(regs); 275 show_regs(regs);
275#ifdef CONFIG_X86_32 276#ifdef CONFIG_X86_32
276 if (user_mode_vm(regs)) { 277 if (user_mode_vm(regs)) {
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index e0b1d783daab..1038a417ea53 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -73,11 +73,11 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
73 if (kstack_end(stack)) 73 if (kstack_end(stack))
74 break; 74 break;
75 if (i && ((i % STACKSLOTS_PER_LINE) == 0)) 75 if (i && ((i % STACKSLOTS_PER_LINE) == 0))
76 printk(KERN_CONT "\n"); 76 pr_cont("\n");
77 printk(KERN_CONT " %08lx", *stack++); 77 pr_cont(" %08lx", *stack++);
78 touch_nmi_watchdog(); 78 touch_nmi_watchdog();
79 } 79 }
80 printk(KERN_CONT "\n"); 80 pr_cont("\n");
81 show_trace_log_lvl(task, regs, sp, bp, log_lvl); 81 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
82} 82}
83 83
@@ -86,12 +86,11 @@ void show_regs(struct pt_regs *regs)
86{ 86{
87 int i; 87 int i;
88 88
89 print_modules();
90 __show_regs(regs, !user_mode_vm(regs)); 89 __show_regs(regs, !user_mode_vm(regs));
91 90
92 printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n", 91 pr_emerg("Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n",
93 TASK_COMM_LEN, current->comm, task_pid_nr(current), 92 TASK_COMM_LEN, current->comm, task_pid_nr(current),
94 current_thread_info(), current, task_thread_info(current)); 93 current_thread_info(), current, task_thread_info(current));
95 /* 94 /*
96 * When in-kernel, we also print out the stack and code at the 95 * When in-kernel, we also print out the stack and code at the
97 * time of the fault.. 96 * time of the fault..
@@ -102,10 +101,10 @@ void show_regs(struct pt_regs *regs)
102 unsigned char c; 101 unsigned char c;
103 u8 *ip; 102 u8 *ip;
104 103
105 printk(KERN_EMERG "Stack:\n"); 104 pr_emerg("Stack:\n");
106 show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG); 105 show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG);
107 106
108 printk(KERN_EMERG "Code: "); 107 pr_emerg("Code:");
109 108
110 ip = (u8 *)regs->ip - code_prologue; 109 ip = (u8 *)regs->ip - code_prologue;
111 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { 110 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
@@ -116,16 +115,16 @@ void show_regs(struct pt_regs *regs)
116 for (i = 0; i < code_len; i++, ip++) { 115 for (i = 0; i < code_len; i++, ip++) {
117 if (ip < (u8 *)PAGE_OFFSET || 116 if (ip < (u8 *)PAGE_OFFSET ||
118 probe_kernel_address(ip, c)) { 117 probe_kernel_address(ip, c)) {
119 printk(KERN_CONT " Bad EIP value."); 118 pr_cont(" Bad EIP value.");
120 break; 119 break;
121 } 120 }
122 if (ip == (u8 *)regs->ip) 121 if (ip == (u8 *)regs->ip)
123 printk(KERN_CONT "<%02x> ", c); 122 pr_cont(" <%02x>", c);
124 else 123 else
125 printk(KERN_CONT "%02x ", c); 124 pr_cont(" %02x", c);
126 } 125 }
127 } 126 }
128 printk(KERN_CONT "\n"); 127 pr_cont("\n");
129} 128}
130 129
131int is_valid_bugaddr(unsigned long ip) 130int is_valid_bugaddr(unsigned long ip)
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 791b76122aa8..b653675d5288 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -228,20 +228,20 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
228 if (stack >= irq_stack && stack <= irq_stack_end) { 228 if (stack >= irq_stack && stack <= irq_stack_end) {
229 if (stack == irq_stack_end) { 229 if (stack == irq_stack_end) {
230 stack = (unsigned long *) (irq_stack_end[-1]); 230 stack = (unsigned long *) (irq_stack_end[-1]);
231 printk(KERN_CONT " <EOI> "); 231 pr_cont(" <EOI> ");
232 } 232 }
233 } else { 233 } else {
234 if (((long) stack & (THREAD_SIZE-1)) == 0) 234 if (((long) stack & (THREAD_SIZE-1)) == 0)
235 break; 235 break;
236 } 236 }
237 if (i && ((i % STACKSLOTS_PER_LINE) == 0)) 237 if (i && ((i % STACKSLOTS_PER_LINE) == 0))
238 printk(KERN_CONT "\n"); 238 pr_cont("\n");
239 printk(KERN_CONT " %016lx", *stack++); 239 pr_cont(" %016lx", *stack++);
240 touch_nmi_watchdog(); 240 touch_nmi_watchdog();
241 } 241 }
242 preempt_enable(); 242 preempt_enable();
243 243
244 printk(KERN_CONT "\n"); 244 pr_cont("\n");
245 show_trace_log_lvl(task, regs, sp, bp, log_lvl); 245 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
246} 246}
247 247
@@ -254,10 +254,9 @@ void show_regs(struct pt_regs *regs)
254 254
255 sp = regs->sp; 255 sp = regs->sp;
256 printk("CPU %d ", cpu); 256 printk("CPU %d ", cpu);
257 print_modules();
258 __show_regs(regs, 1); 257 __show_regs(regs, 1);
259 printk("Process %s (pid: %d, threadinfo %p, task %p)\n", 258 printk(KERN_DEFAULT "Process %s (pid: %d, threadinfo %p, task %p)\n",
260 cur->comm, cur->pid, task_thread_info(cur), cur); 259 cur->comm, cur->pid, task_thread_info(cur), cur);
261 260
262 /* 261 /*
263 * When in-kernel, we also print out the stack and code at the 262 * When in-kernel, we also print out the stack and code at the
@@ -284,16 +283,16 @@ void show_regs(struct pt_regs *regs)
284 for (i = 0; i < code_len; i++, ip++) { 283 for (i = 0; i < code_len; i++, ip++) {
285 if (ip < (u8 *)PAGE_OFFSET || 284 if (ip < (u8 *)PAGE_OFFSET ||
286 probe_kernel_address(ip, c)) { 285 probe_kernel_address(ip, c)) {
287 printk(KERN_CONT " Bad RIP value."); 286 pr_cont(" Bad RIP value.");
288 break; 287 break;
289 } 288 }
290 if (ip == (u8 *)regs->ip) 289 if (ip == (u8 *)regs->ip)
291 printk(KERN_CONT "<%02x> ", c); 290 pr_cont("<%02x> ", c);
292 else 291 else
293 printk(KERN_CONT "%02x ", c); 292 pr_cont("%02x ", c);
294 } 293 }
295 } 294 }
296 printk(KERN_CONT "\n"); 295 pr_cont("\n");
297} 296}
298 297
299int is_valid_bugaddr(unsigned long ip) 298int is_valid_bugaddr(unsigned long ip)
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 41857970517f..ed858e9e9a74 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -944,7 +944,7 @@ void __init e820_reserve_resources(void)
944 for (i = 0; i < e820_saved.nr_map; i++) { 944 for (i = 0; i < e820_saved.nr_map; i++) {
945 struct e820entry *entry = &e820_saved.map[i]; 945 struct e820entry *entry = &e820_saved.map[i];
946 firmware_map_add_early(entry->addr, 946 firmware_map_add_early(entry->addr,
947 entry->addr + entry->size - 1, 947 entry->addr + entry->size,
948 e820_type_to_string(entry->type)); 948 e820_type_to_string(entry->type));
949 } 949 }
950} 950}
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 7d65133b51be..69babd8c834f 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1048,24 +1048,6 @@ apicinterrupt LOCAL_TIMER_VECTOR \
1048apicinterrupt X86_PLATFORM_IPI_VECTOR \ 1048apicinterrupt X86_PLATFORM_IPI_VECTOR \
1049 x86_platform_ipi smp_x86_platform_ipi 1049 x86_platform_ipi smp_x86_platform_ipi
1050 1050
1051#ifdef CONFIG_SMP
1052 ALIGN
1053 INTR_FRAME
1054.irp idx,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
1055 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
1056.if NUM_INVALIDATE_TLB_VECTORS > \idx
1057ENTRY(invalidate_interrupt\idx)
1058 pushq_cfi $~(INVALIDATE_TLB_VECTOR_START+\idx)
1059 jmp .Lcommon_invalidate_interrupt0
1060 CFI_ADJUST_CFA_OFFSET -8
1061END(invalidate_interrupt\idx)
1062.endif
1063.endr
1064 CFI_ENDPROC
1065apicinterrupt INVALIDATE_TLB_VECTOR_START, \
1066 invalidate_interrupt0, smp_invalidate_interrupt
1067#endif
1068
1069apicinterrupt THRESHOLD_APIC_VECTOR \ 1051apicinterrupt THRESHOLD_APIC_VECTOR \
1070 threshold_interrupt smp_threshold_interrupt 1052 threshold_interrupt smp_threshold_interrupt
1071apicinterrupt THERMAL_APIC_VECTOR \ 1053apicinterrupt THERMAL_APIC_VECTOR \
@@ -1758,10 +1740,30 @@ end_repeat_nmi:
1758 */ 1740 */
1759 call save_paranoid 1741 call save_paranoid
1760 DEFAULT_FRAME 0 1742 DEFAULT_FRAME 0
1743
1744 /*
1745 * Save off the CR2 register. If we take a page fault in the NMI then
1746 * it could corrupt the CR2 value. If the NMI preempts a page fault
1747 * handler before it was able to read the CR2 register, and then the
1748 * NMI itself takes a page fault, the page fault that was preempted
1749 * will read the information from the NMI page fault and not the
1750 * origin fault. Save it off and restore it if it changes.
1751 * Use the r12 callee-saved register.
1752 */
1753 movq %cr2, %r12
1754
1761 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ 1755 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
1762 movq %rsp,%rdi 1756 movq %rsp,%rdi
1763 movq $-1,%rsi 1757 movq $-1,%rsi
1764 call do_nmi 1758 call do_nmi
1759
1760 /* Did the NMI take a page fault? Restore cr2 if it did */
1761 movq %cr2, %rcx
1762 cmpq %rcx, %r12
1763 je 1f
1764 movq %r12, %cr2
17651:
1766
1765 testl %ebx,%ebx /* swapgs needed? */ 1767 testl %ebx,%ebx /* swapgs needed? */
1766 jnz nmi_restore 1768 jnz nmi_restore
1767nmi_swapgs: 1769nmi_swapgs:
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 3dafc6003b7c..1f5f1d5d2a02 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -294,9 +294,9 @@ void fixup_irqs(void)
294 raw_spin_unlock(&desc->lock); 294 raw_spin_unlock(&desc->lock);
295 295
296 if (break_affinity && set_affinity) 296 if (break_affinity && set_affinity)
297 printk("Broke affinity for irq %i\n", irq); 297 pr_notice("Broke affinity for irq %i\n", irq);
298 else if (!set_affinity) 298 else if (!set_affinity)
299 printk("Cannot set affinity for irq %i\n", irq); 299 pr_notice("Cannot set affinity for irq %i\n", irq);
300 } 300 }
301 301
302 /* 302 /*
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 252981afd6c4..6e03b0d69138 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -171,79 +171,6 @@ static void __init smp_intr_init(void)
171 */ 171 */
172 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); 172 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
173 173
174 /* IPIs for invalidation */
175#define ALLOC_INVTLB_VEC(NR) \
176 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+NR, \
177 invalidate_interrupt##NR)
178
179 switch (NUM_INVALIDATE_TLB_VECTORS) {
180 default:
181 ALLOC_INVTLB_VEC(31);
182 case 31:
183 ALLOC_INVTLB_VEC(30);
184 case 30:
185 ALLOC_INVTLB_VEC(29);
186 case 29:
187 ALLOC_INVTLB_VEC(28);
188 case 28:
189 ALLOC_INVTLB_VEC(27);
190 case 27:
191 ALLOC_INVTLB_VEC(26);
192 case 26:
193 ALLOC_INVTLB_VEC(25);
194 case 25:
195 ALLOC_INVTLB_VEC(24);
196 case 24:
197 ALLOC_INVTLB_VEC(23);
198 case 23:
199 ALLOC_INVTLB_VEC(22);
200 case 22:
201 ALLOC_INVTLB_VEC(21);
202 case 21:
203 ALLOC_INVTLB_VEC(20);
204 case 20:
205 ALLOC_INVTLB_VEC(19);
206 case 19:
207 ALLOC_INVTLB_VEC(18);
208 case 18:
209 ALLOC_INVTLB_VEC(17);
210 case 17:
211 ALLOC_INVTLB_VEC(16);
212 case 16:
213 ALLOC_INVTLB_VEC(15);
214 case 15:
215 ALLOC_INVTLB_VEC(14);
216 case 14:
217 ALLOC_INVTLB_VEC(13);
218 case 13:
219 ALLOC_INVTLB_VEC(12);
220 case 12:
221 ALLOC_INVTLB_VEC(11);
222 case 11:
223 ALLOC_INVTLB_VEC(10);
224 case 10:
225 ALLOC_INVTLB_VEC(9);
226 case 9:
227 ALLOC_INVTLB_VEC(8);
228 case 8:
229 ALLOC_INVTLB_VEC(7);
230 case 7:
231 ALLOC_INVTLB_VEC(6);
232 case 6:
233 ALLOC_INVTLB_VEC(5);
234 case 5:
235 ALLOC_INVTLB_VEC(4);
236 case 4:
237 ALLOC_INVTLB_VEC(3);
238 case 3:
239 ALLOC_INVTLB_VEC(2);
240 case 2:
241 ALLOC_INVTLB_VEC(1);
242 case 1:
243 ALLOC_INVTLB_VEC(0);
244 break;
245 }
246
247 /* IPI for generic function call */ 174 /* IPI for generic function call */
248 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); 175 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
249 176
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index e554e5ad2fe8..c1d61ee4b4f1 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -39,6 +39,9 @@
39#include <asm/desc.h> 39#include <asm/desc.h>
40#include <asm/tlbflush.h> 40#include <asm/tlbflush.h>
41#include <asm/idle.h> 41#include <asm/idle.h>
42#include <asm/apic.h>
43#include <asm/apicdef.h>
44#include <asm/hypervisor.h>
42 45
43static int kvmapf = 1; 46static int kvmapf = 1;
44 47
@@ -283,6 +286,22 @@ static void kvm_register_steal_time(void)
283 cpu, __pa(st)); 286 cpu, __pa(st));
284} 287}
285 288
289static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
290
291static void kvm_guest_apic_eoi_write(u32 reg, u32 val)
292{
293 /**
294 * This relies on __test_and_clear_bit to modify the memory
295 * in a way that is atomic with respect to the local CPU.
296 * The hypervisor only accesses this memory from the local CPU so
297 * there's no need for lock or memory barriers.
298 * An optimization barrier is implied in apic write.
299 */
300 if (__test_and_clear_bit(KVM_PV_EOI_BIT, &__get_cpu_var(kvm_apic_eoi)))
301 return;
302 apic_write(APIC_EOI, APIC_EOI_ACK);
303}
304
286void __cpuinit kvm_guest_cpu_init(void) 305void __cpuinit kvm_guest_cpu_init(void)
287{ 306{
288 if (!kvm_para_available()) 307 if (!kvm_para_available())
@@ -300,11 +319,20 @@ void __cpuinit kvm_guest_cpu_init(void)
300 smp_processor_id()); 319 smp_processor_id());
301 } 320 }
302 321
322 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) {
323 unsigned long pa;
324 /* Size alignment is implied but just to make it explicit. */
325 BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4);
326 __get_cpu_var(kvm_apic_eoi) = 0;
327 pa = __pa(&__get_cpu_var(kvm_apic_eoi)) | KVM_MSR_ENABLED;
328 wrmsrl(MSR_KVM_PV_EOI_EN, pa);
329 }
330
303 if (has_steal_clock) 331 if (has_steal_clock)
304 kvm_register_steal_time(); 332 kvm_register_steal_time();
305} 333}
306 334
307static void kvm_pv_disable_apf(void *unused) 335static void kvm_pv_disable_apf(void)
308{ 336{
309 if (!__get_cpu_var(apf_reason).enabled) 337 if (!__get_cpu_var(apf_reason).enabled)
310 return; 338 return;
@@ -316,11 +344,23 @@ static void kvm_pv_disable_apf(void *unused)
316 smp_processor_id()); 344 smp_processor_id());
317} 345}
318 346
347static void kvm_pv_guest_cpu_reboot(void *unused)
348{
349 /*
350 * We disable PV EOI before we load a new kernel by kexec,
351 * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory.
352 * New kernel can re-enable when it boots.
353 */
354 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
355 wrmsrl(MSR_KVM_PV_EOI_EN, 0);
356 kvm_pv_disable_apf();
357}
358
319static int kvm_pv_reboot_notify(struct notifier_block *nb, 359static int kvm_pv_reboot_notify(struct notifier_block *nb,
320 unsigned long code, void *unused) 360 unsigned long code, void *unused)
321{ 361{
322 if (code == SYS_RESTART) 362 if (code == SYS_RESTART)
323 on_each_cpu(kvm_pv_disable_apf, NULL, 1); 363 on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1);
324 return NOTIFY_DONE; 364 return NOTIFY_DONE;
325} 365}
326 366
@@ -371,7 +411,9 @@ static void __cpuinit kvm_guest_cpu_online(void *dummy)
371static void kvm_guest_cpu_offline(void *dummy) 411static void kvm_guest_cpu_offline(void *dummy)
372{ 412{
373 kvm_disable_steal_time(); 413 kvm_disable_steal_time();
374 kvm_pv_disable_apf(NULL); 414 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
415 wrmsrl(MSR_KVM_PV_EOI_EN, 0);
416 kvm_pv_disable_apf();
375 apf_task_wake_all(); 417 apf_task_wake_all();
376} 418}
377 419
@@ -424,6 +466,9 @@ void __init kvm_guest_init(void)
424 pv_time_ops.steal_clock = kvm_steal_clock; 466 pv_time_ops.steal_clock = kvm_steal_clock;
425 } 467 }
426 468
469 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
470 apic_set_eoi_write(kvm_guest_apic_eoi_write);
471
427#ifdef CONFIG_SMP 472#ifdef CONFIG_SMP
428 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; 473 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
429 register_cpu_notifier(&kvm_cpu_notifier); 474 register_cpu_notifier(&kvm_cpu_notifier);
@@ -432,6 +477,19 @@ void __init kvm_guest_init(void)
432#endif 477#endif
433} 478}
434 479
480static bool __init kvm_detect(void)
481{
482 if (!kvm_para_available())
483 return false;
484 return true;
485}
486
487const struct hypervisor_x86 x86_hyper_kvm __refconst = {
488 .name = "KVM",
489 .detect = kvm_detect,
490};
491EXPORT_SYMBOL_GPL(x86_hyper_kvm);
492
435static __init int activate_jump_labels(void) 493static __init int activate_jump_labels(void)
436{ 494{
437 if (has_steal_clock) { 495 if (has_steal_clock) {
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index fbdfc6917180..4873e62db6a1 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -87,6 +87,7 @@
87#include <asm/microcode.h> 87#include <asm/microcode.h>
88#include <asm/processor.h> 88#include <asm/processor.h>
89#include <asm/cpu_device_id.h> 89#include <asm/cpu_device_id.h>
90#include <asm/perf_event.h>
90 91
91MODULE_DESCRIPTION("Microcode Update Driver"); 92MODULE_DESCRIPTION("Microcode Update Driver");
92MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); 93MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
@@ -277,7 +278,6 @@ static int reload_for_cpu(int cpu)
277 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 278 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
278 int err = 0; 279 int err = 0;
279 280
280 mutex_lock(&microcode_mutex);
281 if (uci->valid) { 281 if (uci->valid) {
282 enum ucode_state ustate; 282 enum ucode_state ustate;
283 283
@@ -288,7 +288,6 @@ static int reload_for_cpu(int cpu)
288 if (ustate == UCODE_ERROR) 288 if (ustate == UCODE_ERROR)
289 err = -EINVAL; 289 err = -EINVAL;
290 } 290 }
291 mutex_unlock(&microcode_mutex);
292 291
293 return err; 292 return err;
294} 293}
@@ -298,19 +297,31 @@ static ssize_t reload_store(struct device *dev,
298 const char *buf, size_t size) 297 const char *buf, size_t size)
299{ 298{
300 unsigned long val; 299 unsigned long val;
301 int cpu = dev->id; 300 int cpu;
302 ssize_t ret = 0; 301 ssize_t ret = 0, tmp_ret;
303 302
304 ret = kstrtoul(buf, 0, &val); 303 ret = kstrtoul(buf, 0, &val);
305 if (ret) 304 if (ret)
306 return ret; 305 return ret;
307 306
308 if (val == 1) { 307 if (val != 1)
309 get_online_cpus(); 308 return size;
310 if (cpu_online(cpu)) 309
311 ret = reload_for_cpu(cpu); 310 get_online_cpus();
312 put_online_cpus(); 311 mutex_lock(&microcode_mutex);
312 for_each_online_cpu(cpu) {
313 tmp_ret = reload_for_cpu(cpu);
314 if (tmp_ret != 0)
315 pr_warn("Error reloading microcode on CPU %d\n", cpu);
316
317 /* save retval of the first encountered reload error */
318 if (!ret)
319 ret = tmp_ret;
313 } 320 }
321 if (!ret)
322 perf_check_microcode();
323 mutex_unlock(&microcode_mutex);
324 put_online_cpus();
314 325
315 if (!ret) 326 if (!ret)
316 ret = size; 327 ret = size;
@@ -339,7 +350,6 @@ static DEVICE_ATTR(version, 0400, version_show, NULL);
339static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL); 350static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL);
340 351
341static struct attribute *mc_default_attrs[] = { 352static struct attribute *mc_default_attrs[] = {
342 &dev_attr_reload.attr,
343 &dev_attr_version.attr, 353 &dev_attr_version.attr,
344 &dev_attr_processor_flags.attr, 354 &dev_attr_processor_flags.attr,
345 NULL 355 NULL
@@ -504,7 +514,7 @@ static struct notifier_block __refdata mc_cpu_notifier = {
504 514
505#ifdef MODULE 515#ifdef MODULE
506/* Autoload on Intel and AMD systems */ 516/* Autoload on Intel and AMD systems */
507static const struct x86_cpu_id microcode_id[] = { 517static const struct x86_cpu_id __initconst microcode_id[] = {
508#ifdef CONFIG_MICROCODE_INTEL 518#ifdef CONFIG_MICROCODE_INTEL
509 { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, }, 519 { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, },
510#endif 520#endif
@@ -516,6 +526,16 @@ static const struct x86_cpu_id microcode_id[] = {
516MODULE_DEVICE_TABLE(x86cpu, microcode_id); 526MODULE_DEVICE_TABLE(x86cpu, microcode_id);
517#endif 527#endif
518 528
529static struct attribute *cpu_root_microcode_attrs[] = {
530 &dev_attr_reload.attr,
531 NULL
532};
533
534static struct attribute_group cpu_root_microcode_group = {
535 .name = "microcode",
536 .attrs = cpu_root_microcode_attrs,
537};
538
519static int __init microcode_init(void) 539static int __init microcode_init(void)
520{ 540{
521 struct cpuinfo_x86 *c = &cpu_data(0); 541 struct cpuinfo_x86 *c = &cpu_data(0);
@@ -540,16 +560,25 @@ static int __init microcode_init(void)
540 mutex_lock(&microcode_mutex); 560 mutex_lock(&microcode_mutex);
541 561
542 error = subsys_interface_register(&mc_cpu_interface); 562 error = subsys_interface_register(&mc_cpu_interface);
543 563 if (!error)
564 perf_check_microcode();
544 mutex_unlock(&microcode_mutex); 565 mutex_unlock(&microcode_mutex);
545 put_online_cpus(); 566 put_online_cpus();
546 567
547 if (error) 568 if (error)
548 goto out_pdev; 569 goto out_pdev;
549 570
571 error = sysfs_create_group(&cpu_subsys.dev_root->kobj,
572 &cpu_root_microcode_group);
573
574 if (error) {
575 pr_err("Error creating microcode group!\n");
576 goto out_driver;
577 }
578
550 error = microcode_dev_init(); 579 error = microcode_dev_init();
551 if (error) 580 if (error)
552 goto out_driver; 581 goto out_ucode_group;
553 582
554 register_syscore_ops(&mc_syscore_ops); 583 register_syscore_ops(&mc_syscore_ops);
555 register_hotcpu_notifier(&mc_cpu_notifier); 584 register_hotcpu_notifier(&mc_cpu_notifier);
@@ -559,7 +588,11 @@ static int __init microcode_init(void)
559 588
560 return 0; 589 return 0;
561 590
562out_driver: 591 out_ucode_group:
592 sysfs_remove_group(&cpu_subsys.dev_root->kobj,
593 &cpu_root_microcode_group);
594
595 out_driver:
563 get_online_cpus(); 596 get_online_cpus();
564 mutex_lock(&microcode_mutex); 597 mutex_lock(&microcode_mutex);
565 598
@@ -568,7 +601,7 @@ out_driver:
568 mutex_unlock(&microcode_mutex); 601 mutex_unlock(&microcode_mutex);
569 put_online_cpus(); 602 put_online_cpus();
570 603
571out_pdev: 604 out_pdev:
572 platform_device_unregister(microcode_pdev); 605 platform_device_unregister(microcode_pdev);
573 return error; 606 return error;
574 607
@@ -584,6 +617,9 @@ static void __exit microcode_exit(void)
584 unregister_hotcpu_notifier(&mc_cpu_notifier); 617 unregister_hotcpu_notifier(&mc_cpu_notifier);
585 unregister_syscore_ops(&mc_syscore_ops); 618 unregister_syscore_ops(&mc_syscore_ops);
586 619
620 sysfs_remove_group(&cpu_subsys.dev_root->kobj,
621 &cpu_root_microcode_group);
622
587 get_online_cpus(); 623 get_online_cpus();
588 mutex_lock(&microcode_mutex); 624 mutex_lock(&microcode_mutex);
589 625
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index f21fd94ac897..216a4d754b0c 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -15,6 +15,9 @@
15 along with this program; if not, write to the Free Software 15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/ 17*/
18
19#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
18#include <linux/moduleloader.h> 21#include <linux/moduleloader.h>
19#include <linux/elf.h> 22#include <linux/elf.h>
20#include <linux/vmalloc.h> 23#include <linux/vmalloc.h>
@@ -30,9 +33,14 @@
30#include <asm/pgtable.h> 33#include <asm/pgtable.h>
31 34
32#if 0 35#if 0
33#define DEBUGP printk 36#define DEBUGP(fmt, ...) \
37 printk(KERN_DEBUG fmt, ##__VA_ARGS__)
34#else 38#else
35#define DEBUGP(fmt...) 39#define DEBUGP(fmt, ...) \
40do { \
41 if (0) \
42 printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
43} while (0)
36#endif 44#endif
37 45
38void *module_alloc(unsigned long size) 46void *module_alloc(unsigned long size)
@@ -56,8 +64,8 @@ int apply_relocate(Elf32_Shdr *sechdrs,
56 Elf32_Sym *sym; 64 Elf32_Sym *sym;
57 uint32_t *location; 65 uint32_t *location;
58 66
59 DEBUGP("Applying relocate section %u to %u\n", relsec, 67 DEBUGP("Applying relocate section %u to %u\n",
60 sechdrs[relsec].sh_info); 68 relsec, sechdrs[relsec].sh_info);
61 for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { 69 for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
62 /* This is where to make the change */ 70 /* This is where to make the change */
63 location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr 71 location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
@@ -73,11 +81,11 @@ int apply_relocate(Elf32_Shdr *sechdrs,
73 *location += sym->st_value; 81 *location += sym->st_value;
74 break; 82 break;
75 case R_386_PC32: 83 case R_386_PC32:
76 /* Add the value, subtract its postition */ 84 /* Add the value, subtract its position */
77 *location += sym->st_value - (uint32_t)location; 85 *location += sym->st_value - (uint32_t)location;
78 break; 86 break;
79 default: 87 default:
80 printk(KERN_ERR "module %s: Unknown relocation: %u\n", 88 pr_err("%s: Unknown relocation: %u\n",
81 me->name, ELF32_R_TYPE(rel[i].r_info)); 89 me->name, ELF32_R_TYPE(rel[i].r_info));
82 return -ENOEXEC; 90 return -ENOEXEC;
83 } 91 }
@@ -97,8 +105,8 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
97 void *loc; 105 void *loc;
98 u64 val; 106 u64 val;
99 107
100 DEBUGP("Applying relocate section %u to %u\n", relsec, 108 DEBUGP("Applying relocate section %u to %u\n",
101 sechdrs[relsec].sh_info); 109 relsec, sechdrs[relsec].sh_info);
102 for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { 110 for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
103 /* This is where to make the change */ 111 /* This is where to make the change */
104 loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr 112 loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
@@ -110,8 +118,8 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
110 + ELF64_R_SYM(rel[i].r_info); 118 + ELF64_R_SYM(rel[i].r_info);
111 119
112 DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n", 120 DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n",
113 (int)ELF64_R_TYPE(rel[i].r_info), 121 (int)ELF64_R_TYPE(rel[i].r_info),
114 sym->st_value, rel[i].r_addend, (u64)loc); 122 sym->st_value, rel[i].r_addend, (u64)loc);
115 123
116 val = sym->st_value + rel[i].r_addend; 124 val = sym->st_value + rel[i].r_addend;
117 125
@@ -140,7 +148,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
140#endif 148#endif
141 break; 149 break;
142 default: 150 default:
143 printk(KERN_ERR "module %s: Unknown rela relocation: %llu\n", 151 pr_err("%s: Unknown rela relocation: %llu\n",
144 me->name, ELF64_R_TYPE(rel[i].r_info)); 152 me->name, ELF64_R_TYPE(rel[i].r_info));
145 return -ENOEXEC; 153 return -ENOEXEC;
146 } 154 }
@@ -148,9 +156,9 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
148 return 0; 156 return 0;
149 157
150overflow: 158overflow:
151 printk(KERN_ERR "overflow in relocation type %d val %Lx\n", 159 pr_err("overflow in relocation type %d val %Lx\n",
152 (int)ELF64_R_TYPE(rel[i].r_info), val); 160 (int)ELF64_R_TYPE(rel[i].r_info), val);
153 printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n", 161 pr_err("`%s' likely not compiled with -mcmodel=kernel\n",
154 me->name); 162 me->name);
155 return -ENOEXEC; 163 return -ENOEXEC;
156} 164}
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index a0b2f84457be..f84f5c57de35 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -365,8 +365,9 @@ static __kprobes void default_do_nmi(struct pt_regs *regs)
365#ifdef CONFIG_X86_32 365#ifdef CONFIG_X86_32
366/* 366/*
367 * For i386, NMIs use the same stack as the kernel, and we can 367 * For i386, NMIs use the same stack as the kernel, and we can
368 * add a workaround to the iret problem in C. Simply have 3 states 368 * add a workaround to the iret problem in C (preventing nested
369 * the NMI can be in. 369 * NMIs if an NMI takes a trap). Simply have 3 states the NMI
370 * can be in:
370 * 371 *
371 * 1) not running 372 * 1) not running
372 * 2) executing 373 * 2) executing
@@ -383,32 +384,50 @@ static __kprobes void default_do_nmi(struct pt_regs *regs)
383 * If an NMI hits a breakpoint that executes an iret, another 384 * If an NMI hits a breakpoint that executes an iret, another
384 * NMI can preempt it. We do not want to allow this new NMI 385 * NMI can preempt it. We do not want to allow this new NMI
385 * to run, but we want to execute it when the first one finishes. 386 * to run, but we want to execute it when the first one finishes.
386 * We set the state to "latched", and the first NMI will perform 387 * We set the state to "latched", and the exit of the first NMI will
387 * an cmpxchg on the state, and if it doesn't successfully 388 * perform a dec_return, if the result is zero (NOT_RUNNING), then
388 * reset the state to "not running" it will restart the next 389 * it will simply exit the NMI handler. If not, the dec_return
389 * NMI. 390 * would have set the state to NMI_EXECUTING (what we want it to
391 * be when we are running). In this case, we simply jump back
392 * to rerun the NMI handler again, and restart the 'latched' NMI.
393 *
394 * No trap (breakpoint or page fault) should be hit before nmi_restart,
395 * thus there is no race between the first check of state for NOT_RUNNING
396 * and setting it to NMI_EXECUTING. The HW will prevent nested NMIs
397 * at this point.
398 *
399 * In case the NMI takes a page fault, we need to save off the CR2
400 * because the NMI could have preempted another page fault and corrupt
401 * the CR2 that is about to be read. As nested NMIs must be restarted
402 * and they can not take breakpoints or page faults, the update of the
403 * CR2 must be done before converting the nmi state back to NOT_RUNNING.
404 * Otherwise, there would be a race of another nested NMI coming in
405 * after setting state to NOT_RUNNING but before updating the nmi_cr2.
390 */ 406 */
391enum nmi_states { 407enum nmi_states {
392 NMI_NOT_RUNNING, 408 NMI_NOT_RUNNING = 0,
393 NMI_EXECUTING, 409 NMI_EXECUTING,
394 NMI_LATCHED, 410 NMI_LATCHED,
395}; 411};
396static DEFINE_PER_CPU(enum nmi_states, nmi_state); 412static DEFINE_PER_CPU(enum nmi_states, nmi_state);
413static DEFINE_PER_CPU(unsigned long, nmi_cr2);
397 414
398#define nmi_nesting_preprocess(regs) \ 415#define nmi_nesting_preprocess(regs) \
399 do { \ 416 do { \
400 if (__get_cpu_var(nmi_state) != NMI_NOT_RUNNING) { \ 417 if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) { \
401 __get_cpu_var(nmi_state) = NMI_LATCHED; \ 418 this_cpu_write(nmi_state, NMI_LATCHED); \
402 return; \ 419 return; \
403 } \ 420 } \
404 nmi_restart: \ 421 this_cpu_write(nmi_state, NMI_EXECUTING); \
405 __get_cpu_var(nmi_state) = NMI_EXECUTING; \ 422 this_cpu_write(nmi_cr2, read_cr2()); \
406 } while (0) 423 } while (0); \
424 nmi_restart:
407 425
408#define nmi_nesting_postprocess() \ 426#define nmi_nesting_postprocess() \
409 do { \ 427 do { \
410 if (cmpxchg(&__get_cpu_var(nmi_state), \ 428 if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) \
411 NMI_EXECUTING, NMI_NOT_RUNNING) != NMI_EXECUTING) \ 429 write_cr2(this_cpu_read(nmi_cr2)); \
430 if (this_cpu_dec_return(nmi_state)) \
412 goto nmi_restart; \ 431 goto nmi_restart; \
413 } while (0) 432 } while (0)
414#else /* x86_64 */ 433#else /* x86_64 */
diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c
index 149b8d9c6ad4..6d9582ec0324 100644
--- a/arch/x86/kernel/nmi_selftest.c
+++ b/arch/x86/kernel/nmi_selftest.c
@@ -42,7 +42,8 @@ static int __init nmi_unk_cb(unsigned int val, struct pt_regs *regs)
42static void __init init_nmi_testsuite(void) 42static void __init init_nmi_testsuite(void)
43{ 43{
44 /* trap all the unknown NMIs we may generate */ 44 /* trap all the unknown NMIs we may generate */
45 register_nmi_handler_initonly(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk"); 45 register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk",
46 __initdata);
46} 47}
47 48
48static void __init cleanup_nmi_testsuite(void) 49static void __init cleanup_nmi_testsuite(void)
@@ -64,8 +65,8 @@ static void __init test_nmi_ipi(struct cpumask *mask)
64{ 65{
65 unsigned long timeout; 66 unsigned long timeout;
66 67
67 if (register_nmi_handler_initonly(NMI_LOCAL, test_nmi_ipi_callback, 68 if (register_nmi_handler(NMI_LOCAL, test_nmi_ipi_callback,
68 NMI_FLAG_FIRST, "nmi_selftest")) { 69 NMI_FLAG_FIRST, "nmi_selftest", __initdata)) {
69 nmi_fail = FAILURE; 70 nmi_fail = FAILURE;
70 return; 71 return;
71 } 72 }
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 9ce885996fd7..17fff18a1031 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -352,9 +352,7 @@ struct pv_cpu_ops pv_cpu_ops = {
352#endif 352#endif
353 .wbinvd = native_wbinvd, 353 .wbinvd = native_wbinvd,
354 .read_msr = native_read_msr_safe, 354 .read_msr = native_read_msr_safe,
355 .rdmsr_regs = native_rdmsr_safe_regs,
356 .write_msr = native_write_msr_safe, 355 .write_msr = native_write_msr_safe,
357 .wrmsr_regs = native_wrmsr_safe_regs,
358 .read_tsc = native_read_tsc, 356 .read_tsc = native_read_tsc,
359 .read_pmc = native_read_pmc, 357 .read_pmc = native_read_pmc,
360 .read_tscp = native_read_tscp, 358 .read_tscp = native_read_tscp,
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index b72838bae64a..299d49302e7d 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -22,6 +22,8 @@
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 */ 23 */
24 24
25#define pr_fmt(fmt) "Calgary: " fmt
26
25#include <linux/kernel.h> 27#include <linux/kernel.h>
26#include <linux/init.h> 28#include <linux/init.h>
27#include <linux/types.h> 29#include <linux/types.h>
@@ -245,7 +247,7 @@ static unsigned long iommu_range_alloc(struct device *dev,
245 offset = iommu_area_alloc(tbl->it_map, tbl->it_size, 0, 247 offset = iommu_area_alloc(tbl->it_map, tbl->it_size, 0,
246 npages, 0, boundary_size, 0); 248 npages, 0, boundary_size, 0);
247 if (offset == ~0UL) { 249 if (offset == ~0UL) {
248 printk(KERN_WARNING "Calgary: IOMMU full.\n"); 250 pr_warn("IOMMU full\n");
249 spin_unlock_irqrestore(&tbl->it_lock, flags); 251 spin_unlock_irqrestore(&tbl->it_lock, flags);
250 if (panic_on_overflow) 252 if (panic_on_overflow)
251 panic("Calgary: fix the allocator.\n"); 253 panic("Calgary: fix the allocator.\n");
@@ -271,8 +273,8 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
271 entry = iommu_range_alloc(dev, tbl, npages); 273 entry = iommu_range_alloc(dev, tbl, npages);
272 274
273 if (unlikely(entry == DMA_ERROR_CODE)) { 275 if (unlikely(entry == DMA_ERROR_CODE)) {
274 printk(KERN_WARNING "Calgary: failed to allocate %u pages in " 276 pr_warn("failed to allocate %u pages in iommu %p\n",
275 "iommu %p\n", npages, tbl); 277 npages, tbl);
276 return DMA_ERROR_CODE; 278 return DMA_ERROR_CODE;
277 } 279 }
278 280
@@ -561,8 +563,7 @@ static void calgary_tce_cache_blast(struct iommu_table *tbl)
561 i++; 563 i++;
562 } while ((val & 0xff) != 0xff && i < 100); 564 } while ((val & 0xff) != 0xff && i < 100);
563 if (i == 100) 565 if (i == 100)
564 printk(KERN_WARNING "Calgary: PCI bus not quiesced, " 566 pr_warn("PCI bus not quiesced, continuing anyway\n");
565 "continuing anyway\n");
566 567
567 /* invalidate TCE cache */ 568 /* invalidate TCE cache */
568 target = calgary_reg(bbar, tar_offset(tbl->it_busno)); 569 target = calgary_reg(bbar, tar_offset(tbl->it_busno));
@@ -604,8 +605,7 @@ begin:
604 i++; 605 i++;
605 } while ((val64 & 0xff) != 0xff && i < 100); 606 } while ((val64 & 0xff) != 0xff && i < 100);
606 if (i == 100) 607 if (i == 100)
607 printk(KERN_WARNING "CalIOC2: PCI bus not quiesced, " 608 pr_warn("CalIOC2: PCI bus not quiesced, continuing anyway\n");
608 "continuing anyway\n");
609 609
610 /* 3. poll Page Migration DEBUG for SoftStopFault */ 610 /* 3. poll Page Migration DEBUG for SoftStopFault */
611 target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG); 611 target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG);
@@ -617,8 +617,7 @@ begin:
617 if (++count < 100) 617 if (++count < 100)
618 goto begin; 618 goto begin;
619 else { 619 else {
620 printk(KERN_WARNING "CalIOC2: too many SoftStopFaults, " 620 pr_warn("CalIOC2: too many SoftStopFaults, aborting TCE cache flush sequence!\n");
621 "aborting TCE cache flush sequence!\n");
622 return; /* pray for the best */ 621 return; /* pray for the best */
623 } 622 }
624 } 623 }
@@ -840,8 +839,8 @@ static void calgary_dump_error_regs(struct iommu_table *tbl)
840 plssr = be32_to_cpu(readl(target)); 839 plssr = be32_to_cpu(readl(target));
841 840
842 /* If no error, the agent ID in the CSR is not valid */ 841 /* If no error, the agent ID in the CSR is not valid */
843 printk(KERN_EMERG "Calgary: DMA error on Calgary PHB 0x%x, " 842 pr_emerg("DMA error on Calgary PHB 0x%x, 0x%08x@CSR 0x%08x@PLSSR\n",
844 "0x%08x@CSR 0x%08x@PLSSR\n", tbl->it_busno, csr, plssr); 843 tbl->it_busno, csr, plssr);
845} 844}
846 845
847static void calioc2_dump_error_regs(struct iommu_table *tbl) 846static void calioc2_dump_error_regs(struct iommu_table *tbl)
@@ -867,22 +866,21 @@ static void calioc2_dump_error_regs(struct iommu_table *tbl)
867 target = calgary_reg(bbar, phboff | 0x800); 866 target = calgary_reg(bbar, phboff | 0x800);
868 mck = be32_to_cpu(readl(target)); 867 mck = be32_to_cpu(readl(target));
869 868
870 printk(KERN_EMERG "Calgary: DMA error on CalIOC2 PHB 0x%x\n", 869 pr_emerg("DMA error on CalIOC2 PHB 0x%x\n", tbl->it_busno);
871 tbl->it_busno);
872 870
873 printk(KERN_EMERG "Calgary: 0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n", 871 pr_emerg("0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n",
874 csr, plssr, csmr, mck); 872 csr, plssr, csmr, mck);
875 873
876 /* dump rest of error regs */ 874 /* dump rest of error regs */
877 printk(KERN_EMERG "Calgary: "); 875 pr_emerg("");
878 for (i = 0; i < ARRAY_SIZE(errregs); i++) { 876 for (i = 0; i < ARRAY_SIZE(errregs); i++) {
879 /* err regs are at 0x810 - 0x870 */ 877 /* err regs are at 0x810 - 0x870 */
880 erroff = (0x810 + (i * 0x10)); 878 erroff = (0x810 + (i * 0x10));
881 target = calgary_reg(bbar, phboff | erroff); 879 target = calgary_reg(bbar, phboff | erroff);
882 errregs[i] = be32_to_cpu(readl(target)); 880 errregs[i] = be32_to_cpu(readl(target));
883 printk("0x%08x@0x%lx ", errregs[i], erroff); 881 pr_cont("0x%08x@0x%lx ", errregs[i], erroff);
884 } 882 }
885 printk("\n"); 883 pr_cont("\n");
886 884
887 /* root complex status */ 885 /* root complex status */
888 target = calgary_reg(bbar, phboff | PHB_ROOT_COMPLEX_STATUS); 886 target = calgary_reg(bbar, phboff | PHB_ROOT_COMPLEX_STATUS);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index c0f420f76cd3..de2b7ad70273 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -45,15 +45,6 @@ int iommu_detected __read_mostly = 0;
45 */ 45 */
46int iommu_pass_through __read_mostly; 46int iommu_pass_through __read_mostly;
47 47
48/*
49 * Group multi-function PCI devices into a single device-group for the
50 * iommu_device_group interface. This tells the iommu driver to pretend
51 * it cannot distinguish between functions of a device, exposing only one
52 * group for the device. Useful for disallowing use of individual PCI
53 * functions from userspace drivers.
54 */
55int iommu_group_mf __read_mostly;
56
57extern struct iommu_table_entry __iommu_table[], __iommu_table_end[]; 48extern struct iommu_table_entry __iommu_table[], __iommu_table_end[];
58 49
59/* Dummy device used for NULL arguments (normally ISA). */ 50/* Dummy device used for NULL arguments (normally ISA). */
@@ -194,8 +185,6 @@ static __init int iommu_setup(char *p)
194#endif 185#endif
195 if (!strncmp(p, "pt", 2)) 186 if (!strncmp(p, "pt", 2))
196 iommu_pass_through = 1; 187 iommu_pass_through = 1;
197 if (!strncmp(p, "group_mf", 8))
198 iommu_group_mf = 1;
199 188
200 gart_parse_options(p); 189 gart_parse_options(p);
201 190
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 735279e54e59..ef6a8456f719 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -1,3 +1,5 @@
1#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
2
1#include <linux/errno.h> 3#include <linux/errno.h>
2#include <linux/kernel.h> 4#include <linux/kernel.h>
3#include <linux/mm.h> 5#include <linux/mm.h>
@@ -145,16 +147,14 @@ void show_regs_common(void)
145 /* Board Name is optional */ 147 /* Board Name is optional */
146 board = dmi_get_system_info(DMI_BOARD_NAME); 148 board = dmi_get_system_info(DMI_BOARD_NAME);
147 149
148 printk(KERN_CONT "\n"); 150 printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s %s %s%s%s\n",
149 printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s", 151 current->pid, current->comm, print_tainted(),
150 current->pid, current->comm, print_tainted(), 152 init_utsname()->release,
151 init_utsname()->release, 153 (int)strcspn(init_utsname()->version, " "),
152 (int)strcspn(init_utsname()->version, " "), 154 init_utsname()->version,
153 init_utsname()->version); 155 vendor, product,
154 printk(KERN_CONT " %s %s", vendor, product); 156 board ? "/" : "",
155 if (board) 157 board ? board : "");
156 printk(KERN_CONT "/%s", board);
157 printk(KERN_CONT "\n");
158} 158}
159 159
160void flush_thread(void) 160void flush_thread(void)
@@ -645,7 +645,7 @@ static void amd_e400_idle(void)
645 amd_e400_c1e_detected = true; 645 amd_e400_c1e_detected = true;
646 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) 646 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
647 mark_tsc_unstable("TSC halt in AMD C1E"); 647 mark_tsc_unstable("TSC halt in AMD C1E");
648 printk(KERN_INFO "System has AMD C1E enabled\n"); 648 pr_info("System has AMD C1E enabled\n");
649 } 649 }
650 } 650 }
651 651
@@ -659,8 +659,7 @@ static void amd_e400_idle(void)
659 */ 659 */
660 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, 660 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
661 &cpu); 661 &cpu);
662 printk(KERN_INFO "Switch to broadcast mode on CPU%d\n", 662 pr_info("Switch to broadcast mode on CPU%d\n", cpu);
663 cpu);
664 } 663 }
665 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); 664 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
666 665
@@ -681,8 +680,7 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
681{ 680{
682#ifdef CONFIG_SMP 681#ifdef CONFIG_SMP
683 if (pm_idle == poll_idle && smp_num_siblings > 1) { 682 if (pm_idle == poll_idle && smp_num_siblings > 1) {
684 printk_once(KERN_WARNING "WARNING: polling idle and HT enabled," 683 pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n");
685 " performance may degrade.\n");
686 } 684 }
687#endif 685#endif
688 if (pm_idle) 686 if (pm_idle)
@@ -692,11 +690,11 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
692 /* 690 /*
693 * One CPU supports mwait => All CPUs supports mwait 691 * One CPU supports mwait => All CPUs supports mwait
694 */ 692 */
695 printk(KERN_INFO "using mwait in idle threads.\n"); 693 pr_info("using mwait in idle threads\n");
696 pm_idle = mwait_idle; 694 pm_idle = mwait_idle;
697 } else if (cpu_has_amd_erratum(amd_erratum_400)) { 695 } else if (cpu_has_amd_erratum(amd_erratum_400)) {
698 /* E400: APIC timer interrupt does not wake up CPU from C1e */ 696 /* E400: APIC timer interrupt does not wake up CPU from C1e */
699 printk(KERN_INFO "using AMD E400 aware idle routine\n"); 697 pr_info("using AMD E400 aware idle routine\n");
700 pm_idle = amd_e400_idle; 698 pm_idle = amd_e400_idle;
701 } else 699 } else
702 pm_idle = default_idle; 700 pm_idle = default_idle;
@@ -715,7 +713,7 @@ static int __init idle_setup(char *str)
715 return -EINVAL; 713 return -EINVAL;
716 714
717 if (!strcmp(str, "poll")) { 715 if (!strcmp(str, "poll")) {
718 printk("using polling idle threads.\n"); 716 pr_info("using polling idle threads\n");
719 pm_idle = poll_idle; 717 pm_idle = poll_idle;
720 boot_option_idle_override = IDLE_POLL; 718 boot_option_idle_override = IDLE_POLL;
721 } else if (!strcmp(str, "mwait")) { 719 } else if (!strcmp(str, "mwait")) {
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 61cdf7fdf099..0a980c9d7cb8 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -117,10 +117,10 @@ void release_thread(struct task_struct *dead_task)
117{ 117{
118 if (dead_task->mm) { 118 if (dead_task->mm) {
119 if (dead_task->mm->context.size) { 119 if (dead_task->mm->context.size) {
120 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", 120 pr_warn("WARNING: dead process %8s still has LDT? <%p/%d>\n",
121 dead_task->comm, 121 dead_task->comm,
122 dead_task->mm->context.ldt, 122 dead_task->mm->context.ldt,
123 dead_task->mm->context.size); 123 dead_task->mm->context.size);
124 BUG(); 124 BUG();
125 } 125 }
126 } 126 }
@@ -466,7 +466,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
466 task->thread.gs = addr; 466 task->thread.gs = addr;
467 if (doit) { 467 if (doit) {
468 load_gs_index(0); 468 load_gs_index(0);
469 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); 469 ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr);
470 } 470 }
471 } 471 }
472 put_cpu(); 472 put_cpu();
@@ -494,7 +494,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
494 /* set the selector to 0 to not confuse 494 /* set the selector to 0 to not confuse
495 __switch_to */ 495 __switch_to */
496 loadsegment(fs, 0); 496 loadsegment(fs, 0);
497 ret = checking_wrmsrl(MSR_FS_BASE, addr); 497 ret = wrmsrl_safe(MSR_FS_BASE, addr);
498 } 498 }
499 } 499 }
500 put_cpu(); 500 put_cpu();
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 03920a15a632..1b27de563561 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -512,7 +512,7 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
512 512
513#if defined(CONFIG_PCI) && defined(CONFIG_NUMA) 513#if defined(CONFIG_PCI) && defined(CONFIG_NUMA)
514/* Set correct numa_node information for AMD NB functions */ 514/* Set correct numa_node information for AMD NB functions */
515static void __init quirk_amd_nb_node(struct pci_dev *dev) 515static void __devinit quirk_amd_nb_node(struct pci_dev *dev)
516{ 516{
517 struct pci_dev *nb_ht; 517 struct pci_dev *nb_ht;
518 unsigned int devfn; 518 unsigned int devfn;
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 5de92f1abd76..52190a938b4a 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -1,3 +1,5 @@
1#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
2
1#include <linux/module.h> 3#include <linux/module.h>
2#include <linux/reboot.h> 4#include <linux/reboot.h>
3#include <linux/init.h> 5#include <linux/init.h>
@@ -20,14 +22,12 @@
20#include <asm/virtext.h> 22#include <asm/virtext.h>
21#include <asm/cpu.h> 23#include <asm/cpu.h>
22#include <asm/nmi.h> 24#include <asm/nmi.h>
25#include <asm/smp.h>
23 26
24#ifdef CONFIG_X86_32 27#include <linux/ctype.h>
25# include <linux/ctype.h> 28#include <linux/mc146818rtc.h>
26# include <linux/mc146818rtc.h> 29#include <asm/realmode.h>
27# include <asm/realmode.h> 30#include <asm/x86_init.h>
28#else
29# include <asm/x86_init.h>
30#endif
31 31
32/* 32/*
33 * Power off function, if any 33 * Power off function, if any
@@ -49,7 +49,7 @@ int reboot_force;
49 */ 49 */
50static int reboot_default = 1; 50static int reboot_default = 1;
51 51
52#if defined(CONFIG_X86_32) && defined(CONFIG_SMP) 52#ifdef CONFIG_SMP
53static int reboot_cpu = -1; 53static int reboot_cpu = -1;
54#endif 54#endif
55 55
@@ -67,8 +67,8 @@ bool port_cf9_safe = false;
67 * reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci] 67 * reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci]
68 * warm Don't set the cold reboot flag 68 * warm Don't set the cold reboot flag
69 * cold Set the cold reboot flag 69 * cold Set the cold reboot flag
70 * bios Reboot by jumping through the BIOS (only for X86_32) 70 * bios Reboot by jumping through the BIOS
71 * smp Reboot by executing reset on BSP or other CPU (only for X86_32) 71 * smp Reboot by executing reset on BSP or other CPU
72 * triple Force a triple fault (init) 72 * triple Force a triple fault (init)
73 * kbd Use the keyboard controller. cold reset (default) 73 * kbd Use the keyboard controller. cold reset (default)
74 * acpi Use the RESET_REG in the FADT 74 * acpi Use the RESET_REG in the FADT
@@ -95,7 +95,6 @@ static int __init reboot_setup(char *str)
95 reboot_mode = 0; 95 reboot_mode = 0;
96 break; 96 break;
97 97
98#ifdef CONFIG_X86_32
99#ifdef CONFIG_SMP 98#ifdef CONFIG_SMP
100 case 's': 99 case 's':
101 if (isdigit(*(str+1))) { 100 if (isdigit(*(str+1))) {
@@ -112,7 +111,6 @@ static int __init reboot_setup(char *str)
112#endif /* CONFIG_SMP */ 111#endif /* CONFIG_SMP */
113 112
114 case 'b': 113 case 'b':
115#endif
116 case 'a': 114 case 'a':
117 case 'k': 115 case 'k':
118 case 't': 116 case 't':
@@ -138,7 +136,6 @@ static int __init reboot_setup(char *str)
138__setup("reboot=", reboot_setup); 136__setup("reboot=", reboot_setup);
139 137
140 138
141#ifdef CONFIG_X86_32
142/* 139/*
143 * Reboot options and system auto-detection code provided by 140 * Reboot options and system auto-detection code provided by
144 * Dell Inc. so their systems "just work". :-) 141 * Dell Inc. so their systems "just work". :-)
@@ -152,16 +149,14 @@ static int __init set_bios_reboot(const struct dmi_system_id *d)
152{ 149{
153 if (reboot_type != BOOT_BIOS) { 150 if (reboot_type != BOOT_BIOS) {
154 reboot_type = BOOT_BIOS; 151 reboot_type = BOOT_BIOS;
155 printk(KERN_INFO "%s series board detected. Selecting BIOS-method for reboots.\n", d->ident); 152 pr_info("%s series board detected. Selecting %s-method for reboots.\n",
153 "BIOS", d->ident);
156 } 154 }
157 return 0; 155 return 0;
158} 156}
159 157
160void machine_real_restart(unsigned int type) 158void __noreturn machine_real_restart(unsigned int type)
161{ 159{
162 void (*restart_lowmem)(unsigned int) = (void (*)(unsigned int))
163 real_mode_header->machine_real_restart_asm;
164
165 local_irq_disable(); 160 local_irq_disable();
166 161
167 /* 162 /*
@@ -181,25 +176,28 @@ void machine_real_restart(unsigned int type)
181 /* 176 /*
182 * Switch back to the initial page table. 177 * Switch back to the initial page table.
183 */ 178 */
179#ifdef CONFIG_X86_32
184 load_cr3(initial_page_table); 180 load_cr3(initial_page_table);
185 181#else
186 /* 182 write_cr3(real_mode_header->trampoline_pgd);
187 * Write 0x1234 to absolute memory location 0x472. The BIOS reads 183#endif
188 * this on booting to tell it to "Bypass memory test (also warm
189 * boot)". This seems like a fairly standard thing that gets set by
190 * REBOOT.COM programs, and the previous reset routine did this
191 * too. */
192 *((unsigned short *)0x472) = reboot_mode;
193 184
194 /* Jump to the identity-mapped low memory code */ 185 /* Jump to the identity-mapped low memory code */
195 restart_lowmem(type); 186#ifdef CONFIG_X86_32
187 asm volatile("jmpl *%0" : :
188 "rm" (real_mode_header->machine_real_restart_asm),
189 "a" (type));
190#else
191 asm volatile("ljmpl *%0" : :
192 "m" (real_mode_header->machine_real_restart_asm),
193 "D" (type));
194#endif
195 unreachable();
196} 196}
197#ifdef CONFIG_APM_MODULE 197#ifdef CONFIG_APM_MODULE
198EXPORT_SYMBOL(machine_real_restart); 198EXPORT_SYMBOL(machine_real_restart);
199#endif 199#endif
200 200
201#endif /* CONFIG_X86_32 */
202
203/* 201/*
204 * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot 202 * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot
205 */ 203 */
@@ -207,8 +205,8 @@ static int __init set_pci_reboot(const struct dmi_system_id *d)
207{ 205{
208 if (reboot_type != BOOT_CF9) { 206 if (reboot_type != BOOT_CF9) {
209 reboot_type = BOOT_CF9; 207 reboot_type = BOOT_CF9;
210 printk(KERN_INFO "%s series board detected. " 208 pr_info("%s series board detected. Selecting %s-method for reboots.\n",
211 "Selecting PCI-method for reboots.\n", d->ident); 209 "PCI", d->ident);
212 } 210 }
213 return 0; 211 return 0;
214} 212}
@@ -217,17 +215,16 @@ static int __init set_kbd_reboot(const struct dmi_system_id *d)
217{ 215{
218 if (reboot_type != BOOT_KBD) { 216 if (reboot_type != BOOT_KBD) {
219 reboot_type = BOOT_KBD; 217 reboot_type = BOOT_KBD;
220 printk(KERN_INFO "%s series board detected. Selecting KBD-method for reboot.\n", d->ident); 218 pr_info("%s series board detected. Selecting %s-method for reboot.\n",
219 "KBD", d->ident);
221 } 220 }
222 return 0; 221 return 0;
223} 222}
224 223
225/* 224/*
226 * This is a single dmi_table handling all reboot quirks. Note that 225 * This is a single dmi_table handling all reboot quirks.
227 * REBOOT_BIOS is only available for 32bit
228 */ 226 */
229static struct dmi_system_id __initdata reboot_dmi_table[] = { 227static struct dmi_system_id __initdata reboot_dmi_table[] = {
230#ifdef CONFIG_X86_32
231 { /* Handle problems with rebooting on Dell E520's */ 228 { /* Handle problems with rebooting on Dell E520's */
232 .callback = set_bios_reboot, 229 .callback = set_bios_reboot,
233 .ident = "Dell E520", 230 .ident = "Dell E520",
@@ -377,7 +374,6 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
377 DMI_MATCH(DMI_BOARD_NAME, "P4S800"), 374 DMI_MATCH(DMI_BOARD_NAME, "P4S800"),
378 }, 375 },
379 }, 376 },
380#endif /* CONFIG_X86_32 */
381 377
382 { /* Handle reboot issue on Acer Aspire one */ 378 { /* Handle reboot issue on Acer Aspire one */
383 .callback = set_kbd_reboot, 379 .callback = set_kbd_reboot,
@@ -584,13 +580,11 @@ static void native_machine_emergency_restart(void)
584 reboot_type = BOOT_KBD; 580 reboot_type = BOOT_KBD;
585 break; 581 break;
586 582
587#ifdef CONFIG_X86_32
588 case BOOT_BIOS: 583 case BOOT_BIOS:
589 machine_real_restart(MRR_BIOS); 584 machine_real_restart(MRR_BIOS);
590 585
591 reboot_type = BOOT_KBD; 586 reboot_type = BOOT_KBD;
592 break; 587 break;
593#endif
594 588
595 case BOOT_ACPI: 589 case BOOT_ACPI:
596 acpi_reboot(); 590 acpi_reboot();
@@ -632,12 +626,10 @@ void native_machine_shutdown(void)
632 /* The boot cpu is always logical cpu 0 */ 626 /* The boot cpu is always logical cpu 0 */
633 int reboot_cpu_id = 0; 627 int reboot_cpu_id = 0;
634 628
635#ifdef CONFIG_X86_32
636 /* See if there has been given a command line override */ 629 /* See if there has been given a command line override */
637 if ((reboot_cpu != -1) && (reboot_cpu < nr_cpu_ids) && 630 if ((reboot_cpu != -1) && (reboot_cpu < nr_cpu_ids) &&
638 cpu_online(reboot_cpu)) 631 cpu_online(reboot_cpu))
639 reboot_cpu_id = reboot_cpu; 632 reboot_cpu_id = reboot_cpu;
640#endif
641 633
642 /* Make certain the cpu I'm about to reboot on is online */ 634 /* Make certain the cpu I'm about to reboot on is online */
643 if (!cpu_online(reboot_cpu_id)) 635 if (!cpu_online(reboot_cpu_id))
@@ -678,7 +670,7 @@ static void __machine_emergency_restart(int emergency)
678 670
679static void native_machine_restart(char *__unused) 671static void native_machine_restart(char *__unused)
680{ 672{
681 printk("machine restart\n"); 673 pr_notice("machine restart\n");
682 674
683 if (!reboot_force) 675 if (!reboot_force)
684 machine_shutdown(); 676 machine_shutdown();
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 16be6dc14db1..f4b9b80e1b95 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1031,8 +1031,6 @@ void __init setup_arch(char **cmdline_p)
1031 1031
1032 x86_init.timers.wallclock_init(); 1032 x86_init.timers.wallclock_init();
1033 1033
1034 x86_platform.wallclock_init();
1035
1036 mcheck_init(); 1034 mcheck_init();
1037 1035
1038 arch_init_ideal_nops(); 1036 arch_init_ideal_nops();
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 5a98aa272184..5cdff0357746 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -21,7 +21,7 @@
21#include <asm/cpu.h> 21#include <asm/cpu.h>
22#include <asm/stackprotector.h> 22#include <asm/stackprotector.h>
23 23
24DEFINE_PER_CPU(int, cpu_number); 24DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number);
25EXPORT_PER_CPU_SYMBOL(cpu_number); 25EXPORT_PER_CPU_SYMBOL(cpu_number);
26 26
27#ifdef CONFIG_X86_64 27#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 21af737053aa..b280908a376e 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -6,6 +6,9 @@
6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes 6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
7 * 2000-2002 x86-64 support by Andi Kleen 7 * 2000-2002 x86-64 support by Andi Kleen
8 */ 8 */
9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
9#include <linux/sched.h> 12#include <linux/sched.h>
10#include <linux/mm.h> 13#include <linux/mm.h>
11#include <linux/smp.h> 14#include <linux/smp.h>
@@ -814,7 +817,7 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
814 me->comm, me->pid, where, frame, 817 me->comm, me->pid, where, frame,
815 regs->ip, regs->sp, regs->orig_ax); 818 regs->ip, regs->sp, regs->orig_ax);
816 print_vma_addr(" in ", regs->ip); 819 print_vma_addr(" in ", regs->ip);
817 printk(KERN_CONT "\n"); 820 pr_cont("\n");
818 } 821 }
819 822
820 force_sig(SIGSEGV, me); 823 force_sig(SIGSEGV, me);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 7bd8a0823654..7c5a8c314c02 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1,4 +1,4 @@
1/* 1 /*
2 * x86 SMP booting functions 2 * x86 SMP booting functions
3 * 3 *
4 * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk> 4 * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
@@ -39,6 +39,8 @@
39 * Glauber Costa : i386 and x86_64 integration 39 * Glauber Costa : i386 and x86_64 integration
40 */ 40 */
41 41
42#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
43
42#include <linux/init.h> 44#include <linux/init.h>
43#include <linux/smp.h> 45#include <linux/smp.h>
44#include <linux/module.h> 46#include <linux/module.h>
@@ -104,17 +106,17 @@ int smp_num_siblings = 1;
104EXPORT_SYMBOL(smp_num_siblings); 106EXPORT_SYMBOL(smp_num_siblings);
105 107
106/* Last level cache ID of each logical CPU */ 108/* Last level cache ID of each logical CPU */
107DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID; 109DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID;
108 110
109/* representing HT siblings of each logical CPU */ 111/* representing HT siblings of each logical CPU */
110DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map); 112DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
111EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); 113EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
112 114
113/* representing HT and core siblings of each logical CPU */ 115/* representing HT and core siblings of each logical CPU */
114DEFINE_PER_CPU(cpumask_var_t, cpu_core_map); 116DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
115EXPORT_PER_CPU_SYMBOL(cpu_core_map); 117EXPORT_PER_CPU_SYMBOL(cpu_core_map);
116 118
117DEFINE_PER_CPU(cpumask_var_t, cpu_llc_shared_map); 119DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
118 120
119/* Per CPU bogomips and other parameters */ 121/* Per CPU bogomips and other parameters */
120DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); 122DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
@@ -184,7 +186,7 @@ static void __cpuinit smp_callin(void)
184 * boards) 186 * boards)
185 */ 187 */
186 188
187 pr_debug("CALLIN, before setup_local_APIC().\n"); 189 pr_debug("CALLIN, before setup_local_APIC()\n");
188 if (apic->smp_callin_clear_local_apic) 190 if (apic->smp_callin_clear_local_apic)
189 apic->smp_callin_clear_local_apic(); 191 apic->smp_callin_clear_local_apic();
190 setup_local_APIC(); 192 setup_local_APIC();
@@ -255,22 +257,13 @@ notrace static void __cpuinit start_secondary(void *unused)
255 check_tsc_sync_target(); 257 check_tsc_sync_target();
256 258
257 /* 259 /*
258 * We need to hold call_lock, so there is no inconsistency
259 * between the time smp_call_function() determines number of
260 * IPI recipients, and the time when the determination is made
261 * for which cpus receive the IPI. Holding this
262 * lock helps us to not include this cpu in a currently in progress
263 * smp_call_function().
264 *
265 * We need to hold vector_lock so there the set of online cpus 260 * We need to hold vector_lock so there the set of online cpus
266 * does not change while we are assigning vectors to cpus. Holding 261 * does not change while we are assigning vectors to cpus. Holding
267 * this lock ensures we don't half assign or remove an irq from a cpu. 262 * this lock ensures we don't half assign or remove an irq from a cpu.
268 */ 263 */
269 ipi_call_lock();
270 lock_vector_lock(); 264 lock_vector_lock();
271 set_cpu_online(smp_processor_id(), true); 265 set_cpu_online(smp_processor_id(), true);
272 unlock_vector_lock(); 266 unlock_vector_lock();
273 ipi_call_unlock();
274 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; 267 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
275 x86_platform.nmi_init(); 268 x86_platform.nmi_init();
276 269
@@ -432,17 +425,16 @@ static void impress_friends(void)
432 /* 425 /*
433 * Allow the user to impress friends. 426 * Allow the user to impress friends.
434 */ 427 */
435 pr_debug("Before bogomips.\n"); 428 pr_debug("Before bogomips\n");
436 for_each_possible_cpu(cpu) 429 for_each_possible_cpu(cpu)
437 if (cpumask_test_cpu(cpu, cpu_callout_mask)) 430 if (cpumask_test_cpu(cpu, cpu_callout_mask))
438 bogosum += cpu_data(cpu).loops_per_jiffy; 431 bogosum += cpu_data(cpu).loops_per_jiffy;
439 printk(KERN_INFO 432 pr_info("Total of %d processors activated (%lu.%02lu BogoMIPS)\n",
440 "Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
441 num_online_cpus(), 433 num_online_cpus(),
442 bogosum/(500000/HZ), 434 bogosum/(500000/HZ),
443 (bogosum/(5000/HZ))%100); 435 (bogosum/(5000/HZ))%100);
444 436
445 pr_debug("Before bogocount - setting activated=1.\n"); 437 pr_debug("Before bogocount - setting activated=1\n");
446} 438}
447 439
448void __inquire_remote_apic(int apicid) 440void __inquire_remote_apic(int apicid)
@@ -452,18 +444,17 @@ void __inquire_remote_apic(int apicid)
452 int timeout; 444 int timeout;
453 u32 status; 445 u32 status;
454 446
455 printk(KERN_INFO "Inquiring remote APIC 0x%x...\n", apicid); 447 pr_info("Inquiring remote APIC 0x%x...\n", apicid);
456 448
457 for (i = 0; i < ARRAY_SIZE(regs); i++) { 449 for (i = 0; i < ARRAY_SIZE(regs); i++) {
458 printk(KERN_INFO "... APIC 0x%x %s: ", apicid, names[i]); 450 pr_info("... APIC 0x%x %s: ", apicid, names[i]);
459 451
460 /* 452 /*
461 * Wait for idle. 453 * Wait for idle.
462 */ 454 */
463 status = safe_apic_wait_icr_idle(); 455 status = safe_apic_wait_icr_idle();
464 if (status) 456 if (status)
465 printk(KERN_CONT 457 pr_cont("a previous APIC delivery may have failed\n");
466 "a previous APIC delivery may have failed\n");
467 458
468 apic_icr_write(APIC_DM_REMRD | regs[i], apicid); 459 apic_icr_write(APIC_DM_REMRD | regs[i], apicid);
469 460
@@ -476,10 +467,10 @@ void __inquire_remote_apic(int apicid)
476 switch (status) { 467 switch (status) {
477 case APIC_ICR_RR_VALID: 468 case APIC_ICR_RR_VALID:
478 status = apic_read(APIC_RRR); 469 status = apic_read(APIC_RRR);
479 printk(KERN_CONT "%08x\n", status); 470 pr_cont("%08x\n", status);
480 break; 471 break;
481 default: 472 default:
482 printk(KERN_CONT "failed\n"); 473 pr_cont("failed\n");
483 } 474 }
484 } 475 }
485} 476}
@@ -513,12 +504,12 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
513 apic_write(APIC_ESR, 0); 504 apic_write(APIC_ESR, 0);
514 accept_status = (apic_read(APIC_ESR) & 0xEF); 505 accept_status = (apic_read(APIC_ESR) & 0xEF);
515 } 506 }
516 pr_debug("NMI sent.\n"); 507 pr_debug("NMI sent\n");
517 508
518 if (send_status) 509 if (send_status)
519 printk(KERN_ERR "APIC never delivered???\n"); 510 pr_err("APIC never delivered???\n");
520 if (accept_status) 511 if (accept_status)
521 printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status); 512 pr_err("APIC delivery error (%lx)\n", accept_status);
522 513
523 return (send_status | accept_status); 514 return (send_status | accept_status);
524} 515}
@@ -540,7 +531,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
540 apic_read(APIC_ESR); 531 apic_read(APIC_ESR);
541 } 532 }
542 533
543 pr_debug("Asserting INIT.\n"); 534 pr_debug("Asserting INIT\n");
544 535
545 /* 536 /*
546 * Turn INIT on target chip 537 * Turn INIT on target chip
@@ -556,7 +547,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
556 547
557 mdelay(10); 548 mdelay(10);
558 549
559 pr_debug("Deasserting INIT.\n"); 550 pr_debug("Deasserting INIT\n");
560 551
561 /* Target chip */ 552 /* Target chip */
562 /* Send IPI */ 553 /* Send IPI */
@@ -589,14 +580,14 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
589 /* 580 /*
590 * Run STARTUP IPI loop. 581 * Run STARTUP IPI loop.
591 */ 582 */
592 pr_debug("#startup loops: %d.\n", num_starts); 583 pr_debug("#startup loops: %d\n", num_starts);
593 584
594 for (j = 1; j <= num_starts; j++) { 585 for (j = 1; j <= num_starts; j++) {
595 pr_debug("Sending STARTUP #%d.\n", j); 586 pr_debug("Sending STARTUP #%d\n", j);
596 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ 587 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
597 apic_write(APIC_ESR, 0); 588 apic_write(APIC_ESR, 0);
598 apic_read(APIC_ESR); 589 apic_read(APIC_ESR);
599 pr_debug("After apic_write.\n"); 590 pr_debug("After apic_write\n");
600 591
601 /* 592 /*
602 * STARTUP IPI 593 * STARTUP IPI
@@ -613,7 +604,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
613 */ 604 */
614 udelay(300); 605 udelay(300);
615 606
616 pr_debug("Startup point 1.\n"); 607 pr_debug("Startup point 1\n");
617 608
618 pr_debug("Waiting for send to finish...\n"); 609 pr_debug("Waiting for send to finish...\n");
619 send_status = safe_apic_wait_icr_idle(); 610 send_status = safe_apic_wait_icr_idle();
@@ -628,12 +619,12 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
628 if (send_status || accept_status) 619 if (send_status || accept_status)
629 break; 620 break;
630 } 621 }
631 pr_debug("After Startup.\n"); 622 pr_debug("After Startup\n");
632 623
633 if (send_status) 624 if (send_status)
634 printk(KERN_ERR "APIC never delivered???\n"); 625 pr_err("APIC never delivered???\n");
635 if (accept_status) 626 if (accept_status)
636 printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status); 627 pr_err("APIC delivery error (%lx)\n", accept_status);
637 628
638 return (send_status | accept_status); 629 return (send_status | accept_status);
639} 630}
@@ -647,11 +638,11 @@ static void __cpuinit announce_cpu(int cpu, int apicid)
647 if (system_state == SYSTEM_BOOTING) { 638 if (system_state == SYSTEM_BOOTING) {
648 if (node != current_node) { 639 if (node != current_node) {
649 if (current_node > (-1)) 640 if (current_node > (-1))
650 pr_cont(" Ok.\n"); 641 pr_cont(" OK\n");
651 current_node = node; 642 current_node = node;
652 pr_info("Booting Node %3d, Processors ", node); 643 pr_info("Booting Node %3d, Processors ", node);
653 } 644 }
654 pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " Ok.\n" : ""); 645 pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " OK\n" : "");
655 return; 646 return;
656 } else 647 } else
657 pr_info("Booting Node %d Processor %d APIC 0x%x\n", 648 pr_info("Booting Node %d Processor %d APIC 0x%x\n",
@@ -731,9 +722,9 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
731 /* 722 /*
732 * allow APs to start initializing. 723 * allow APs to start initializing.
733 */ 724 */
734 pr_debug("Before Callout %d.\n", cpu); 725 pr_debug("Before Callout %d\n", cpu);
735 cpumask_set_cpu(cpu, cpu_callout_mask); 726 cpumask_set_cpu(cpu, cpu_callout_mask);
736 pr_debug("After Callout %d.\n", cpu); 727 pr_debug("After Callout %d\n", cpu);
737 728
738 /* 729 /*
739 * Wait 5s total for a response 730 * Wait 5s total for a response
@@ -761,7 +752,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
761 pr_err("CPU%d: Stuck ??\n", cpu); 752 pr_err("CPU%d: Stuck ??\n", cpu);
762 else 753 else
763 /* trampoline code not run */ 754 /* trampoline code not run */
764 pr_err("CPU%d: Not responding.\n", cpu); 755 pr_err("CPU%d: Not responding\n", cpu);
765 if (apic->inquire_remote_apic) 756 if (apic->inquire_remote_apic)
766 apic->inquire_remote_apic(apicid); 757 apic->inquire_remote_apic(apicid);
767 } 758 }
@@ -806,7 +797,7 @@ int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle)
806 if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || 797 if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid ||
807 !physid_isset(apicid, phys_cpu_present_map) || 798 !physid_isset(apicid, phys_cpu_present_map) ||
808 !apic->apic_id_valid(apicid)) { 799 !apic->apic_id_valid(apicid)) {
809 printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu); 800 pr_err("%s: bad cpu %d\n", __func__, cpu);
810 return -EINVAL; 801 return -EINVAL;
811 } 802 }
812 803
@@ -887,9 +878,8 @@ static int __init smp_sanity_check(unsigned max_cpus)
887 unsigned int cpu; 878 unsigned int cpu;
888 unsigned nr; 879 unsigned nr;
889 880
890 printk(KERN_WARNING 881 pr_warn("More than 8 CPUs detected - skipping them\n"
891 "More than 8 CPUs detected - skipping them.\n" 882 "Use CONFIG_X86_BIGSMP\n");
892 "Use CONFIG_X86_BIGSMP.\n");
893 883
894 nr = 0; 884 nr = 0;
895 for_each_present_cpu(cpu) { 885 for_each_present_cpu(cpu) {
@@ -910,8 +900,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
910#endif 900#endif
911 901
912 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { 902 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
913 printk(KERN_WARNING 903 pr_warn("weird, boot CPU (#%d) not listed by the BIOS\n",
914 "weird, boot CPU (#%d) not listed by the BIOS.\n",
915 hard_smp_processor_id()); 904 hard_smp_processor_id());
916 905
917 physid_set(hard_smp_processor_id(), phys_cpu_present_map); 906 physid_set(hard_smp_processor_id(), phys_cpu_present_map);
@@ -923,11 +912,10 @@ static int __init smp_sanity_check(unsigned max_cpus)
923 */ 912 */
924 if (!smp_found_config && !acpi_lapic) { 913 if (!smp_found_config && !acpi_lapic) {
925 preempt_enable(); 914 preempt_enable();
926 printk(KERN_NOTICE "SMP motherboard not detected.\n"); 915 pr_notice("SMP motherboard not detected\n");
927 disable_smp(); 916 disable_smp();
928 if (APIC_init_uniprocessor()) 917 if (APIC_init_uniprocessor())
929 printk(KERN_NOTICE "Local APIC not detected." 918 pr_notice("Local APIC not detected. Using dummy APIC emulation.\n");
930 " Using dummy APIC emulation.\n");
931 return -1; 919 return -1;
932 } 920 }
933 921
@@ -936,9 +924,8 @@ static int __init smp_sanity_check(unsigned max_cpus)
936 * CPU too, but we do it for the sake of robustness anyway. 924 * CPU too, but we do it for the sake of robustness anyway.
937 */ 925 */
938 if (!apic->check_phys_apicid_present(boot_cpu_physical_apicid)) { 926 if (!apic->check_phys_apicid_present(boot_cpu_physical_apicid)) {
939 printk(KERN_NOTICE 927 pr_notice("weird, boot CPU (#%d) not listed by the BIOS\n",
940 "weird, boot CPU (#%d) not listed by the BIOS.\n", 928 boot_cpu_physical_apicid);
941 boot_cpu_physical_apicid);
942 physid_set(hard_smp_processor_id(), phys_cpu_present_map); 929 physid_set(hard_smp_processor_id(), phys_cpu_present_map);
943 } 930 }
944 preempt_enable(); 931 preempt_enable();
@@ -951,8 +938,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
951 if (!disable_apic) { 938 if (!disable_apic) {
952 pr_err("BIOS bug, local APIC #%d not detected!...\n", 939 pr_err("BIOS bug, local APIC #%d not detected!...\n",
953 boot_cpu_physical_apicid); 940 boot_cpu_physical_apicid);
954 pr_err("... forcing use of dummy APIC emulation." 941 pr_err("... forcing use of dummy APIC emulation (tell your hw vendor)\n");
955 "(tell your hw vendor)\n");
956 } 942 }
957 smpboot_clear_io_apic(); 943 smpboot_clear_io_apic();
958 disable_ioapic_support(); 944 disable_ioapic_support();
@@ -965,7 +951,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
965 * If SMP should be disabled, then really disable it! 951 * If SMP should be disabled, then really disable it!
966 */ 952 */
967 if (!max_cpus) { 953 if (!max_cpus) {
968 printk(KERN_INFO "SMP mode deactivated.\n"); 954 pr_info("SMP mode deactivated\n");
969 smpboot_clear_io_apic(); 955 smpboot_clear_io_apic();
970 956
971 connect_bsp_APIC(); 957 connect_bsp_APIC();
@@ -1017,7 +1003,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1017 1003
1018 1004
1019 if (smp_sanity_check(max_cpus) < 0) { 1005 if (smp_sanity_check(max_cpus) < 0) {
1020 printk(KERN_INFO "SMP disabled\n"); 1006 pr_info("SMP disabled\n");
1021 disable_smp(); 1007 disable_smp();
1022 goto out; 1008 goto out;
1023 } 1009 }
@@ -1055,7 +1041,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1055 * Set up local APIC timer on boot CPU. 1041 * Set up local APIC timer on boot CPU.
1056 */ 1042 */
1057 1043
1058 printk(KERN_INFO "CPU%d: ", 0); 1044 pr_info("CPU%d: ", 0);
1059 print_cpu_info(&cpu_data(0)); 1045 print_cpu_info(&cpu_data(0));
1060 x86_init.timers.setup_percpu_clockev(); 1046 x86_init.timers.setup_percpu_clockev();
1061 1047
@@ -1105,7 +1091,7 @@ void __init native_smp_prepare_boot_cpu(void)
1105 1091
1106void __init native_smp_cpus_done(unsigned int max_cpus) 1092void __init native_smp_cpus_done(unsigned int max_cpus)
1107{ 1093{
1108 pr_debug("Boot done.\n"); 1094 pr_debug("Boot done\n");
1109 1095
1110 nmi_selftest(); 1096 nmi_selftest();
1111 impress_friends(); 1097 impress_friends();
@@ -1166,8 +1152,7 @@ __init void prefill_possible_map(void)
1166 1152
1167 /* nr_cpu_ids could be reduced via nr_cpus= */ 1153 /* nr_cpu_ids could be reduced via nr_cpus= */
1168 if (possible > nr_cpu_ids) { 1154 if (possible > nr_cpu_ids) {
1169 printk(KERN_WARNING 1155 pr_warn("%d Processors exceeds NR_CPUS limit of %d\n",
1170 "%d Processors exceeds NR_CPUS limit of %d\n",
1171 possible, nr_cpu_ids); 1156 possible, nr_cpu_ids);
1172 possible = nr_cpu_ids; 1157 possible = nr_cpu_ids;
1173 } 1158 }
@@ -1176,13 +1161,12 @@ __init void prefill_possible_map(void)
1176 if (!setup_max_cpus) 1161 if (!setup_max_cpus)
1177#endif 1162#endif
1178 if (possible > i) { 1163 if (possible > i) {
1179 printk(KERN_WARNING 1164 pr_warn("%d Processors exceeds max_cpus limit of %u\n",
1180 "%d Processors exceeds max_cpus limit of %u\n",
1181 possible, setup_max_cpus); 1165 possible, setup_max_cpus);
1182 possible = i; 1166 possible = i;
1183 } 1167 }
1184 1168
1185 printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", 1169 pr_info("Allowing %d CPUs, %d hotplug CPUs\n",
1186 possible, max_t(int, possible - num_processors, 0)); 1170 possible, max_t(int, possible - num_processors, 0));
1187 1171
1188 for (i = 0; i < possible; i++) 1172 for (i = 0; i < possible; i++)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 05b31d92f69c..b481341c9369 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -9,6 +9,9 @@
9/* 9/*
10 * Handle hardware traps and faults. 10 * Handle hardware traps and faults.
11 */ 11 */
12
13#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14
12#include <linux/interrupt.h> 15#include <linux/interrupt.h>
13#include <linux/kallsyms.h> 16#include <linux/kallsyms.h>
14#include <linux/spinlock.h> 17#include <linux/spinlock.h>
@@ -143,12 +146,11 @@ trap_signal:
143#ifdef CONFIG_X86_64 146#ifdef CONFIG_X86_64
144 if (show_unhandled_signals && unhandled_signal(tsk, signr) && 147 if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
145 printk_ratelimit()) { 148 printk_ratelimit()) {
146 printk(KERN_INFO 149 pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx",
147 "%s[%d] trap %s ip:%lx sp:%lx error:%lx", 150 tsk->comm, tsk->pid, str,
148 tsk->comm, tsk->pid, str, 151 regs->ip, regs->sp, error_code);
149 regs->ip, regs->sp, error_code);
150 print_vma_addr(" in ", regs->ip); 152 print_vma_addr(" in ", regs->ip);
151 printk("\n"); 153 pr_cont("\n");
152 } 154 }
153#endif 155#endif
154 156
@@ -269,12 +271,11 @@ do_general_protection(struct pt_regs *regs, long error_code)
269 271
270 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && 272 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
271 printk_ratelimit()) { 273 printk_ratelimit()) {
272 printk(KERN_INFO 274 pr_info("%s[%d] general protection ip:%lx sp:%lx error:%lx",
273 "%s[%d] general protection ip:%lx sp:%lx error:%lx",
274 tsk->comm, task_pid_nr(tsk), 275 tsk->comm, task_pid_nr(tsk),
275 regs->ip, regs->sp, error_code); 276 regs->ip, regs->sp, error_code);
276 print_vma_addr(" in ", regs->ip); 277 print_vma_addr(" in ", regs->ip);
277 printk("\n"); 278 pr_cont("\n");
278 } 279 }
279 280
280 force_sig(SIGSEGV, tsk); 281 force_sig(SIGSEGV, tsk);
@@ -570,7 +571,7 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
570 conditional_sti(regs); 571 conditional_sti(regs);
571#if 0 572#if 0
572 /* No need to warn about this any longer. */ 573 /* No need to warn about this any longer. */
573 printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); 574 pr_info("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
574#endif 575#endif
575} 576}
576 577
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index fc0a147e3727..cfa5d4f7ca56 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1,3 +1,5 @@
1#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
2
1#include <linux/kernel.h> 3#include <linux/kernel.h>
2#include <linux/sched.h> 4#include <linux/sched.h>
3#include <linux/init.h> 5#include <linux/init.h>
@@ -84,8 +86,7 @@ EXPORT_SYMBOL_GPL(check_tsc_unstable);
84#ifdef CONFIG_X86_TSC 86#ifdef CONFIG_X86_TSC
85int __init notsc_setup(char *str) 87int __init notsc_setup(char *str)
86{ 88{
87 printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, " 89 pr_warn("Kernel compiled with CONFIG_X86_TSC, cannot disable TSC completely\n");
88 "cannot disable TSC completely.\n");
89 tsc_disabled = 1; 90 tsc_disabled = 1;
90 return 1; 91 return 1;
91} 92}
@@ -373,7 +374,7 @@ static unsigned long quick_pit_calibrate(void)
373 goto success; 374 goto success;
374 } 375 }
375 } 376 }
376 printk("Fast TSC calibration failed\n"); 377 pr_err("Fast TSC calibration failed\n");
377 return 0; 378 return 0;
378 379
379success: 380success:
@@ -392,7 +393,7 @@ success:
392 */ 393 */
393 delta *= PIT_TICK_RATE; 394 delta *= PIT_TICK_RATE;
394 do_div(delta, i*256*1000); 395 do_div(delta, i*256*1000);
395 printk("Fast TSC calibration using PIT\n"); 396 pr_info("Fast TSC calibration using PIT\n");
396 return delta; 397 return delta;
397} 398}
398 399
@@ -487,9 +488,8 @@ unsigned long native_calibrate_tsc(void)
487 * use the reference value, as it is more precise. 488 * use the reference value, as it is more precise.
488 */ 489 */
489 if (delta >= 90 && delta <= 110) { 490 if (delta >= 90 && delta <= 110) {
490 printk(KERN_INFO 491 pr_info("PIT calibration matches %s. %d loops\n",
491 "TSC: PIT calibration matches %s. %d loops\n", 492 hpet ? "HPET" : "PMTIMER", i + 1);
492 hpet ? "HPET" : "PMTIMER", i + 1);
493 return tsc_ref_min; 493 return tsc_ref_min;
494 } 494 }
495 495
@@ -511,38 +511,36 @@ unsigned long native_calibrate_tsc(void)
511 */ 511 */
512 if (tsc_pit_min == ULONG_MAX) { 512 if (tsc_pit_min == ULONG_MAX) {
513 /* PIT gave no useful value */ 513 /* PIT gave no useful value */
514 printk(KERN_WARNING "TSC: Unable to calibrate against PIT\n"); 514 pr_warn("Unable to calibrate against PIT\n");
515 515
516 /* We don't have an alternative source, disable TSC */ 516 /* We don't have an alternative source, disable TSC */
517 if (!hpet && !ref1 && !ref2) { 517 if (!hpet && !ref1 && !ref2) {
518 printk("TSC: No reference (HPET/PMTIMER) available\n"); 518 pr_notice("No reference (HPET/PMTIMER) available\n");
519 return 0; 519 return 0;
520 } 520 }
521 521
522 /* The alternative source failed as well, disable TSC */ 522 /* The alternative source failed as well, disable TSC */
523 if (tsc_ref_min == ULONG_MAX) { 523 if (tsc_ref_min == ULONG_MAX) {
524 printk(KERN_WARNING "TSC: HPET/PMTIMER calibration " 524 pr_warn("HPET/PMTIMER calibration failed\n");
525 "failed.\n");
526 return 0; 525 return 0;
527 } 526 }
528 527
529 /* Use the alternative source */ 528 /* Use the alternative source */
530 printk(KERN_INFO "TSC: using %s reference calibration\n", 529 pr_info("using %s reference calibration\n",
531 hpet ? "HPET" : "PMTIMER"); 530 hpet ? "HPET" : "PMTIMER");
532 531
533 return tsc_ref_min; 532 return tsc_ref_min;
534 } 533 }
535 534
536 /* We don't have an alternative source, use the PIT calibration value */ 535 /* We don't have an alternative source, use the PIT calibration value */
537 if (!hpet && !ref1 && !ref2) { 536 if (!hpet && !ref1 && !ref2) {
538 printk(KERN_INFO "TSC: Using PIT calibration value\n"); 537 pr_info("Using PIT calibration value\n");
539 return tsc_pit_min; 538 return tsc_pit_min;
540 } 539 }
541 540
542 /* The alternative source failed, use the PIT calibration value */ 541 /* The alternative source failed, use the PIT calibration value */
543 if (tsc_ref_min == ULONG_MAX) { 542 if (tsc_ref_min == ULONG_MAX) {
544 printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed. " 543 pr_warn("HPET/PMTIMER calibration failed. Using PIT calibration.\n");
545 "Using PIT calibration\n");
546 return tsc_pit_min; 544 return tsc_pit_min;
547 } 545 }
548 546
@@ -551,9 +549,9 @@ unsigned long native_calibrate_tsc(void)
551 * the PIT value as we know that there are PMTIMERs around 549 * the PIT value as we know that there are PMTIMERs around
552 * running at double speed. At least we let the user know: 550 * running at double speed. At least we let the user know:
553 */ 551 */
554 printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n", 552 pr_warn("PIT calibration deviates from %s: %lu %lu\n",
555 hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min); 553 hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
556 printk(KERN_INFO "TSC: Using PIT calibration value\n"); 554 pr_info("Using PIT calibration value\n");
557 return tsc_pit_min; 555 return tsc_pit_min;
558} 556}
559 557
@@ -785,7 +783,7 @@ void mark_tsc_unstable(char *reason)
785 tsc_unstable = 1; 783 tsc_unstable = 1;
786 sched_clock_stable = 0; 784 sched_clock_stable = 0;
787 disable_sched_clock_irqtime(); 785 disable_sched_clock_irqtime();
788 printk(KERN_INFO "Marking TSC unstable due to %s\n", reason); 786 pr_info("Marking TSC unstable due to %s\n", reason);
789 /* Change only the rating, when not registered */ 787 /* Change only the rating, when not registered */
790 if (clocksource_tsc.mult) 788 if (clocksource_tsc.mult)
791 clocksource_mark_unstable(&clocksource_tsc); 789 clocksource_mark_unstable(&clocksource_tsc);
@@ -912,9 +910,9 @@ static void tsc_refine_calibration_work(struct work_struct *work)
912 goto out; 910 goto out;
913 911
914 tsc_khz = freq; 912 tsc_khz = freq;
915 printk(KERN_INFO "Refined TSC clocksource calibration: " 913 pr_info("Refined TSC clocksource calibration: %lu.%03lu MHz\n",
916 "%lu.%03lu MHz.\n", (unsigned long)tsc_khz / 1000, 914 (unsigned long)tsc_khz / 1000,
917 (unsigned long)tsc_khz % 1000); 915 (unsigned long)tsc_khz % 1000);
918 916
919out: 917out:
920 clocksource_register_khz(&clocksource_tsc, tsc_khz); 918 clocksource_register_khz(&clocksource_tsc, tsc_khz);
@@ -970,9 +968,9 @@ void __init tsc_init(void)
970 return; 968 return;
971 } 969 }
972 970
973 printk("Detected %lu.%03lu MHz processor.\n", 971 pr_info("Detected %lu.%03lu MHz processor\n",
974 (unsigned long)cpu_khz / 1000, 972 (unsigned long)cpu_khz / 1000,
975 (unsigned long)cpu_khz % 1000); 973 (unsigned long)cpu_khz % 1000);
976 974
977 /* 975 /*
978 * Secondary CPUs do not run through tsc_init(), so set up 976 * Secondary CPUs do not run through tsc_init(), so set up
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index dc4e910a7d96..36fd42091fa7 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -409,9 +409,10 @@ static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm,
409 * arch_uprobe_analyze_insn - instruction analysis including validity and fixups. 409 * arch_uprobe_analyze_insn - instruction analysis including validity and fixups.
410 * @mm: the probed address space. 410 * @mm: the probed address space.
411 * @arch_uprobe: the probepoint information. 411 * @arch_uprobe: the probepoint information.
412 * @addr: virtual address at which to install the probepoint
412 * Return 0 on success or a -ve number on error. 413 * Return 0 on success or a -ve number on error.
413 */ 414 */
414int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm) 415int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr)
415{ 416{
416 int ret; 417 int ret;
417 struct insn insn; 418 struct insn insn;
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 255f58ae71e8..54abcc0baf23 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -28,6 +28,8 @@
28 * 28 *
29 */ 29 */
30 30
31#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
32
31#include <linux/capability.h> 33#include <linux/capability.h>
32#include <linux/errno.h> 34#include <linux/errno.h>
33#include <linux/interrupt.h> 35#include <linux/interrupt.h>
@@ -137,14 +139,14 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
137 local_irq_enable(); 139 local_irq_enable();
138 140
139 if (!current->thread.vm86_info) { 141 if (!current->thread.vm86_info) {
140 printk("no vm86_info: BAD\n"); 142 pr_alert("no vm86_info: BAD\n");
141 do_exit(SIGSEGV); 143 do_exit(SIGSEGV);
142 } 144 }
143 set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | current->thread.v86mask); 145 set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | current->thread.v86mask);
144 tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs, regs); 146 tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs, regs);
145 tmp += put_user(current->thread.screen_bitmap, &current->thread.vm86_info->screen_bitmap); 147 tmp += put_user(current->thread.screen_bitmap, &current->thread.vm86_info->screen_bitmap);
146 if (tmp) { 148 if (tmp) {
147 printk("vm86: could not access userspace vm86_info\n"); 149 pr_alert("could not access userspace vm86_info\n");
148 do_exit(SIGSEGV); 150 do_exit(SIGSEGV);
149 } 151 }
150 152
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index 8eeb55a551b4..992f890283e9 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -16,6 +16,7 @@
16#include <linux/pci_ids.h> 16#include <linux/pci_ids.h>
17#include <linux/pci_regs.h> 17#include <linux/pci_regs.h>
18#include <linux/smp.h> 18#include <linux/smp.h>
19#include <linux/irq.h>
19 20
20#include <asm/apic.h> 21#include <asm/apic.h>
21#include <asm/pci-direct.h> 22#include <asm/pci-direct.h>
@@ -95,6 +96,18 @@ static void __init set_vsmp_pv_ops(void)
95 ctl = readl(address + 4); 96 ctl = readl(address + 4);
96 printk(KERN_INFO "vSMP CTL: capabilities:0x%08x control:0x%08x\n", 97 printk(KERN_INFO "vSMP CTL: capabilities:0x%08x control:0x%08x\n",
97 cap, ctl); 98 cap, ctl);
99
100 /* If possible, let the vSMP foundation route the interrupt optimally */
101#ifdef CONFIG_SMP
102 if (cap & ctl & BIT(8)) {
103 ctl &= ~BIT(8);
104#ifdef CONFIG_PROC_FS
105 /* Don't let users change irq affinity via procfs */
106 no_irq_affinity = 1;
107#endif
108 }
109#endif
110
98 if (cap & ctl & (1 << 4)) { 111 if (cap & ctl & (1 << 4)) {
99 /* Setup irq ops and turn on vSMP IRQ fastpath handling */ 112 /* Setup irq ops and turn on vSMP IRQ fastpath handling */
100 pv_irq_ops.irq_disable = PV_CALLEE_SAVE(vsmp_irq_disable); 113 pv_irq_ops.irq_disable = PV_CALLEE_SAVE(vsmp_irq_disable);
@@ -102,12 +115,11 @@ static void __init set_vsmp_pv_ops(void)
102 pv_irq_ops.save_fl = PV_CALLEE_SAVE(vsmp_save_fl); 115 pv_irq_ops.save_fl = PV_CALLEE_SAVE(vsmp_save_fl);
103 pv_irq_ops.restore_fl = PV_CALLEE_SAVE(vsmp_restore_fl); 116 pv_irq_ops.restore_fl = PV_CALLEE_SAVE(vsmp_restore_fl);
104 pv_init_ops.patch = vsmp_patch; 117 pv_init_ops.patch = vsmp_patch;
105
106 ctl &= ~(1 << 4); 118 ctl &= ~(1 << 4);
107 writel(ctl, address + 4);
108 ctl = readl(address + 4);
109 printk(KERN_INFO "vSMP CTL: control set to:0x%08x\n", ctl);
110 } 119 }
120 writel(ctl, address + 4);
121 ctl = readl(address + 4);
122 pr_info("vSMP CTL: control set to:0x%08x\n", ctl);
111 123
112 early_iounmap(address, 8); 124 early_iounmap(address, 8);
113} 125}
@@ -187,12 +199,36 @@ static void __init vsmp_cap_cpus(void)
187#endif 199#endif
188} 200}
189 201
202static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
203{
204 return hard_smp_processor_id() >> index_msb;
205}
206
207/*
208 * In vSMP, all cpus should be capable of handling interrupts, regardless of
209 * the APIC used.
210 */
211static void fill_vector_allocation_domain(int cpu, struct cpumask *retmask,
212 const struct cpumask *mask)
213{
214 cpumask_setall(retmask);
215}
216
217static void vsmp_apic_post_init(void)
218{
219 /* need to update phys_pkg_id */
220 apic->phys_pkg_id = apicid_phys_pkg_id;
221 apic->vector_allocation_domain = fill_vector_allocation_domain;
222}
223
190void __init vsmp_init(void) 224void __init vsmp_init(void)
191{ 225{
192 detect_vsmp_box(); 226 detect_vsmp_box();
193 if (!is_vsmp_box()) 227 if (!is_vsmp_box())
194 return; 228 return;
195 229
230 x86_platform.apic_post_init = vsmp_apic_post_init;
231
196 vsmp_cap_cpus(); 232 vsmp_cap_cpus();
197 233
198 set_vsmp_pv_ops(); 234 set_vsmp_pv_ops();
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 5db36caf4289..8d141b309046 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -18,6 +18,8 @@
18 * use the vDSO. 18 * use the vDSO.
19 */ 19 */
20 20
21#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
22
21#include <linux/time.h> 23#include <linux/time.h>
22#include <linux/init.h> 24#include <linux/init.h>
23#include <linux/kernel.h> 25#include <linux/kernel.h>
@@ -111,18 +113,13 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
111static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, 113static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
112 const char *message) 114 const char *message)
113{ 115{
114 static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); 116 if (!show_unhandled_signals)
115 struct task_struct *tsk;
116
117 if (!show_unhandled_signals || !__ratelimit(&rs))
118 return; 117 return;
119 118
120 tsk = current; 119 pr_notice_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
121 120 level, current->comm, task_pid_nr(current),
122 printk("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", 121 message, regs->ip, regs->cs,
123 level, tsk->comm, task_pid_nr(tsk), 122 regs->sp, regs->ax, regs->si, regs->di);
124 message, regs->ip, regs->cs,
125 regs->sp, regs->ax, regs->si, regs->di);
126} 123}
127 124
128static int addr_to_vsyscall_nr(unsigned long addr) 125static int addr_to_vsyscall_nr(unsigned long addr)
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 9796c2f3d074..6020f6f5927c 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -28,6 +28,7 @@ EXPORT_SYMBOL(__put_user_8);
28 28
29EXPORT_SYMBOL(copy_user_generic_string); 29EXPORT_SYMBOL(copy_user_generic_string);
30EXPORT_SYMBOL(copy_user_generic_unrolled); 30EXPORT_SYMBOL(copy_user_generic_unrolled);
31EXPORT_SYMBOL(copy_user_enhanced_fast_string);
31EXPORT_SYMBOL(__copy_user_nocache); 32EXPORT_SYMBOL(__copy_user_nocache);
32EXPORT_SYMBOL(_copy_from_user); 33EXPORT_SYMBOL(_copy_from_user);
33EXPORT_SYMBOL(_copy_to_user); 34EXPORT_SYMBOL(_copy_to_user);
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 35c5e543f550..9f3167e891ef 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -29,7 +29,6 @@ void __init x86_init_uint_noop(unsigned int unused) { }
29void __init x86_init_pgd_noop(pgd_t *unused) { } 29void __init x86_init_pgd_noop(pgd_t *unused) { }
30int __init iommu_init_noop(void) { return 0; } 30int __init iommu_init_noop(void) { return 0; }
31void iommu_shutdown_noop(void) { } 31void iommu_shutdown_noop(void) { }
32void wallclock_init_noop(void) { }
33 32
34/* 33/*
35 * The platform setup functions are preset with the default functions 34 * The platform setup functions are preset with the default functions
@@ -101,7 +100,6 @@ static int default_i8042_detect(void) { return 1; };
101 100
102struct x86_platform_ops x86_platform = { 101struct x86_platform_ops x86_platform = {
103 .calibrate_tsc = native_calibrate_tsc, 102 .calibrate_tsc = native_calibrate_tsc,
104 .wallclock_init = wallclock_init_noop,
105 .get_wallclock = mach_get_cmos_time, 103 .get_wallclock = mach_get_cmos_time,
106 .set_wallclock = mach_set_rtc_mmss, 104 .set_wallclock = mach_set_rtc_mmss,
107 .iommu_shutdown = iommu_shutdown_noop, 105 .iommu_shutdown = iommu_shutdown_noop,
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index bd18149b2b0f..3d3e20709119 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -3,6 +3,9 @@
3 * 3 *
4 * Author: Suresh Siddha <suresh.b.siddha@intel.com> 4 * Author: Suresh Siddha <suresh.b.siddha@intel.com>
5 */ 5 */
6
7#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
8
6#include <linux/bootmem.h> 9#include <linux/bootmem.h>
7#include <linux/compat.h> 10#include <linux/compat.h>
8#include <asm/i387.h> 11#include <asm/i387.h>
@@ -162,7 +165,7 @@ int save_i387_xstate(void __user *buf)
162 BUG_ON(sig_xstate_size < xstate_size); 165 BUG_ON(sig_xstate_size < xstate_size);
163 166
164 if ((unsigned long)buf % 64) 167 if ((unsigned long)buf % 64)
165 printk("save_i387_xstate: bad fpstate %p\n", buf); 168 pr_err("%s: bad fpstate %p\n", __func__, buf);
166 169
167 if (!used_math()) 170 if (!used_math())
168 return 0; 171 return 0;
@@ -422,7 +425,7 @@ static void __init xstate_enable_boot_cpu(void)
422 pcntxt_mask = eax + ((u64)edx << 32); 425 pcntxt_mask = eax + ((u64)edx << 32);
423 426
424 if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) { 427 if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) {
425 printk(KERN_ERR "FP/SSE not shown under xsave features 0x%llx\n", 428 pr_err("FP/SSE not shown under xsave features 0x%llx\n",
426 pcntxt_mask); 429 pcntxt_mask);
427 BUG(); 430 BUG();
428 } 431 }
@@ -445,9 +448,8 @@ static void __init xstate_enable_boot_cpu(void)
445 448
446 setup_xstate_init(); 449 setup_xstate_init();
447 450
448 printk(KERN_INFO "xsave/xrstor: enabled xstate_bv 0x%llx, " 451 pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n",
449 "cntxt size 0x%x\n", 452 pcntxt_mask, xstate_size);
450 pcntxt_mask, xstate_size);
451} 453}
452 454
453/* 455/*
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 7df1c6d839fb..0595f1397b7c 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -201,6 +201,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
201 unsigned f_lm = 0; 201 unsigned f_lm = 0;
202#endif 202#endif
203 unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; 203 unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
204 unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0;
204 205
205 /* cpuid 1.edx */ 206 /* cpuid 1.edx */
206 const u32 kvm_supported_word0_x86_features = 207 const u32 kvm_supported_word0_x86_features =
@@ -228,7 +229,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
228 0 /* DS-CPL, VMX, SMX, EST */ | 229 0 /* DS-CPL, VMX, SMX, EST */ |
229 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | 230 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
230 F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ | 231 F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ |
231 0 /* Reserved, DCA */ | F(XMM4_1) | 232 F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) |
232 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | 233 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
233 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | 234 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
234 F(F16C) | F(RDRAND); 235 F(F16C) | F(RDRAND);
@@ -248,7 +249,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
248 /* cpuid 7.0.ebx */ 249 /* cpuid 7.0.ebx */
249 const u32 kvm_supported_word9_x86_features = 250 const u32 kvm_supported_word9_x86_features =
250 F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | 251 F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
251 F(BMI2) | F(ERMS) | F(RTM); 252 F(BMI2) | F(ERMS) | f_invpcid | F(RTM);
252 253
253 /* all calls to cpuid_count() should be made on the same cpu */ 254 /* all calls to cpuid_count() should be made on the same cpu */
254 get_cpu(); 255 get_cpu();
@@ -409,6 +410,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
409 (1 << KVM_FEATURE_NOP_IO_DELAY) | 410 (1 << KVM_FEATURE_NOP_IO_DELAY) |
410 (1 << KVM_FEATURE_CLOCKSOURCE2) | 411 (1 << KVM_FEATURE_CLOCKSOURCE2) |
411 (1 << KVM_FEATURE_ASYNC_PF) | 412 (1 << KVM_FEATURE_ASYNC_PF) |
413 (1 << KVM_FEATURE_PV_EOI) |
412 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); 414 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
413 415
414 if (sched_info_on()) 416 if (sched_info_on())
@@ -639,33 +641,37 @@ static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu,
639 return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index); 641 return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index);
640} 642}
641 643
642void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) 644void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
643{ 645{
644 u32 function, index; 646 u32 function = *eax, index = *ecx;
645 struct kvm_cpuid_entry2 *best; 647 struct kvm_cpuid_entry2 *best;
646 648
647 function = kvm_register_read(vcpu, VCPU_REGS_RAX);
648 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
649 kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
650 kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
651 kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
652 kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
653 best = kvm_find_cpuid_entry(vcpu, function, index); 649 best = kvm_find_cpuid_entry(vcpu, function, index);
654 650
655 if (!best) 651 if (!best)
656 best = check_cpuid_limit(vcpu, function, index); 652 best = check_cpuid_limit(vcpu, function, index);
657 653
658 if (best) { 654 if (best) {
659 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); 655 *eax = best->eax;
660 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); 656 *ebx = best->ebx;
661 kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); 657 *ecx = best->ecx;
662 kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); 658 *edx = best->edx;
663 } 659 } else
660 *eax = *ebx = *ecx = *edx = 0;
661}
662
663void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
664{
665 u32 function, eax, ebx, ecx, edx;
666
667 function = eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
668 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
669 kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx);
670 kvm_register_write(vcpu, VCPU_REGS_RAX, eax);
671 kvm_register_write(vcpu, VCPU_REGS_RBX, ebx);
672 kvm_register_write(vcpu, VCPU_REGS_RCX, ecx);
673 kvm_register_write(vcpu, VCPU_REGS_RDX, edx);
664 kvm_x86_ops->skip_emulated_instruction(vcpu); 674 kvm_x86_ops->skip_emulated_instruction(vcpu);
665 trace_kvm_cpuid(function, 675 trace_kvm_cpuid(function, eax, ebx, ecx, edx);
666 kvm_register_read(vcpu, VCPU_REGS_RAX),
667 kvm_register_read(vcpu, VCPU_REGS_RBX),
668 kvm_register_read(vcpu, VCPU_REGS_RCX),
669 kvm_register_read(vcpu, VCPU_REGS_RDX));
670} 676}
671EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); 677EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 26d1fb437eb5..a10e46016851 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -17,6 +17,7 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
17int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, 17int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
18 struct kvm_cpuid2 *cpuid, 18 struct kvm_cpuid2 *cpuid,
19 struct kvm_cpuid_entry2 __user *entries); 19 struct kvm_cpuid_entry2 __user *entries);
20void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
20 21
21 22
22static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) 23static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
@@ -51,4 +52,12 @@ static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu)
51 return best && (best->ecx & bit(X86_FEATURE_OSVW)); 52 return best && (best->ecx & bit(X86_FEATURE_OSVW));
52} 53}
53 54
55static inline bool guest_cpuid_has_pcid(struct kvm_vcpu *vcpu)
56{
57 struct kvm_cpuid_entry2 *best;
58
59 best = kvm_find_cpuid_entry(vcpu, 1, 0);
60 return best && (best->ecx & bit(X86_FEATURE_PCID));
61}
62
54#endif 63#endif
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index f95d242ee9f7..97d9a9914ba8 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -433,11 +433,32 @@ static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,
433 return ctxt->ops->intercept(ctxt, &info, stage); 433 return ctxt->ops->intercept(ctxt, &info, stage);
434} 434}
435 435
436static void assign_masked(ulong *dest, ulong src, ulong mask)
437{
438 *dest = (*dest & ~mask) | (src & mask);
439}
440
436static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt) 441static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt)
437{ 442{
438 return (1UL << (ctxt->ad_bytes << 3)) - 1; 443 return (1UL << (ctxt->ad_bytes << 3)) - 1;
439} 444}
440 445
446static ulong stack_mask(struct x86_emulate_ctxt *ctxt)
447{
448 u16 sel;
449 struct desc_struct ss;
450
451 if (ctxt->mode == X86EMUL_MODE_PROT64)
452 return ~0UL;
453 ctxt->ops->get_segment(ctxt, &sel, &ss, NULL, VCPU_SREG_SS);
454 return ~0U >> ((ss.d ^ 1) * 16); /* d=0: 0xffff; d=1: 0xffffffff */
455}
456
457static int stack_size(struct x86_emulate_ctxt *ctxt)
458{
459 return (__fls(stack_mask(ctxt)) + 1) >> 3;
460}
461
441/* Access/update address held in a register, based on addressing mode. */ 462/* Access/update address held in a register, based on addressing mode. */
442static inline unsigned long 463static inline unsigned long
443address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg) 464address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg)
@@ -958,6 +979,12 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
958 op->orig_val = op->val; 979 op->orig_val = op->val;
959} 980}
960 981
982static void adjust_modrm_seg(struct x86_emulate_ctxt *ctxt, int base_reg)
983{
984 if (base_reg == VCPU_REGS_RSP || base_reg == VCPU_REGS_RBP)
985 ctxt->modrm_seg = VCPU_SREG_SS;
986}
987
961static int decode_modrm(struct x86_emulate_ctxt *ctxt, 988static int decode_modrm(struct x86_emulate_ctxt *ctxt,
962 struct operand *op) 989 struct operand *op)
963{ 990{
@@ -1061,15 +1088,20 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
1061 1088
1062 if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0) 1089 if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0)
1063 modrm_ea += insn_fetch(s32, ctxt); 1090 modrm_ea += insn_fetch(s32, ctxt);
1064 else 1091 else {
1065 modrm_ea += ctxt->regs[base_reg]; 1092 modrm_ea += ctxt->regs[base_reg];
1093 adjust_modrm_seg(ctxt, base_reg);
1094 }
1066 if (index_reg != 4) 1095 if (index_reg != 4)
1067 modrm_ea += ctxt->regs[index_reg] << scale; 1096 modrm_ea += ctxt->regs[index_reg] << scale;
1068 } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) { 1097 } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) {
1069 if (ctxt->mode == X86EMUL_MODE_PROT64) 1098 if (ctxt->mode == X86EMUL_MODE_PROT64)
1070 ctxt->rip_relative = 1; 1099 ctxt->rip_relative = 1;
1071 } else 1100 } else {
1072 modrm_ea += ctxt->regs[ctxt->modrm_rm]; 1101 base_reg = ctxt->modrm_rm;
1102 modrm_ea += ctxt->regs[base_reg];
1103 adjust_modrm_seg(ctxt, base_reg);
1104 }
1073 switch (ctxt->modrm_mod) { 1105 switch (ctxt->modrm_mod) {
1074 case 0: 1106 case 0:
1075 if (ctxt->modrm_rm == 5) 1107 if (ctxt->modrm_rm == 5)
@@ -1264,7 +1296,8 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
1264 1296
1265/* allowed just for 8 bytes segments */ 1297/* allowed just for 8 bytes segments */
1266static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, 1298static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1267 u16 selector, struct desc_struct *desc) 1299 u16 selector, struct desc_struct *desc,
1300 ulong *desc_addr_p)
1268{ 1301{
1269 struct desc_ptr dt; 1302 struct desc_ptr dt;
1270 u16 index = selector >> 3; 1303 u16 index = selector >> 3;
@@ -1275,7 +1308,7 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1275 if (dt.size < index * 8 + 7) 1308 if (dt.size < index * 8 + 7)
1276 return emulate_gp(ctxt, selector & 0xfffc); 1309 return emulate_gp(ctxt, selector & 0xfffc);
1277 1310
1278 addr = dt.address + index * 8; 1311 *desc_addr_p = addr = dt.address + index * 8;
1279 return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc, 1312 return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc,
1280 &ctxt->exception); 1313 &ctxt->exception);
1281} 1314}
@@ -1302,11 +1335,12 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1302static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, 1335static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1303 u16 selector, int seg) 1336 u16 selector, int seg)
1304{ 1337{
1305 struct desc_struct seg_desc; 1338 struct desc_struct seg_desc, old_desc;
1306 u8 dpl, rpl, cpl; 1339 u8 dpl, rpl, cpl;
1307 unsigned err_vec = GP_VECTOR; 1340 unsigned err_vec = GP_VECTOR;
1308 u32 err_code = 0; 1341 u32 err_code = 0;
1309 bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ 1342 bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
1343 ulong desc_addr;
1310 int ret; 1344 int ret;
1311 1345
1312 memset(&seg_desc, 0, sizeof seg_desc); 1346 memset(&seg_desc, 0, sizeof seg_desc);
@@ -1324,8 +1358,14 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1324 goto load; 1358 goto load;
1325 } 1359 }
1326 1360
1327 /* NULL selector is not valid for TR, CS and SS */ 1361 rpl = selector & 3;
1328 if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR) 1362 cpl = ctxt->ops->cpl(ctxt);
1363
1364 /* NULL selector is not valid for TR, CS and SS (except for long mode) */
1365 if ((seg == VCPU_SREG_CS
1366 || (seg == VCPU_SREG_SS
1367 && (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl))
1368 || seg == VCPU_SREG_TR)
1329 && null_selector) 1369 && null_selector)
1330 goto exception; 1370 goto exception;
1331 1371
@@ -1336,7 +1376,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1336 if (null_selector) /* for NULL selector skip all following checks */ 1376 if (null_selector) /* for NULL selector skip all following checks */
1337 goto load; 1377 goto load;
1338 1378
1339 ret = read_segment_descriptor(ctxt, selector, &seg_desc); 1379 ret = read_segment_descriptor(ctxt, selector, &seg_desc, &desc_addr);
1340 if (ret != X86EMUL_CONTINUE) 1380 if (ret != X86EMUL_CONTINUE)
1341 return ret; 1381 return ret;
1342 1382
@@ -1352,9 +1392,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1352 goto exception; 1392 goto exception;
1353 } 1393 }
1354 1394
1355 rpl = selector & 3;
1356 dpl = seg_desc.dpl; 1395 dpl = seg_desc.dpl;
1357 cpl = ctxt->ops->cpl(ctxt);
1358 1396
1359 switch (seg) { 1397 switch (seg) {
1360 case VCPU_SREG_SS: 1398 case VCPU_SREG_SS:
@@ -1384,6 +1422,12 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1384 case VCPU_SREG_TR: 1422 case VCPU_SREG_TR:
1385 if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9)) 1423 if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9))
1386 goto exception; 1424 goto exception;
1425 old_desc = seg_desc;
1426 seg_desc.type |= 2; /* busy */
1427 ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc,
1428 sizeof(seg_desc), &ctxt->exception);
1429 if (ret != X86EMUL_CONTINUE)
1430 return ret;
1387 break; 1431 break;
1388 case VCPU_SREG_LDTR: 1432 case VCPU_SREG_LDTR:
1389 if (seg_desc.s || seg_desc.type != 2) 1433 if (seg_desc.s || seg_desc.type != 2)
@@ -1474,17 +1518,22 @@ static int writeback(struct x86_emulate_ctxt *ctxt)
1474 return X86EMUL_CONTINUE; 1518 return X86EMUL_CONTINUE;
1475} 1519}
1476 1520
1477static int em_push(struct x86_emulate_ctxt *ctxt) 1521static int push(struct x86_emulate_ctxt *ctxt, void *data, int bytes)
1478{ 1522{
1479 struct segmented_address addr; 1523 struct segmented_address addr;
1480 1524
1481 register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -ctxt->op_bytes); 1525 register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -bytes);
1482 addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]); 1526 addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]);
1483 addr.seg = VCPU_SREG_SS; 1527 addr.seg = VCPU_SREG_SS;
1484 1528
1529 return segmented_write(ctxt, addr, data, bytes);
1530}
1531
1532static int em_push(struct x86_emulate_ctxt *ctxt)
1533{
1485 /* Disable writeback. */ 1534 /* Disable writeback. */
1486 ctxt->dst.type = OP_NONE; 1535 ctxt->dst.type = OP_NONE;
1487 return segmented_write(ctxt, addr, &ctxt->src.val, ctxt->op_bytes); 1536 return push(ctxt, &ctxt->src.val, ctxt->op_bytes);
1488} 1537}
1489 1538
1490static int emulate_pop(struct x86_emulate_ctxt *ctxt, 1539static int emulate_pop(struct x86_emulate_ctxt *ctxt,
@@ -1556,6 +1605,33 @@ static int em_popf(struct x86_emulate_ctxt *ctxt)
1556 return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes); 1605 return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes);
1557} 1606}
1558 1607
1608static int em_enter(struct x86_emulate_ctxt *ctxt)
1609{
1610 int rc;
1611 unsigned frame_size = ctxt->src.val;
1612 unsigned nesting_level = ctxt->src2.val & 31;
1613
1614 if (nesting_level)
1615 return X86EMUL_UNHANDLEABLE;
1616
1617 rc = push(ctxt, &ctxt->regs[VCPU_REGS_RBP], stack_size(ctxt));
1618 if (rc != X86EMUL_CONTINUE)
1619 return rc;
1620 assign_masked(&ctxt->regs[VCPU_REGS_RBP], ctxt->regs[VCPU_REGS_RSP],
1621 stack_mask(ctxt));
1622 assign_masked(&ctxt->regs[VCPU_REGS_RSP],
1623 ctxt->regs[VCPU_REGS_RSP] - frame_size,
1624 stack_mask(ctxt));
1625 return X86EMUL_CONTINUE;
1626}
1627
1628static int em_leave(struct x86_emulate_ctxt *ctxt)
1629{
1630 assign_masked(&ctxt->regs[VCPU_REGS_RSP], ctxt->regs[VCPU_REGS_RBP],
1631 stack_mask(ctxt));
1632 return emulate_pop(ctxt, &ctxt->regs[VCPU_REGS_RBP], ctxt->op_bytes);
1633}
1634
1559static int em_push_sreg(struct x86_emulate_ctxt *ctxt) 1635static int em_push_sreg(struct x86_emulate_ctxt *ctxt)
1560{ 1636{
1561 int seg = ctxt->src2.val; 1637 int seg = ctxt->src2.val;
@@ -1993,8 +2069,8 @@ static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
1993 u32 eax, ebx, ecx, edx; 2069 u32 eax, ebx, ecx, edx;
1994 2070
1995 eax = ecx = 0; 2071 eax = ecx = 0;
1996 return ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx) 2072 ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
1997 && ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx 2073 return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx
1998 && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx 2074 && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx
1999 && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx; 2075 && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx;
2000} 2076}
@@ -2013,32 +2089,31 @@ static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)
2013 2089
2014 eax = 0x00000000; 2090 eax = 0x00000000;
2015 ecx = 0x00000000; 2091 ecx = 0x00000000;
2016 if (ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx)) { 2092 ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
2017 /* 2093 /*
2018 * Intel ("GenuineIntel") 2094 * Intel ("GenuineIntel")
2019 * remark: Intel CPUs only support "syscall" in 64bit 2095 * remark: Intel CPUs only support "syscall" in 64bit
2020 * longmode. Also an 64bit guest with a 2096 * longmode. Also an 64bit guest with a
2021 * 32bit compat-app running will #UD !! While this 2097 * 32bit compat-app running will #UD !! While this
2022 * behaviour can be fixed (by emulating) into AMD 2098 * behaviour can be fixed (by emulating) into AMD
2023 * response - CPUs of AMD can't behave like Intel. 2099 * response - CPUs of AMD can't behave like Intel.
2024 */ 2100 */
2025 if (ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx && 2101 if (ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx &&
2026 ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx && 2102 ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx &&
2027 edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx) 2103 edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx)
2028 return false; 2104 return false;
2029 2105
2030 /* AMD ("AuthenticAMD") */ 2106 /* AMD ("AuthenticAMD") */
2031 if (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx && 2107 if (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx &&
2032 ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx && 2108 ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx &&
2033 edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx) 2109 edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx)
2034 return true; 2110 return true;
2035 2111
2036 /* AMD ("AMDisbetter!") */ 2112 /* AMD ("AMDisbetter!") */
2037 if (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx && 2113 if (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx &&
2038 ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx && 2114 ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx &&
2039 edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx) 2115 edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx)
2040 return true; 2116 return true;
2041 }
2042 2117
2043 /* default: (not Intel, not AMD), apply Intel's stricter rules... */ 2118 /* default: (not Intel, not AMD), apply Intel's stricter rules... */
2044 return false; 2119 return false;
@@ -2547,13 +2622,14 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2547 ulong old_tss_base = 2622 ulong old_tss_base =
2548 ops->get_cached_segment_base(ctxt, VCPU_SREG_TR); 2623 ops->get_cached_segment_base(ctxt, VCPU_SREG_TR);
2549 u32 desc_limit; 2624 u32 desc_limit;
2625 ulong desc_addr;
2550 2626
2551 /* FIXME: old_tss_base == ~0 ? */ 2627 /* FIXME: old_tss_base == ~0 ? */
2552 2628
2553 ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc); 2629 ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc, &desc_addr);
2554 if (ret != X86EMUL_CONTINUE) 2630 if (ret != X86EMUL_CONTINUE)
2555 return ret; 2631 return ret;
2556 ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc); 2632 ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc, &desc_addr);
2557 if (ret != X86EMUL_CONTINUE) 2633 if (ret != X86EMUL_CONTINUE)
2558 return ret; 2634 return ret;
2559 2635
@@ -2948,6 +3024,24 @@ static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt)
2948 return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg); 3024 return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg);
2949} 3025}
2950 3026
3027static int em_lldt(struct x86_emulate_ctxt *ctxt)
3028{
3029 u16 sel = ctxt->src.val;
3030
3031 /* Disable writeback. */
3032 ctxt->dst.type = OP_NONE;
3033 return load_segment_descriptor(ctxt, sel, VCPU_SREG_LDTR);
3034}
3035
3036static int em_ltr(struct x86_emulate_ctxt *ctxt)
3037{
3038 u16 sel = ctxt->src.val;
3039
3040 /* Disable writeback. */
3041 ctxt->dst.type = OP_NONE;
3042 return load_segment_descriptor(ctxt, sel, VCPU_SREG_TR);
3043}
3044
2951static int em_invlpg(struct x86_emulate_ctxt *ctxt) 3045static int em_invlpg(struct x86_emulate_ctxt *ctxt)
2952{ 3046{
2953 int rc; 3047 int rc;
@@ -2989,11 +3083,42 @@ static int em_vmcall(struct x86_emulate_ctxt *ctxt)
2989 return X86EMUL_CONTINUE; 3083 return X86EMUL_CONTINUE;
2990} 3084}
2991 3085
3086static int emulate_store_desc_ptr(struct x86_emulate_ctxt *ctxt,
3087 void (*get)(struct x86_emulate_ctxt *ctxt,
3088 struct desc_ptr *ptr))
3089{
3090 struct desc_ptr desc_ptr;
3091
3092 if (ctxt->mode == X86EMUL_MODE_PROT64)
3093 ctxt->op_bytes = 8;
3094 get(ctxt, &desc_ptr);
3095 if (ctxt->op_bytes == 2) {
3096 ctxt->op_bytes = 4;
3097 desc_ptr.address &= 0x00ffffff;
3098 }
3099 /* Disable writeback. */
3100 ctxt->dst.type = OP_NONE;
3101 return segmented_write(ctxt, ctxt->dst.addr.mem,
3102 &desc_ptr, 2 + ctxt->op_bytes);
3103}
3104
3105static int em_sgdt(struct x86_emulate_ctxt *ctxt)
3106{
3107 return emulate_store_desc_ptr(ctxt, ctxt->ops->get_gdt);
3108}
3109
3110static int em_sidt(struct x86_emulate_ctxt *ctxt)
3111{
3112 return emulate_store_desc_ptr(ctxt, ctxt->ops->get_idt);
3113}
3114
2992static int em_lgdt(struct x86_emulate_ctxt *ctxt) 3115static int em_lgdt(struct x86_emulate_ctxt *ctxt)
2993{ 3116{
2994 struct desc_ptr desc_ptr; 3117 struct desc_ptr desc_ptr;
2995 int rc; 3118 int rc;
2996 3119
3120 if (ctxt->mode == X86EMUL_MODE_PROT64)
3121 ctxt->op_bytes = 8;
2997 rc = read_descriptor(ctxt, ctxt->src.addr.mem, 3122 rc = read_descriptor(ctxt, ctxt->src.addr.mem,
2998 &desc_ptr.size, &desc_ptr.address, 3123 &desc_ptr.size, &desc_ptr.address,
2999 ctxt->op_bytes); 3124 ctxt->op_bytes);
@@ -3021,6 +3146,8 @@ static int em_lidt(struct x86_emulate_ctxt *ctxt)
3021 struct desc_ptr desc_ptr; 3146 struct desc_ptr desc_ptr;
3022 int rc; 3147 int rc;
3023 3148
3149 if (ctxt->mode == X86EMUL_MODE_PROT64)
3150 ctxt->op_bytes = 8;
3024 rc = read_descriptor(ctxt, ctxt->src.addr.mem, 3151 rc = read_descriptor(ctxt, ctxt->src.addr.mem,
3025 &desc_ptr.size, &desc_ptr.address, 3152 &desc_ptr.size, &desc_ptr.address,
3026 ctxt->op_bytes); 3153 ctxt->op_bytes);
@@ -3143,6 +3270,42 @@ static int em_bsr(struct x86_emulate_ctxt *ctxt)
3143 return X86EMUL_CONTINUE; 3270 return X86EMUL_CONTINUE;
3144} 3271}
3145 3272
3273static int em_cpuid(struct x86_emulate_ctxt *ctxt)
3274{
3275 u32 eax, ebx, ecx, edx;
3276
3277 eax = ctxt->regs[VCPU_REGS_RAX];
3278 ecx = ctxt->regs[VCPU_REGS_RCX];
3279 ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
3280 ctxt->regs[VCPU_REGS_RAX] = eax;
3281 ctxt->regs[VCPU_REGS_RBX] = ebx;
3282 ctxt->regs[VCPU_REGS_RCX] = ecx;
3283 ctxt->regs[VCPU_REGS_RDX] = edx;
3284 return X86EMUL_CONTINUE;
3285}
3286
3287static int em_lahf(struct x86_emulate_ctxt *ctxt)
3288{
3289 ctxt->regs[VCPU_REGS_RAX] &= ~0xff00UL;
3290 ctxt->regs[VCPU_REGS_RAX] |= (ctxt->eflags & 0xff) << 8;
3291 return X86EMUL_CONTINUE;
3292}
3293
3294static int em_bswap(struct x86_emulate_ctxt *ctxt)
3295{
3296 switch (ctxt->op_bytes) {
3297#ifdef CONFIG_X86_64
3298 case 8:
3299 asm("bswap %0" : "+r"(ctxt->dst.val));
3300 break;
3301#endif
3302 default:
3303 asm("bswap %0" : "+r"(*(u32 *)&ctxt->dst.val));
3304 break;
3305 }
3306 return X86EMUL_CONTINUE;
3307}
3308
3146static bool valid_cr(int nr) 3309static bool valid_cr(int nr)
3147{ 3310{
3148 switch (nr) { 3311 switch (nr) {
@@ -3424,14 +3587,14 @@ static struct opcode group5[] = {
3424static struct opcode group6[] = { 3587static struct opcode group6[] = {
3425 DI(Prot, sldt), 3588 DI(Prot, sldt),
3426 DI(Prot, str), 3589 DI(Prot, str),
3427 DI(Prot | Priv, lldt), 3590 II(Prot | Priv | SrcMem16, em_lldt, lldt),
3428 DI(Prot | Priv, ltr), 3591 II(Prot | Priv | SrcMem16, em_ltr, ltr),
3429 N, N, N, N, 3592 N, N, N, N,
3430}; 3593};
3431 3594
3432static struct group_dual group7 = { { 3595static struct group_dual group7 = { {
3433 DI(Mov | DstMem | Priv, sgdt), 3596 II(Mov | DstMem | Priv, em_sgdt, sgdt),
3434 DI(Mov | DstMem | Priv, sidt), 3597 II(Mov | DstMem | Priv, em_sidt, sidt),
3435 II(SrcMem | Priv, em_lgdt, lgdt), 3598 II(SrcMem | Priv, em_lgdt, lgdt),
3436 II(SrcMem | Priv, em_lidt, lidt), 3599 II(SrcMem | Priv, em_lidt, lidt),
3437 II(SrcNone | DstMem | Mov, em_smsw, smsw), N, 3600 II(SrcNone | DstMem | Mov, em_smsw, smsw), N,
@@ -3538,7 +3701,7 @@ static struct opcode opcode_table[256] = {
3538 D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd), 3701 D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd),
3539 I(SrcImmFAddr | No64, em_call_far), N, 3702 I(SrcImmFAddr | No64, em_call_far), N,
3540 II(ImplicitOps | Stack, em_pushf, pushf), 3703 II(ImplicitOps | Stack, em_pushf, pushf),
3541 II(ImplicitOps | Stack, em_popf, popf), N, N, 3704 II(ImplicitOps | Stack, em_popf, popf), N, I(ImplicitOps, em_lahf),
3542 /* 0xA0 - 0xA7 */ 3705 /* 0xA0 - 0xA7 */
3543 I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), 3706 I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
3544 I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov), 3707 I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov),
@@ -3561,7 +3724,8 @@ static struct opcode opcode_table[256] = {
3561 I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg), 3724 I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg),
3562 G(ByteOp, group11), G(0, group11), 3725 G(ByteOp, group11), G(0, group11),
3563 /* 0xC8 - 0xCF */ 3726 /* 0xC8 - 0xCF */
3564 N, N, N, I(ImplicitOps | Stack, em_ret_far), 3727 I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave),
3728 N, I(ImplicitOps | Stack, em_ret_far),
3565 D(ImplicitOps), DI(SrcImmByte, intn), 3729 D(ImplicitOps), DI(SrcImmByte, intn),
3566 D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret), 3730 D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret),
3567 /* 0xD0 - 0xD7 */ 3731 /* 0xD0 - 0xD7 */
@@ -3635,7 +3799,7 @@ static struct opcode twobyte_table[256] = {
3635 X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), 3799 X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
3636 /* 0xA0 - 0xA7 */ 3800 /* 0xA0 - 0xA7 */
3637 I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg), 3801 I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg),
3638 DI(ImplicitOps, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt), 3802 II(ImplicitOps, em_cpuid, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt),
3639 D(DstMem | SrcReg | Src2ImmByte | ModRM), 3803 D(DstMem | SrcReg | Src2ImmByte | ModRM),
3640 D(DstMem | SrcReg | Src2CL | ModRM), N, N, 3804 D(DstMem | SrcReg | Src2CL | ModRM), N, N,
3641 /* 0xA8 - 0xAF */ 3805 /* 0xA8 - 0xAF */
@@ -3658,11 +3822,12 @@ static struct opcode twobyte_table[256] = {
3658 I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc), 3822 I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc),
3659 I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr), 3823 I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr),
3660 D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), 3824 D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
3661 /* 0xC0 - 0xCF */ 3825 /* 0xC0 - 0xC7 */
3662 D2bv(DstMem | SrcReg | ModRM | Lock), 3826 D2bv(DstMem | SrcReg | ModRM | Lock),
3663 N, D(DstMem | SrcReg | ModRM | Mov), 3827 N, D(DstMem | SrcReg | ModRM | Mov),
3664 N, N, N, GD(0, &group9), 3828 N, N, N, GD(0, &group9),
3665 N, N, N, N, N, N, N, N, 3829 /* 0xC8 - 0xCF */
3830 X8(I(DstReg, em_bswap)),
3666 /* 0xD0 - 0xDF */ 3831 /* 0xD0 - 0xDF */
3667 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, 3832 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
3668 /* 0xE0 - 0xEF */ 3833 /* 0xE0 - 0xEF */
@@ -4426,12 +4591,12 @@ twobyte_insn:
4426 break; 4591 break;
4427 case 0xb6 ... 0xb7: /* movzx */ 4592 case 0xb6 ... 0xb7: /* movzx */
4428 ctxt->dst.bytes = ctxt->op_bytes; 4593 ctxt->dst.bytes = ctxt->op_bytes;
4429 ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val 4594 ctxt->dst.val = (ctxt->src.bytes == 1) ? (u8) ctxt->src.val
4430 : (u16) ctxt->src.val; 4595 : (u16) ctxt->src.val;
4431 break; 4596 break;
4432 case 0xbe ... 0xbf: /* movsx */ 4597 case 0xbe ... 0xbf: /* movsx */
4433 ctxt->dst.bytes = ctxt->op_bytes; 4598 ctxt->dst.bytes = ctxt->op_bytes;
4434 ctxt->dst.val = (ctxt->d & ByteOp) ? (s8) ctxt->src.val : 4599 ctxt->dst.val = (ctxt->src.bytes == 1) ? (s8) ctxt->src.val :
4435 (s16) ctxt->src.val; 4600 (s16) ctxt->src.val;
4436 break; 4601 break;
4437 case 0xc0 ... 0xc1: /* xadd */ 4602 case 0xc0 ... 0xc1: /* xadd */
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 81cf4fa4a2be..1df8fb9e1d5d 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -188,14 +188,15 @@ void kvm_pic_update_irq(struct kvm_pic *s)
188 pic_unlock(s); 188 pic_unlock(s);
189} 189}
190 190
191int kvm_pic_set_irq(void *opaque, int irq, int level) 191int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level)
192{ 192{
193 struct kvm_pic *s = opaque;
194 int ret = -1; 193 int ret = -1;
195 194
196 pic_lock(s); 195 pic_lock(s);
197 if (irq >= 0 && irq < PIC_NUM_PINS) { 196 if (irq >= 0 && irq < PIC_NUM_PINS) {
198 ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); 197 int irq_level = __kvm_irq_line_state(&s->irq_states[irq],
198 irq_source_id, level);
199 ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level);
199 pic_update_irq(s); 200 pic_update_irq(s);
200 trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, 201 trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
201 s->pics[irq >> 3].imr, ret == 0); 202 s->pics[irq >> 3].imr, ret == 0);
@@ -205,6 +206,16 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
205 return ret; 206 return ret;
206} 207}
207 208
209void kvm_pic_clear_all(struct kvm_pic *s, int irq_source_id)
210{
211 int i;
212
213 pic_lock(s);
214 for (i = 0; i < PIC_NUM_PINS; i++)
215 __clear_bit(irq_source_id, &s->irq_states[i]);
216 pic_unlock(s);
217}
218
208/* 219/*
209 * acknowledge interrupt 'irq' 220 * acknowledge interrupt 'irq'
210 */ 221 */
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 93c15743f1ee..ce878788a39f 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -107,6 +107,16 @@ static inline void apic_clear_vector(int vec, void *bitmap)
107 clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); 107 clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
108} 108}
109 109
110static inline int __apic_test_and_set_vector(int vec, void *bitmap)
111{
112 return __test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
113}
114
115static inline int __apic_test_and_clear_vector(int vec, void *bitmap)
116{
117 return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
118}
119
110static inline int apic_hw_enabled(struct kvm_lapic *apic) 120static inline int apic_hw_enabled(struct kvm_lapic *apic)
111{ 121{
112 return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE; 122 return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
@@ -210,6 +220,16 @@ static int find_highest_vector(void *bitmap)
210 return fls(word[word_offset << 2]) - 1 + (word_offset << 5); 220 return fls(word[word_offset << 2]) - 1 + (word_offset << 5);
211} 221}
212 222
223static u8 count_vectors(void *bitmap)
224{
225 u32 *word = bitmap;
226 int word_offset;
227 u8 count = 0;
228 for (word_offset = 0; word_offset < MAX_APIC_VECTOR >> 5; ++word_offset)
229 count += hweight32(word[word_offset << 2]);
230 return count;
231}
232
213static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) 233static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
214{ 234{
215 apic->irr_pending = true; 235 apic->irr_pending = true;
@@ -242,6 +262,27 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
242 apic->irr_pending = true; 262 apic->irr_pending = true;
243} 263}
244 264
265static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
266{
267 if (!__apic_test_and_set_vector(vec, apic->regs + APIC_ISR))
268 ++apic->isr_count;
269 BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
270 /*
271 * ISR (in service register) bit is set when injecting an interrupt.
272 * The highest vector is injected. Thus the latest bit set matches
273 * the highest bit in ISR.
274 */
275 apic->highest_isr_cache = vec;
276}
277
278static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
279{
280 if (__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR))
281 --apic->isr_count;
282 BUG_ON(apic->isr_count < 0);
283 apic->highest_isr_cache = -1;
284}
285
245int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) 286int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
246{ 287{
247 struct kvm_lapic *apic = vcpu->arch.apic; 288 struct kvm_lapic *apic = vcpu->arch.apic;
@@ -270,9 +311,61 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq)
270 irq->level, irq->trig_mode); 311 irq->level, irq->trig_mode);
271} 312}
272 313
314static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val)
315{
316
317 return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val,
318 sizeof(val));
319}
320
321static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val)
322{
323
324 return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val,
325 sizeof(*val));
326}
327
328static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu)
329{
330 return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED;
331}
332
333static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu)
334{
335 u8 val;
336 if (pv_eoi_get_user(vcpu, &val) < 0)
337 apic_debug("Can't read EOI MSR value: 0x%llx\n",
338 (unsigned long long)vcpi->arch.pv_eoi.msr_val);
339 return val & 0x1;
340}
341
342static void pv_eoi_set_pending(struct kvm_vcpu *vcpu)
343{
344 if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0) {
345 apic_debug("Can't set EOI MSR value: 0x%llx\n",
346 (unsigned long long)vcpi->arch.pv_eoi.msr_val);
347 return;
348 }
349 __set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
350}
351
352static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
353{
354 if (pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) {
355 apic_debug("Can't clear EOI MSR value: 0x%llx\n",
356 (unsigned long long)vcpi->arch.pv_eoi.msr_val);
357 return;
358 }
359 __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
360}
361
273static inline int apic_find_highest_isr(struct kvm_lapic *apic) 362static inline int apic_find_highest_isr(struct kvm_lapic *apic)
274{ 363{
275 int result; 364 int result;
365 if (!apic->isr_count)
366 return -1;
367 if (likely(apic->highest_isr_cache != -1))
368 return apic->highest_isr_cache;
276 369
277 result = find_highest_vector(apic->regs + APIC_ISR); 370 result = find_highest_vector(apic->regs + APIC_ISR);
278 ASSERT(result == -1 || result >= 16); 371 ASSERT(result == -1 || result >= 16);
@@ -482,17 +575,20 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
482 return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; 575 return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
483} 576}
484 577
485static void apic_set_eoi(struct kvm_lapic *apic) 578static int apic_set_eoi(struct kvm_lapic *apic)
486{ 579{
487 int vector = apic_find_highest_isr(apic); 580 int vector = apic_find_highest_isr(apic);
581
582 trace_kvm_eoi(apic, vector);
583
488 /* 584 /*
489 * Not every write EOI will has corresponding ISR, 585 * Not every write EOI will has corresponding ISR,
490 * one example is when Kernel check timer on setup_IO_APIC 586 * one example is when Kernel check timer on setup_IO_APIC
491 */ 587 */
492 if (vector == -1) 588 if (vector == -1)
493 return; 589 return vector;
494 590
495 apic_clear_vector(vector, apic->regs + APIC_ISR); 591 apic_clear_isr(vector, apic);
496 apic_update_ppr(apic); 592 apic_update_ppr(apic);
497 593
498 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && 594 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) &&
@@ -505,6 +601,7 @@ static void apic_set_eoi(struct kvm_lapic *apic)
505 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); 601 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
506 } 602 }
507 kvm_make_request(KVM_REQ_EVENT, apic->vcpu); 603 kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
604 return vector;
508} 605}
509 606
510static void apic_send_ipi(struct kvm_lapic *apic) 607static void apic_send_ipi(struct kvm_lapic *apic)
@@ -1081,10 +1178,13 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
1081 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); 1178 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
1082 } 1179 }
1083 apic->irr_pending = false; 1180 apic->irr_pending = false;
1181 apic->isr_count = 0;
1182 apic->highest_isr_cache = -1;
1084 update_divide_count(apic); 1183 update_divide_count(apic);
1085 atomic_set(&apic->lapic_timer.pending, 0); 1184 atomic_set(&apic->lapic_timer.pending, 0);
1086 if (kvm_vcpu_is_bsp(vcpu)) 1185 if (kvm_vcpu_is_bsp(vcpu))
1087 vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; 1186 vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
1187 vcpu->arch.pv_eoi.msr_val = 0;
1088 apic_update_ppr(apic); 1188 apic_update_ppr(apic);
1089 1189
1090 vcpu->arch.apic_arb_prio = 0; 1190 vcpu->arch.apic_arb_prio = 0;
@@ -1248,7 +1348,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
1248 if (vector == -1) 1348 if (vector == -1)
1249 return -1; 1349 return -1;
1250 1350
1251 apic_set_vector(vector, apic->regs + APIC_ISR); 1351 apic_set_isr(vector, apic);
1252 apic_update_ppr(apic); 1352 apic_update_ppr(apic);
1253 apic_clear_irr(vector, apic); 1353 apic_clear_irr(vector, apic);
1254 return vector; 1354 return vector;
@@ -1267,6 +1367,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
1267 update_divide_count(apic); 1367 update_divide_count(apic);
1268 start_apic_timer(apic); 1368 start_apic_timer(apic);
1269 apic->irr_pending = true; 1369 apic->irr_pending = true;
1370 apic->isr_count = count_vectors(apic->regs + APIC_ISR);
1371 apic->highest_isr_cache = -1;
1270 kvm_make_request(KVM_REQ_EVENT, vcpu); 1372 kvm_make_request(KVM_REQ_EVENT, vcpu);
1271} 1373}
1272 1374
@@ -1283,11 +1385,51 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
1283 hrtimer_start_expires(timer, HRTIMER_MODE_ABS); 1385 hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
1284} 1386}
1285 1387
1388/*
1389 * apic_sync_pv_eoi_from_guest - called on vmexit or cancel interrupt
1390 *
1391 * Detect whether guest triggered PV EOI since the
1392 * last entry. If yes, set EOI on guests's behalf.
1393 * Clear PV EOI in guest memory in any case.
1394 */
1395static void apic_sync_pv_eoi_from_guest(struct kvm_vcpu *vcpu,
1396 struct kvm_lapic *apic)
1397{
1398 bool pending;
1399 int vector;
1400 /*
1401 * PV EOI state is derived from KVM_APIC_PV_EOI_PENDING in host
1402 * and KVM_PV_EOI_ENABLED in guest memory as follows:
1403 *
1404 * KVM_APIC_PV_EOI_PENDING is unset:
1405 * -> host disabled PV EOI.
1406 * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is set:
1407 * -> host enabled PV EOI, guest did not execute EOI yet.
1408 * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is unset:
1409 * -> host enabled PV EOI, guest executed EOI.
1410 */
1411 BUG_ON(!pv_eoi_enabled(vcpu));
1412 pending = pv_eoi_get_pending(vcpu);
1413 /*
1414 * Clear pending bit in any case: it will be set again on vmentry.
1415 * While this might not be ideal from performance point of view,
1416 * this makes sure pv eoi is only enabled when we know it's safe.
1417 */
1418 pv_eoi_clr_pending(vcpu);
1419 if (pending)
1420 return;
1421 vector = apic_set_eoi(apic);
1422 trace_kvm_pv_eoi(apic, vector);
1423}
1424
1286void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) 1425void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
1287{ 1426{
1288 u32 data; 1427 u32 data;
1289 void *vapic; 1428 void *vapic;
1290 1429
1430 if (test_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention))
1431 apic_sync_pv_eoi_from_guest(vcpu, vcpu->arch.apic);
1432
1291 if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) 1433 if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
1292 return; 1434 return;
1293 1435
@@ -1298,17 +1440,44 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
1298 apic_set_tpr(vcpu->arch.apic, data & 0xff); 1440 apic_set_tpr(vcpu->arch.apic, data & 0xff);
1299} 1441}
1300 1442
1443/*
1444 * apic_sync_pv_eoi_to_guest - called before vmentry
1445 *
1446 * Detect whether it's safe to enable PV EOI and
1447 * if yes do so.
1448 */
1449static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu,
1450 struct kvm_lapic *apic)
1451{
1452 if (!pv_eoi_enabled(vcpu) ||
1453 /* IRR set or many bits in ISR: could be nested. */
1454 apic->irr_pending ||
1455 /* Cache not set: could be safe but we don't bother. */
1456 apic->highest_isr_cache == -1 ||
1457 /* Need EOI to update ioapic. */
1458 kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache)) {
1459 /*
1460 * PV EOI was disabled by apic_sync_pv_eoi_from_guest
1461 * so we need not do anything here.
1462 */
1463 return;
1464 }
1465
1466 pv_eoi_set_pending(apic->vcpu);
1467}
1468
1301void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) 1469void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
1302{ 1470{
1303 u32 data, tpr; 1471 u32 data, tpr;
1304 int max_irr, max_isr; 1472 int max_irr, max_isr;
1305 struct kvm_lapic *apic; 1473 struct kvm_lapic *apic = vcpu->arch.apic;
1306 void *vapic; 1474 void *vapic;
1307 1475
1476 apic_sync_pv_eoi_to_guest(vcpu, apic);
1477
1308 if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) 1478 if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
1309 return; 1479 return;
1310 1480
1311 apic = vcpu->arch.apic;
1312 tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff; 1481 tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff;
1313 max_irr = apic_find_highest_irr(apic); 1482 max_irr = apic_find_highest_irr(apic);
1314 if (max_irr < 0) 1483 if (max_irr < 0)
@@ -1394,3 +1563,16 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
1394 1563
1395 return 0; 1564 return 0;
1396} 1565}
1566
1567int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data)
1568{
1569 u64 addr = data & ~KVM_MSR_ENABLED;
1570 if (!IS_ALIGNED(addr, 4))
1571 return 1;
1572
1573 vcpu->arch.pv_eoi.msr_val = data;
1574 if (!pv_eoi_enabled(vcpu))
1575 return 0;
1576 return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data,
1577 addr);
1578}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 6f4ce2575d09..4af5405ae1e2 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -13,6 +13,15 @@ struct kvm_lapic {
13 u32 divide_count; 13 u32 divide_count;
14 struct kvm_vcpu *vcpu; 14 struct kvm_vcpu *vcpu;
15 bool irr_pending; 15 bool irr_pending;
16 /* Number of bits set in ISR. */
17 s16 isr_count;
18 /* The highest vector set in ISR; if -1 - invalid, must scan ISR. */
19 int highest_isr_cache;
20 /**
21 * APIC register page. The layout matches the register layout seen by
22 * the guest 1:1, because it is accessed by the vmx microcode.
23 * Note: Only one register, the TPR, is used by the microcode.
24 */
16 void *regs; 25 void *regs;
17 gpa_t vapic_addr; 26 gpa_t vapic_addr;
18 struct page *vapic_page; 27 struct page *vapic_page;
@@ -60,4 +69,6 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
60{ 69{
61 return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE; 70 return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE;
62} 71}
72
73int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data);
63#endif 74#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 57e168e27b5b..01ca00423938 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -90,7 +90,7 @@ module_param(dbg, bool, 0644);
90 90
91#define PTE_PREFETCH_NUM 8 91#define PTE_PREFETCH_NUM 8
92 92
93#define PT_FIRST_AVAIL_BITS_SHIFT 9 93#define PT_FIRST_AVAIL_BITS_SHIFT 10
94#define PT64_SECOND_AVAIL_BITS_SHIFT 52 94#define PT64_SECOND_AVAIL_BITS_SHIFT 52
95 95
96#define PT64_LEVEL_BITS 9 96#define PT64_LEVEL_BITS 9
@@ -145,7 +145,8 @@ module_param(dbg, bool, 0644);
145#define CREATE_TRACE_POINTS 145#define CREATE_TRACE_POINTS
146#include "mmutrace.h" 146#include "mmutrace.h"
147 147
148#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) 148#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
149#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
149 150
150#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) 151#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
151 152
@@ -188,6 +189,7 @@ static u64 __read_mostly shadow_dirty_mask;
188static u64 __read_mostly shadow_mmio_mask; 189static u64 __read_mostly shadow_mmio_mask;
189 190
190static void mmu_spte_set(u64 *sptep, u64 spte); 191static void mmu_spte_set(u64 *sptep, u64 spte);
192static void mmu_free_roots(struct kvm_vcpu *vcpu);
191 193
192void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) 194void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
193{ 195{
@@ -444,8 +446,22 @@ static bool __check_direct_spte_mmio_pf(u64 spte)
444} 446}
445#endif 447#endif
446 448
449static bool spte_is_locklessly_modifiable(u64 spte)
450{
451 return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE));
452}
453
447static bool spte_has_volatile_bits(u64 spte) 454static bool spte_has_volatile_bits(u64 spte)
448{ 455{
456 /*
457 * Always atomicly update spte if it can be updated
458 * out of mmu-lock, it can ensure dirty bit is not lost,
459 * also, it can help us to get a stable is_writable_pte()
460 * to ensure tlb flush is not missed.
461 */
462 if (spte_is_locklessly_modifiable(spte))
463 return true;
464
449 if (!shadow_accessed_mask) 465 if (!shadow_accessed_mask)
450 return false; 466 return false;
451 467
@@ -478,34 +494,47 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte)
478 494
479/* Rules for using mmu_spte_update: 495/* Rules for using mmu_spte_update:
480 * Update the state bits, it means the mapped pfn is not changged. 496 * Update the state bits, it means the mapped pfn is not changged.
497 *
498 * Whenever we overwrite a writable spte with a read-only one we
499 * should flush remote TLBs. Otherwise rmap_write_protect
500 * will find a read-only spte, even though the writable spte
501 * might be cached on a CPU's TLB, the return value indicates this
502 * case.
481 */ 503 */
482static void mmu_spte_update(u64 *sptep, u64 new_spte) 504static bool mmu_spte_update(u64 *sptep, u64 new_spte)
483{ 505{
484 u64 mask, old_spte = *sptep; 506 u64 old_spte = *sptep;
507 bool ret = false;
485 508
486 WARN_ON(!is_rmap_spte(new_spte)); 509 WARN_ON(!is_rmap_spte(new_spte));
487 510
488 if (!is_shadow_present_pte(old_spte)) 511 if (!is_shadow_present_pte(old_spte)) {
489 return mmu_spte_set(sptep, new_spte); 512 mmu_spte_set(sptep, new_spte);
490 513 return ret;
491 new_spte |= old_spte & shadow_dirty_mask; 514 }
492
493 mask = shadow_accessed_mask;
494 if (is_writable_pte(old_spte))
495 mask |= shadow_dirty_mask;
496 515
497 if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask) 516 if (!spte_has_volatile_bits(old_spte))
498 __update_clear_spte_fast(sptep, new_spte); 517 __update_clear_spte_fast(sptep, new_spte);
499 else 518 else
500 old_spte = __update_clear_spte_slow(sptep, new_spte); 519 old_spte = __update_clear_spte_slow(sptep, new_spte);
501 520
521 /*
522 * For the spte updated out of mmu-lock is safe, since
523 * we always atomicly update it, see the comments in
524 * spte_has_volatile_bits().
525 */
526 if (is_writable_pte(old_spte) && !is_writable_pte(new_spte))
527 ret = true;
528
502 if (!shadow_accessed_mask) 529 if (!shadow_accessed_mask)
503 return; 530 return ret;
504 531
505 if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) 532 if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
506 kvm_set_pfn_accessed(spte_to_pfn(old_spte)); 533 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
507 if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) 534 if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
508 kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 535 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
536
537 return ret;
509} 538}
510 539
511/* 540/*
@@ -652,8 +681,7 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
652 mmu_page_header_cache); 681 mmu_page_header_cache);
653} 682}
654 683
655static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, 684static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
656 size_t size)
657{ 685{
658 void *p; 686 void *p;
659 687
@@ -664,8 +692,7 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
664 692
665static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) 693static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
666{ 694{
667 return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache, 695 return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
668 sizeof(struct pte_list_desc));
669} 696}
670 697
671static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) 698static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
@@ -1051,35 +1078,82 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
1051 rmap_remove(kvm, sptep); 1078 rmap_remove(kvm, sptep);
1052} 1079}
1053 1080
1054static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) 1081
1082static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
1083{
1084 if (is_large_pte(*sptep)) {
1085 WARN_ON(page_header(__pa(sptep))->role.level ==
1086 PT_PAGE_TABLE_LEVEL);
1087 drop_spte(kvm, sptep);
1088 --kvm->stat.lpages;
1089 return true;
1090 }
1091
1092 return false;
1093}
1094
1095static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1096{
1097 if (__drop_large_spte(vcpu->kvm, sptep))
1098 kvm_flush_remote_tlbs(vcpu->kvm);
1099}
1100
1101/*
1102 * Write-protect on the specified @sptep, @pt_protect indicates whether
1103 * spte writ-protection is caused by protecting shadow page table.
1104 * @flush indicates whether tlb need be flushed.
1105 *
1106 * Note: write protection is difference between drity logging and spte
1107 * protection:
1108 * - for dirty logging, the spte can be set to writable at anytime if
1109 * its dirty bitmap is properly set.
1110 * - for spte protection, the spte can be writable only after unsync-ing
1111 * shadow page.
1112 *
1113 * Return true if the spte is dropped.
1114 */
1115static bool
1116spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect)
1117{
1118 u64 spte = *sptep;
1119
1120 if (!is_writable_pte(spte) &&
1121 !(pt_protect && spte_is_locklessly_modifiable(spte)))
1122 return false;
1123
1124 rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
1125
1126 if (__drop_large_spte(kvm, sptep)) {
1127 *flush |= true;
1128 return true;
1129 }
1130
1131 if (pt_protect)
1132 spte &= ~SPTE_MMU_WRITEABLE;
1133 spte = spte & ~PT_WRITABLE_MASK;
1134
1135 *flush |= mmu_spte_update(sptep, spte);
1136 return false;
1137}
1138
1139static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
1140 int level, bool pt_protect)
1055{ 1141{
1056 u64 *sptep; 1142 u64 *sptep;
1057 struct rmap_iterator iter; 1143 struct rmap_iterator iter;
1058 int write_protected = 0; 1144 bool flush = false;
1059 1145
1060 for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { 1146 for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
1061 BUG_ON(!(*sptep & PT_PRESENT_MASK)); 1147 BUG_ON(!(*sptep & PT_PRESENT_MASK));
1062 rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); 1148 if (spte_write_protect(kvm, sptep, &flush, pt_protect)) {
1063
1064 if (!is_writable_pte(*sptep)) {
1065 sptep = rmap_get_next(&iter);
1066 continue;
1067 }
1068
1069 if (level == PT_PAGE_TABLE_LEVEL) {
1070 mmu_spte_update(sptep, *sptep & ~PT_WRITABLE_MASK);
1071 sptep = rmap_get_next(&iter);
1072 } else {
1073 BUG_ON(!is_large_pte(*sptep));
1074 drop_spte(kvm, sptep);
1075 --kvm->stat.lpages;
1076 sptep = rmap_get_first(*rmapp, &iter); 1149 sptep = rmap_get_first(*rmapp, &iter);
1150 continue;
1077 } 1151 }
1078 1152
1079 write_protected = 1; 1153 sptep = rmap_get_next(&iter);
1080 } 1154 }
1081 1155
1082 return write_protected; 1156 return flush;
1083} 1157}
1084 1158
1085/** 1159/**
@@ -1100,26 +1174,26 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1100 1174
1101 while (mask) { 1175 while (mask) {
1102 rmapp = &slot->rmap[gfn_offset + __ffs(mask)]; 1176 rmapp = &slot->rmap[gfn_offset + __ffs(mask)];
1103 __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL); 1177 __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false);
1104 1178
1105 /* clear the first set bit */ 1179 /* clear the first set bit */
1106 mask &= mask - 1; 1180 mask &= mask - 1;
1107 } 1181 }
1108} 1182}
1109 1183
1110static int rmap_write_protect(struct kvm *kvm, u64 gfn) 1184static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
1111{ 1185{
1112 struct kvm_memory_slot *slot; 1186 struct kvm_memory_slot *slot;
1113 unsigned long *rmapp; 1187 unsigned long *rmapp;
1114 int i; 1188 int i;
1115 int write_protected = 0; 1189 bool write_protected = false;
1116 1190
1117 slot = gfn_to_memslot(kvm, gfn); 1191 slot = gfn_to_memslot(kvm, gfn);
1118 1192
1119 for (i = PT_PAGE_TABLE_LEVEL; 1193 for (i = PT_PAGE_TABLE_LEVEL;
1120 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 1194 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
1121 rmapp = __gfn_to_rmap(gfn, i, slot); 1195 rmapp = __gfn_to_rmap(gfn, i, slot);
1122 write_protected |= __rmap_write_protect(kvm, rmapp, i); 1196 write_protected |= __rmap_write_protect(kvm, rmapp, i, true);
1123 } 1197 }
1124 1198
1125 return write_protected; 1199 return write_protected;
@@ -1238,11 +1312,12 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1238 unsigned long data) 1312 unsigned long data)
1239{ 1313{
1240 u64 *sptep; 1314 u64 *sptep;
1241 struct rmap_iterator iter; 1315 struct rmap_iterator uninitialized_var(iter);
1242 int young = 0; 1316 int young = 0;
1243 1317
1244 /* 1318 /*
1245 * Emulate the accessed bit for EPT, by checking if this page has 1319 * In case of absence of EPT Access and Dirty Bits supports,
1320 * emulate the accessed bit for EPT, by checking if this page has
1246 * an EPT mapping, and clearing it if it does. On the next access, 1321 * an EPT mapping, and clearing it if it does. On the next access,
1247 * a new EPT mapping will be established. 1322 * a new EPT mapping will be established.
1248 * This has some overhead, but not as much as the cost of swapping 1323 * This has some overhead, but not as much as the cost of swapping
@@ -1253,11 +1328,12 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1253 1328
1254 for (sptep = rmap_get_first(*rmapp, &iter); sptep; 1329 for (sptep = rmap_get_first(*rmapp, &iter); sptep;
1255 sptep = rmap_get_next(&iter)) { 1330 sptep = rmap_get_next(&iter)) {
1256 BUG_ON(!(*sptep & PT_PRESENT_MASK)); 1331 BUG_ON(!is_shadow_present_pte(*sptep));
1257 1332
1258 if (*sptep & PT_ACCESSED_MASK) { 1333 if (*sptep & shadow_accessed_mask) {
1259 young = 1; 1334 young = 1;
1260 clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)sptep); 1335 clear_bit((ffs(shadow_accessed_mask) - 1),
1336 (unsigned long *)sptep);
1261 } 1337 }
1262 } 1338 }
1263 1339
@@ -1281,9 +1357,9 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1281 1357
1282 for (sptep = rmap_get_first(*rmapp, &iter); sptep; 1358 for (sptep = rmap_get_first(*rmapp, &iter); sptep;
1283 sptep = rmap_get_next(&iter)) { 1359 sptep = rmap_get_next(&iter)) {
1284 BUG_ON(!(*sptep & PT_PRESENT_MASK)); 1360 BUG_ON(!is_shadow_present_pte(*sptep));
1285 1361
1286 if (*sptep & PT_ACCESSED_MASK) { 1362 if (*sptep & shadow_accessed_mask) {
1287 young = 1; 1363 young = 1;
1288 break; 1364 break;
1289 } 1365 }
@@ -1401,12 +1477,10 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
1401 u64 *parent_pte, int direct) 1477 u64 *parent_pte, int direct)
1402{ 1478{
1403 struct kvm_mmu_page *sp; 1479 struct kvm_mmu_page *sp;
1404 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, 1480 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
1405 sizeof *sp); 1481 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
1406 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
1407 if (!direct) 1482 if (!direct)
1408 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, 1483 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
1409 PAGE_SIZE);
1410 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 1484 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
1411 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); 1485 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
1412 bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM); 1486 bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM);
@@ -1701,7 +1775,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
1701 1775
1702 kvm_mmu_pages_init(parent, &parents, &pages); 1776 kvm_mmu_pages_init(parent, &parents, &pages);
1703 while (mmu_unsync_walk(parent, &pages)) { 1777 while (mmu_unsync_walk(parent, &pages)) {
1704 int protected = 0; 1778 bool protected = false;
1705 1779
1706 for_each_sp(pages, sp, parents, i) 1780 for_each_sp(pages, sp, parents, i)
1707 protected |= rmap_write_protect(vcpu->kvm, sp->gfn); 1781 protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
@@ -1866,15 +1940,6 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
1866 mmu_spte_set(sptep, spte); 1940 mmu_spte_set(sptep, spte);
1867} 1941}
1868 1942
1869static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1870{
1871 if (is_large_pte(*sptep)) {
1872 drop_spte(vcpu->kvm, sptep);
1873 --vcpu->kvm->stat.lpages;
1874 kvm_flush_remote_tlbs(vcpu->kvm);
1875 }
1876}
1877
1878static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, 1943static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1879 unsigned direct_access) 1944 unsigned direct_access)
1880{ 1945{
@@ -2243,7 +2308,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2243 gfn_t gfn, pfn_t pfn, bool speculative, 2308 gfn_t gfn, pfn_t pfn, bool speculative,
2244 bool can_unsync, bool host_writable) 2309 bool can_unsync, bool host_writable)
2245{ 2310{
2246 u64 spte, entry = *sptep; 2311 u64 spte;
2247 int ret = 0; 2312 int ret = 0;
2248 2313
2249 if (set_mmio_spte(sptep, gfn, pfn, pte_access)) 2314 if (set_mmio_spte(sptep, gfn, pfn, pte_access))
@@ -2257,8 +2322,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2257 spte |= shadow_x_mask; 2322 spte |= shadow_x_mask;
2258 else 2323 else
2259 spte |= shadow_nx_mask; 2324 spte |= shadow_nx_mask;
2325
2260 if (pte_access & ACC_USER_MASK) 2326 if (pte_access & ACC_USER_MASK)
2261 spte |= shadow_user_mask; 2327 spte |= shadow_user_mask;
2328
2262 if (level > PT_PAGE_TABLE_LEVEL) 2329 if (level > PT_PAGE_TABLE_LEVEL)
2263 spte |= PT_PAGE_SIZE_MASK; 2330 spte |= PT_PAGE_SIZE_MASK;
2264 if (tdp_enabled) 2331 if (tdp_enabled)
@@ -2283,7 +2350,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2283 goto done; 2350 goto done;
2284 } 2351 }
2285 2352
2286 spte |= PT_WRITABLE_MASK; 2353 spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
2287 2354
2288 if (!vcpu->arch.mmu.direct_map 2355 if (!vcpu->arch.mmu.direct_map
2289 && !(pte_access & ACC_WRITE_MASK)) { 2356 && !(pte_access & ACC_WRITE_MASK)) {
@@ -2312,8 +2379,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2312 __func__, gfn); 2379 __func__, gfn);
2313 ret = 1; 2380 ret = 1;
2314 pte_access &= ~ACC_WRITE_MASK; 2381 pte_access &= ~ACC_WRITE_MASK;
2315 if (is_writable_pte(spte)) 2382 spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
2316 spte &= ~PT_WRITABLE_MASK;
2317 } 2383 }
2318 } 2384 }
2319 2385
@@ -2321,14 +2387,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2321 mark_page_dirty(vcpu->kvm, gfn); 2387 mark_page_dirty(vcpu->kvm, gfn);
2322 2388
2323set_pte: 2389set_pte:
2324 mmu_spte_update(sptep, spte); 2390 if (mmu_spte_update(sptep, spte))
2325 /*
2326 * If we overwrite a writable spte with a read-only one we
2327 * should flush remote TLBs. Otherwise rmap_write_protect
2328 * will find a read-only spte, even though the writable spte
2329 * might be cached on a CPU's TLB.
2330 */
2331 if (is_writable_pte(entry) && !is_writable_pte(*sptep))
2332 kvm_flush_remote_tlbs(vcpu->kvm); 2391 kvm_flush_remote_tlbs(vcpu->kvm);
2333done: 2392done:
2334 return ret; 2393 return ret;
@@ -2403,6 +2462,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2403 2462
2404static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) 2463static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
2405{ 2464{
2465 mmu_free_roots(vcpu);
2406} 2466}
2407 2467
2408static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, 2468static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
@@ -2625,18 +2685,116 @@ exit:
2625 return ret; 2685 return ret;
2626} 2686}
2627 2687
2688static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, u32 error_code)
2689{
2690 /*
2691 * #PF can be fast only if the shadow page table is present and it
2692 * is caused by write-protect, that means we just need change the
2693 * W bit of the spte which can be done out of mmu-lock.
2694 */
2695 if (!(error_code & PFERR_PRESENT_MASK) ||
2696 !(error_code & PFERR_WRITE_MASK))
2697 return false;
2698
2699 return true;
2700}
2701
2702static bool
2703fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte)
2704{
2705 struct kvm_mmu_page *sp = page_header(__pa(sptep));
2706 gfn_t gfn;
2707
2708 WARN_ON(!sp->role.direct);
2709
2710 /*
2711 * The gfn of direct spte is stable since it is calculated
2712 * by sp->gfn.
2713 */
2714 gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
2715
2716 if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte)
2717 mark_page_dirty(vcpu->kvm, gfn);
2718
2719 return true;
2720}
2721
2722/*
2723 * Return value:
2724 * - true: let the vcpu to access on the same address again.
2725 * - false: let the real page fault path to fix it.
2726 */
2727static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
2728 u32 error_code)
2729{
2730 struct kvm_shadow_walk_iterator iterator;
2731 bool ret = false;
2732 u64 spte = 0ull;
2733
2734 if (!page_fault_can_be_fast(vcpu, error_code))
2735 return false;
2736
2737 walk_shadow_page_lockless_begin(vcpu);
2738 for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
2739 if (!is_shadow_present_pte(spte) || iterator.level < level)
2740 break;
2741
2742 /*
2743 * If the mapping has been changed, let the vcpu fault on the
2744 * same address again.
2745 */
2746 if (!is_rmap_spte(spte)) {
2747 ret = true;
2748 goto exit;
2749 }
2750
2751 if (!is_last_spte(spte, level))
2752 goto exit;
2753
2754 /*
2755 * Check if it is a spurious fault caused by TLB lazily flushed.
2756 *
2757 * Need not check the access of upper level table entries since
2758 * they are always ACC_ALL.
2759 */
2760 if (is_writable_pte(spte)) {
2761 ret = true;
2762 goto exit;
2763 }
2764
2765 /*
2766 * Currently, to simplify the code, only the spte write-protected
2767 * by dirty-log can be fast fixed.
2768 */
2769 if (!spte_is_locklessly_modifiable(spte))
2770 goto exit;
2771
2772 /*
2773 * Currently, fast page fault only works for direct mapping since
2774 * the gfn is not stable for indirect shadow page.
2775 * See Documentation/virtual/kvm/locking.txt to get more detail.
2776 */
2777 ret = fast_pf_fix_direct_spte(vcpu, iterator.sptep, spte);
2778exit:
2779 trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
2780 spte, ret);
2781 walk_shadow_page_lockless_end(vcpu);
2782
2783 return ret;
2784}
2785
2628static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, 2786static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
2629 gva_t gva, pfn_t *pfn, bool write, bool *writable); 2787 gva_t gva, pfn_t *pfn, bool write, bool *writable);
2630 2788
2631static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, 2789static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
2632 bool prefault) 2790 gfn_t gfn, bool prefault)
2633{ 2791{
2634 int r; 2792 int r;
2635 int level; 2793 int level;
2636 int force_pt_level; 2794 int force_pt_level;
2637 pfn_t pfn; 2795 pfn_t pfn;
2638 unsigned long mmu_seq; 2796 unsigned long mmu_seq;
2639 bool map_writable; 2797 bool map_writable, write = error_code & PFERR_WRITE_MASK;
2640 2798
2641 force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); 2799 force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
2642 if (likely(!force_pt_level)) { 2800 if (likely(!force_pt_level)) {
@@ -2653,6 +2811,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
2653 } else 2811 } else
2654 level = PT_PAGE_TABLE_LEVEL; 2812 level = PT_PAGE_TABLE_LEVEL;
2655 2813
2814 if (fast_page_fault(vcpu, v, level, error_code))
2815 return 0;
2816
2656 mmu_seq = vcpu->kvm->mmu_notifier_seq; 2817 mmu_seq = vcpu->kvm->mmu_notifier_seq;
2657 smp_rmb(); 2818 smp_rmb();
2658 2819
@@ -3041,7 +3202,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
3041 gfn = gva >> PAGE_SHIFT; 3202 gfn = gva >> PAGE_SHIFT;
3042 3203
3043 return nonpaging_map(vcpu, gva & PAGE_MASK, 3204 return nonpaging_map(vcpu, gva & PAGE_MASK,
3044 error_code & PFERR_WRITE_MASK, gfn, prefault); 3205 error_code, gfn, prefault);
3045} 3206}
3046 3207
3047static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) 3208static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
@@ -3121,6 +3282,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
3121 } else 3282 } else
3122 level = PT_PAGE_TABLE_LEVEL; 3283 level = PT_PAGE_TABLE_LEVEL;
3123 3284
3285 if (fast_page_fault(vcpu, gpa, level, error_code))
3286 return 0;
3287
3124 mmu_seq = vcpu->kvm->mmu_notifier_seq; 3288 mmu_seq = vcpu->kvm->mmu_notifier_seq;
3125 smp_rmb(); 3289 smp_rmb();
3126 3290
@@ -3885,6 +4049,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu)
3885void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) 4049void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
3886{ 4050{
3887 struct kvm_mmu_page *sp; 4051 struct kvm_mmu_page *sp;
4052 bool flush = false;
3888 4053
3889 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { 4054 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
3890 int i; 4055 int i;
@@ -3899,16 +4064,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
3899 !is_last_spte(pt[i], sp->role.level)) 4064 !is_last_spte(pt[i], sp->role.level))
3900 continue; 4065 continue;
3901 4066
3902 if (is_large_pte(pt[i])) { 4067 spte_write_protect(kvm, &pt[i], &flush, false);
3903 drop_spte(kvm, &pt[i]);
3904 --kvm->stat.lpages;
3905 continue;
3906 }
3907
3908 /* avoid RMW */
3909 if (is_writable_pte(pt[i]))
3910 mmu_spte_update(&pt[i],
3911 pt[i] & ~PT_WRITABLE_MASK);
3912 } 4068 }
3913 } 4069 }
3914 kvm_flush_remote_tlbs(kvm); 4070 kvm_flush_remote_tlbs(kvm);
@@ -3945,7 +4101,6 @@ static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
3945static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) 4101static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
3946{ 4102{
3947 struct kvm *kvm; 4103 struct kvm *kvm;
3948 struct kvm *kvm_freed = NULL;
3949 int nr_to_scan = sc->nr_to_scan; 4104 int nr_to_scan = sc->nr_to_scan;
3950 4105
3951 if (nr_to_scan == 0) 4106 if (nr_to_scan == 0)
@@ -3957,22 +4112,30 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
3957 int idx; 4112 int idx;
3958 LIST_HEAD(invalid_list); 4113 LIST_HEAD(invalid_list);
3959 4114
4115 /*
4116 * n_used_mmu_pages is accessed without holding kvm->mmu_lock
4117 * here. We may skip a VM instance errorneosly, but we do not
4118 * want to shrink a VM that only started to populate its MMU
4119 * anyway.
4120 */
4121 if (kvm->arch.n_used_mmu_pages > 0) {
4122 if (!nr_to_scan--)
4123 break;
4124 continue;
4125 }
4126
3960 idx = srcu_read_lock(&kvm->srcu); 4127 idx = srcu_read_lock(&kvm->srcu);
3961 spin_lock(&kvm->mmu_lock); 4128 spin_lock(&kvm->mmu_lock);
3962 if (!kvm_freed && nr_to_scan > 0 &&
3963 kvm->arch.n_used_mmu_pages > 0) {
3964 kvm_mmu_remove_some_alloc_mmu_pages(kvm,
3965 &invalid_list);
3966 kvm_freed = kvm;
3967 }
3968 nr_to_scan--;
3969 4129
4130 kvm_mmu_remove_some_alloc_mmu_pages(kvm, &invalid_list);
3970 kvm_mmu_commit_zap_page(kvm, &invalid_list); 4131 kvm_mmu_commit_zap_page(kvm, &invalid_list);
4132
3971 spin_unlock(&kvm->mmu_lock); 4133 spin_unlock(&kvm->mmu_lock);
3972 srcu_read_unlock(&kvm->srcu, idx); 4134 srcu_read_unlock(&kvm->srcu, idx);
4135
4136 list_move_tail(&kvm->vm_list, &vm_list);
4137 break;
3973 } 4138 }
3974 if (kvm_freed)
3975 list_move_tail(&kvm_freed->vm_list, &vm_list);
3976 4139
3977 raw_spin_unlock(&kvm_lock); 4140 raw_spin_unlock(&kvm_lock);
3978 4141
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 89fb0e81322a..cd6e98333ba3 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -54,8 +54,8 @@
54 */ 54 */
55TRACE_EVENT( 55TRACE_EVENT(
56 kvm_mmu_pagetable_walk, 56 kvm_mmu_pagetable_walk,
57 TP_PROTO(u64 addr, int write_fault, int user_fault, int fetch_fault), 57 TP_PROTO(u64 addr, u32 pferr),
58 TP_ARGS(addr, write_fault, user_fault, fetch_fault), 58 TP_ARGS(addr, pferr),
59 59
60 TP_STRUCT__entry( 60 TP_STRUCT__entry(
61 __field(__u64, addr) 61 __field(__u64, addr)
@@ -64,8 +64,7 @@ TRACE_EVENT(
64 64
65 TP_fast_assign( 65 TP_fast_assign(
66 __entry->addr = addr; 66 __entry->addr = addr;
67 __entry->pferr = (!!write_fault << 1) | (!!user_fault << 2) 67 __entry->pferr = pferr;
68 | (!!fetch_fault << 4);
69 ), 68 ),
70 69
71 TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr, 70 TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr,
@@ -243,6 +242,44 @@ TRACE_EVENT(
243 TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn, 242 TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn,
244 __entry->access) 243 __entry->access)
245); 244);
245
246#define __spte_satisfied(__spte) \
247 (__entry->retry && is_writable_pte(__entry->__spte))
248
249TRACE_EVENT(
250 fast_page_fault,
251 TP_PROTO(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
252 u64 *sptep, u64 old_spte, bool retry),
253 TP_ARGS(vcpu, gva, error_code, sptep, old_spte, retry),
254
255 TP_STRUCT__entry(
256 __field(int, vcpu_id)
257 __field(gva_t, gva)
258 __field(u32, error_code)
259 __field(u64 *, sptep)
260 __field(u64, old_spte)
261 __field(u64, new_spte)
262 __field(bool, retry)
263 ),
264
265 TP_fast_assign(
266 __entry->vcpu_id = vcpu->vcpu_id;
267 __entry->gva = gva;
268 __entry->error_code = error_code;
269 __entry->sptep = sptep;
270 __entry->old_spte = old_spte;
271 __entry->new_spte = *sptep;
272 __entry->retry = retry;
273 ),
274
275 TP_printk("vcpu %d gva %lx error_code %s sptep %p old %#llx"
276 " new %llx spurious %d fixed %d", __entry->vcpu_id,
277 __entry->gva, __print_flags(__entry->error_code, "|",
278 kvm_mmu_trace_pferr_flags), __entry->sptep,
279 __entry->old_spte, __entry->new_spte,
280 __spte_satisfied(old_spte), __spte_satisfied(new_spte)
281 )
282);
246#endif /* _TRACE_KVMMMU_H */ 283#endif /* _TRACE_KVMMMU_H */
247 284
248#undef TRACE_INCLUDE_PATH 285#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 34f970937ef1..bb7cf01cae76 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -154,8 +154,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
154 const int fetch_fault = access & PFERR_FETCH_MASK; 154 const int fetch_fault = access & PFERR_FETCH_MASK;
155 u16 errcode = 0; 155 u16 errcode = 0;
156 156
157 trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, 157 trace_kvm_mmu_pagetable_walk(addr, access);
158 fetch_fault);
159retry_walk: 158retry_walk:
160 eperm = false; 159 eperm = false;
161 walker->level = mmu->root_level; 160 walker->level = mmu->root_level;
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 2e88438ffd83..9b7ec1150ab0 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -80,10 +80,10 @@ static inline struct kvm_pmc *get_fixed_pmc_idx(struct kvm_pmu *pmu, int idx)
80 80
81static struct kvm_pmc *global_idx_to_pmc(struct kvm_pmu *pmu, int idx) 81static struct kvm_pmc *global_idx_to_pmc(struct kvm_pmu *pmu, int idx)
82{ 82{
83 if (idx < X86_PMC_IDX_FIXED) 83 if (idx < INTEL_PMC_IDX_FIXED)
84 return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + idx, MSR_P6_EVNTSEL0); 84 return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + idx, MSR_P6_EVNTSEL0);
85 else 85 else
86 return get_fixed_pmc_idx(pmu, idx - X86_PMC_IDX_FIXED); 86 return get_fixed_pmc_idx(pmu, idx - INTEL_PMC_IDX_FIXED);
87} 87}
88 88
89void kvm_deliver_pmi(struct kvm_vcpu *vcpu) 89void kvm_deliver_pmi(struct kvm_vcpu *vcpu)
@@ -291,7 +291,7 @@ static void reprogram_idx(struct kvm_pmu *pmu, int idx)
291 if (pmc_is_gp(pmc)) 291 if (pmc_is_gp(pmc))
292 reprogram_gp_counter(pmc, pmc->eventsel); 292 reprogram_gp_counter(pmc, pmc->eventsel);
293 else { 293 else {
294 int fidx = idx - X86_PMC_IDX_FIXED; 294 int fidx = idx - INTEL_PMC_IDX_FIXED;
295 reprogram_fixed_counter(pmc, 295 reprogram_fixed_counter(pmc,
296 fixed_en_pmi(pmu->fixed_ctr_ctrl, fidx), fidx); 296 fixed_en_pmi(pmu->fixed_ctr_ctrl, fidx), fidx);
297 } 297 }
@@ -452,7 +452,7 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
452 return; 452 return;
453 453
454 pmu->nr_arch_gp_counters = min((int)(entry->eax >> 8) & 0xff, 454 pmu->nr_arch_gp_counters = min((int)(entry->eax >> 8) & 0xff,
455 X86_PMC_MAX_GENERIC); 455 INTEL_PMC_MAX_GENERIC);
456 pmu->counter_bitmask[KVM_PMC_GP] = 456 pmu->counter_bitmask[KVM_PMC_GP] =
457 ((u64)1 << ((entry->eax >> 16) & 0xff)) - 1; 457 ((u64)1 << ((entry->eax >> 16) & 0xff)) - 1;
458 bitmap_len = (entry->eax >> 24) & 0xff; 458 bitmap_len = (entry->eax >> 24) & 0xff;
@@ -462,13 +462,13 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
462 pmu->nr_arch_fixed_counters = 0; 462 pmu->nr_arch_fixed_counters = 0;
463 } else { 463 } else {
464 pmu->nr_arch_fixed_counters = min((int)(entry->edx & 0x1f), 464 pmu->nr_arch_fixed_counters = min((int)(entry->edx & 0x1f),
465 X86_PMC_MAX_FIXED); 465 INTEL_PMC_MAX_FIXED);
466 pmu->counter_bitmask[KVM_PMC_FIXED] = 466 pmu->counter_bitmask[KVM_PMC_FIXED] =
467 ((u64)1 << ((entry->edx >> 5) & 0xff)) - 1; 467 ((u64)1 << ((entry->edx >> 5) & 0xff)) - 1;
468 } 468 }
469 469
470 pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) | 470 pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) |
471 (((1ull << pmu->nr_arch_fixed_counters) - 1) << X86_PMC_IDX_FIXED); 471 (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED);
472 pmu->global_ctrl_mask = ~pmu->global_ctrl; 472 pmu->global_ctrl_mask = ~pmu->global_ctrl;
473} 473}
474 474
@@ -478,15 +478,15 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu)
478 struct kvm_pmu *pmu = &vcpu->arch.pmu; 478 struct kvm_pmu *pmu = &vcpu->arch.pmu;
479 479
480 memset(pmu, 0, sizeof(*pmu)); 480 memset(pmu, 0, sizeof(*pmu));
481 for (i = 0; i < X86_PMC_MAX_GENERIC; i++) { 481 for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) {
482 pmu->gp_counters[i].type = KVM_PMC_GP; 482 pmu->gp_counters[i].type = KVM_PMC_GP;
483 pmu->gp_counters[i].vcpu = vcpu; 483 pmu->gp_counters[i].vcpu = vcpu;
484 pmu->gp_counters[i].idx = i; 484 pmu->gp_counters[i].idx = i;
485 } 485 }
486 for (i = 0; i < X86_PMC_MAX_FIXED; i++) { 486 for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) {
487 pmu->fixed_counters[i].type = KVM_PMC_FIXED; 487 pmu->fixed_counters[i].type = KVM_PMC_FIXED;
488 pmu->fixed_counters[i].vcpu = vcpu; 488 pmu->fixed_counters[i].vcpu = vcpu;
489 pmu->fixed_counters[i].idx = i + X86_PMC_IDX_FIXED; 489 pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED;
490 } 490 }
491 init_irq_work(&pmu->irq_work, trigger_pmi); 491 init_irq_work(&pmu->irq_work, trigger_pmi);
492 kvm_pmu_cpuid_update(vcpu); 492 kvm_pmu_cpuid_update(vcpu);
@@ -498,13 +498,13 @@ void kvm_pmu_reset(struct kvm_vcpu *vcpu)
498 int i; 498 int i;
499 499
500 irq_work_sync(&pmu->irq_work); 500 irq_work_sync(&pmu->irq_work);
501 for (i = 0; i < X86_PMC_MAX_GENERIC; i++) { 501 for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) {
502 struct kvm_pmc *pmc = &pmu->gp_counters[i]; 502 struct kvm_pmc *pmc = &pmu->gp_counters[i];
503 stop_counter(pmc); 503 stop_counter(pmc);
504 pmc->counter = pmc->eventsel = 0; 504 pmc->counter = pmc->eventsel = 0;
505 } 505 }
506 506
507 for (i = 0; i < X86_PMC_MAX_FIXED; i++) 507 for (i = 0; i < INTEL_PMC_MAX_FIXED; i++)
508 stop_counter(&pmu->fixed_counters[i]); 508 stop_counter(&pmu->fixed_counters[i]);
509 509
510 pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 510 pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status =
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f75af406b268..baead950d6c8 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3185,8 +3185,8 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
3185 break; 3185 break;
3186 case MSR_IA32_DEBUGCTLMSR: 3186 case MSR_IA32_DEBUGCTLMSR:
3187 if (!boot_cpu_has(X86_FEATURE_LBRV)) { 3187 if (!boot_cpu_has(X86_FEATURE_LBRV)) {
3188 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", 3188 vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
3189 __func__, data); 3189 __func__, data);
3190 break; 3190 break;
3191 } 3191 }
3192 if (data & DEBUGCTL_RESERVED_BITS) 3192 if (data & DEBUGCTL_RESERVED_BITS)
@@ -3205,7 +3205,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
3205 case MSR_VM_CR: 3205 case MSR_VM_CR:
3206 return svm_set_vm_cr(vcpu, data); 3206 return svm_set_vm_cr(vcpu, data);
3207 case MSR_VM_IGNNE: 3207 case MSR_VM_IGNNE:
3208 pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); 3208 vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
3209 break; 3209 break;
3210 default: 3210 default:
3211 return kvm_set_msr_common(vcpu, ecx, data); 3211 return kvm_set_msr_common(vcpu, ecx, data);
@@ -4044,6 +4044,11 @@ static bool svm_rdtscp_supported(void)
4044 return false; 4044 return false;
4045} 4045}
4046 4046
4047static bool svm_invpcid_supported(void)
4048{
4049 return false;
4050}
4051
4047static bool svm_has_wbinvd_exit(void) 4052static bool svm_has_wbinvd_exit(void)
4048{ 4053{
4049 return true; 4054 return true;
@@ -4312,6 +4317,7 @@ static struct kvm_x86_ops svm_x86_ops = {
4312 .cpuid_update = svm_cpuid_update, 4317 .cpuid_update = svm_cpuid_update,
4313 4318
4314 .rdtscp_supported = svm_rdtscp_supported, 4319 .rdtscp_supported = svm_rdtscp_supported,
4320 .invpcid_supported = svm_invpcid_supported,
4315 4321
4316 .set_supported_cpuid = svm_set_supported_cpuid, 4322 .set_supported_cpuid = svm_set_supported_cpuid,
4317 4323
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 911d2641f14c..a71faf727ff3 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -517,6 +517,40 @@ TRACE_EVENT(kvm_apic_accept_irq,
517 __entry->coalesced ? " (coalesced)" : "") 517 __entry->coalesced ? " (coalesced)" : "")
518); 518);
519 519
520TRACE_EVENT(kvm_eoi,
521 TP_PROTO(struct kvm_lapic *apic, int vector),
522 TP_ARGS(apic, vector),
523
524 TP_STRUCT__entry(
525 __field( __u32, apicid )
526 __field( int, vector )
527 ),
528
529 TP_fast_assign(
530 __entry->apicid = apic->vcpu->vcpu_id;
531 __entry->vector = vector;
532 ),
533
534 TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector)
535);
536
537TRACE_EVENT(kvm_pv_eoi,
538 TP_PROTO(struct kvm_lapic *apic, int vector),
539 TP_ARGS(apic, vector),
540
541 TP_STRUCT__entry(
542 __field( __u32, apicid )
543 __field( int, vector )
544 ),
545
546 TP_fast_assign(
547 __entry->apicid = apic->vcpu->vcpu_id;
548 __entry->vector = vector;
549 ),
550
551 TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector)
552);
553
520/* 554/*
521 * Tracepoint for nested VMRUN 555 * Tracepoint for nested VMRUN
522 */ 556 */
@@ -710,16 +744,6 @@ TRACE_EVENT(kvm_skinit,
710 __entry->rip, __entry->slb) 744 __entry->rip, __entry->slb)
711); 745);
712 746
713#define __print_insn(insn, ilen) ({ \
714 int i; \
715 const char *ret = p->buffer + p->len; \
716 \
717 for (i = 0; i < ilen; ++i) \
718 trace_seq_printf(p, " %02x", insn[i]); \
719 trace_seq_printf(p, "%c", 0); \
720 ret; \
721 })
722
723#define KVM_EMUL_INSN_F_CR0_PE (1 << 0) 747#define KVM_EMUL_INSN_F_CR0_PE (1 << 0)
724#define KVM_EMUL_INSN_F_EFL_VM (1 << 1) 748#define KVM_EMUL_INSN_F_EFL_VM (1 << 1)
725#define KVM_EMUL_INSN_F_CS_D (1 << 2) 749#define KVM_EMUL_INSN_F_CS_D (1 << 2)
@@ -786,7 +810,7 @@ TRACE_EVENT(kvm_emulate_insn,
786 810
787 TP_printk("%x:%llx:%s (%s)%s", 811 TP_printk("%x:%llx:%s (%s)%s",
788 __entry->csbase, __entry->rip, 812 __entry->csbase, __entry->rip,
789 __print_insn(__entry->insn, __entry->len), 813 __print_hex(__entry->insn, __entry->len),
790 __print_symbolic(__entry->flags, 814 __print_symbolic(__entry->flags,
791 kvm_trace_symbol_emul_flags), 815 kvm_trace_symbol_emul_flags),
792 __entry->failed ? " failed" : "" 816 __entry->failed ? " failed" : ""
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 32eb58866292..c39b60707e02 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -71,7 +71,10 @@ static bool __read_mostly enable_unrestricted_guest = 1;
71module_param_named(unrestricted_guest, 71module_param_named(unrestricted_guest,
72 enable_unrestricted_guest, bool, S_IRUGO); 72 enable_unrestricted_guest, bool, S_IRUGO);
73 73
74static bool __read_mostly emulate_invalid_guest_state = 0; 74static bool __read_mostly enable_ept_ad_bits = 1;
75module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
76
77static bool __read_mostly emulate_invalid_guest_state = true;
75module_param(emulate_invalid_guest_state, bool, S_IRUGO); 78module_param(emulate_invalid_guest_state, bool, S_IRUGO);
76 79
77static bool __read_mostly vmm_exclusive = 1; 80static bool __read_mostly vmm_exclusive = 1;
@@ -615,6 +618,10 @@ static void kvm_cpu_vmxon(u64 addr);
615static void kvm_cpu_vmxoff(void); 618static void kvm_cpu_vmxoff(void);
616static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); 619static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
617static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); 620static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
621static void vmx_set_segment(struct kvm_vcpu *vcpu,
622 struct kvm_segment *var, int seg);
623static void vmx_get_segment(struct kvm_vcpu *vcpu,
624 struct kvm_segment *var, int seg);
618 625
619static DEFINE_PER_CPU(struct vmcs *, vmxarea); 626static DEFINE_PER_CPU(struct vmcs *, vmxarea);
620static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 627static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -789,6 +796,11 @@ static inline bool cpu_has_vmx_ept_4levels(void)
789 return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; 796 return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
790} 797}
791 798
799static inline bool cpu_has_vmx_ept_ad_bits(void)
800{
801 return vmx_capability.ept & VMX_EPT_AD_BIT;
802}
803
792static inline bool cpu_has_vmx_invept_individual_addr(void) 804static inline bool cpu_has_vmx_invept_individual_addr(void)
793{ 805{
794 return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT; 806 return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
@@ -849,6 +861,12 @@ static inline bool cpu_has_vmx_rdtscp(void)
849 SECONDARY_EXEC_RDTSCP; 861 SECONDARY_EXEC_RDTSCP;
850} 862}
851 863
864static inline bool cpu_has_vmx_invpcid(void)
865{
866 return vmcs_config.cpu_based_2nd_exec_ctrl &
867 SECONDARY_EXEC_ENABLE_INVPCID;
868}
869
852static inline bool cpu_has_virtual_nmis(void) 870static inline bool cpu_has_virtual_nmis(void)
853{ 871{
854 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; 872 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
@@ -1739,6 +1757,11 @@ static bool vmx_rdtscp_supported(void)
1739 return cpu_has_vmx_rdtscp(); 1757 return cpu_has_vmx_rdtscp();
1740} 1758}
1741 1759
1760static bool vmx_invpcid_supported(void)
1761{
1762 return cpu_has_vmx_invpcid() && enable_ept;
1763}
1764
1742/* 1765/*
1743 * Swap MSR entry in host/guest MSR entry array. 1766 * Swap MSR entry in host/guest MSR entry array.
1744 */ 1767 */
@@ -2458,7 +2481,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
2458 SECONDARY_EXEC_ENABLE_EPT | 2481 SECONDARY_EXEC_ENABLE_EPT |
2459 SECONDARY_EXEC_UNRESTRICTED_GUEST | 2482 SECONDARY_EXEC_UNRESTRICTED_GUEST |
2460 SECONDARY_EXEC_PAUSE_LOOP_EXITING | 2483 SECONDARY_EXEC_PAUSE_LOOP_EXITING |
2461 SECONDARY_EXEC_RDTSCP; 2484 SECONDARY_EXEC_RDTSCP |
2485 SECONDARY_EXEC_ENABLE_INVPCID;
2462 if (adjust_vmx_controls(min2, opt2, 2486 if (adjust_vmx_controls(min2, opt2,
2463 MSR_IA32_VMX_PROCBASED_CTLS2, 2487 MSR_IA32_VMX_PROCBASED_CTLS2,
2464 &_cpu_based_2nd_exec_control) < 0) 2488 &_cpu_based_2nd_exec_control) < 0)
@@ -2645,8 +2669,12 @@ static __init int hardware_setup(void)
2645 !cpu_has_vmx_ept_4levels()) { 2669 !cpu_has_vmx_ept_4levels()) {
2646 enable_ept = 0; 2670 enable_ept = 0;
2647 enable_unrestricted_guest = 0; 2671 enable_unrestricted_guest = 0;
2672 enable_ept_ad_bits = 0;
2648 } 2673 }
2649 2674
2675 if (!cpu_has_vmx_ept_ad_bits())
2676 enable_ept_ad_bits = 0;
2677
2650 if (!cpu_has_vmx_unrestricted_guest()) 2678 if (!cpu_has_vmx_unrestricted_guest())
2651 enable_unrestricted_guest = 0; 2679 enable_unrestricted_guest = 0;
2652 2680
@@ -2770,6 +2798,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
2770{ 2798{
2771 unsigned long flags; 2799 unsigned long flags;
2772 struct vcpu_vmx *vmx = to_vmx(vcpu); 2800 struct vcpu_vmx *vmx = to_vmx(vcpu);
2801 struct kvm_segment var;
2773 2802
2774 if (enable_unrestricted_guest) 2803 if (enable_unrestricted_guest)
2775 return; 2804 return;
@@ -2813,20 +2842,23 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
2813 if (emulate_invalid_guest_state) 2842 if (emulate_invalid_guest_state)
2814 goto continue_rmode; 2843 goto continue_rmode;
2815 2844
2816 vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); 2845 vmx_get_segment(vcpu, &var, VCPU_SREG_SS);
2817 vmcs_write32(GUEST_SS_LIMIT, 0xffff); 2846 vmx_set_segment(vcpu, &var, VCPU_SREG_SS);
2818 vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); 2847
2848 vmx_get_segment(vcpu, &var, VCPU_SREG_CS);
2849 vmx_set_segment(vcpu, &var, VCPU_SREG_CS);
2850
2851 vmx_get_segment(vcpu, &var, VCPU_SREG_ES);
2852 vmx_set_segment(vcpu, &var, VCPU_SREG_ES);
2853
2854 vmx_get_segment(vcpu, &var, VCPU_SREG_DS);
2855 vmx_set_segment(vcpu, &var, VCPU_SREG_DS);
2819 2856
2820 vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); 2857 vmx_get_segment(vcpu, &var, VCPU_SREG_GS);
2821 vmcs_write32(GUEST_CS_LIMIT, 0xffff); 2858 vmx_set_segment(vcpu, &var, VCPU_SREG_GS);
2822 if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
2823 vmcs_writel(GUEST_CS_BASE, 0xf0000);
2824 vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
2825 2859
2826 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es); 2860 vmx_get_segment(vcpu, &var, VCPU_SREG_FS);
2827 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds); 2861 vmx_set_segment(vcpu, &var, VCPU_SREG_FS);
2828 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs);
2829 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs);
2830 2862
2831continue_rmode: 2863continue_rmode:
2832 kvm_mmu_reset_context(vcpu); 2864 kvm_mmu_reset_context(vcpu);
@@ -3027,6 +3059,8 @@ static u64 construct_eptp(unsigned long root_hpa)
3027 /* TODO write the value reading from MSR */ 3059 /* TODO write the value reading from MSR */
3028 eptp = VMX_EPT_DEFAULT_MT | 3060 eptp = VMX_EPT_DEFAULT_MT |
3029 VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT; 3061 VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
3062 if (enable_ept_ad_bits)
3063 eptp |= VMX_EPT_AD_ENABLE_BIT;
3030 eptp |= (root_hpa & PAGE_MASK); 3064 eptp |= (root_hpa & PAGE_MASK);
3031 3065
3032 return eptp; 3066 return eptp;
@@ -3153,11 +3187,22 @@ static int __vmx_get_cpl(struct kvm_vcpu *vcpu)
3153 3187
3154static int vmx_get_cpl(struct kvm_vcpu *vcpu) 3188static int vmx_get_cpl(struct kvm_vcpu *vcpu)
3155{ 3189{
3190 struct vcpu_vmx *vmx = to_vmx(vcpu);
3191
3192 /*
3193 * If we enter real mode with cs.sel & 3 != 0, the normal CPL calculations
3194 * fail; use the cache instead.
3195 */
3196 if (unlikely(vmx->emulation_required && emulate_invalid_guest_state)) {
3197 return vmx->cpl;
3198 }
3199
3156 if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) { 3200 if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) {
3157 __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); 3201 __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
3158 to_vmx(vcpu)->cpl = __vmx_get_cpl(vcpu); 3202 vmx->cpl = __vmx_get_cpl(vcpu);
3159 } 3203 }
3160 return to_vmx(vcpu)->cpl; 3204
3205 return vmx->cpl;
3161} 3206}
3162 3207
3163 3208
@@ -3165,7 +3210,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var)
3165{ 3210{
3166 u32 ar; 3211 u32 ar;
3167 3212
3168 if (var->unusable) 3213 if (var->unusable || !var->present)
3169 ar = 1 << 16; 3214 ar = 1 << 16;
3170 else { 3215 else {
3171 ar = var->type & 15; 3216 ar = var->type & 15;
@@ -3177,8 +3222,6 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var)
3177 ar |= (var->db & 1) << 14; 3222 ar |= (var->db & 1) << 14;
3178 ar |= (var->g & 1) << 15; 3223 ar |= (var->g & 1) << 15;
3179 } 3224 }
3180 if (ar == 0) /* a 0 value means unusable */
3181 ar = AR_UNUSABLE_MASK;
3182 3225
3183 return ar; 3226 return ar;
3184} 3227}
@@ -3229,6 +3272,44 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
3229 3272
3230 vmcs_write32(sf->ar_bytes, ar); 3273 vmcs_write32(sf->ar_bytes, ar);
3231 __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); 3274 __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
3275
3276 /*
3277 * Fix segments for real mode guest in hosts that don't have
3278 * "unrestricted_mode" or it was disabled.
3279 * This is done to allow migration of the guests from hosts with
3280 * unrestricted guest like Westmere to older host that don't have
3281 * unrestricted guest like Nehelem.
3282 */
3283 if (!enable_unrestricted_guest && vmx->rmode.vm86_active) {
3284 switch (seg) {
3285 case VCPU_SREG_CS:
3286 vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
3287 vmcs_write32(GUEST_CS_LIMIT, 0xffff);
3288 if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
3289 vmcs_writel(GUEST_CS_BASE, 0xf0000);
3290 vmcs_write16(GUEST_CS_SELECTOR,
3291 vmcs_readl(GUEST_CS_BASE) >> 4);
3292 break;
3293 case VCPU_SREG_ES:
3294 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es);
3295 break;
3296 case VCPU_SREG_DS:
3297 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds);
3298 break;
3299 case VCPU_SREG_GS:
3300 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs);
3301 break;
3302 case VCPU_SREG_FS:
3303 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs);
3304 break;
3305 case VCPU_SREG_SS:
3306 vmcs_write16(GUEST_SS_SELECTOR,
3307 vmcs_readl(GUEST_SS_BASE) >> 4);
3308 vmcs_write32(GUEST_SS_LIMIT, 0xffff);
3309 vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
3310 break;
3311 }
3312 }
3232} 3313}
3233 3314
3234static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 3315static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
@@ -3731,6 +3812,8 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
3731 if (!enable_ept) { 3812 if (!enable_ept) {
3732 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; 3813 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
3733 enable_unrestricted_guest = 0; 3814 enable_unrestricted_guest = 0;
3815 /* Enable INVPCID for non-ept guests may cause performance regression. */
3816 exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
3734 } 3817 }
3735 if (!enable_unrestricted_guest) 3818 if (!enable_unrestricted_guest)
3736 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 3819 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
@@ -4489,7 +4572,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
4489 break; 4572 break;
4490 } 4573 }
4491 vcpu->run->exit_reason = 0; 4574 vcpu->run->exit_reason = 0;
4492 pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n", 4575 vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
4493 (int)(exit_qualification >> 4) & 3, cr); 4576 (int)(exit_qualification >> 4) & 3, cr);
4494 return 0; 4577 return 0;
4495} 4578}
@@ -4769,6 +4852,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
4769{ 4852{
4770 unsigned long exit_qualification; 4853 unsigned long exit_qualification;
4771 gpa_t gpa; 4854 gpa_t gpa;
4855 u32 error_code;
4772 int gla_validity; 4856 int gla_validity;
4773 4857
4774 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 4858 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -4793,7 +4877,13 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
4793 4877
4794 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 4878 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
4795 trace_kvm_page_fault(gpa, exit_qualification); 4879 trace_kvm_page_fault(gpa, exit_qualification);
4796 return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0); 4880
4881 /* It is a write fault? */
4882 error_code = exit_qualification & (1U << 1);
4883 /* ept page table is present? */
4884 error_code |= (exit_qualification >> 3) & 0x1;
4885
4886 return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
4797} 4887}
4798 4888
4799static u64 ept_rsvd_mask(u64 spte, int level) 4889static u64 ept_rsvd_mask(u64 spte, int level)
@@ -4908,15 +4998,18 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
4908 int ret = 1; 4998 int ret = 1;
4909 u32 cpu_exec_ctrl; 4999 u32 cpu_exec_ctrl;
4910 bool intr_window_requested; 5000 bool intr_window_requested;
5001 unsigned count = 130;
4911 5002
4912 cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 5003 cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
4913 intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; 5004 intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
4914 5005
4915 while (!guest_state_valid(vcpu)) { 5006 while (!guest_state_valid(vcpu) && count-- != 0) {
4916 if (intr_window_requested 5007 if (intr_window_requested && vmx_interrupt_allowed(vcpu))
4917 && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF))
4918 return handle_interrupt_window(&vmx->vcpu); 5008 return handle_interrupt_window(&vmx->vcpu);
4919 5009
5010 if (test_bit(KVM_REQ_EVENT, &vcpu->requests))
5011 return 1;
5012
4920 err = emulate_instruction(vcpu, 0); 5013 err = emulate_instruction(vcpu, 0);
4921 5014
4922 if (err == EMULATE_DO_MMIO) { 5015 if (err == EMULATE_DO_MMIO) {
@@ -4924,8 +5017,12 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
4924 goto out; 5017 goto out;
4925 } 5018 }
4926 5019
4927 if (err != EMULATE_DONE) 5020 if (err != EMULATE_DONE) {
5021 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5022 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
5023 vcpu->run->internal.ndata = 0;
4928 return 0; 5024 return 0;
5025 }
4929 5026
4930 if (signal_pending(current)) 5027 if (signal_pending(current))
4931 goto out; 5028 goto out;
@@ -4933,7 +5030,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
4933 schedule(); 5030 schedule();
4934 } 5031 }
4935 5032
4936 vmx->emulation_required = 0; 5033 vmx->emulation_required = !guest_state_valid(vcpu);
4937out: 5034out:
4938 return ret; 5035 return ret;
4939} 5036}
@@ -6467,6 +6564,23 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
6467 } 6564 }
6468 } 6565 }
6469 } 6566 }
6567
6568 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
6569 /* Exposing INVPCID only when PCID is exposed */
6570 best = kvm_find_cpuid_entry(vcpu, 0x7, 0);
6571 if (vmx_invpcid_supported() &&
6572 best && (best->ecx & bit(X86_FEATURE_INVPCID)) &&
6573 guest_cpuid_has_pcid(vcpu)) {
6574 exec_control |= SECONDARY_EXEC_ENABLE_INVPCID;
6575 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
6576 exec_control);
6577 } else {
6578 exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
6579 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
6580 exec_control);
6581 if (best)
6582 best->ecx &= ~bit(X86_FEATURE_INVPCID);
6583 }
6470} 6584}
6471 6585
6472static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) 6586static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@ -7201,6 +7315,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
7201 .cpuid_update = vmx_cpuid_update, 7315 .cpuid_update = vmx_cpuid_update,
7202 7316
7203 .rdtscp_supported = vmx_rdtscp_supported, 7317 .rdtscp_supported = vmx_rdtscp_supported,
7318 .invpcid_supported = vmx_invpcid_supported,
7204 7319
7205 .set_supported_cpuid = vmx_set_supported_cpuid, 7320 .set_supported_cpuid = vmx_set_supported_cpuid,
7206 7321
@@ -7230,23 +7345,21 @@ static int __init vmx_init(void)
7230 if (!vmx_io_bitmap_a) 7345 if (!vmx_io_bitmap_a)
7231 return -ENOMEM; 7346 return -ENOMEM;
7232 7347
7348 r = -ENOMEM;
7349
7233 vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); 7350 vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL);
7234 if (!vmx_io_bitmap_b) { 7351 if (!vmx_io_bitmap_b)
7235 r = -ENOMEM;
7236 goto out; 7352 goto out;
7237 }
7238 7353
7239 vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); 7354 vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL);
7240 if (!vmx_msr_bitmap_legacy) { 7355 if (!vmx_msr_bitmap_legacy)
7241 r = -ENOMEM;
7242 goto out1; 7356 goto out1;
7243 } 7357
7244 7358
7245 vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); 7359 vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
7246 if (!vmx_msr_bitmap_longmode) { 7360 if (!vmx_msr_bitmap_longmode)
7247 r = -ENOMEM;
7248 goto out2; 7361 goto out2;
7249 } 7362
7250 7363
7251 /* 7364 /*
7252 * Allow direct access to the PC debug port (it is often used for I/O 7365 * Allow direct access to the PC debug port (it is often used for I/O
@@ -7275,8 +7388,10 @@ static int __init vmx_init(void)
7275 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); 7388 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
7276 7389
7277 if (enable_ept) { 7390 if (enable_ept) {
7278 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, 7391 kvm_mmu_set_mask_ptes(0ull,
7279 VMX_EPT_EXECUTABLE_MASK); 7392 (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
7393 (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull,
7394 0ull, VMX_EPT_EXECUTABLE_MASK);
7280 ept_set_mmio_spte_mask(); 7395 ept_set_mmio_spte_mask();
7281 kvm_enable_tdp(); 7396 kvm_enable_tdp();
7282 } else 7397 } else
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index be6d54929fa7..59b59508ff07 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -528,6 +528,9 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
528 return 1; 528 return 1;
529 } 529 }
530 530
531 if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
532 return 1;
533
531 kvm_x86_ops->set_cr0(vcpu, cr0); 534 kvm_x86_ops->set_cr0(vcpu, cr0);
532 535
533 if ((cr0 ^ old_cr0) & X86_CR0_PG) { 536 if ((cr0 ^ old_cr0) & X86_CR0_PG) {
@@ -604,10 +607,20 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
604 kvm_read_cr3(vcpu))) 607 kvm_read_cr3(vcpu)))
605 return 1; 608 return 1;
606 609
610 if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
611 if (!guest_cpuid_has_pcid(vcpu))
612 return 1;
613
614 /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
615 if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
616 return 1;
617 }
618
607 if (kvm_x86_ops->set_cr4(vcpu, cr4)) 619 if (kvm_x86_ops->set_cr4(vcpu, cr4))
608 return 1; 620 return 1;
609 621
610 if ((cr4 ^ old_cr4) & pdptr_bits) 622 if (((cr4 ^ old_cr4) & pdptr_bits) ||
623 (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
611 kvm_mmu_reset_context(vcpu); 624 kvm_mmu_reset_context(vcpu);
612 625
613 if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) 626 if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
@@ -626,8 +639,12 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
626 } 639 }
627 640
628 if (is_long_mode(vcpu)) { 641 if (is_long_mode(vcpu)) {
629 if (cr3 & CR3_L_MODE_RESERVED_BITS) 642 if (kvm_read_cr4(vcpu) & X86_CR4_PCIDE) {
630 return 1; 643 if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS)
644 return 1;
645 } else
646 if (cr3 & CR3_L_MODE_RESERVED_BITS)
647 return 1;
631 } else { 648 } else {
632 if (is_pae(vcpu)) { 649 if (is_pae(vcpu)) {
633 if (cr3 & CR3_PAE_RESERVED_BITS) 650 if (cr3 & CR3_PAE_RESERVED_BITS)
@@ -795,6 +812,7 @@ static u32 msrs_to_save[] = {
795 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, 812 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
796 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, 813 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
797 HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, 814 HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
815 MSR_KVM_PV_EOI_EN,
798 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 816 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
799 MSR_STAR, 817 MSR_STAR,
800#ifdef CONFIG_X86_64 818#ifdef CONFIG_X86_64
@@ -1437,8 +1455,8 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1437 break; 1455 break;
1438 } 1456 }
1439 default: 1457 default:
1440 pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " 1458 vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
1441 "data 0x%llx\n", msr, data); 1459 "data 0x%llx\n", msr, data);
1442 return 1; 1460 return 1;
1443 } 1461 }
1444 return 0; 1462 return 0;
@@ -1470,8 +1488,8 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1470 case HV_X64_MSR_TPR: 1488 case HV_X64_MSR_TPR:
1471 return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); 1489 return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
1472 default: 1490 default:
1473 pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " 1491 vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
1474 "data 0x%llx\n", msr, data); 1492 "data 0x%llx\n", msr, data);
1475 return 1; 1493 return 1;
1476 } 1494 }
1477 1495
@@ -1551,15 +1569,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1551 data &= ~(u64)0x100; /* ignore ignne emulation enable */ 1569 data &= ~(u64)0x100; /* ignore ignne emulation enable */
1552 data &= ~(u64)0x8; /* ignore TLB cache disable */ 1570 data &= ~(u64)0x8; /* ignore TLB cache disable */
1553 if (data != 0) { 1571 if (data != 0) {
1554 pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", 1572 vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
1555 data); 1573 data);
1556 return 1; 1574 return 1;
1557 } 1575 }
1558 break; 1576 break;
1559 case MSR_FAM10H_MMIO_CONF_BASE: 1577 case MSR_FAM10H_MMIO_CONF_BASE:
1560 if (data != 0) { 1578 if (data != 0) {
1561 pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " 1579 vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
1562 "0x%llx\n", data); 1580 "0x%llx\n", data);
1563 return 1; 1581 return 1;
1564 } 1582 }
1565 break; 1583 break;
@@ -1574,8 +1592,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1574 thus reserved and should throw a #GP */ 1592 thus reserved and should throw a #GP */
1575 return 1; 1593 return 1;
1576 } 1594 }
1577 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", 1595 vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
1578 __func__, data); 1596 __func__, data);
1579 break; 1597 break;
1580 case MSR_IA32_UCODE_REV: 1598 case MSR_IA32_UCODE_REV:
1581 case MSR_IA32_UCODE_WRITE: 1599 case MSR_IA32_UCODE_WRITE:
@@ -1653,6 +1671,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1653 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); 1671 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
1654 1672
1655 break; 1673 break;
1674 case MSR_KVM_PV_EOI_EN:
1675 if (kvm_lapic_enable_pv_eoi(vcpu, data))
1676 return 1;
1677 break;
1656 1678
1657 case MSR_IA32_MCG_CTL: 1679 case MSR_IA32_MCG_CTL:
1658 case MSR_IA32_MCG_STATUS: 1680 case MSR_IA32_MCG_STATUS:
@@ -1671,8 +1693,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1671 case MSR_K7_EVNTSEL2: 1693 case MSR_K7_EVNTSEL2:
1672 case MSR_K7_EVNTSEL3: 1694 case MSR_K7_EVNTSEL3:
1673 if (data != 0) 1695 if (data != 0)
1674 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " 1696 vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: "
1675 "0x%x data 0x%llx\n", msr, data); 1697 "0x%x data 0x%llx\n", msr, data);
1676 break; 1698 break;
1677 /* at least RHEL 4 unconditionally writes to the perfctr registers, 1699 /* at least RHEL 4 unconditionally writes to the perfctr registers,
1678 * so we ignore writes to make it happy. 1700 * so we ignore writes to make it happy.
@@ -1681,8 +1703,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1681 case MSR_K7_PERFCTR1: 1703 case MSR_K7_PERFCTR1:
1682 case MSR_K7_PERFCTR2: 1704 case MSR_K7_PERFCTR2:
1683 case MSR_K7_PERFCTR3: 1705 case MSR_K7_PERFCTR3:
1684 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " 1706 vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: "
1685 "0x%x data 0x%llx\n", msr, data); 1707 "0x%x data 0x%llx\n", msr, data);
1686 break; 1708 break;
1687 case MSR_P6_PERFCTR0: 1709 case MSR_P6_PERFCTR0:
1688 case MSR_P6_PERFCTR1: 1710 case MSR_P6_PERFCTR1:
@@ -1693,8 +1715,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1693 return kvm_pmu_set_msr(vcpu, msr, data); 1715 return kvm_pmu_set_msr(vcpu, msr, data);
1694 1716
1695 if (pr || data != 0) 1717 if (pr || data != 0)
1696 pr_unimpl(vcpu, "disabled perfctr wrmsr: " 1718 vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
1697 "0x%x data 0x%llx\n", msr, data); 1719 "0x%x data 0x%llx\n", msr, data);
1698 break; 1720 break;
1699 case MSR_K7_CLK_CTL: 1721 case MSR_K7_CLK_CTL:
1700 /* 1722 /*
@@ -1720,7 +1742,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1720 /* Drop writes to this legacy MSR -- see rdmsr 1742 /* Drop writes to this legacy MSR -- see rdmsr
1721 * counterpart for further detail. 1743 * counterpart for further detail.
1722 */ 1744 */
1723 pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); 1745 vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data);
1724 break; 1746 break;
1725 case MSR_AMD64_OSVW_ID_LENGTH: 1747 case MSR_AMD64_OSVW_ID_LENGTH:
1726 if (!guest_cpuid_has_osvw(vcpu)) 1748 if (!guest_cpuid_has_osvw(vcpu))
@@ -1738,12 +1760,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1738 if (kvm_pmu_msr(vcpu, msr)) 1760 if (kvm_pmu_msr(vcpu, msr))
1739 return kvm_pmu_set_msr(vcpu, msr, data); 1761 return kvm_pmu_set_msr(vcpu, msr, data);
1740 if (!ignore_msrs) { 1762 if (!ignore_msrs) {
1741 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", 1763 vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
1742 msr, data); 1764 msr, data);
1743 return 1; 1765 return 1;
1744 } else { 1766 } else {
1745 pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", 1767 vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n",
1746 msr, data); 1768 msr, data);
1747 break; 1769 break;
1748 } 1770 }
1749 } 1771 }
@@ -1846,7 +1868,7 @@ static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1846 data = kvm->arch.hv_hypercall; 1868 data = kvm->arch.hv_hypercall;
1847 break; 1869 break;
1848 default: 1870 default:
1849 pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); 1871 vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
1850 return 1; 1872 return 1;
1851 } 1873 }
1852 1874
@@ -1877,7 +1899,7 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1877 data = vcpu->arch.hv_vapic; 1899 data = vcpu->arch.hv_vapic;
1878 break; 1900 break;
1879 default: 1901 default:
1880 pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); 1902 vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
1881 return 1; 1903 return 1;
1882 } 1904 }
1883 *pdata = data; 1905 *pdata = data;
@@ -2030,10 +2052,10 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
2030 if (kvm_pmu_msr(vcpu, msr)) 2052 if (kvm_pmu_msr(vcpu, msr))
2031 return kvm_pmu_get_msr(vcpu, msr, pdata); 2053 return kvm_pmu_get_msr(vcpu, msr, pdata);
2032 if (!ignore_msrs) { 2054 if (!ignore_msrs) {
2033 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); 2055 vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
2034 return 1; 2056 return 1;
2035 } else { 2057 } else {
2036 pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr); 2058 vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);
2037 data = 0; 2059 data = 0;
2038 } 2060 }
2039 break; 2061 break;
@@ -4116,7 +4138,7 @@ static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)
4116 value = kvm_get_cr8(vcpu); 4138 value = kvm_get_cr8(vcpu);
4117 break; 4139 break;
4118 default: 4140 default:
4119 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 4141 kvm_err("%s: unexpected cr %u\n", __func__, cr);
4120 return 0; 4142 return 0;
4121 } 4143 }
4122 4144
@@ -4145,7 +4167,7 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
4145 res = kvm_set_cr8(vcpu, val); 4167 res = kvm_set_cr8(vcpu, val);
4146 break; 4168 break;
4147 default: 4169 default:
4148 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 4170 kvm_err("%s: unexpected cr %u\n", __func__, cr);
4149 res = -1; 4171 res = -1;
4150 } 4172 }
4151 4173
@@ -4297,26 +4319,10 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
4297 return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage); 4319 return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
4298} 4320}
4299 4321
4300static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, 4322static void emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
4301 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) 4323 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
4302{ 4324{
4303 struct kvm_cpuid_entry2 *cpuid = NULL; 4325 kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx);
4304
4305 if (eax && ecx)
4306 cpuid = kvm_find_cpuid_entry(emul_to_vcpu(ctxt),
4307 *eax, *ecx);
4308
4309 if (cpuid) {
4310 *eax = cpuid->eax;
4311 *ecx = cpuid->ecx;
4312 if (ebx)
4313 *ebx = cpuid->ebx;
4314 if (edx)
4315 *edx = cpuid->edx;
4316 return true;
4317 }
4318
4319 return false;
4320} 4326}
4321 4327
4322static struct x86_emulate_ops emulate_ops = { 4328static struct x86_emulate_ops emulate_ops = {
@@ -5296,8 +5302,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5296 5302
5297 r = kvm_mmu_reload(vcpu); 5303 r = kvm_mmu_reload(vcpu);
5298 if (unlikely(r)) { 5304 if (unlikely(r)) {
5299 kvm_x86_ops->cancel_injection(vcpu); 5305 goto cancel_injection;
5300 goto out;
5301 } 5306 }
5302 5307
5303 preempt_disable(); 5308 preempt_disable();
@@ -5322,9 +5327,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5322 smp_wmb(); 5327 smp_wmb();
5323 local_irq_enable(); 5328 local_irq_enable();
5324 preempt_enable(); 5329 preempt_enable();
5325 kvm_x86_ops->cancel_injection(vcpu);
5326 r = 1; 5330 r = 1;
5327 goto out; 5331 goto cancel_injection;
5328 } 5332 }
5329 5333
5330 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 5334 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
@@ -5388,9 +5392,16 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5388 if (unlikely(vcpu->arch.tsc_always_catchup)) 5392 if (unlikely(vcpu->arch.tsc_always_catchup))
5389 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 5393 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
5390 5394
5391 kvm_lapic_sync_from_vapic(vcpu); 5395 if (vcpu->arch.apic_attention)
5396 kvm_lapic_sync_from_vapic(vcpu);
5392 5397
5393 r = kvm_x86_ops->handle_exit(vcpu); 5398 r = kvm_x86_ops->handle_exit(vcpu);
5399 return r;
5400
5401cancel_injection:
5402 kvm_x86_ops->cancel_injection(vcpu);
5403 if (unlikely(vcpu->arch.apic_attention))
5404 kvm_lapic_sync_from_vapic(vcpu);
5394out: 5405out:
5395 return r; 5406 return r;
5396} 5407}
@@ -6304,7 +6315,7 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free,
6304 6315
6305 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { 6316 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
6306 if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) { 6317 if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) {
6307 vfree(free->arch.lpage_info[i]); 6318 kvm_kvfree(free->arch.lpage_info[i]);
6308 free->arch.lpage_info[i] = NULL; 6319 free->arch.lpage_info[i] = NULL;
6309 } 6320 }
6310 } 6321 }
@@ -6323,7 +6334,7 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
6323 slot->base_gfn, level) + 1; 6334 slot->base_gfn, level) + 1;
6324 6335
6325 slot->arch.lpage_info[i] = 6336 slot->arch.lpage_info[i] =
6326 vzalloc(lpages * sizeof(*slot->arch.lpage_info[i])); 6337 kvm_kvzalloc(lpages * sizeof(*slot->arch.lpage_info[i]));
6327 if (!slot->arch.lpage_info[i]) 6338 if (!slot->arch.lpage_info[i])
6328 goto out_free; 6339 goto out_free;
6329 6340
@@ -6350,7 +6361,7 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
6350 6361
6351out_free: 6362out_free:
6352 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { 6363 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
6353 vfree(slot->arch.lpage_info[i]); 6364 kvm_kvfree(slot->arch.lpage_info[i]);
6354 slot->arch.lpage_info[i] = NULL; 6365 slot->arch.lpage_info[i] = NULL;
6355 } 6366 }
6356 return -ENOMEM; 6367 return -ENOMEM;
diff --git a/arch/x86/lib/msr-reg-export.c b/arch/x86/lib/msr-reg-export.c
index a311cc59b65d..8d6ef78b5d01 100644
--- a/arch/x86/lib/msr-reg-export.c
+++ b/arch/x86/lib/msr-reg-export.c
@@ -1,5 +1,5 @@
1#include <linux/module.h> 1#include <linux/module.h>
2#include <asm/msr.h> 2#include <asm/msr.h>
3 3
4EXPORT_SYMBOL(native_rdmsr_safe_regs); 4EXPORT_SYMBOL(rdmsr_safe_regs);
5EXPORT_SYMBOL(native_wrmsr_safe_regs); 5EXPORT_SYMBOL(wrmsr_safe_regs);
diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S
index 69fa10623f21..f6d13eefad10 100644
--- a/arch/x86/lib/msr-reg.S
+++ b/arch/x86/lib/msr-reg.S
@@ -6,13 +6,13 @@
6 6
7#ifdef CONFIG_X86_64 7#ifdef CONFIG_X86_64
8/* 8/*
9 * int native_{rdmsr,wrmsr}_safe_regs(u32 gprs[8]); 9 * int {rdmsr,wrmsr}_safe_regs(u32 gprs[8]);
10 * 10 *
11 * reg layout: u32 gprs[eax, ecx, edx, ebx, esp, ebp, esi, edi] 11 * reg layout: u32 gprs[eax, ecx, edx, ebx, esp, ebp, esi, edi]
12 * 12 *
13 */ 13 */
14.macro op_safe_regs op 14.macro op_safe_regs op
15ENTRY(native_\op\()_safe_regs) 15ENTRY(\op\()_safe_regs)
16 CFI_STARTPROC 16 CFI_STARTPROC
17 pushq_cfi %rbx 17 pushq_cfi %rbx
18 pushq_cfi %rbp 18 pushq_cfi %rbp
@@ -45,13 +45,13 @@ ENTRY(native_\op\()_safe_regs)
45 45
46 _ASM_EXTABLE(1b, 3b) 46 _ASM_EXTABLE(1b, 3b)
47 CFI_ENDPROC 47 CFI_ENDPROC
48ENDPROC(native_\op\()_safe_regs) 48ENDPROC(\op\()_safe_regs)
49.endm 49.endm
50 50
51#else /* X86_32 */ 51#else /* X86_32 */
52 52
53.macro op_safe_regs op 53.macro op_safe_regs op
54ENTRY(native_\op\()_safe_regs) 54ENTRY(\op\()_safe_regs)
55 CFI_STARTPROC 55 CFI_STARTPROC
56 pushl_cfi %ebx 56 pushl_cfi %ebx
57 pushl_cfi %ebp 57 pushl_cfi %ebp
@@ -92,7 +92,7 @@ ENTRY(native_\op\()_safe_regs)
92 92
93 _ASM_EXTABLE(1b, 3b) 93 _ASM_EXTABLE(1b, 3b)
94 CFI_ENDPROC 94 CFI_ENDPROC
95ENDPROC(native_\op\()_safe_regs) 95ENDPROC(\op\()_safe_regs)
96.endm 96.endm
97 97
98#endif 98#endif
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index bc4e9d84157f..e0e6990723e9 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -385,7 +385,7 @@ void free_initmem(void)
385} 385}
386 386
387#ifdef CONFIG_BLK_DEV_INITRD 387#ifdef CONFIG_BLK_DEV_INITRD
388void free_initrd_mem(unsigned long start, unsigned long end) 388void __init free_initrd_mem(unsigned long start, unsigned long end)
389{ 389{
390 /* 390 /*
391 * end could be not aligned, and We can not align that, 391 * end could be not aligned, and We can not align that,
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index a718e0d23503..931930a96160 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -919,11 +919,13 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
919 919
920 /* 920 /*
921 * On success we use clflush, when the CPU supports it to 921 * On success we use clflush, when the CPU supports it to
922 * avoid the wbindv. If the CPU does not support it and in the 922 * avoid the wbindv. If the CPU does not support it, in the
923 * error case we fall back to cpa_flush_all (which uses 923 * error case, and during early boot (for EFI) we fall back
924 * wbindv): 924 * to cpa_flush_all (which uses wbinvd):
925 */ 925 */
926 if (!ret && cpu_has_clflush) { 926 if (early_boot_irqs_disabled)
927 __cpa_flush_all((void *)(long)cache);
928 else if (!ret && cpu_has_clflush) {
927 if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { 929 if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
928 cpa_flush_array(addr, numpages, cache, 930 cpa_flush_array(addr, numpages, cache,
929 cpa.flags, pages); 931 cpa.flags, pages);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 5e57e113b72c..613cd83e8c0c 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -12,6 +12,7 @@
12#include <asm/cache.h> 12#include <asm/cache.h>
13#include <asm/apic.h> 13#include <asm/apic.h>
14#include <asm/uv/uv.h> 14#include <asm/uv/uv.h>
15#include <linux/debugfs.h>
15 16
16DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) 17DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
17 = { &init_mm, 0, }; 18 = { &init_mm, 0, };
@@ -27,33 +28,14 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
27 * 28 *
28 * More scalable flush, from Andi Kleen 29 * More scalable flush, from Andi Kleen
29 * 30 *
30 * To avoid global state use 8 different call vectors. 31 * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
31 * Each CPU uses a specific vector to trigger flushes on other
32 * CPUs. Depending on the received vector the target CPUs look into
33 * the right array slot for the flush data.
34 *
35 * With more than 8 CPUs they are hashed to the 8 available
36 * vectors. The limited global vector space forces us to this right now.
37 * In future when interrupts are split into per CPU domains this could be
38 * fixed, at the cost of triggering multiple IPIs in some cases.
39 */ 32 */
40 33
41union smp_flush_state { 34struct flush_tlb_info {
42 struct { 35 struct mm_struct *flush_mm;
43 struct mm_struct *flush_mm; 36 unsigned long flush_start;
44 unsigned long flush_va; 37 unsigned long flush_end;
45 raw_spinlock_t tlbstate_lock; 38};
46 DECLARE_BITMAP(flush_cpumask, NR_CPUS);
47 };
48 char pad[INTERNODE_CACHE_BYTES];
49} ____cacheline_internodealigned_in_smp;
50
51/* State is put into the per CPU data section, but padded
52 to a full cache line because other CPUs can access it and we don't
53 want false sharing in the per cpu data segment. */
54static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS];
55
56static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset);
57 39
58/* 40/*
59 * We cannot call mmdrop() because we are in interrupt context, 41 * We cannot call mmdrop() because we are in interrupt context,
@@ -72,28 +54,25 @@ void leave_mm(int cpu)
72EXPORT_SYMBOL_GPL(leave_mm); 54EXPORT_SYMBOL_GPL(leave_mm);
73 55
74/* 56/*
75 *
76 * The flush IPI assumes that a thread switch happens in this order: 57 * The flush IPI assumes that a thread switch happens in this order:
77 * [cpu0: the cpu that switches] 58 * [cpu0: the cpu that switches]
78 * 1) switch_mm() either 1a) or 1b) 59 * 1) switch_mm() either 1a) or 1b)
79 * 1a) thread switch to a different mm 60 * 1a) thread switch to a different mm
80 * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); 61 * 1a1) set cpu_tlbstate to TLBSTATE_OK
81 * Stop ipi delivery for the old mm. This is not synchronized with 62 * Now the tlb flush NMI handler flush_tlb_func won't call leave_mm
82 * the other cpus, but smp_invalidate_interrupt ignore flush ipis 63 * if cpu0 was in lazy tlb mode.
83 * for the wrong mm, and in the worst case we perform a superfluous 64 * 1a2) update cpu active_mm
84 * tlb flush.
85 * 1a2) set cpu mmu_state to TLBSTATE_OK
86 * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
87 * was in lazy tlb mode.
88 * 1a3) update cpu active_mm
89 * Now cpu0 accepts tlb flushes for the new mm. 65 * Now cpu0 accepts tlb flushes for the new mm.
90 * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); 66 * 1a3) cpu_set(cpu, new_mm->cpu_vm_mask);
91 * Now the other cpus will send tlb flush ipis. 67 * Now the other cpus will send tlb flush ipis.
92 * 1a4) change cr3. 68 * 1a4) change cr3.
69 * 1a5) cpu_clear(cpu, old_mm->cpu_vm_mask);
70 * Stop ipi delivery for the old mm. This is not synchronized with
71 * the other cpus, but flush_tlb_func ignore flush ipis for the wrong
72 * mm, and in the worst case we perform a superfluous tlb flush.
93 * 1b) thread switch without mm change 73 * 1b) thread switch without mm change
94 * cpu active_mm is correct, cpu0 already handles 74 * cpu active_mm is correct, cpu0 already handles flush ipis.
95 * flush ipis. 75 * 1b1) set cpu_tlbstate to TLBSTATE_OK
96 * 1b1) set cpu mmu_state to TLBSTATE_OK
97 * 1b2) test_and_set the cpu bit in cpu_vm_mask. 76 * 1b2) test_and_set the cpu bit in cpu_vm_mask.
98 * Atomically set the bit [other cpus will start sending flush ipis], 77 * Atomically set the bit [other cpus will start sending flush ipis],
99 * and test the bit. 78 * and test the bit.
@@ -106,174 +85,62 @@ EXPORT_SYMBOL_GPL(leave_mm);
106 * runs in kernel space, the cpu could load tlb entries for user space 85 * runs in kernel space, the cpu could load tlb entries for user space
107 * pages. 86 * pages.
108 * 87 *
109 * The good news is that cpu mmu_state is local to each cpu, no 88 * The good news is that cpu_tlbstate is local to each cpu, no
110 * write/read ordering problems. 89 * write/read ordering problems.
111 */ 90 */
112 91
113/* 92/*
114 * TLB flush IPI: 93 * TLB flush funcation:
115 *
116 * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. 94 * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
117 * 2) Leave the mm if we are in the lazy tlb mode. 95 * 2) Leave the mm if we are in the lazy tlb mode.
118 *
119 * Interrupts are disabled.
120 */
121
122/*
123 * FIXME: use of asmlinkage is not consistent. On x86_64 it's noop
124 * but still used for documentation purpose but the usage is slightly
125 * inconsistent. On x86_32, asmlinkage is regparm(0) but interrupt
126 * entry calls in with the first parameter in %eax. Maybe define
127 * intrlinkage?
128 */ 96 */
129#ifdef CONFIG_X86_64 97static void flush_tlb_func(void *info)
130asmlinkage
131#endif
132void smp_invalidate_interrupt(struct pt_regs *regs)
133{ 98{
134 unsigned int cpu; 99 struct flush_tlb_info *f = info;
135 unsigned int sender;
136 union smp_flush_state *f;
137
138 cpu = smp_processor_id();
139 /*
140 * orig_rax contains the negated interrupt vector.
141 * Use that to determine where the sender put the data.
142 */
143 sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
144 f = &flush_state[sender];
145
146 if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask)))
147 goto out;
148 /*
149 * This was a BUG() but until someone can quote me the
150 * line from the intel manual that guarantees an IPI to
151 * multiple CPUs is retried _only_ on the erroring CPUs
152 * its staying as a return
153 *
154 * BUG();
155 */
156
157 if (f->flush_mm == this_cpu_read(cpu_tlbstate.active_mm)) {
158 if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
159 if (f->flush_va == TLB_FLUSH_ALL)
160 local_flush_tlb();
161 else
162 __flush_tlb_one(f->flush_va);
163 } else
164 leave_mm(cpu);
165 }
166out:
167 ack_APIC_irq();
168 smp_mb__before_clear_bit();
169 cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask));
170 smp_mb__after_clear_bit();
171 inc_irq_stat(irq_tlb_count);
172}
173 100
174static void flush_tlb_others_ipi(const struct cpumask *cpumask, 101 if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
175 struct mm_struct *mm, unsigned long va) 102 return;
176{ 103
177 unsigned int sender; 104 if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
178 union smp_flush_state *f; 105 if (f->flush_end == TLB_FLUSH_ALL || !cpu_has_invlpg)
179 106 local_flush_tlb();
180 /* Caller has disabled preemption */ 107 else if (!f->flush_end)
181 sender = this_cpu_read(tlb_vector_offset); 108 __flush_tlb_single(f->flush_start);
182 f = &flush_state[sender]; 109 else {
183 110 unsigned long addr;
184 if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS) 111 addr = f->flush_start;
185 raw_spin_lock(&f->tlbstate_lock); 112 while (addr < f->flush_end) {
186 113 __flush_tlb_single(addr);
187 f->flush_mm = mm; 114 addr += PAGE_SIZE;
188 f->flush_va = va; 115 }
189 if (cpumask_andnot(to_cpumask(f->flush_cpumask), cpumask, cpumask_of(smp_processor_id()))) { 116 }
190 /* 117 } else
191 * We have to send the IPI only to 118 leave_mm(smp_processor_id());
192 * CPUs affected.
193 */
194 apic->send_IPI_mask(to_cpumask(f->flush_cpumask),
195 INVALIDATE_TLB_VECTOR_START + sender);
196
197 while (!cpumask_empty(to_cpumask(f->flush_cpumask)))
198 cpu_relax();
199 }
200 119
201 f->flush_mm = NULL;
202 f->flush_va = 0;
203 if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
204 raw_spin_unlock(&f->tlbstate_lock);
205} 120}
206 121
207void native_flush_tlb_others(const struct cpumask *cpumask, 122void native_flush_tlb_others(const struct cpumask *cpumask,
208 struct mm_struct *mm, unsigned long va) 123 struct mm_struct *mm, unsigned long start,
124 unsigned long end)
209{ 125{
126 struct flush_tlb_info info;
127 info.flush_mm = mm;
128 info.flush_start = start;
129 info.flush_end = end;
130
210 if (is_uv_system()) { 131 if (is_uv_system()) {
211 unsigned int cpu; 132 unsigned int cpu;
212 133
213 cpu = smp_processor_id(); 134 cpu = smp_processor_id();
214 cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu); 135 cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu);
215 if (cpumask) 136 if (cpumask)
216 flush_tlb_others_ipi(cpumask, mm, va); 137 smp_call_function_many(cpumask, flush_tlb_func,
138 &info, 1);
217 return; 139 return;
218 } 140 }
219 flush_tlb_others_ipi(cpumask, mm, va); 141 smp_call_function_many(cpumask, flush_tlb_func, &info, 1);
220} 142}
221 143
222static void __cpuinit calculate_tlb_offset(void)
223{
224 int cpu, node, nr_node_vecs, idx = 0;
225 /*
226 * we are changing tlb_vector_offset for each CPU in runtime, but this
227 * will not cause inconsistency, as the write is atomic under X86. we
228 * might see more lock contentions in a short time, but after all CPU's
229 * tlb_vector_offset are changed, everything should go normal
230 *
231 * Note: if NUM_INVALIDATE_TLB_VECTORS % nr_online_nodes !=0, we might
232 * waste some vectors.
233 **/
234 if (nr_online_nodes > NUM_INVALIDATE_TLB_VECTORS)
235 nr_node_vecs = 1;
236 else
237 nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes;
238
239 for_each_online_node(node) {
240 int node_offset = (idx % NUM_INVALIDATE_TLB_VECTORS) *
241 nr_node_vecs;
242 int cpu_offset = 0;
243 for_each_cpu(cpu, cpumask_of_node(node)) {
244 per_cpu(tlb_vector_offset, cpu) = node_offset +
245 cpu_offset;
246 cpu_offset++;
247 cpu_offset = cpu_offset % nr_node_vecs;
248 }
249 idx++;
250 }
251}
252
253static int __cpuinit tlb_cpuhp_notify(struct notifier_block *n,
254 unsigned long action, void *hcpu)
255{
256 switch (action & 0xf) {
257 case CPU_ONLINE:
258 case CPU_DEAD:
259 calculate_tlb_offset();
260 }
261 return NOTIFY_OK;
262}
263
264static int __cpuinit init_smp_flush(void)
265{
266 int i;
267
268 for (i = 0; i < ARRAY_SIZE(flush_state); i++)
269 raw_spin_lock_init(&flush_state[i].tlbstate_lock);
270
271 calculate_tlb_offset();
272 hotcpu_notifier(tlb_cpuhp_notify, 0);
273 return 0;
274}
275core_initcall(init_smp_flush);
276
277void flush_tlb_current_task(void) 144void flush_tlb_current_task(void)
278{ 145{
279 struct mm_struct *mm = current->mm; 146 struct mm_struct *mm = current->mm;
@@ -282,27 +149,91 @@ void flush_tlb_current_task(void)
282 149
283 local_flush_tlb(); 150 local_flush_tlb();
284 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) 151 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
285 flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL); 152 flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
286 preempt_enable(); 153 preempt_enable();
287} 154}
288 155
289void flush_tlb_mm(struct mm_struct *mm) 156/*
157 * It can find out the THP large page, or
158 * HUGETLB page in tlb_flush when THP disabled
159 */
160static inline unsigned long has_large_page(struct mm_struct *mm,
161 unsigned long start, unsigned long end)
162{
163 pgd_t *pgd;
164 pud_t *pud;
165 pmd_t *pmd;
166 unsigned long addr = ALIGN(start, HPAGE_SIZE);
167 for (; addr < end; addr += HPAGE_SIZE) {
168 pgd = pgd_offset(mm, addr);
169 if (likely(!pgd_none(*pgd))) {
170 pud = pud_offset(pgd, addr);
171 if (likely(!pud_none(*pud))) {
172 pmd = pmd_offset(pud, addr);
173 if (likely(!pmd_none(*pmd)))
174 if (pmd_large(*pmd))
175 return addr;
176 }
177 }
178 }
179 return 0;
180}
181
182void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
183 unsigned long end, unsigned long vmflag)
290{ 184{
185 unsigned long addr;
186 unsigned act_entries, tlb_entries = 0;
187
291 preempt_disable(); 188 preempt_disable();
189 if (current->active_mm != mm)
190 goto flush_all;
292 191
293 if (current->active_mm == mm) { 192 if (!current->mm) {
294 if (current->mm) 193 leave_mm(smp_processor_id());
194 goto flush_all;
195 }
196
197 if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1
198 || vmflag == VM_HUGETLB) {
199 local_flush_tlb();
200 goto flush_all;
201 }
202
203 /* In modern CPU, last level tlb used for both data/ins */
204 if (vmflag & VM_EXEC)
205 tlb_entries = tlb_lli_4k[ENTRIES];
206 else
207 tlb_entries = tlb_lld_4k[ENTRIES];
208 /* Assume all of TLB entries was occupied by this task */
209 act_entries = mm->total_vm > tlb_entries ? tlb_entries : mm->total_vm;
210
211 /* tlb_flushall_shift is on balance point, details in commit log */
212 if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift)
213 local_flush_tlb();
214 else {
215 if (has_large_page(mm, start, end)) {
295 local_flush_tlb(); 216 local_flush_tlb();
296 else 217 goto flush_all;
297 leave_mm(smp_processor_id()); 218 }
219 /* flush range by one by one 'invlpg' */
220 for (addr = start; addr < end; addr += PAGE_SIZE)
221 __flush_tlb_single(addr);
222
223 if (cpumask_any_but(mm_cpumask(mm),
224 smp_processor_id()) < nr_cpu_ids)
225 flush_tlb_others(mm_cpumask(mm), mm, start, end);
226 preempt_enable();
227 return;
298 } 228 }
299 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
300 flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL);
301 229
230flush_all:
231 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
232 flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
302 preempt_enable(); 233 preempt_enable();
303} 234}
304 235
305void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) 236void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
306{ 237{
307 struct mm_struct *mm = vma->vm_mm; 238 struct mm_struct *mm = vma->vm_mm;
308 239
@@ -310,13 +241,13 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
310 241
311 if (current->active_mm == mm) { 242 if (current->active_mm == mm) {
312 if (current->mm) 243 if (current->mm)
313 __flush_tlb_one(va); 244 __flush_tlb_one(start);
314 else 245 else
315 leave_mm(smp_processor_id()); 246 leave_mm(smp_processor_id());
316 } 247 }
317 248
318 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) 249 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
319 flush_tlb_others(mm_cpumask(mm), mm, va); 250 flush_tlb_others(mm_cpumask(mm), mm, start, 0UL);
320 251
321 preempt_enable(); 252 preempt_enable();
322} 253}
@@ -332,3 +263,83 @@ void flush_tlb_all(void)
332{ 263{
333 on_each_cpu(do_flush_tlb_all, NULL, 1); 264 on_each_cpu(do_flush_tlb_all, NULL, 1);
334} 265}
266
267static void do_kernel_range_flush(void *info)
268{
269 struct flush_tlb_info *f = info;
270 unsigned long addr;
271
272 /* flush range by one by one 'invlpg' */
273 for (addr = f->flush_start; addr < f->flush_end; addr += PAGE_SIZE)
274 __flush_tlb_single(addr);
275}
276
277void flush_tlb_kernel_range(unsigned long start, unsigned long end)
278{
279 unsigned act_entries;
280 struct flush_tlb_info info;
281
282 /* In modern CPU, last level tlb used for both data/ins */
283 act_entries = tlb_lld_4k[ENTRIES];
284
285 /* Balance as user space task's flush, a bit conservative */
286 if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1 ||
287 (end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift)
288
289 on_each_cpu(do_flush_tlb_all, NULL, 1);
290 else {
291 info.flush_start = start;
292 info.flush_end = end;
293 on_each_cpu(do_kernel_range_flush, &info, 1);
294 }
295}
296
297#ifdef CONFIG_DEBUG_TLBFLUSH
298static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
299 size_t count, loff_t *ppos)
300{
301 char buf[32];
302 unsigned int len;
303
304 len = sprintf(buf, "%hd\n", tlb_flushall_shift);
305 return simple_read_from_buffer(user_buf, count, ppos, buf, len);
306}
307
308static ssize_t tlbflush_write_file(struct file *file,
309 const char __user *user_buf, size_t count, loff_t *ppos)
310{
311 char buf[32];
312 ssize_t len;
313 s8 shift;
314
315 len = min(count, sizeof(buf) - 1);
316 if (copy_from_user(buf, user_buf, len))
317 return -EFAULT;
318
319 buf[len] = '\0';
320 if (kstrtos8(buf, 0, &shift))
321 return -EINVAL;
322
323 if (shift > 64)
324 return -EINVAL;
325
326 tlb_flushall_shift = shift;
327 return count;
328}
329
330static const struct file_operations fops_tlbflush = {
331 .read = tlbflush_read_file,
332 .write = tlbflush_write_file,
333 .llseek = default_llseek,
334};
335
336static int __cpuinit create_tlb_flushall_shift(void)
337{
338 if (cpu_has_invlpg) {
339 debugfs_create_file("tlb_flushall_shift", S_IRUSR | S_IWUSR,
340 arch_debugfs_dir, NULL, &fops_tlbflush);
341 }
342 return 0;
343}
344late_initcall(create_tlb_flushall_shift);
345#endif
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 0597f95b6da6..33643a8bcbbb 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -309,6 +309,10 @@ void bpf_jit_compile(struct sk_filter *fp)
309 else 309 else
310 EMIT1_off32(0x0d, K); /* or imm32,%eax */ 310 EMIT1_off32(0x0d, K); /* or imm32,%eax */
311 break; 311 break;
312 case BPF_S_ANC_ALU_XOR_X: /* A ^= X; */
313 seen |= SEEN_XREG;
314 EMIT2(0x31, 0xd8); /* xor %ebx,%eax */
315 break;
312 case BPF_S_ALU_LSH_X: /* A <<= X; */ 316 case BPF_S_ALU_LSH_X: /* A <<= X; */
313 seen |= SEEN_XREG; 317 seen |= SEEN_XREG;
314 EMIT4(0x89, 0xd9, 0xd3, 0xe0); /* mov %ebx,%ecx; shl %cl,%eax */ 318 EMIT4(0x89, 0xd9, 0xd3, 0xe0); /* mov %ebx,%ecx; shl %cl,%eax */
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 303f08637826..b2b94438ff05 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -312,7 +312,7 @@ static int op_amd_fill_in_addresses(struct op_msrs * const msrs)
312 goto fail; 312 goto fail;
313 } 313 }
314 /* both registers must be reserved */ 314 /* both registers must be reserved */
315 if (num_counters == AMD64_NUM_COUNTERS_F15H) { 315 if (num_counters == AMD64_NUM_COUNTERS_CORE) {
316 msrs->counters[i].addr = MSR_F15H_PERF_CTR + (i << 1); 316 msrs->counters[i].addr = MSR_F15H_PERF_CTR + (i << 1);
317 msrs->controls[i].addr = MSR_F15H_PERF_CTL + (i << 1); 317 msrs->controls[i].addr = MSR_F15H_PERF_CTL + (i << 1);
318 } else { 318 } else {
@@ -514,7 +514,7 @@ static int op_amd_init(struct oprofile_operations *ops)
514 ops->create_files = setup_ibs_files; 514 ops->create_files = setup_ibs_files;
515 515
516 if (boot_cpu_data.x86 == 0x15) { 516 if (boot_cpu_data.x86 == 0x15) {
517 num_counters = AMD64_NUM_COUNTERS_F15H; 517 num_counters = AMD64_NUM_COUNTERS_CORE;
518 } else { 518 } else {
519 num_counters = AMD64_NUM_COUNTERS; 519 num_counters = AMD64_NUM_COUNTERS;
520 } 520 }
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index fc09c2754e08..505acdd6d600 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -12,8 +12,13 @@ struct pci_root_info {
12 char name[16]; 12 char name[16];
13 unsigned int res_num; 13 unsigned int res_num;
14 struct resource *res; 14 struct resource *res;
15 int busnum;
16 struct pci_sysdata sd; 15 struct pci_sysdata sd;
16#ifdef CONFIG_PCI_MMCONFIG
17 bool mcfg_added;
18 u16 segment;
19 u8 start_bus;
20 u8 end_bus;
21#endif
17}; 22};
18 23
19static bool pci_use_crs = true; 24static bool pci_use_crs = true;
@@ -120,6 +125,81 @@ void __init pci_acpi_crs_quirks(void)
120 pci_use_crs ? "nocrs" : "use_crs"); 125 pci_use_crs ? "nocrs" : "use_crs");
121} 126}
122 127
128#ifdef CONFIG_PCI_MMCONFIG
129static int __devinit check_segment(u16 seg, struct device *dev, char *estr)
130{
131 if (seg) {
132 dev_err(dev,
133 "%s can't access PCI configuration "
134 "space under this host bridge.\n",
135 estr);
136 return -EIO;
137 }
138
139 /*
140 * Failure in adding MMCFG information is not fatal,
141 * just can't access extended configuration space of
142 * devices under this host bridge.
143 */
144 dev_warn(dev,
145 "%s can't access extended PCI configuration "
146 "space under this bridge.\n",
147 estr);
148
149 return 0;
150}
151
152static int __devinit setup_mcfg_map(struct pci_root_info *info,
153 u16 seg, u8 start, u8 end,
154 phys_addr_t addr)
155{
156 int result;
157 struct device *dev = &info->bridge->dev;
158
159 info->start_bus = start;
160 info->end_bus = end;
161 info->mcfg_added = false;
162
163 /* return success if MMCFG is not in use */
164 if (raw_pci_ext_ops && raw_pci_ext_ops != &pci_mmcfg)
165 return 0;
166
167 if (!(pci_probe & PCI_PROBE_MMCONF))
168 return check_segment(seg, dev, "MMCONFIG is disabled,");
169
170 result = pci_mmconfig_insert(dev, seg, start, end, addr);
171 if (result == 0) {
172 /* enable MMCFG if it hasn't been enabled yet */
173 if (raw_pci_ext_ops == NULL)
174 raw_pci_ext_ops = &pci_mmcfg;
175 info->mcfg_added = true;
176 } else if (result != -EEXIST)
177 return check_segment(seg, dev,
178 "fail to add MMCONFIG information,");
179
180 return 0;
181}
182
183static void teardown_mcfg_map(struct pci_root_info *info)
184{
185 if (info->mcfg_added) {
186 pci_mmconfig_delete(info->segment, info->start_bus,
187 info->end_bus);
188 info->mcfg_added = false;
189 }
190}
191#else
192static int __devinit setup_mcfg_map(struct pci_root_info *info,
193 u16 seg, u8 start, u8 end,
194 phys_addr_t addr)
195{
196 return 0;
197}
198static void teardown_mcfg_map(struct pci_root_info *info)
199{
200}
201#endif
202
123static acpi_status 203static acpi_status
124resource_to_addr(struct acpi_resource *resource, 204resource_to_addr(struct acpi_resource *resource,
125 struct acpi_resource_address64 *addr) 205 struct acpi_resource_address64 *addr)
@@ -234,13 +314,6 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
234 } 314 }
235 315
236 info->res_num++; 316 info->res_num++;
237 if (addr.translation_offset)
238 dev_info(&info->bridge->dev, "host bridge window %pR "
239 "(PCI address [%#llx-%#llx])\n",
240 res, res->start - addr.translation_offset,
241 res->end - addr.translation_offset);
242 else
243 dev_info(&info->bridge->dev, "host bridge window %pR\n", res);
244 317
245 return AE_OK; 318 return AE_OK;
246} 319}
@@ -332,8 +405,11 @@ static void __release_pci_root_info(struct pci_root_info *info)
332 405
333 free_pci_root_info_res(info); 406 free_pci_root_info_res(info);
334 407
408 teardown_mcfg_map(info);
409
335 kfree(info); 410 kfree(info);
336} 411}
412
337static void release_pci_root_info(struct pci_host_bridge *bridge) 413static void release_pci_root_info(struct pci_host_bridge *bridge)
338{ 414{
339 struct pci_root_info *info = bridge->release_data; 415 struct pci_root_info *info = bridge->release_data;
@@ -347,7 +423,9 @@ probe_pci_root_info(struct pci_root_info *info, struct acpi_device *device,
347{ 423{
348 size_t size; 424 size_t size;
349 425
426 sprintf(info->name, "PCI Bus %04x:%02x", domain, busnum);
350 info->bridge = device; 427 info->bridge = device;
428
351 info->res_num = 0; 429 info->res_num = 0;
352 acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_resource, 430 acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_resource,
353 info); 431 info);
@@ -360,8 +438,6 @@ probe_pci_root_info(struct pci_root_info *info, struct acpi_device *device,
360 if (!info->res) 438 if (!info->res)
361 return; 439 return;
362 440
363 sprintf(info->name, "PCI Bus %04x:%02x", domain, busnum);
364
365 acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource, 441 acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource,
366 info); 442 info);
367} 443}
@@ -373,7 +449,7 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
373 int domain = root->segment; 449 int domain = root->segment;
374 int busnum = root->secondary.start; 450 int busnum = root->secondary.start;
375 LIST_HEAD(resources); 451 LIST_HEAD(resources);
376 struct pci_bus *bus; 452 struct pci_bus *bus = NULL;
377 struct pci_sysdata *sd; 453 struct pci_sysdata *sd;
378 int node; 454 int node;
379#ifdef CONFIG_ACPI_NUMA 455#ifdef CONFIG_ACPI_NUMA
@@ -426,6 +502,8 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
426 } else { 502 } else {
427 probe_pci_root_info(info, device, busnum, domain); 503 probe_pci_root_info(info, device, busnum, domain);
428 504
505 /* insert busn res at first */
506 pci_add_resource(&resources, &root->secondary);
429 /* 507 /*
430 * _CRS with no apertures is normal, so only fall back to 508 * _CRS with no apertures is normal, so only fall back to
431 * defaults or native bridge info if we're ignoring _CRS. 509 * defaults or native bridge info if we're ignoring _CRS.
@@ -437,10 +515,13 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
437 x86_pci_root_bus_resources(busnum, &resources); 515 x86_pci_root_bus_resources(busnum, &resources);
438 } 516 }
439 517
440 bus = pci_create_root_bus(NULL, busnum, &pci_root_ops, sd, 518 if (!setup_mcfg_map(info, domain, (u8)root->secondary.start,
441 &resources); 519 (u8)root->secondary.end, root->mcfg_addr))
520 bus = pci_create_root_bus(NULL, busnum, &pci_root_ops,
521 sd, &resources);
522
442 if (bus) { 523 if (bus) {
443 bus->subordinate = pci_scan_child_bus(bus); 524 pci_scan_child_bus(bus);
444 pci_set_host_bridge_release( 525 pci_set_host_bridge_release(
445 to_pci_host_bridge(bus->bridge), 526 to_pci_host_bridge(bus->bridge),
446 release_pci_root_info, info); 527 release_pci_root_info, info);
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 5aed49bff058..e9e6ed5cdf94 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -121,7 +121,6 @@ static int __init early_fill_mp_bus_info(void)
121 link = (reg >> 8) & 0x03; 121 link = (reg >> 8) & 0x03;
122 122
123 info = alloc_pci_root_info(min_bus, max_bus, node, link); 123 info = alloc_pci_root_info(min_bus, max_bus, node, link);
124 sprintf(info->name, "PCI Bus #%02x", min_bus);
125 } 124 }
126 125
127 /* get the default node and link for left over res */ 126 /* get the default node and link for left over res */
@@ -300,9 +299,9 @@ static int __init early_fill_mp_bus_info(void)
300 int busnum; 299 int busnum;
301 struct pci_root_res *root_res; 300 struct pci_root_res *root_res;
302 301
303 busnum = info->bus_min; 302 busnum = info->busn.start;
304 printk(KERN_DEBUG "bus: [%02x, %02x] on node %x link %x\n", 303 printk(KERN_DEBUG "bus: %pR on node %x link %x\n",
305 info->bus_min, info->bus_max, info->node, info->link); 304 &info->busn, info->node, info->link);
306 list_for_each_entry(root_res, &info->resources, list) 305 list_for_each_entry(root_res, &info->resources, list)
307 printk(KERN_DEBUG "bus: %02x %pR\n", 306 printk(KERN_DEBUG "bus: %02x %pR\n",
308 busnum, &root_res->res); 307 busnum, &root_res->res);
diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c
index 306579f7d0fd..d37e2fec97e5 100644
--- a/arch/x86/pci/bus_numa.c
+++ b/arch/x86/pci/bus_numa.c
@@ -14,7 +14,7 @@ static struct pci_root_info *x86_find_pci_root_info(int bus)
14 return NULL; 14 return NULL;
15 15
16 list_for_each_entry(info, &pci_root_infos, list) 16 list_for_each_entry(info, &pci_root_infos, list)
17 if (info->bus_min == bus) 17 if (info->busn.start == bus)
18 return info; 18 return info;
19 19
20 return NULL; 20 return NULL;
@@ -24,6 +24,8 @@ void x86_pci_root_bus_resources(int bus, struct list_head *resources)
24{ 24{
25 struct pci_root_info *info = x86_find_pci_root_info(bus); 25 struct pci_root_info *info = x86_find_pci_root_info(bus);
26 struct pci_root_res *root_res; 26 struct pci_root_res *root_res;
27 struct pci_host_bridge_window *window;
28 bool found = false;
27 29
28 if (!info) 30 if (!info)
29 goto default_resources; 31 goto default_resources;
@@ -31,6 +33,16 @@ void x86_pci_root_bus_resources(int bus, struct list_head *resources)
31 printk(KERN_DEBUG "PCI: root bus %02x: hardware-probed resources\n", 33 printk(KERN_DEBUG "PCI: root bus %02x: hardware-probed resources\n",
32 bus); 34 bus);
33 35
36 /* already added by acpi ? */
37 list_for_each_entry(window, resources, list)
38 if (window->res->flags & IORESOURCE_BUS) {
39 found = true;
40 break;
41 }
42
43 if (!found)
44 pci_add_resource(resources, &info->busn);
45
34 list_for_each_entry(root_res, &info->resources, list) { 46 list_for_each_entry(root_res, &info->resources, list) {
35 struct resource *res; 47 struct resource *res;
36 struct resource *root; 48 struct resource *root;
@@ -66,9 +78,13 @@ struct pci_root_info __init *alloc_pci_root_info(int bus_min, int bus_max,
66 if (!info) 78 if (!info)
67 return info; 79 return info;
68 80
81 sprintf(info->name, "PCI Bus #%02x", bus_min);
82
69 INIT_LIST_HEAD(&info->resources); 83 INIT_LIST_HEAD(&info->resources);
70 info->bus_min = bus_min; 84 info->busn.name = info->name;
71 info->bus_max = bus_max; 85 info->busn.start = bus_min;
86 info->busn.end = bus_max;
87 info->busn.flags = IORESOURCE_BUS;
72 info->node = node; 88 info->node = node;
73 info->link = link; 89 info->link = link;
74 90
diff --git a/arch/x86/pci/bus_numa.h b/arch/x86/pci/bus_numa.h
index 226a466b2b2b..ff8f65b04574 100644
--- a/arch/x86/pci/bus_numa.h
+++ b/arch/x86/pci/bus_numa.h
@@ -13,8 +13,7 @@ struct pci_root_info {
13 struct list_head list; 13 struct list_head list;
14 char name[12]; 14 char name[12];
15 struct list_head resources; 15 struct list_head resources;
16 int bus_min; 16 struct resource busn;
17 int bus_max;
18 int node; 17 int node;
19 int link; 18 int link;
20}; 19};
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 0ad990a20d4a..720e973fc34a 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -494,7 +494,7 @@ int __init pcibios_init(void)
494 return 0; 494 return 0;
495} 495}
496 496
497char * __devinit pcibios_setup(char *str) 497char * __init pcibios_setup(char *str)
498{ 498{
499 if (!strcmp(str, "off")) { 499 if (!strcmp(str, "off")) {
500 pci_probe = 0; 500 pci_probe = 0;
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 301e325992f6..937bcece7006 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -17,6 +17,8 @@
17#include <linux/bitmap.h> 17#include <linux/bitmap.h>
18#include <linux/dmi.h> 18#include <linux/dmi.h>
19#include <linux/slab.h> 19#include <linux/slab.h>
20#include <linux/mutex.h>
21#include <linux/rculist.h>
20#include <asm/e820.h> 22#include <asm/e820.h>
21#include <asm/pci_x86.h> 23#include <asm/pci_x86.h>
22#include <asm/acpi.h> 24#include <asm/acpi.h>
@@ -24,7 +26,9 @@
24#define PREFIX "PCI: " 26#define PREFIX "PCI: "
25 27
26/* Indicate if the mmcfg resources have been placed into the resource table. */ 28/* Indicate if the mmcfg resources have been placed into the resource table. */
27static int __initdata pci_mmcfg_resources_inserted; 29static bool pci_mmcfg_running_state;
30static bool pci_mmcfg_arch_init_failed;
31static DEFINE_MUTEX(pci_mmcfg_lock);
28 32
29LIST_HEAD(pci_mmcfg_list); 33LIST_HEAD(pci_mmcfg_list);
30 34
@@ -45,24 +49,25 @@ static __init void free_all_mmcfg(void)
45 pci_mmconfig_remove(cfg); 49 pci_mmconfig_remove(cfg);
46} 50}
47 51
48static __init void list_add_sorted(struct pci_mmcfg_region *new) 52static __devinit void list_add_sorted(struct pci_mmcfg_region *new)
49{ 53{
50 struct pci_mmcfg_region *cfg; 54 struct pci_mmcfg_region *cfg;
51 55
52 /* keep list sorted by segment and starting bus number */ 56 /* keep list sorted by segment and starting bus number */
53 list_for_each_entry(cfg, &pci_mmcfg_list, list) { 57 list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list) {
54 if (cfg->segment > new->segment || 58 if (cfg->segment > new->segment ||
55 (cfg->segment == new->segment && 59 (cfg->segment == new->segment &&
56 cfg->start_bus >= new->start_bus)) { 60 cfg->start_bus >= new->start_bus)) {
57 list_add_tail(&new->list, &cfg->list); 61 list_add_tail_rcu(&new->list, &cfg->list);
58 return; 62 return;
59 } 63 }
60 } 64 }
61 list_add_tail(&new->list, &pci_mmcfg_list); 65 list_add_tail_rcu(&new->list, &pci_mmcfg_list);
62} 66}
63 67
64static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start, 68static __devinit struct pci_mmcfg_region *pci_mmconfig_alloc(int segment,
65 int end, u64 addr) 69 int start,
70 int end, u64 addr)
66{ 71{
67 struct pci_mmcfg_region *new; 72 struct pci_mmcfg_region *new;
68 struct resource *res; 73 struct resource *res;
@@ -79,8 +84,6 @@ static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start,
79 new->start_bus = start; 84 new->start_bus = start;
80 new->end_bus = end; 85 new->end_bus = end;
81 86
82 list_add_sorted(new);
83
84 res = &new->res; 87 res = &new->res;
85 res->start = addr + PCI_MMCFG_BUS_OFFSET(start); 88 res->start = addr + PCI_MMCFG_BUS_OFFSET(start);
86 res->end = addr + PCI_MMCFG_BUS_OFFSET(end + 1) - 1; 89 res->end = addr + PCI_MMCFG_BUS_OFFSET(end + 1) - 1;
@@ -89,9 +92,25 @@ static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start,
89 "PCI MMCONFIG %04x [bus %02x-%02x]", segment, start, end); 92 "PCI MMCONFIG %04x [bus %02x-%02x]", segment, start, end);
90 res->name = new->name; 93 res->name = new->name;
91 94
92 printk(KERN_INFO PREFIX "MMCONFIG for domain %04x [bus %02x-%02x] at " 95 return new;
93 "%pR (base %#lx)\n", segment, start, end, &new->res, 96}
94 (unsigned long) addr); 97
98static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start,
99 int end, u64 addr)
100{
101 struct pci_mmcfg_region *new;
102
103 new = pci_mmconfig_alloc(segment, start, end, addr);
104 if (new) {
105 mutex_lock(&pci_mmcfg_lock);
106 list_add_sorted(new);
107 mutex_unlock(&pci_mmcfg_lock);
108
109 pr_info(PREFIX
110 "MMCONFIG for domain %04x [bus %02x-%02x] at %pR "
111 "(base %#lx)\n",
112 segment, start, end, &new->res, (unsigned long)addr);
113 }
95 114
96 return new; 115 return new;
97} 116}
@@ -100,7 +119,7 @@ struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus)
100{ 119{
101 struct pci_mmcfg_region *cfg; 120 struct pci_mmcfg_region *cfg;
102 121
103 list_for_each_entry(cfg, &pci_mmcfg_list, list) 122 list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list)
104 if (cfg->segment == segment && 123 if (cfg->segment == segment &&
105 cfg->start_bus <= bus && bus <= cfg->end_bus) 124 cfg->start_bus <= bus && bus <= cfg->end_bus)
106 return cfg; 125 return cfg;
@@ -343,8 +362,7 @@ static int __init pci_mmcfg_check_hostbridge(void)
343 name = pci_mmcfg_probes[i].probe(); 362 name = pci_mmcfg_probes[i].probe();
344 363
345 if (name) 364 if (name)
346 printk(KERN_INFO PREFIX "%s with MMCONFIG support\n", 365 pr_info(PREFIX "%s with MMCONFIG support\n", name);
347 name);
348 } 366 }
349 367
350 /* some end_bus_number is crazy, fix it */ 368 /* some end_bus_number is crazy, fix it */
@@ -353,19 +371,8 @@ static int __init pci_mmcfg_check_hostbridge(void)
353 return !list_empty(&pci_mmcfg_list); 371 return !list_empty(&pci_mmcfg_list);
354} 372}
355 373
356static void __init pci_mmcfg_insert_resources(void) 374static acpi_status __devinit check_mcfg_resource(struct acpi_resource *res,
357{ 375 void *data)
358 struct pci_mmcfg_region *cfg;
359
360 list_for_each_entry(cfg, &pci_mmcfg_list, list)
361 insert_resource(&iomem_resource, &cfg->res);
362
363 /* Mark that the resources have been inserted. */
364 pci_mmcfg_resources_inserted = 1;
365}
366
367static acpi_status __init check_mcfg_resource(struct acpi_resource *res,
368 void *data)
369{ 376{
370 struct resource *mcfg_res = data; 377 struct resource *mcfg_res = data;
371 struct acpi_resource_address64 address; 378 struct acpi_resource_address64 address;
@@ -401,8 +408,8 @@ static acpi_status __init check_mcfg_resource(struct acpi_resource *res,
401 return AE_OK; 408 return AE_OK;
402} 409}
403 410
404static acpi_status __init find_mboard_resource(acpi_handle handle, u32 lvl, 411static acpi_status __devinit find_mboard_resource(acpi_handle handle, u32 lvl,
405 void *context, void **rv) 412 void *context, void **rv)
406{ 413{
407 struct resource *mcfg_res = context; 414 struct resource *mcfg_res = context;
408 415
@@ -415,7 +422,7 @@ static acpi_status __init find_mboard_resource(acpi_handle handle, u32 lvl,
415 return AE_OK; 422 return AE_OK;
416} 423}
417 424
418static int __init is_acpi_reserved(u64 start, u64 end, unsigned not_used) 425static int __devinit is_acpi_reserved(u64 start, u64 end, unsigned not_used)
419{ 426{
420 struct resource mcfg_res; 427 struct resource mcfg_res;
421 428
@@ -434,13 +441,15 @@ static int __init is_acpi_reserved(u64 start, u64 end, unsigned not_used)
434 441
435typedef int (*check_reserved_t)(u64 start, u64 end, unsigned type); 442typedef int (*check_reserved_t)(u64 start, u64 end, unsigned type);
436 443
437static int __init is_mmconf_reserved(check_reserved_t is_reserved, 444static int __ref is_mmconf_reserved(check_reserved_t is_reserved,
438 struct pci_mmcfg_region *cfg, int with_e820) 445 struct pci_mmcfg_region *cfg,
446 struct device *dev, int with_e820)
439{ 447{
440 u64 addr = cfg->res.start; 448 u64 addr = cfg->res.start;
441 u64 size = resource_size(&cfg->res); 449 u64 size = resource_size(&cfg->res);
442 u64 old_size = size; 450 u64 old_size = size;
443 int valid = 0, num_buses; 451 int num_buses;
452 char *method = with_e820 ? "E820" : "ACPI motherboard resources";
444 453
445 while (!is_reserved(addr, addr + size, E820_RESERVED)) { 454 while (!is_reserved(addr, addr + size, E820_RESERVED)) {
446 size >>= 1; 455 size >>= 1;
@@ -448,30 +457,76 @@ static int __init is_mmconf_reserved(check_reserved_t is_reserved,
448 break; 457 break;
449 } 458 }
450 459
451 if (size >= (16UL<<20) || size == old_size) { 460 if (size < (16UL<<20) && size != old_size)
452 printk(KERN_INFO PREFIX "MMCONFIG at %pR reserved in %s\n", 461 return 0;
453 &cfg->res, 462
454 with_e820 ? "E820" : "ACPI motherboard resources"); 463 if (dev)
455 valid = 1; 464 dev_info(dev, "MMCONFIG at %pR reserved in %s\n",
456 465 &cfg->res, method);
457 if (old_size != size) { 466 else
458 /* update end_bus */ 467 pr_info(PREFIX "MMCONFIG at %pR reserved in %s\n",
459 cfg->end_bus = cfg->start_bus + ((size>>20) - 1); 468 &cfg->res, method);
460 num_buses = cfg->end_bus - cfg->start_bus + 1; 469
461 cfg->res.end = cfg->res.start + 470 if (old_size != size) {
462 PCI_MMCFG_BUS_OFFSET(num_buses) - 1; 471 /* update end_bus */
463 snprintf(cfg->name, PCI_MMCFG_RESOURCE_NAME_LEN, 472 cfg->end_bus = cfg->start_bus + ((size>>20) - 1);
464 "PCI MMCONFIG %04x [bus %02x-%02x]", 473 num_buses = cfg->end_bus - cfg->start_bus + 1;
465 cfg->segment, cfg->start_bus, cfg->end_bus); 474 cfg->res.end = cfg->res.start +
466 printk(KERN_INFO PREFIX 475 PCI_MMCFG_BUS_OFFSET(num_buses) - 1;
467 "MMCONFIG for %04x [bus%02x-%02x] " 476 snprintf(cfg->name, PCI_MMCFG_RESOURCE_NAME_LEN,
468 "at %pR (base %#lx) (size reduced!)\n", 477 "PCI MMCONFIG %04x [bus %02x-%02x]",
469 cfg->segment, cfg->start_bus, cfg->end_bus, 478 cfg->segment, cfg->start_bus, cfg->end_bus);
470 &cfg->res, (unsigned long) cfg->address); 479
471 } 480 if (dev)
481 dev_info(dev,
482 "MMCONFIG "
483 "at %pR (base %#lx) (size reduced!)\n",
484 &cfg->res, (unsigned long) cfg->address);
485 else
486 pr_info(PREFIX
487 "MMCONFIG for %04x [bus%02x-%02x] "
488 "at %pR (base %#lx) (size reduced!)\n",
489 cfg->segment, cfg->start_bus, cfg->end_bus,
490 &cfg->res, (unsigned long) cfg->address);
472 } 491 }
473 492
474 return valid; 493 return 1;
494}
495
496static int __ref pci_mmcfg_check_reserved(struct device *dev,
497 struct pci_mmcfg_region *cfg, int early)
498{
499 if (!early && !acpi_disabled) {
500 if (is_mmconf_reserved(is_acpi_reserved, cfg, dev, 0))
501 return 1;
502
503 if (dev)
504 dev_info(dev, FW_INFO
505 "MMCONFIG at %pR not reserved in "
506 "ACPI motherboard resources\n",
507 &cfg->res);
508 else
509 pr_info(FW_INFO PREFIX
510 "MMCONFIG at %pR not reserved in "
511 "ACPI motherboard resources\n",
512 &cfg->res);
513 }
514
515 /*
516 * e820_all_mapped() is marked as __init.
517 * All entries from ACPI MCFG table have been checked at boot time.
518 * For MCFG information constructed from hotpluggable host bridge's
519 * _CBA method, just assume it's reserved.
520 */
521 if (pci_mmcfg_running_state)
522 return 1;
523
524 /* Don't try to do this check unless configuration
525 type 1 is available. how about type 2 ?*/
526 if (raw_pci_ops)
527 return is_mmconf_reserved(e820_all_mapped, cfg, dev, 1);
528
529 return 0;
475} 530}
476 531
477static void __init pci_mmcfg_reject_broken(int early) 532static void __init pci_mmcfg_reject_broken(int early)
@@ -479,38 +534,14 @@ static void __init pci_mmcfg_reject_broken(int early)
479 struct pci_mmcfg_region *cfg; 534 struct pci_mmcfg_region *cfg;
480 535
481 list_for_each_entry(cfg, &pci_mmcfg_list, list) { 536 list_for_each_entry(cfg, &pci_mmcfg_list, list) {
482 int valid = 0; 537 if (pci_mmcfg_check_reserved(NULL, cfg, early) == 0) {
483 538 pr_info(PREFIX "not using MMCONFIG\n");
484 if (!early && !acpi_disabled) { 539 free_all_mmcfg();
485 valid = is_mmconf_reserved(is_acpi_reserved, cfg, 0); 540 return;
486
487 if (valid)
488 continue;
489 else
490 printk(KERN_ERR FW_BUG PREFIX
491 "MMCONFIG at %pR not reserved in "
492 "ACPI motherboard resources\n",
493 &cfg->res);
494 } 541 }
495
496 /* Don't try to do this check unless configuration
497 type 1 is available. how about type 2 ?*/
498 if (raw_pci_ops)
499 valid = is_mmconf_reserved(e820_all_mapped, cfg, 1);
500
501 if (!valid)
502 goto reject;
503 } 542 }
504
505 return;
506
507reject:
508 printk(KERN_INFO PREFIX "not using MMCONFIG\n");
509 free_all_mmcfg();
510} 543}
511 544
512static int __initdata known_bridge;
513
514static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg, 545static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg,
515 struct acpi_mcfg_allocation *cfg) 546 struct acpi_mcfg_allocation *cfg)
516{ 547{
@@ -529,7 +560,7 @@ static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg,
529 return 0; 560 return 0;
530 } 561 }
531 562
532 printk(KERN_ERR PREFIX "MCFG region for %04x [bus %02x-%02x] at %#llx " 563 pr_err(PREFIX "MCFG region for %04x [bus %02x-%02x] at %#llx "
533 "is above 4GB, ignored\n", cfg->pci_segment, 564 "is above 4GB, ignored\n", cfg->pci_segment,
534 cfg->start_bus_number, cfg->end_bus_number, cfg->address); 565 cfg->start_bus_number, cfg->end_bus_number, cfg->address);
535 return -EINVAL; 566 return -EINVAL;
@@ -556,7 +587,7 @@ static int __init pci_parse_mcfg(struct acpi_table_header *header)
556 i -= sizeof(struct acpi_mcfg_allocation); 587 i -= sizeof(struct acpi_mcfg_allocation);
557 }; 588 };
558 if (entries == 0) { 589 if (entries == 0) {
559 printk(KERN_ERR PREFIX "MMCONFIG has no entries\n"); 590 pr_err(PREFIX "MMCONFIG has no entries\n");
560 return -ENODEV; 591 return -ENODEV;
561 } 592 }
562 593
@@ -570,8 +601,7 @@ static int __init pci_parse_mcfg(struct acpi_table_header *header)
570 601
571 if (pci_mmconfig_add(cfg->pci_segment, cfg->start_bus_number, 602 if (pci_mmconfig_add(cfg->pci_segment, cfg->start_bus_number,
572 cfg->end_bus_number, cfg->address) == NULL) { 603 cfg->end_bus_number, cfg->address) == NULL) {
573 printk(KERN_WARNING PREFIX 604 pr_warn(PREFIX "no memory for MCFG entries\n");
574 "no memory for MCFG entries\n");
575 free_all_mmcfg(); 605 free_all_mmcfg();
576 return -ENOMEM; 606 return -ENOMEM;
577 } 607 }
@@ -582,28 +612,7 @@ static int __init pci_parse_mcfg(struct acpi_table_header *header)
582 612
583static void __init __pci_mmcfg_init(int early) 613static void __init __pci_mmcfg_init(int early)
584{ 614{
585 /* MMCONFIG disabled */
586 if ((pci_probe & PCI_PROBE_MMCONF) == 0)
587 return;
588
589 /* MMCONFIG already enabled */
590 if (!early && !(pci_probe & PCI_PROBE_MASK & ~PCI_PROBE_MMCONF))
591 return;
592
593 /* for late to exit */
594 if (known_bridge)
595 return;
596
597 if (early) {
598 if (pci_mmcfg_check_hostbridge())
599 known_bridge = 1;
600 }
601
602 if (!known_bridge)
603 acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
604
605 pci_mmcfg_reject_broken(early); 615 pci_mmcfg_reject_broken(early);
606
607 if (list_empty(&pci_mmcfg_list)) 616 if (list_empty(&pci_mmcfg_list))
608 return; 617 return;
609 618
@@ -620,33 +629,48 @@ static void __init __pci_mmcfg_init(int early)
620 if (pci_mmcfg_arch_init()) 629 if (pci_mmcfg_arch_init())
621 pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; 630 pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
622 else { 631 else {
623 /* 632 free_all_mmcfg();
624 * Signal not to attempt to insert mmcfg resources because 633 pci_mmcfg_arch_init_failed = true;
625 * the architecture mmcfg setup could not initialize.
626 */
627 pci_mmcfg_resources_inserted = 1;
628 } 634 }
629} 635}
630 636
637static int __initdata known_bridge;
638
631void __init pci_mmcfg_early_init(void) 639void __init pci_mmcfg_early_init(void)
632{ 640{
633 __pci_mmcfg_init(1); 641 if (pci_probe & PCI_PROBE_MMCONF) {
642 if (pci_mmcfg_check_hostbridge())
643 known_bridge = 1;
644 else
645 acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
646 __pci_mmcfg_init(1);
647 }
634} 648}
635 649
636void __init pci_mmcfg_late_init(void) 650void __init pci_mmcfg_late_init(void)
637{ 651{
638 __pci_mmcfg_init(0); 652 /* MMCONFIG disabled */
653 if ((pci_probe & PCI_PROBE_MMCONF) == 0)
654 return;
655
656 if (known_bridge)
657 return;
658
659 /* MMCONFIG hasn't been enabled yet, try again */
660 if (pci_probe & PCI_PROBE_MASK & ~PCI_PROBE_MMCONF) {
661 acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
662 __pci_mmcfg_init(0);
663 }
639} 664}
640 665
641static int __init pci_mmcfg_late_insert_resources(void) 666static int __init pci_mmcfg_late_insert_resources(void)
642{ 667{
643 /* 668 struct pci_mmcfg_region *cfg;
644 * If resources are already inserted or we are not using MMCONFIG, 669
645 * don't insert the resources. 670 pci_mmcfg_running_state = true;
646 */ 671
647 if ((pci_mmcfg_resources_inserted == 1) || 672 /* If we are not using MMCONFIG, don't insert the resources. */
648 (pci_probe & PCI_PROBE_MMCONF) == 0 || 673 if ((pci_probe & PCI_PROBE_MMCONF) == 0)
649 list_empty(&pci_mmcfg_list))
650 return 1; 674 return 1;
651 675
652 /* 676 /*
@@ -654,7 +678,9 @@ static int __init pci_mmcfg_late_insert_resources(void)
654 * marked so it won't cause request errors when __request_region is 678 * marked so it won't cause request errors when __request_region is
655 * called. 679 * called.
656 */ 680 */
657 pci_mmcfg_insert_resources(); 681 list_for_each_entry(cfg, &pci_mmcfg_list, list)
682 if (!cfg->res.parent)
683 insert_resource(&iomem_resource, &cfg->res);
658 684
659 return 0; 685 return 0;
660} 686}
@@ -665,3 +691,101 @@ static int __init pci_mmcfg_late_insert_resources(void)
665 * with other system resources. 691 * with other system resources.
666 */ 692 */
667late_initcall(pci_mmcfg_late_insert_resources); 693late_initcall(pci_mmcfg_late_insert_resources);
694
695/* Add MMCFG information for host bridges */
696int __devinit pci_mmconfig_insert(struct device *dev,
697 u16 seg, u8 start, u8 end,
698 phys_addr_t addr)
699{
700 int rc;
701 struct resource *tmp = NULL;
702 struct pci_mmcfg_region *cfg;
703
704 if (!(pci_probe & PCI_PROBE_MMCONF) || pci_mmcfg_arch_init_failed)
705 return -ENODEV;
706
707 if (start > end)
708 return -EINVAL;
709
710 mutex_lock(&pci_mmcfg_lock);
711 cfg = pci_mmconfig_lookup(seg, start);
712 if (cfg) {
713 if (cfg->end_bus < end)
714 dev_info(dev, FW_INFO
715 "MMCONFIG for "
716 "domain %04x [bus %02x-%02x] "
717 "only partially covers this bridge\n",
718 cfg->segment, cfg->start_bus, cfg->end_bus);
719 mutex_unlock(&pci_mmcfg_lock);
720 return -EEXIST;
721 }
722
723 if (!addr) {
724 mutex_unlock(&pci_mmcfg_lock);
725 return -EINVAL;
726 }
727
728 rc = -EBUSY;
729 cfg = pci_mmconfig_alloc(seg, start, end, addr);
730 if (cfg == NULL) {
731 dev_warn(dev, "fail to add MMCONFIG (out of memory)\n");
732 rc = -ENOMEM;
733 } else if (!pci_mmcfg_check_reserved(dev, cfg, 0)) {
734 dev_warn(dev, FW_BUG "MMCONFIG %pR isn't reserved\n",
735 &cfg->res);
736 } else {
737 /* Insert resource if it's not in boot stage */
738 if (pci_mmcfg_running_state)
739 tmp = insert_resource_conflict(&iomem_resource,
740 &cfg->res);
741
742 if (tmp) {
743 dev_warn(dev,
744 "MMCONFIG %pR conflicts with "
745 "%s %pR\n",
746 &cfg->res, tmp->name, tmp);
747 } else if (pci_mmcfg_arch_map(cfg)) {
748 dev_warn(dev, "fail to map MMCONFIG %pR.\n",
749 &cfg->res);
750 } else {
751 list_add_sorted(cfg);
752 dev_info(dev, "MMCONFIG at %pR (base %#lx)\n",
753 &cfg->res, (unsigned long)addr);
754 cfg = NULL;
755 rc = 0;
756 }
757 }
758
759 if (cfg) {
760 if (cfg->res.parent)
761 release_resource(&cfg->res);
762 kfree(cfg);
763 }
764
765 mutex_unlock(&pci_mmcfg_lock);
766
767 return rc;
768}
769
770/* Delete MMCFG information for host bridges */
771int pci_mmconfig_delete(u16 seg, u8 start, u8 end)
772{
773 struct pci_mmcfg_region *cfg;
774
775 mutex_lock(&pci_mmcfg_lock);
776 list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list)
777 if (cfg->segment == seg && cfg->start_bus == start &&
778 cfg->end_bus == end) {
779 list_del_rcu(&cfg->list);
780 synchronize_rcu();
781 pci_mmcfg_arch_unmap(cfg);
782 if (cfg->res.parent)
783 release_resource(&cfg->res);
784 mutex_unlock(&pci_mmcfg_lock);
785 kfree(cfg);
786 return 0;
787 }
788 mutex_unlock(&pci_mmcfg_lock);
789
790 return -ENOENT;
791}
diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c
index 5372e86834c0..db63ac23e3d9 100644
--- a/arch/x86/pci/mmconfig_32.c
+++ b/arch/x86/pci/mmconfig_32.c
@@ -11,6 +11,7 @@
11 11
12#include <linux/pci.h> 12#include <linux/pci.h>
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/rcupdate.h>
14#include <asm/e820.h> 15#include <asm/e820.h>
15#include <asm/pci_x86.h> 16#include <asm/pci_x86.h>
16#include <acpi/acpi.h> 17#include <acpi/acpi.h>
@@ -60,9 +61,12 @@ err: *value = -1;
60 return -EINVAL; 61 return -EINVAL;
61 } 62 }
62 63
64 rcu_read_lock();
63 base = get_base_addr(seg, bus, devfn); 65 base = get_base_addr(seg, bus, devfn);
64 if (!base) 66 if (!base) {
67 rcu_read_unlock();
65 goto err; 68 goto err;
69 }
66 70
67 raw_spin_lock_irqsave(&pci_config_lock, flags); 71 raw_spin_lock_irqsave(&pci_config_lock, flags);
68 72
@@ -80,6 +84,7 @@ err: *value = -1;
80 break; 84 break;
81 } 85 }
82 raw_spin_unlock_irqrestore(&pci_config_lock, flags); 86 raw_spin_unlock_irqrestore(&pci_config_lock, flags);
87 rcu_read_unlock();
83 88
84 return 0; 89 return 0;
85} 90}
@@ -93,9 +98,12 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus,
93 if ((bus > 255) || (devfn > 255) || (reg > 4095)) 98 if ((bus > 255) || (devfn > 255) || (reg > 4095))
94 return -EINVAL; 99 return -EINVAL;
95 100
101 rcu_read_lock();
96 base = get_base_addr(seg, bus, devfn); 102 base = get_base_addr(seg, bus, devfn);
97 if (!base) 103 if (!base) {
104 rcu_read_unlock();
98 return -EINVAL; 105 return -EINVAL;
106 }
99 107
100 raw_spin_lock_irqsave(&pci_config_lock, flags); 108 raw_spin_lock_irqsave(&pci_config_lock, flags);
101 109
@@ -113,11 +121,12 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus,
113 break; 121 break;
114 } 122 }
115 raw_spin_unlock_irqrestore(&pci_config_lock, flags); 123 raw_spin_unlock_irqrestore(&pci_config_lock, flags);
124 rcu_read_unlock();
116 125
117 return 0; 126 return 0;
118} 127}
119 128
120static const struct pci_raw_ops pci_mmcfg = { 129const struct pci_raw_ops pci_mmcfg = {
121 .read = pci_mmcfg_read, 130 .read = pci_mmcfg_read,
122 .write = pci_mmcfg_write, 131 .write = pci_mmcfg_write,
123}; 132};
@@ -132,3 +141,18 @@ int __init pci_mmcfg_arch_init(void)
132void __init pci_mmcfg_arch_free(void) 141void __init pci_mmcfg_arch_free(void)
133{ 142{
134} 143}
144
145int __devinit pci_mmcfg_arch_map(struct pci_mmcfg_region *cfg)
146{
147 return 0;
148}
149
150void pci_mmcfg_arch_unmap(struct pci_mmcfg_region *cfg)
151{
152 unsigned long flags;
153
154 /* Invalidate the cached mmcfg map entry. */
155 raw_spin_lock_irqsave(&pci_config_lock, flags);
156 mmcfg_last_accessed_device = 0;
157 raw_spin_unlock_irqrestore(&pci_config_lock, flags);
158}
diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c
index 915a493502cb..d4ebd07c306d 100644
--- a/arch/x86/pci/mmconfig_64.c
+++ b/arch/x86/pci/mmconfig_64.c
@@ -9,6 +9,7 @@
9#include <linux/init.h> 9#include <linux/init.h>
10#include <linux/acpi.h> 10#include <linux/acpi.h>
11#include <linux/bitmap.h> 11#include <linux/bitmap.h>
12#include <linux/rcupdate.h>
12#include <asm/e820.h> 13#include <asm/e820.h>
13#include <asm/pci_x86.h> 14#include <asm/pci_x86.h>
14 15
@@ -34,9 +35,12 @@ err: *value = -1;
34 return -EINVAL; 35 return -EINVAL;
35 } 36 }
36 37
38 rcu_read_lock();
37 addr = pci_dev_base(seg, bus, devfn); 39 addr = pci_dev_base(seg, bus, devfn);
38 if (!addr) 40 if (!addr) {
41 rcu_read_unlock();
39 goto err; 42 goto err;
43 }
40 44
41 switch (len) { 45 switch (len) {
42 case 1: 46 case 1:
@@ -49,6 +53,7 @@ err: *value = -1;
49 *value = mmio_config_readl(addr + reg); 53 *value = mmio_config_readl(addr + reg);
50 break; 54 break;
51 } 55 }
56 rcu_read_unlock();
52 57
53 return 0; 58 return 0;
54} 59}
@@ -62,9 +67,12 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus,
62 if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095))) 67 if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095)))
63 return -EINVAL; 68 return -EINVAL;
64 69
70 rcu_read_lock();
65 addr = pci_dev_base(seg, bus, devfn); 71 addr = pci_dev_base(seg, bus, devfn);
66 if (!addr) 72 if (!addr) {
73 rcu_read_unlock();
67 return -EINVAL; 74 return -EINVAL;
75 }
68 76
69 switch (len) { 77 switch (len) {
70 case 1: 78 case 1:
@@ -77,16 +85,17 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus,
77 mmio_config_writel(addr + reg, value); 85 mmio_config_writel(addr + reg, value);
78 break; 86 break;
79 } 87 }
88 rcu_read_unlock();
80 89
81 return 0; 90 return 0;
82} 91}
83 92
84static const struct pci_raw_ops pci_mmcfg = { 93const struct pci_raw_ops pci_mmcfg = {
85 .read = pci_mmcfg_read, 94 .read = pci_mmcfg_read,
86 .write = pci_mmcfg_write, 95 .write = pci_mmcfg_write,
87}; 96};
88 97
89static void __iomem * __init mcfg_ioremap(struct pci_mmcfg_region *cfg) 98static void __iomem * __devinit mcfg_ioremap(struct pci_mmcfg_region *cfg)
90{ 99{
91 void __iomem *addr; 100 void __iomem *addr;
92 u64 start, size; 101 u64 start, size;
@@ -105,16 +114,14 @@ int __init pci_mmcfg_arch_init(void)
105{ 114{
106 struct pci_mmcfg_region *cfg; 115 struct pci_mmcfg_region *cfg;
107 116
108 list_for_each_entry(cfg, &pci_mmcfg_list, list) { 117 list_for_each_entry(cfg, &pci_mmcfg_list, list)
109 cfg->virt = mcfg_ioremap(cfg); 118 if (pci_mmcfg_arch_map(cfg)) {
110 if (!cfg->virt) {
111 printk(KERN_ERR PREFIX "can't map MMCONFIG at %pR\n",
112 &cfg->res);
113 pci_mmcfg_arch_free(); 119 pci_mmcfg_arch_free();
114 return 0; 120 return 0;
115 } 121 }
116 } 122
117 raw_pci_ext_ops = &pci_mmcfg; 123 raw_pci_ext_ops = &pci_mmcfg;
124
118 return 1; 125 return 1;
119} 126}
120 127
@@ -122,10 +129,25 @@ void __init pci_mmcfg_arch_free(void)
122{ 129{
123 struct pci_mmcfg_region *cfg; 130 struct pci_mmcfg_region *cfg;
124 131
125 list_for_each_entry(cfg, &pci_mmcfg_list, list) { 132 list_for_each_entry(cfg, &pci_mmcfg_list, list)
126 if (cfg->virt) { 133 pci_mmcfg_arch_unmap(cfg);
127 iounmap(cfg->virt + PCI_MMCFG_BUS_OFFSET(cfg->start_bus)); 134}
128 cfg->virt = NULL; 135
129 } 136int __devinit pci_mmcfg_arch_map(struct pci_mmcfg_region *cfg)
137{
138 cfg->virt = mcfg_ioremap(cfg);
139 if (!cfg->virt) {
140 pr_err(PREFIX "can't map MMCONFIG at %pR\n", &cfg->res);
141 return -ENOMEM;
142 }
143
144 return 0;
145}
146
147void pci_mmcfg_arch_unmap(struct pci_mmcfg_region *cfg)
148{
149 if (cfg && cfg->virt) {
150 iounmap(cfg->virt + PCI_MMCFG_BUS_OFFSET(cfg->start_bus));
151 cfg->virt = NULL;
130 } 152 }
131} 153}
diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/mrst.c
index 140942f66b31..e14a2ff708b5 100644
--- a/arch/x86/pci/mrst.c
+++ b/arch/x86/pci/mrst.c
@@ -264,7 +264,7 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_d3delay_fixup);
264 264
265static void __devinit mrst_power_off_unused_dev(struct pci_dev *dev) 265static void __devinit mrst_power_off_unused_dev(struct pci_dev *dev)
266{ 266{
267 pci_set_power_state(dev, PCI_D3cold); 267 pci_set_power_state(dev, PCI_D3hot);
268} 268}
269DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0801, mrst_power_off_unused_dev); 269DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0801, mrst_power_off_unused_dev);
270DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0809, mrst_power_off_unused_dev); 270DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0809, mrst_power_off_unused_dev);
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 92660edaa1e7..2dc29f51e75a 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -234,22 +234,7 @@ static efi_status_t __init phys_efi_set_virtual_address_map(
234 return status; 234 return status;
235} 235}
236 236
237static efi_status_t __init phys_efi_get_time(efi_time_t *tm, 237static int efi_set_rtc_mmss(unsigned long nowtime)
238 efi_time_cap_t *tc)
239{
240 unsigned long flags;
241 efi_status_t status;
242
243 spin_lock_irqsave(&rtc_lock, flags);
244 efi_call_phys_prelog();
245 status = efi_call_phys2(efi_phys.get_time, virt_to_phys(tm),
246 virt_to_phys(tc));
247 efi_call_phys_epilog();
248 spin_unlock_irqrestore(&rtc_lock, flags);
249 return status;
250}
251
252int efi_set_rtc_mmss(unsigned long nowtime)
253{ 238{
254 int real_seconds, real_minutes; 239 int real_seconds, real_minutes;
255 efi_status_t status; 240 efi_status_t status;
@@ -278,7 +263,7 @@ int efi_set_rtc_mmss(unsigned long nowtime)
278 return 0; 263 return 0;
279} 264}
280 265
281unsigned long efi_get_time(void) 266static unsigned long efi_get_time(void)
282{ 267{
283 efi_status_t status; 268 efi_status_t status;
284 efi_time_t eft; 269 efi_time_t eft;
@@ -621,18 +606,13 @@ static int __init efi_runtime_init(void)
621 } 606 }
622 /* 607 /*
623 * We will only need *early* access to the following 608 * We will only need *early* access to the following
624 * two EFI runtime services before set_virtual_address_map 609 * EFI runtime service before set_virtual_address_map
625 * is invoked. 610 * is invoked.
626 */ 611 */
627 efi_phys.get_time = (efi_get_time_t *)runtime->get_time;
628 efi_phys.set_virtual_address_map = 612 efi_phys.set_virtual_address_map =
629 (efi_set_virtual_address_map_t *) 613 (efi_set_virtual_address_map_t *)
630 runtime->set_virtual_address_map; 614 runtime->set_virtual_address_map;
631 /* 615
632 * Make efi_get_time can be called before entering
633 * virtual mode.
634 */
635 efi.get_time = phys_efi_get_time;
636 early_iounmap(runtime, sizeof(efi_runtime_services_t)); 616 early_iounmap(runtime, sizeof(efi_runtime_services_t));
637 617
638 return 0; 618 return 0;
@@ -720,12 +700,10 @@ void __init efi_init(void)
720 efi_enabled = 0; 700 efi_enabled = 0;
721 return; 701 return;
722 } 702 }
723#ifdef CONFIG_X86_32
724 if (efi_native) { 703 if (efi_native) {
725 x86_platform.get_wallclock = efi_get_time; 704 x86_platform.get_wallclock = efi_get_time;
726 x86_platform.set_wallclock = efi_set_rtc_mmss; 705 x86_platform.set_wallclock = efi_set_rtc_mmss;
727 } 706 }
728#endif
729 707
730#if EFI_DEBUG 708#if EFI_DEBUG
731 print_efi_memmap(); 709 print_efi_memmap();
diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c
index 23e5b9d7977b..599be499fdf7 100644
--- a/arch/x86/platform/olpc/olpc-xo15-sci.c
+++ b/arch/x86/platform/olpc/olpc-xo15-sci.c
@@ -203,7 +203,7 @@ static int xo15_sci_remove(struct acpi_device *device, int type)
203 return 0; 203 return 0;
204} 204}
205 205
206static int xo15_sci_resume(struct acpi_device *device) 206static int xo15_sci_resume(struct device *dev)
207{ 207{
208 /* Enable all EC events */ 208 /* Enable all EC events */
209 olpc_ec_mask_write(EC_SCI_SRC_ALL); 209 olpc_ec_mask_write(EC_SCI_SRC_ALL);
@@ -215,6 +215,8 @@ static int xo15_sci_resume(struct acpi_device *device)
215 return 0; 215 return 0;
216} 216}
217 217
218static SIMPLE_DEV_PM_OPS(xo15_sci_pm, NULL, xo15_sci_resume);
219
218static const struct acpi_device_id xo15_sci_device_ids[] = { 220static const struct acpi_device_id xo15_sci_device_ids[] = {
219 {"XO15EC", 0}, 221 {"XO15EC", 0},
220 {"", 0}, 222 {"", 0},
@@ -227,8 +229,8 @@ static struct acpi_driver xo15_sci_drv = {
227 .ops = { 229 .ops = {
228 .add = xo15_sci_add, 230 .add = xo15_sci_add,
229 .remove = xo15_sci_remove, 231 .remove = xo15_sci_remove,
230 .resume = xo15_sci_resume,
231 }, 232 },
233 .drv.pm = &xo15_sci_pm,
232}; 234};
233 235
234static int __init xo15_sci_init(void) 236static int __init xo15_sci_init(void)
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 59880afa851f..b8b3a37c80cd 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * SGI UltraViolet TLB flush routines. 2 * SGI UltraViolet TLB flush routines.
3 * 3 *
4 * (c) 2008-2011 Cliff Wickman <cpw@sgi.com>, SGI. 4 * (c) 2008-2012 Cliff Wickman <cpw@sgi.com>, SGI.
5 * 5 *
6 * This code is released under the GNU General Public License version 2 or 6 * This code is released under the GNU General Public License version 2 or
7 * later. 7 * later.
@@ -38,8 +38,7 @@ static int timeout_base_ns[] = {
38 38
39static int timeout_us; 39static int timeout_us;
40static int nobau; 40static int nobau;
41static int baudisabled; 41static int nobau_perm;
42static spinlock_t disable_lock;
43static cycles_t congested_cycles; 42static cycles_t congested_cycles;
44 43
45/* tunables: */ 44/* tunables: */
@@ -47,12 +46,13 @@ static int max_concurr = MAX_BAU_CONCURRENT;
47static int max_concurr_const = MAX_BAU_CONCURRENT; 46static int max_concurr_const = MAX_BAU_CONCURRENT;
48static int plugged_delay = PLUGGED_DELAY; 47static int plugged_delay = PLUGGED_DELAY;
49static int plugsb4reset = PLUGSB4RESET; 48static int plugsb4reset = PLUGSB4RESET;
49static int giveup_limit = GIVEUP_LIMIT;
50static int timeoutsb4reset = TIMEOUTSB4RESET; 50static int timeoutsb4reset = TIMEOUTSB4RESET;
51static int ipi_reset_limit = IPI_RESET_LIMIT; 51static int ipi_reset_limit = IPI_RESET_LIMIT;
52static int complete_threshold = COMPLETE_THRESHOLD; 52static int complete_threshold = COMPLETE_THRESHOLD;
53static int congested_respns_us = CONGESTED_RESPONSE_US; 53static int congested_respns_us = CONGESTED_RESPONSE_US;
54static int congested_reps = CONGESTED_REPS; 54static int congested_reps = CONGESTED_REPS;
55static int congested_period = CONGESTED_PERIOD; 55static int disabled_period = DISABLED_PERIOD;
56 56
57static struct tunables tunables[] = { 57static struct tunables tunables[] = {
58 {&max_concurr, MAX_BAU_CONCURRENT}, /* must be [0] */ 58 {&max_concurr, MAX_BAU_CONCURRENT}, /* must be [0] */
@@ -63,7 +63,8 @@ static struct tunables tunables[] = {
63 {&complete_threshold, COMPLETE_THRESHOLD}, 63 {&complete_threshold, COMPLETE_THRESHOLD},
64 {&congested_respns_us, CONGESTED_RESPONSE_US}, 64 {&congested_respns_us, CONGESTED_RESPONSE_US},
65 {&congested_reps, CONGESTED_REPS}, 65 {&congested_reps, CONGESTED_REPS},
66 {&congested_period, CONGESTED_PERIOD} 66 {&disabled_period, DISABLED_PERIOD},
67 {&giveup_limit, GIVEUP_LIMIT}
67}; 68};
68 69
69static struct dentry *tunables_dir; 70static struct dentry *tunables_dir;
@@ -120,6 +121,40 @@ static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
120static DEFINE_PER_CPU(struct bau_control, bau_control); 121static DEFINE_PER_CPU(struct bau_control, bau_control);
121static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask); 122static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
122 123
124static void
125set_bau_on(void)
126{
127 int cpu;
128 struct bau_control *bcp;
129
130 if (nobau_perm) {
131 pr_info("BAU not initialized; cannot be turned on\n");
132 return;
133 }
134 nobau = 0;
135 for_each_present_cpu(cpu) {
136 bcp = &per_cpu(bau_control, cpu);
137 bcp->nobau = 0;
138 }
139 pr_info("BAU turned on\n");
140 return;
141}
142
143static void
144set_bau_off(void)
145{
146 int cpu;
147 struct bau_control *bcp;
148
149 nobau = 1;
150 for_each_present_cpu(cpu) {
151 bcp = &per_cpu(bau_control, cpu);
152 bcp->nobau = 1;
153 }
154 pr_info("BAU turned off\n");
155 return;
156}
157
123/* 158/*
124 * Determine the first node on a uvhub. 'Nodes' are used for kernel 159 * Determine the first node on a uvhub. 'Nodes' are used for kernel
125 * memory allocation. 160 * memory allocation.
@@ -278,7 +313,7 @@ static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp,
278 * Both sockets dump their completed count total into 313 * Both sockets dump their completed count total into
279 * the message's count. 314 * the message's count.
280 */ 315 */
281 smaster->socket_acknowledge_count[mdp->msg_slot] = 0; 316 *sp = 0;
282 asp = (struct atomic_short *)&msg->acknowledge_count; 317 asp = (struct atomic_short *)&msg->acknowledge_count;
283 msg_ack_count = atom_asr(socket_ack_count, asp); 318 msg_ack_count = atom_asr(socket_ack_count, asp);
284 319
@@ -491,16 +526,15 @@ static int uv1_wait_completion(struct bau_desc *bau_desc,
491} 526}
492 527
493/* 528/*
494 * UV2 has an extra bit of status in the ACTIVATION_STATUS_2 register. 529 * UV2 could have an extra bit of status in the ACTIVATION_STATUS_2 register.
530 * But not currently used.
495 */ 531 */
496static unsigned long uv2_read_status(unsigned long offset, int rshft, int desc) 532static unsigned long uv2_read_status(unsigned long offset, int rshft, int desc)
497{ 533{
498 unsigned long descriptor_status; 534 unsigned long descriptor_status;
499 unsigned long descriptor_status2;
500 535
501 descriptor_status = ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK); 536 descriptor_status =
502 descriptor_status2 = (read_mmr_uv2_status() >> desc) & 0x1UL; 537 ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK) << 1;
503 descriptor_status = (descriptor_status << 1) | descriptor_status2;
504 return descriptor_status; 538 return descriptor_status;
505} 539}
506 540
@@ -531,87 +565,11 @@ int normal_busy(struct bau_control *bcp)
531 */ 565 */
532int handle_uv2_busy(struct bau_control *bcp) 566int handle_uv2_busy(struct bau_control *bcp)
533{ 567{
534 int busy_one = bcp->using_desc;
535 int normal = bcp->uvhub_cpu;
536 int selected = -1;
537 int i;
538 unsigned long descriptor_status;
539 unsigned long status;
540 int mmr_offset;
541 struct bau_desc *bau_desc_old;
542 struct bau_desc *bau_desc_new;
543 struct bau_control *hmaster = bcp->uvhub_master;
544 struct ptc_stats *stat = bcp->statp; 568 struct ptc_stats *stat = bcp->statp;
545 cycles_t ttm;
546 569
547 stat->s_uv2_wars++; 570 stat->s_uv2_wars++;
548 spin_lock(&hmaster->uvhub_lock); 571 bcp->busy = 1;
549 /* try for the original first */ 572 return FLUSH_GIVEUP;
550 if (busy_one != normal) {
551 if (!normal_busy(bcp))
552 selected = normal;
553 }
554 if (selected < 0) {
555 /* can't use the normal, select an alternate */
556 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
557 descriptor_status = read_lmmr(mmr_offset);
558
559 /* scan available descriptors 32-63 */
560 for (i = 0; i < UV_CPUS_PER_AS; i++) {
561 if ((hmaster->inuse_map & (1 << i)) == 0) {
562 status = ((descriptor_status >>
563 (i * UV_ACT_STATUS_SIZE)) &
564 UV_ACT_STATUS_MASK) << 1;
565 if (status != UV2H_DESC_BUSY) {
566 selected = i + UV_CPUS_PER_AS;
567 break;
568 }
569 }
570 }
571 }
572
573 if (busy_one != normal)
574 /* mark the busy alternate as not in-use */
575 hmaster->inuse_map &= ~(1 << (busy_one - UV_CPUS_PER_AS));
576
577 if (selected >= 0) {
578 /* switch to the selected descriptor */
579 if (selected != normal) {
580 /* set the selected alternate as in-use */
581 hmaster->inuse_map |=
582 (1 << (selected - UV_CPUS_PER_AS));
583 if (selected > stat->s_uv2_wars_hw)
584 stat->s_uv2_wars_hw = selected;
585 }
586 bau_desc_old = bcp->descriptor_base;
587 bau_desc_old += (ITEMS_PER_DESC * busy_one);
588 bcp->using_desc = selected;
589 bau_desc_new = bcp->descriptor_base;
590 bau_desc_new += (ITEMS_PER_DESC * selected);
591 *bau_desc_new = *bau_desc_old;
592 } else {
593 /*
594 * All are busy. Wait for the normal one for this cpu to
595 * free up.
596 */
597 stat->s_uv2_war_waits++;
598 spin_unlock(&hmaster->uvhub_lock);
599 ttm = get_cycles();
600 do {
601 cpu_relax();
602 } while (normal_busy(bcp));
603 spin_lock(&hmaster->uvhub_lock);
604 /* switch to the original descriptor */
605 bcp->using_desc = normal;
606 bau_desc_old = bcp->descriptor_base;
607 bau_desc_old += (ITEMS_PER_DESC * bcp->using_desc);
608 bcp->using_desc = (ITEMS_PER_DESC * normal);
609 bau_desc_new = bcp->descriptor_base;
610 bau_desc_new += (ITEMS_PER_DESC * normal);
611 *bau_desc_new = *bau_desc_old; /* copy the entire descriptor */
612 }
613 spin_unlock(&hmaster->uvhub_lock);
614 return FLUSH_RETRY_BUSYBUG;
615} 573}
616 574
617static int uv2_wait_completion(struct bau_desc *bau_desc, 575static int uv2_wait_completion(struct bau_desc *bau_desc,
@@ -620,7 +578,7 @@ static int uv2_wait_completion(struct bau_desc *bau_desc,
620{ 578{
621 unsigned long descriptor_stat; 579 unsigned long descriptor_stat;
622 cycles_t ttm; 580 cycles_t ttm;
623 int desc = bcp->using_desc; 581 int desc = bcp->uvhub_cpu;
624 long busy_reps = 0; 582 long busy_reps = 0;
625 struct ptc_stats *stat = bcp->statp; 583 struct ptc_stats *stat = bcp->statp;
626 584
@@ -628,24 +586,38 @@ static int uv2_wait_completion(struct bau_desc *bau_desc,
628 586
629 /* spin on the status MMR, waiting for it to go idle */ 587 /* spin on the status MMR, waiting for it to go idle */
630 while (descriptor_stat != UV2H_DESC_IDLE) { 588 while (descriptor_stat != UV2H_DESC_IDLE) {
631 /* 589 if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT)) {
632 * Our software ack messages may be blocked because 590 /*
633 * there are no swack resources available. As long 591 * A h/w bug on the destination side may
634 * as none of them has timed out hardware will NACK 592 * have prevented the message being marked
635 * our message and its state will stay IDLE. 593 * pending, thus it doesn't get replied to
636 */ 594 * and gets continually nacked until it times
637 if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT) || 595 * out with a SOURCE_TIMEOUT.
638 (descriptor_stat == UV2H_DESC_DEST_PUT_ERR)) { 596 */
639 stat->s_stimeout++; 597 stat->s_stimeout++;
640 return FLUSH_GIVEUP; 598 return FLUSH_GIVEUP;
641 } else if (descriptor_stat == UV2H_DESC_DEST_STRONG_NACK) {
642 stat->s_strongnacks++;
643 bcp->conseccompletes = 0;
644 return FLUSH_GIVEUP;
645 } else if (descriptor_stat == UV2H_DESC_DEST_TIMEOUT) { 599 } else if (descriptor_stat == UV2H_DESC_DEST_TIMEOUT) {
600 ttm = get_cycles();
601
602 /*
603 * Our retries may be blocked by all destination
604 * swack resources being consumed, and a timeout
605 * pending. In that case hardware returns the
606 * ERROR that looks like a destination timeout.
607 * Without using the extended status we have to
608 * deduce from the short time that this was a
609 * strong nack.
610 */
611 if (cycles_2_us(ttm - bcp->send_message) < timeout_us) {
612 bcp->conseccompletes = 0;
613 stat->s_plugged++;
614 /* FLUSH_RETRY_PLUGGED causes hang on boot */
615 return FLUSH_GIVEUP;
616 }
646 stat->s_dtimeout++; 617 stat->s_dtimeout++;
647 bcp->conseccompletes = 0; 618 bcp->conseccompletes = 0;
648 return FLUSH_RETRY_TIMEOUT; 619 /* FLUSH_RETRY_TIMEOUT causes hang on boot */
620 return FLUSH_GIVEUP;
649 } else { 621 } else {
650 busy_reps++; 622 busy_reps++;
651 if (busy_reps > 1000000) { 623 if (busy_reps > 1000000) {
@@ -653,9 +625,8 @@ static int uv2_wait_completion(struct bau_desc *bau_desc,
653 busy_reps = 0; 625 busy_reps = 0;
654 ttm = get_cycles(); 626 ttm = get_cycles();
655 if ((ttm - bcp->send_message) > 627 if ((ttm - bcp->send_message) >
656 (bcp->clocks_per_100_usec)) { 628 bcp->timeout_interval)
657 return handle_uv2_busy(bcp); 629 return handle_uv2_busy(bcp);
658 }
659 } 630 }
660 /* 631 /*
661 * descriptor_stat is still BUSY 632 * descriptor_stat is still BUSY
@@ -679,7 +650,7 @@ static int wait_completion(struct bau_desc *bau_desc,
679{ 650{
680 int right_shift; 651 int right_shift;
681 unsigned long mmr_offset; 652 unsigned long mmr_offset;
682 int desc = bcp->using_desc; 653 int desc = bcp->uvhub_cpu;
683 654
684 if (desc < UV_CPUS_PER_AS) { 655 if (desc < UV_CPUS_PER_AS) {
685 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; 656 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
@@ -758,33 +729,31 @@ static void destination_timeout(struct bau_desc *bau_desc,
758} 729}
759 730
760/* 731/*
761 * Completions are taking a very long time due to a congested numalink 732 * Stop all cpus on a uvhub from using the BAU for a period of time.
762 * network. 733 * This is reversed by check_enable.
763 */ 734 */
764static void disable_for_congestion(struct bau_control *bcp, 735static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat)
765 struct ptc_stats *stat)
766{ 736{
767 /* let only one cpu do this disabling */ 737 int tcpu;
768 spin_lock(&disable_lock); 738 struct bau_control *tbcp;
769 739 struct bau_control *hmaster;
770 if (!baudisabled && bcp->period_requests && 740 cycles_t tm1;
771 ((bcp->period_time / bcp->period_requests) > congested_cycles)) { 741
772 int tcpu; 742 hmaster = bcp->uvhub_master;
773 struct bau_control *tbcp; 743 spin_lock(&hmaster->disable_lock);
774 /* it becomes this cpu's job to turn on the use of the 744 if (!bcp->baudisabled) {
775 BAU again */
776 baudisabled = 1;
777 bcp->set_bau_off = 1;
778 bcp->set_bau_on_time = get_cycles();
779 bcp->set_bau_on_time += sec_2_cycles(bcp->cong_period);
780 stat->s_bau_disabled++; 745 stat->s_bau_disabled++;
746 tm1 = get_cycles();
781 for_each_present_cpu(tcpu) { 747 for_each_present_cpu(tcpu) {
782 tbcp = &per_cpu(bau_control, tcpu); 748 tbcp = &per_cpu(bau_control, tcpu);
783 tbcp->baudisabled = 1; 749 if (tbcp->uvhub_master == hmaster) {
750 tbcp->baudisabled = 1;
751 tbcp->set_bau_on_time =
752 tm1 + bcp->disabled_period;
753 }
784 } 754 }
785 } 755 }
786 756 spin_unlock(&hmaster->disable_lock);
787 spin_unlock(&disable_lock);
788} 757}
789 758
790static void count_max_concurr(int stat, struct bau_control *bcp, 759static void count_max_concurr(int stat, struct bau_control *bcp,
@@ -815,16 +784,30 @@ static void record_send_stats(cycles_t time1, cycles_t time2,
815 bcp->period_requests++; 784 bcp->period_requests++;
816 bcp->period_time += elapsed; 785 bcp->period_time += elapsed;
817 if ((elapsed > congested_cycles) && 786 if ((elapsed > congested_cycles) &&
818 (bcp->period_requests > bcp->cong_reps)) 787 (bcp->period_requests > bcp->cong_reps) &&
819 disable_for_congestion(bcp, stat); 788 ((bcp->period_time / bcp->period_requests) >
789 congested_cycles)) {
790 stat->s_congested++;
791 disable_for_period(bcp, stat);
792 }
820 } 793 }
821 } else 794 } else
822 stat->s_requestor--; 795 stat->s_requestor--;
823 796
824 if (completion_status == FLUSH_COMPLETE && try > 1) 797 if (completion_status == FLUSH_COMPLETE && try > 1)
825 stat->s_retriesok++; 798 stat->s_retriesok++;
826 else if (completion_status == FLUSH_GIVEUP) 799 else if (completion_status == FLUSH_GIVEUP) {
827 stat->s_giveup++; 800 stat->s_giveup++;
801 if (get_cycles() > bcp->period_end)
802 bcp->period_giveups = 0;
803 bcp->period_giveups++;
804 if (bcp->period_giveups == 1)
805 bcp->period_end = get_cycles() + bcp->disabled_period;
806 if (bcp->period_giveups > bcp->giveup_limit) {
807 disable_for_period(bcp, stat);
808 stat->s_giveuplimit++;
809 }
810 }
828} 811}
829 812
830/* 813/*
@@ -868,7 +851,8 @@ static void handle_cmplt(int completion_status, struct bau_desc *bau_desc,
868 * Returns 1 if it gives up entirely and the original cpu mask is to be 851 * Returns 1 if it gives up entirely and the original cpu mask is to be
869 * returned to the kernel. 852 * returned to the kernel.
870 */ 853 */
871int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp) 854int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp,
855 struct bau_desc *bau_desc)
872{ 856{
873 int seq_number = 0; 857 int seq_number = 0;
874 int completion_stat = 0; 858 int completion_stat = 0;
@@ -881,24 +865,23 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp)
881 struct bau_control *hmaster = bcp->uvhub_master; 865 struct bau_control *hmaster = bcp->uvhub_master;
882 struct uv1_bau_msg_header *uv1_hdr = NULL; 866 struct uv1_bau_msg_header *uv1_hdr = NULL;
883 struct uv2_bau_msg_header *uv2_hdr = NULL; 867 struct uv2_bau_msg_header *uv2_hdr = NULL;
884 struct bau_desc *bau_desc;
885 868
886 if (bcp->uvhub_version == 1) 869 if (bcp->uvhub_version == 1) {
870 uv1 = 1;
887 uv1_throttle(hmaster, stat); 871 uv1_throttle(hmaster, stat);
872 }
888 873
889 while (hmaster->uvhub_quiesce) 874 while (hmaster->uvhub_quiesce)
890 cpu_relax(); 875 cpu_relax();
891 876
892 time1 = get_cycles(); 877 time1 = get_cycles();
878 if (uv1)
879 uv1_hdr = &bau_desc->header.uv1_hdr;
880 else
881 uv2_hdr = &bau_desc->header.uv2_hdr;
882
893 do { 883 do {
894 bau_desc = bcp->descriptor_base; 884 if (try == 0) {
895 bau_desc += (ITEMS_PER_DESC * bcp->using_desc);
896 if (bcp->uvhub_version == 1) {
897 uv1 = 1;
898 uv1_hdr = &bau_desc->header.uv1_hdr;
899 } else
900 uv2_hdr = &bau_desc->header.uv2_hdr;
901 if ((try == 0) || (completion_stat == FLUSH_RETRY_BUSYBUG)) {
902 if (uv1) 885 if (uv1)
903 uv1_hdr->msg_type = MSG_REGULAR; 886 uv1_hdr->msg_type = MSG_REGULAR;
904 else 887 else
@@ -916,25 +899,24 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp)
916 uv1_hdr->sequence = seq_number; 899 uv1_hdr->sequence = seq_number;
917 else 900 else
918 uv2_hdr->sequence = seq_number; 901 uv2_hdr->sequence = seq_number;
919 index = (1UL << AS_PUSH_SHIFT) | bcp->using_desc; 902 index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu;
920 bcp->send_message = get_cycles(); 903 bcp->send_message = get_cycles();
921 904
922 write_mmr_activation(index); 905 write_mmr_activation(index);
923 906
924 try++; 907 try++;
925 completion_stat = wait_completion(bau_desc, bcp, try); 908 completion_stat = wait_completion(bau_desc, bcp, try);
926 /* UV2: wait_completion() may change the bcp->using_desc */
927 909
928 handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat); 910 handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat);
929 911
930 if (bcp->ipi_attempts >= bcp->ipi_reset_limit) { 912 if (bcp->ipi_attempts >= bcp->ipi_reset_limit) {
931 bcp->ipi_attempts = 0; 913 bcp->ipi_attempts = 0;
914 stat->s_overipilimit++;
932 completion_stat = FLUSH_GIVEUP; 915 completion_stat = FLUSH_GIVEUP;
933 break; 916 break;
934 } 917 }
935 cpu_relax(); 918 cpu_relax();
936 } while ((completion_stat == FLUSH_RETRY_PLUGGED) || 919 } while ((completion_stat == FLUSH_RETRY_PLUGGED) ||
937 (completion_stat == FLUSH_RETRY_BUSYBUG) ||
938 (completion_stat == FLUSH_RETRY_TIMEOUT)); 920 (completion_stat == FLUSH_RETRY_TIMEOUT));
939 921
940 time2 = get_cycles(); 922 time2 = get_cycles();
@@ -955,28 +937,33 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp)
955} 937}
956 938
957/* 939/*
958 * The BAU is disabled. When the disabled time period has expired, the cpu 940 * The BAU is disabled for this uvhub. When the disabled time period has
959 * that disabled it must re-enable it. 941 * expired re-enable it.
960 * Return 0 if it is re-enabled for all cpus. 942 * Return 0 if it is re-enabled for all cpus on this uvhub.
961 */ 943 */
962static int check_enable(struct bau_control *bcp, struct ptc_stats *stat) 944static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
963{ 945{
964 int tcpu; 946 int tcpu;
965 struct bau_control *tbcp; 947 struct bau_control *tbcp;
948 struct bau_control *hmaster;
966 949
967 if (bcp->set_bau_off) { 950 hmaster = bcp->uvhub_master;
968 if (get_cycles() >= bcp->set_bau_on_time) { 951 spin_lock(&hmaster->disable_lock);
969 stat->s_bau_reenabled++; 952 if (bcp->baudisabled && (get_cycles() >= bcp->set_bau_on_time)) {
970 baudisabled = 0; 953 stat->s_bau_reenabled++;
971 for_each_present_cpu(tcpu) { 954 for_each_present_cpu(tcpu) {
972 tbcp = &per_cpu(bau_control, tcpu); 955 tbcp = &per_cpu(bau_control, tcpu);
956 if (tbcp->uvhub_master == hmaster) {
973 tbcp->baudisabled = 0; 957 tbcp->baudisabled = 0;
974 tbcp->period_requests = 0; 958 tbcp->period_requests = 0;
975 tbcp->period_time = 0; 959 tbcp->period_time = 0;
960 tbcp->period_giveups = 0;
976 } 961 }
977 return 0;
978 } 962 }
963 spin_unlock(&hmaster->disable_lock);
964 return 0;
979 } 965 }
966 spin_unlock(&hmaster->disable_lock);
980 return -1; 967 return -1;
981} 968}
982 969
@@ -1068,8 +1055,8 @@ static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp,
1068 * done. The returned pointer is valid till preemption is re-enabled. 1055 * done. The returned pointer is valid till preemption is re-enabled.
1069 */ 1056 */
1070const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, 1057const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
1071 struct mm_struct *mm, unsigned long va, 1058 struct mm_struct *mm, unsigned long start,
1072 unsigned int cpu) 1059 unsigned end, unsigned int cpu)
1073{ 1060{
1074 int locals = 0; 1061 int locals = 0;
1075 int remotes = 0; 1062 int remotes = 0;
@@ -1078,18 +1065,32 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
1078 struct cpumask *flush_mask; 1065 struct cpumask *flush_mask;
1079 struct ptc_stats *stat; 1066 struct ptc_stats *stat;
1080 struct bau_control *bcp; 1067 struct bau_control *bcp;
1081 1068 unsigned long descriptor_status;
1082 /* kernel was booted 'nobau' */ 1069 unsigned long status;
1083 if (nobau)
1084 return cpumask;
1085 1070
1086 bcp = &per_cpu(bau_control, cpu); 1071 bcp = &per_cpu(bau_control, cpu);
1087 stat = bcp->statp; 1072 stat = bcp->statp;
1073 stat->s_enters++;
1074
1075 if (bcp->nobau)
1076 return cpumask;
1077
1078 if (bcp->busy) {
1079 descriptor_status =
1080 read_lmmr(UVH_LB_BAU_SB_ACTIVATION_STATUS_0);
1081 status = ((descriptor_status >> (bcp->uvhub_cpu *
1082 UV_ACT_STATUS_SIZE)) & UV_ACT_STATUS_MASK) << 1;
1083 if (status == UV2H_DESC_BUSY)
1084 return cpumask;
1085 bcp->busy = 0;
1086 }
1088 1087
1089 /* bau was disabled due to slow response */ 1088 /* bau was disabled due to slow response */
1090 if (bcp->baudisabled) { 1089 if (bcp->baudisabled) {
1091 if (check_enable(bcp, stat)) 1090 if (check_enable(bcp, stat)) {
1091 stat->s_ipifordisabled++;
1092 return cpumask; 1092 return cpumask;
1093 }
1093 } 1094 }
1094 1095
1095 /* 1096 /*
@@ -1105,38 +1106,40 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
1105 stat->s_ntargself++; 1106 stat->s_ntargself++;
1106 1107
1107 bau_desc = bcp->descriptor_base; 1108 bau_desc = bcp->descriptor_base;
1108 bau_desc += (ITEMS_PER_DESC * bcp->using_desc); 1109 bau_desc += (ITEMS_PER_DESC * bcp->uvhub_cpu);
1109 bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); 1110 bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
1110 if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes)) 1111 if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes))
1111 return NULL; 1112 return NULL;
1112 1113
1113 record_send_statistics(stat, locals, hubs, remotes, bau_desc); 1114 record_send_statistics(stat, locals, hubs, remotes, bau_desc);
1114 1115
1115 bau_desc->payload.address = va; 1116 bau_desc->payload.address = start;
1116 bau_desc->payload.sending_cpu = cpu; 1117 bau_desc->payload.sending_cpu = cpu;
1117 /* 1118 /*
1118 * uv_flush_send_and_wait returns 0 if all cpu's were messaged, 1119 * uv_flush_send_and_wait returns 0 if all cpu's were messaged,
1119 * or 1 if it gave up and the original cpumask should be returned. 1120 * or 1 if it gave up and the original cpumask should be returned.
1120 */ 1121 */
1121 if (!uv_flush_send_and_wait(flush_mask, bcp)) 1122 if (!uv_flush_send_and_wait(flush_mask, bcp, bau_desc))
1122 return NULL; 1123 return NULL;
1123 else 1124 else
1124 return cpumask; 1125 return cpumask;
1125} 1126}
1126 1127
1127/* 1128/*
1128 * Search the message queue for any 'other' message with the same software 1129 * Search the message queue for any 'other' unprocessed message with the
1129 * acknowledge resource bit vector. 1130 * same software acknowledge resource bit vector as the 'msg' message.
1130 */ 1131 */
1131struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg, 1132struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg,
1132 struct bau_control *bcp, unsigned char swack_vec) 1133 struct bau_control *bcp)
1133{ 1134{
1134 struct bau_pq_entry *msg_next = msg + 1; 1135 struct bau_pq_entry *msg_next = msg + 1;
1136 unsigned char swack_vec = msg->swack_vec;
1135 1137
1136 if (msg_next > bcp->queue_last) 1138 if (msg_next > bcp->queue_last)
1137 msg_next = bcp->queue_first; 1139 msg_next = bcp->queue_first;
1138 while ((msg_next->swack_vec != 0) && (msg_next != msg)) { 1140 while (msg_next != msg) {
1139 if (msg_next->swack_vec == swack_vec) 1141 if ((msg_next->canceled == 0) && (msg_next->replied_to == 0) &&
1142 (msg_next->swack_vec == swack_vec))
1140 return msg_next; 1143 return msg_next;
1141 msg_next++; 1144 msg_next++;
1142 if (msg_next > bcp->queue_last) 1145 if (msg_next > bcp->queue_last)
@@ -1165,32 +1168,30 @@ void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp)
1165 * This message was assigned a swack resource, but no 1168 * This message was assigned a swack resource, but no
1166 * reserved acknowlegment is pending. 1169 * reserved acknowlegment is pending.
1167 * The bug has prevented this message from setting the MMR. 1170 * The bug has prevented this message from setting the MMR.
1168 * And no other message has used the same sw_ack resource.
1169 * Do the requested shootdown but do not reply to the msg.
1170 * (the 0 means make no acknowledge)
1171 */ 1171 */
1172 bau_process_message(mdp, bcp, 0);
1173 return;
1174 }
1175
1176 /*
1177 * Some message has set the MMR 'pending' bit; it might have been
1178 * another message. Look for that message.
1179 */
1180 other_msg = find_another_by_swack(msg, bcp, msg->swack_vec);
1181 if (other_msg) {
1182 /* There is another. Do not ack the current one. */
1183 bau_process_message(mdp, bcp, 0);
1184 /* 1172 /*
1185 * Let the natural processing of that message acknowledge 1173 * Some message has set the MMR 'pending' bit; it might have
1186 * it. Don't get the processing of sw_ack's out of order. 1174 * been another message. Look for that message.
1187 */ 1175 */
1188 return; 1176 other_msg = find_another_by_swack(msg, bcp);
1177 if (other_msg) {
1178 /*
1179 * There is another. Process this one but do not
1180 * ack it.
1181 */
1182 bau_process_message(mdp, bcp, 0);
1183 /*
1184 * Let the natural processing of that other message
1185 * acknowledge it. Don't get the processing of sw_ack's
1186 * out of order.
1187 */
1188 return;
1189 }
1189 } 1190 }
1190 1191
1191 /* 1192 /*
1192 * There is no other message using this sw_ack, so it is safe to 1193 * Either the MMR shows this one pending a reply or there is no
1193 * acknowledge it. 1194 * other message using this sw_ack, so it is safe to acknowledge it.
1194 */ 1195 */
1195 bau_process_message(mdp, bcp, 1); 1196 bau_process_message(mdp, bcp, 1);
1196 1197
@@ -1295,7 +1296,8 @@ static void __init enable_timeouts(void)
1295 */ 1296 */
1296 mmr_image |= (1L << SOFTACK_MSHIFT); 1297 mmr_image |= (1L << SOFTACK_MSHIFT);
1297 if (is_uv2_hub()) { 1298 if (is_uv2_hub()) {
1298 mmr_image |= (1L << UV2_EXT_SHFT); 1299 /* hw bug workaround; do not use extended status */
1300 mmr_image &= ~(1L << UV2_EXT_SHFT);
1299 } 1301 }
1300 write_mmr_misc_control(pnode, mmr_image); 1302 write_mmr_misc_control(pnode, mmr_image);
1301 } 1303 }
@@ -1338,29 +1340,34 @@ static inline unsigned long long usec_2_cycles(unsigned long microsec)
1338static int ptc_seq_show(struct seq_file *file, void *data) 1340static int ptc_seq_show(struct seq_file *file, void *data)
1339{ 1341{
1340 struct ptc_stats *stat; 1342 struct ptc_stats *stat;
1343 struct bau_control *bcp;
1341 int cpu; 1344 int cpu;
1342 1345
1343 cpu = *(loff_t *)data; 1346 cpu = *(loff_t *)data;
1344 if (!cpu) { 1347 if (!cpu) {
1345 seq_printf(file, 1348 seq_printf(file,
1346 "# cpu sent stime self locals remotes ncpus localhub "); 1349 "# cpu bauoff sent stime self locals remotes ncpus localhub ");
1347 seq_printf(file, 1350 seq_printf(file,
1348 "remotehub numuvhubs numuvhubs16 numuvhubs8 "); 1351 "remotehub numuvhubs numuvhubs16 numuvhubs8 ");
1349 seq_printf(file, 1352 seq_printf(file,
1350 "numuvhubs4 numuvhubs2 numuvhubs1 dto snacks retries rok "); 1353 "numuvhubs4 numuvhubs2 numuvhubs1 dto snacks retries ");
1354 seq_printf(file,
1355 "rok resetp resett giveup sto bz throt disable ");
1351 seq_printf(file, 1356 seq_printf(file,
1352 "resetp resett giveup sto bz throt swack recv rtime "); 1357 "enable wars warshw warwaits enters ipidis plugged ");
1353 seq_printf(file, 1358 seq_printf(file,
1354 "all one mult none retry canc nocan reset rcan "); 1359 "ipiover glim cong swack recv rtime all one mult ");
1355 seq_printf(file, 1360 seq_printf(file,
1356 "disable enable wars warshw warwaits\n"); 1361 "none retry canc nocan reset rcan\n");
1357 } 1362 }
1358 if (cpu < num_possible_cpus() && cpu_online(cpu)) { 1363 if (cpu < num_possible_cpus() && cpu_online(cpu)) {
1359 stat = &per_cpu(ptcstats, cpu); 1364 bcp = &per_cpu(bau_control, cpu);
1365 stat = bcp->statp;
1360 /* source side statistics */ 1366 /* source side statistics */
1361 seq_printf(file, 1367 seq_printf(file,
1362 "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", 1368 "cpu %d %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
1363 cpu, stat->s_requestor, cycles_2_us(stat->s_time), 1369 cpu, bcp->nobau, stat->s_requestor,
1370 cycles_2_us(stat->s_time),
1364 stat->s_ntargself, stat->s_ntarglocals, 1371 stat->s_ntargself, stat->s_ntarglocals,
1365 stat->s_ntargremotes, stat->s_ntargcpu, 1372 stat->s_ntargremotes, stat->s_ntargcpu,
1366 stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub, 1373 stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub,
@@ -1374,20 +1381,23 @@ static int ptc_seq_show(struct seq_file *file, void *data)
1374 stat->s_resets_plug, stat->s_resets_timeout, 1381 stat->s_resets_plug, stat->s_resets_timeout,
1375 stat->s_giveup, stat->s_stimeout, 1382 stat->s_giveup, stat->s_stimeout,
1376 stat->s_busy, stat->s_throttles); 1383 stat->s_busy, stat->s_throttles);
1384 seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
1385 stat->s_bau_disabled, stat->s_bau_reenabled,
1386 stat->s_uv2_wars, stat->s_uv2_wars_hw,
1387 stat->s_uv2_war_waits, stat->s_enters,
1388 stat->s_ipifordisabled, stat->s_plugged,
1389 stat->s_overipilimit, stat->s_giveuplimit,
1390 stat->s_congested);
1377 1391
1378 /* destination side statistics */ 1392 /* destination side statistics */
1379 seq_printf(file, 1393 seq_printf(file,
1380 "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", 1394 "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n",
1381 read_gmmr_sw_ack(uv_cpu_to_pnode(cpu)), 1395 read_gmmr_sw_ack(uv_cpu_to_pnode(cpu)),
1382 stat->d_requestee, cycles_2_us(stat->d_time), 1396 stat->d_requestee, cycles_2_us(stat->d_time),
1383 stat->d_alltlb, stat->d_onetlb, stat->d_multmsg, 1397 stat->d_alltlb, stat->d_onetlb, stat->d_multmsg,
1384 stat->d_nomsg, stat->d_retries, stat->d_canceled, 1398 stat->d_nomsg, stat->d_retries, stat->d_canceled,
1385 stat->d_nocanceled, stat->d_resets, 1399 stat->d_nocanceled, stat->d_resets,
1386 stat->d_rcanceled); 1400 stat->d_rcanceled);
1387 seq_printf(file, "%ld %ld %ld %ld %ld\n",
1388 stat->s_bau_disabled, stat->s_bau_reenabled,
1389 stat->s_uv2_wars, stat->s_uv2_wars_hw,
1390 stat->s_uv2_war_waits);
1391 } 1401 }
1392 return 0; 1402 return 0;
1393} 1403}
@@ -1401,13 +1411,14 @@ static ssize_t tunables_read(struct file *file, char __user *userbuf,
1401 char *buf; 1411 char *buf;
1402 int ret; 1412 int ret;
1403 1413
1404 buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n", 1414 buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d %d\n",
1405 "max_concur plugged_delay plugsb4reset", 1415 "max_concur plugged_delay plugsb4reset timeoutsb4reset",
1406 "timeoutsb4reset ipi_reset_limit complete_threshold", 1416 "ipi_reset_limit complete_threshold congested_response_us",
1407 "congested_response_us congested_reps congested_period", 1417 "congested_reps disabled_period giveup_limit",
1408 max_concurr, plugged_delay, plugsb4reset, 1418 max_concurr, plugged_delay, plugsb4reset,
1409 timeoutsb4reset, ipi_reset_limit, complete_threshold, 1419 timeoutsb4reset, ipi_reset_limit, complete_threshold,
1410 congested_respns_us, congested_reps, congested_period); 1420 congested_respns_us, congested_reps, disabled_period,
1421 giveup_limit);
1411 1422
1412 if (!buf) 1423 if (!buf)
1413 return -ENOMEM; 1424 return -ENOMEM;
@@ -1438,6 +1449,14 @@ static ssize_t ptc_proc_write(struct file *file, const char __user *user,
1438 return -EFAULT; 1449 return -EFAULT;
1439 optstr[count - 1] = '\0'; 1450 optstr[count - 1] = '\0';
1440 1451
1452 if (!strcmp(optstr, "on")) {
1453 set_bau_on();
1454 return count;
1455 } else if (!strcmp(optstr, "off")) {
1456 set_bau_off();
1457 return count;
1458 }
1459
1441 if (strict_strtol(optstr, 10, &input_arg) < 0) { 1460 if (strict_strtol(optstr, 10, &input_arg) < 0) {
1442 printk(KERN_DEBUG "%s is invalid\n", optstr); 1461 printk(KERN_DEBUG "%s is invalid\n", optstr);
1443 return -EINVAL; 1462 return -EINVAL;
@@ -1570,7 +1589,8 @@ static ssize_t tunables_write(struct file *file, const char __user *user,
1570 bcp->complete_threshold = complete_threshold; 1589 bcp->complete_threshold = complete_threshold;
1571 bcp->cong_response_us = congested_respns_us; 1590 bcp->cong_response_us = congested_respns_us;
1572 bcp->cong_reps = congested_reps; 1591 bcp->cong_reps = congested_reps;
1573 bcp->cong_period = congested_period; 1592 bcp->disabled_period = sec_2_cycles(disabled_period);
1593 bcp->giveup_limit = giveup_limit;
1574 } 1594 }
1575 return count; 1595 return count;
1576} 1596}
@@ -1699,6 +1719,10 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode)
1699 * fairness chaining multilevel count replied_to 1719 * fairness chaining multilevel count replied_to
1700 */ 1720 */
1701 } else { 1721 } else {
1722 /*
1723 * BIOS uses legacy mode, but UV2 hardware always
1724 * uses native mode for selective broadcasts.
1725 */
1702 uv2_hdr = &bd2->header.uv2_hdr; 1726 uv2_hdr = &bd2->header.uv2_hdr;
1703 uv2_hdr->swack_flag = 1; 1727 uv2_hdr->swack_flag = 1;
1704 uv2_hdr->base_dest_nasid = 1728 uv2_hdr->base_dest_nasid =
@@ -1811,8 +1835,8 @@ static int calculate_destination_timeout(void)
1811 index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK; 1835 index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK;
1812 mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT); 1836 mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT);
1813 mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK; 1837 mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK;
1814 base = timeout_base_ns[index]; 1838 ts_ns = timeout_base_ns[index];
1815 ts_ns = base * mult1 * mult2; 1839 ts_ns *= (mult1 * mult2);
1816 ret = ts_ns / 1000; 1840 ret = ts_ns / 1000;
1817 } else { 1841 } else {
1818 /* 4 bits 0/1 for 10/80us base, 3 bits of multiplier */ 1842 /* 4 bits 0/1 for 10/80us base, 3 bits of multiplier */
@@ -1836,6 +1860,8 @@ static void __init init_per_cpu_tunables(void)
1836 for_each_present_cpu(cpu) { 1860 for_each_present_cpu(cpu) {
1837 bcp = &per_cpu(bau_control, cpu); 1861 bcp = &per_cpu(bau_control, cpu);
1838 bcp->baudisabled = 0; 1862 bcp->baudisabled = 0;
1863 if (nobau)
1864 bcp->nobau = 1;
1839 bcp->statp = &per_cpu(ptcstats, cpu); 1865 bcp->statp = &per_cpu(ptcstats, cpu);
1840 /* time interval to catch a hardware stay-busy bug */ 1866 /* time interval to catch a hardware stay-busy bug */
1841 bcp->timeout_interval = usec_2_cycles(2*timeout_us); 1867 bcp->timeout_interval = usec_2_cycles(2*timeout_us);
@@ -1848,10 +1874,11 @@ static void __init init_per_cpu_tunables(void)
1848 bcp->complete_threshold = complete_threshold; 1874 bcp->complete_threshold = complete_threshold;
1849 bcp->cong_response_us = congested_respns_us; 1875 bcp->cong_response_us = congested_respns_us;
1850 bcp->cong_reps = congested_reps; 1876 bcp->cong_reps = congested_reps;
1851 bcp->cong_period = congested_period; 1877 bcp->disabled_period = sec_2_cycles(disabled_period);
1852 bcp->clocks_per_100_usec = usec_2_cycles(100); 1878 bcp->giveup_limit = giveup_limit;
1853 spin_lock_init(&bcp->queue_lock); 1879 spin_lock_init(&bcp->queue_lock);
1854 spin_lock_init(&bcp->uvhub_lock); 1880 spin_lock_init(&bcp->uvhub_lock);
1881 spin_lock_init(&bcp->disable_lock);
1855 } 1882 }
1856} 1883}
1857 1884
@@ -1972,7 +1999,6 @@ static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp,
1972 } 1999 }
1973 bcp->uvhub_master = *hmasterp; 2000 bcp->uvhub_master = *hmasterp;
1974 bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id; 2001 bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id;
1975 bcp->using_desc = bcp->uvhub_cpu;
1976 if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) { 2002 if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) {
1977 printk(KERN_EMERG "%d cpus per uvhub invalid\n", 2003 printk(KERN_EMERG "%d cpus per uvhub invalid\n",
1978 bcp->uvhub_cpu); 2004 bcp->uvhub_cpu);
@@ -2069,16 +2095,12 @@ static int __init uv_bau_init(void)
2069 if (!is_uv_system()) 2095 if (!is_uv_system())
2070 return 0; 2096 return 0;
2071 2097
2072 if (nobau)
2073 return 0;
2074
2075 for_each_possible_cpu(cur_cpu) { 2098 for_each_possible_cpu(cur_cpu) {
2076 mask = &per_cpu(uv_flush_tlb_mask, cur_cpu); 2099 mask = &per_cpu(uv_flush_tlb_mask, cur_cpu);
2077 zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cur_cpu)); 2100 zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cur_cpu));
2078 } 2101 }
2079 2102
2080 nuvhubs = uv_num_possible_blades(); 2103 nuvhubs = uv_num_possible_blades();
2081 spin_lock_init(&disable_lock);
2082 congested_cycles = usec_2_cycles(congested_respns_us); 2104 congested_cycles = usec_2_cycles(congested_respns_us);
2083 2105
2084 uv_base_pnode = 0x7fffffff; 2106 uv_base_pnode = 0x7fffffff;
@@ -2091,7 +2113,8 @@ static int __init uv_bau_init(void)
2091 enable_timeouts(); 2113 enable_timeouts();
2092 2114
2093 if (init_per_cpu(nuvhubs, uv_base_pnode)) { 2115 if (init_per_cpu(nuvhubs, uv_base_pnode)) {
2094 nobau = 1; 2116 set_bau_off();
2117 nobau_perm = 1;
2095 return 0; 2118 return 0;
2096 } 2119 }
2097 2120
diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c
index f25c2765a5c9..acf7752da952 100644
--- a/arch/x86/platform/uv/uv_irq.c
+++ b/arch/x86/platform/uv/uv_irq.c
@@ -135,6 +135,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
135 unsigned long mmr_value; 135 unsigned long mmr_value;
136 struct uv_IO_APIC_route_entry *entry; 136 struct uv_IO_APIC_route_entry *entry;
137 int mmr_pnode, err; 137 int mmr_pnode, err;
138 unsigned int dest;
138 139
139 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != 140 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
140 sizeof(unsigned long)); 141 sizeof(unsigned long));
@@ -143,6 +144,10 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
143 if (err != 0) 144 if (err != 0)
144 return err; 145 return err;
145 146
147 err = apic->cpu_mask_to_apicid_and(eligible_cpu, eligible_cpu, &dest);
148 if (err != 0)
149 return err;
150
146 if (limit == UV_AFFINITY_CPU) 151 if (limit == UV_AFFINITY_CPU)
147 irq_set_status_flags(irq, IRQ_NO_BALANCING); 152 irq_set_status_flags(irq, IRQ_NO_BALANCING);
148 else 153 else
@@ -159,7 +164,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
159 entry->polarity = 0; 164 entry->polarity = 0;
160 entry->trigger = 0; 165 entry->trigger = 0;
161 entry->mask = 0; 166 entry->mask = 0;
162 entry->dest = apic->cpu_mask_to_apicid(eligible_cpu); 167 entry->dest = dest;
163 168
164 mmr_pnode = uv_blade_to_pnode(mmr_blade); 169 mmr_pnode = uv_blade_to_pnode(mmr_blade);
165 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); 170 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
@@ -222,7 +227,7 @@ uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask,
222 if (cfg->move_in_progress) 227 if (cfg->move_in_progress)
223 send_cleanup_vector(cfg); 228 send_cleanup_vector(cfg);
224 229
225 return 0; 230 return IRQ_SET_MASK_OK_NOCOPY;
226} 231}
227 232
228/* 233/*
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index 5b84a2d30888..b2d534cab25f 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -22,7 +22,7 @@ wakeup-objs += video-bios.o
22realmode-y += header.o 22realmode-y += header.o
23realmode-y += trampoline_$(BITS).o 23realmode-y += trampoline_$(BITS).o
24realmode-y += stack.o 24realmode-y += stack.o
25realmode-$(CONFIG_X86_32) += reboot_32.o 25realmode-y += reboot.o
26realmode-$(CONFIG_ACPI_SLEEP) += $(wakeup-objs) 26realmode-$(CONFIG_ACPI_SLEEP) += $(wakeup-objs)
27 27
28targets += $(realmode-y) 28targets += $(realmode-y)
diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S
index fadf48378ada..a28221d94e69 100644
--- a/arch/x86/realmode/rm/header.S
+++ b/arch/x86/realmode/rm/header.S
@@ -6,6 +6,7 @@
6 6
7#include <linux/linkage.h> 7#include <linux/linkage.h>
8#include <asm/page_types.h> 8#include <asm/page_types.h>
9#include <asm/segment.h>
9 10
10#include "realmode.h" 11#include "realmode.h"
11 12
@@ -28,8 +29,9 @@ GLOBAL(real_mode_header)
28 .long pa_wakeup_header 29 .long pa_wakeup_header
29#endif 30#endif
30 /* APM/BIOS reboot */ 31 /* APM/BIOS reboot */
31#ifdef CONFIG_X86_32
32 .long pa_machine_real_restart_asm 32 .long pa_machine_real_restart_asm
33#ifdef CONFIG_X86_64
34 .long __KERNEL32_CS
33#endif 35#endif
34END(real_mode_header) 36END(real_mode_header)
35 37
diff --git a/arch/x86/realmode/rm/reboot_32.S b/arch/x86/realmode/rm/reboot.S
index 114044876b3d..f932ea61d1c8 100644
--- a/arch/x86/realmode/rm/reboot_32.S
+++ b/arch/x86/realmode/rm/reboot.S
@@ -2,6 +2,8 @@
2#include <linux/init.h> 2#include <linux/init.h>
3#include <asm/segment.h> 3#include <asm/segment.h>
4#include <asm/page_types.h> 4#include <asm/page_types.h>
5#include <asm/processor-flags.h>
6#include <asm/msr-index.h>
5#include "realmode.h" 7#include "realmode.h"
6 8
7/* 9/*
@@ -12,13 +14,35 @@
12 * doesn't work with at least one type of 486 motherboard. It is easy 14 * doesn't work with at least one type of 486 motherboard. It is easy
13 * to stop this code working; hence the copious comments. 15 * to stop this code working; hence the copious comments.
14 * 16 *
15 * This code is called with the restart type (0 = BIOS, 1 = APM) in %eax. 17 * This code is called with the restart type (0 = BIOS, 1 = APM) in
18 * the primary argument register (%eax for 32 bit, %edi for 64 bit).
16 */ 19 */
17 .section ".text32", "ax" 20 .section ".text32", "ax"
18 .code32 21 .code32
19
20 .balign 16
21ENTRY(machine_real_restart_asm) 22ENTRY(machine_real_restart_asm)
23
24#ifdef CONFIG_X86_64
25 /* Switch to trampoline GDT as it is guaranteed < 4 GiB */
26 movl $__KERNEL_DS, %eax
27 movl %eax, %ds
28 lgdtl pa_tr_gdt
29
30 /* Disable paging to drop us out of long mode */
31 movl %cr0, %eax
32 andl $~X86_CR0_PG, %eax
33 movl %eax, %cr0
34 ljmpl $__KERNEL32_CS, $pa_machine_real_restart_paging_off
35
36GLOBAL(machine_real_restart_paging_off)
37 xorl %eax, %eax
38 xorl %edx, %edx
39 movl $MSR_EFER, %ecx
40 wrmsr
41
42 movl %edi, %eax
43
44#endif /* CONFIG_X86_64 */
45
22 /* Set up the IDT for real mode. */ 46 /* Set up the IDT for real mode. */
23 lidtl pa_machine_real_restart_idt 47 lidtl pa_machine_real_restart_idt
24 48
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 66e6d9359826..0faad646f5fd 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -205,9 +205,9 @@ void syscall32_cpu_init(void)
205{ 205{
206 /* Load these always in case some future AMD CPU supports 206 /* Load these always in case some future AMD CPU supports
207 SYSENTER from compat mode too. */ 207 SYSENTER from compat mode too. */
208 checking_wrmsrl(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); 208 wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
209 checking_wrmsrl(MSR_IA32_SYSENTER_ESP, 0ULL); 209 wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
210 checking_wrmsrl(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); 210 wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
211 211
212 wrmsrl(MSR_CSTAR, ia32_cstar_target); 212 wrmsrl(MSR_CSTAR, ia32_cstar_target);
213} 213}
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index ff962d4b821e..bf4bda6d3e9a 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -31,6 +31,7 @@
31#include <linux/pci.h> 31#include <linux/pci.h>
32#include <linux/gfp.h> 32#include <linux/gfp.h>
33#include <linux/memblock.h> 33#include <linux/memblock.h>
34#include <linux/syscore_ops.h>
34 35
35#include <xen/xen.h> 36#include <xen/xen.h>
36#include <xen/interface/xen.h> 37#include <xen/interface/xen.h>
@@ -38,6 +39,7 @@
38#include <xen/interface/physdev.h> 39#include <xen/interface/physdev.h>
39#include <xen/interface/vcpu.h> 40#include <xen/interface/vcpu.h>
40#include <xen/interface/memory.h> 41#include <xen/interface/memory.h>
42#include <xen/interface/xen-mca.h>
41#include <xen/features.h> 43#include <xen/features.h>
42#include <xen/page.h> 44#include <xen/page.h>
43#include <xen/hvm.h> 45#include <xen/hvm.h>
@@ -107,7 +109,7 @@ EXPORT_SYMBOL_GPL(xen_have_vector_callback);
107 * Point at some empty memory to start with. We map the real shared_info 109 * Point at some empty memory to start with. We map the real shared_info
108 * page as soon as fixmap is up and running. 110 * page as soon as fixmap is up and running.
109 */ 111 */
110struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info; 112struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info;
111 113
112/* 114/*
113 * Flag to determine whether vcpu info placement is available on all 115 * Flag to determine whether vcpu info placement is available on all
@@ -124,6 +126,19 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
124 */ 126 */
125static int have_vcpu_info_placement = 1; 127static int have_vcpu_info_placement = 1;
126 128
129struct tls_descs {
130 struct desc_struct desc[3];
131};
132
133/*
134 * Updating the 3 TLS descriptors in the GDT on every task switch is
135 * surprisingly expensive so we avoid updating them if they haven't
136 * changed. Since Xen writes different descriptors than the one
137 * passed in the update_descriptor hypercall we keep shadow copies to
138 * compare against.
139 */
140static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
141
127static void clamp_max_cpus(void) 142static void clamp_max_cpus(void)
128{ 143{
129#ifdef CONFIG_SMP 144#ifdef CONFIG_SMP
@@ -341,9 +356,7 @@ static void __init xen_init_cpuid_mask(void)
341 unsigned int xsave_mask; 356 unsigned int xsave_mask;
342 357
343 cpuid_leaf1_edx_mask = 358 cpuid_leaf1_edx_mask =
344 ~((1 << X86_FEATURE_MCE) | /* disable MCE */ 359 ~((1 << X86_FEATURE_MTRR) | /* disable MTRR */
345 (1 << X86_FEATURE_MCA) | /* disable MCA */
346 (1 << X86_FEATURE_MTRR) | /* disable MTRR */
347 (1 << X86_FEATURE_ACC)); /* thermal monitoring */ 360 (1 << X86_FEATURE_ACC)); /* thermal monitoring */
348 361
349 if (!xen_initial_domain()) 362 if (!xen_initial_domain())
@@ -540,12 +553,28 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
540 BUG(); 553 BUG();
541} 554}
542 555
556static inline bool desc_equal(const struct desc_struct *d1,
557 const struct desc_struct *d2)
558{
559 return d1->a == d2->a && d1->b == d2->b;
560}
561
543static void load_TLS_descriptor(struct thread_struct *t, 562static void load_TLS_descriptor(struct thread_struct *t,
544 unsigned int cpu, unsigned int i) 563 unsigned int cpu, unsigned int i)
545{ 564{
546 struct desc_struct *gdt = get_cpu_gdt_table(cpu); 565 struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i];
547 xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); 566 struct desc_struct *gdt;
548 struct multicall_space mc = __xen_mc_entry(0); 567 xmaddr_t maddr;
568 struct multicall_space mc;
569
570 if (desc_equal(shadow, &t->tls_array[i]))
571 return;
572
573 *shadow = t->tls_array[i];
574
575 gdt = get_cpu_gdt_table(cpu);
576 maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
577 mc = __xen_mc_entry(0);
549 578
550 MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); 579 MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
551} 580}
@@ -627,8 +656,8 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
627 /* 656 /*
628 * Look for known traps using IST, and substitute them 657 * Look for known traps using IST, and substitute them
629 * appropriately. The debugger ones are the only ones we care 658 * appropriately. The debugger ones are the only ones we care
630 * about. Xen will handle faults like double_fault and 659 * about. Xen will handle faults like double_fault,
631 * machine_check, so we should never see them. Warn if 660 * so we should never see them. Warn if
632 * there's an unexpected IST-using fault handler. 661 * there's an unexpected IST-using fault handler.
633 */ 662 */
634 if (addr == (unsigned long)debug) 663 if (addr == (unsigned long)debug)
@@ -643,7 +672,11 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
643 return 0; 672 return 0;
644#ifdef CONFIG_X86_MCE 673#ifdef CONFIG_X86_MCE
645 } else if (addr == (unsigned long)machine_check) { 674 } else if (addr == (unsigned long)machine_check) {
646 return 0; 675 /*
676 * when xen hypervisor inject vMCE to guest,
677 * use native mce handler to handle it
678 */
679 ;
647#endif 680#endif
648 } else { 681 } else {
649 /* Some other trap using IST? */ 682 /* Some other trap using IST? */
@@ -1124,9 +1157,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
1124 .wbinvd = native_wbinvd, 1157 .wbinvd = native_wbinvd,
1125 1158
1126 .read_msr = native_read_msr_safe, 1159 .read_msr = native_read_msr_safe,
1127 .rdmsr_regs = native_rdmsr_safe_regs,
1128 .write_msr = xen_write_msr_safe, 1160 .write_msr = xen_write_msr_safe,
1129 .wrmsr_regs = native_wrmsr_safe_regs,
1130 1161
1131 .read_tsc = native_read_tsc, 1162 .read_tsc = native_read_tsc,
1132 .read_pmc = native_read_pmc, 1163 .read_pmc = native_read_pmc,
@@ -1439,64 +1470,155 @@ asmlinkage void __init xen_start_kernel(void)
1439#endif 1470#endif
1440} 1471}
1441 1472
1442static int init_hvm_pv_info(int *major, int *minor) 1473#ifdef CONFIG_XEN_PVHVM
1443{ 1474/*
1444 uint32_t eax, ebx, ecx, edx, pages, msr, base; 1475 * The pfn containing the shared_info is located somewhere in RAM. This
1445 u64 pfn; 1476 * will cause trouble if the current kernel is doing a kexec boot into a
1446 1477 * new kernel. The new kernel (and its startup code) can not know where
1447 base = xen_cpuid_base(); 1478 * the pfn is, so it can not reserve the page. The hypervisor will
1448 cpuid(base + 1, &eax, &ebx, &ecx, &edx); 1479 * continue to update the pfn, and as a result memory corruption occours
1449 1480 * in the new kernel.
1450 *major = eax >> 16; 1481 *
1451 *minor = eax & 0xffff; 1482 * One way to work around this issue is to allocate a page in the
1452 printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor); 1483 * xen-platform pci device's BAR memory range. But pci init is done very
1453 1484 * late and the shared_info page is already in use very early to read
1454 cpuid(base + 2, &pages, &msr, &ecx, &edx); 1485 * the pvclock. So moving the pfn from RAM to MMIO is racy because some
1455 1486 * code paths on other vcpus could access the pfn during the small
1456 pfn = __pa(hypercall_page); 1487 * window when the old pfn is moved to the new pfn. There is even a
1457 wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); 1488 * small window were the old pfn is not backed by a mfn, and during that
1458 1489 * time all reads return -1.
1459 xen_setup_features(); 1490 *
1460 1491 * Because it is not known upfront where the MMIO region is located it
1461 pv_info.name = "Xen HVM"; 1492 * can not be used right from the start in xen_hvm_init_shared_info.
1462 1493 *
1463 xen_domain_type = XEN_HVM_DOMAIN; 1494 * To minimise trouble the move of the pfn is done shortly before kexec.
1495 * This does not eliminate the race because all vcpus are still online
1496 * when the syscore_ops will be called. But hopefully there is no work
1497 * pending at this point in time. Also the syscore_op is run last which
1498 * reduces the risk further.
1499 */
1464 1500
1465 return 0; 1501static struct shared_info *xen_hvm_shared_info;
1466}
1467 1502
1468void __ref xen_hvm_init_shared_info(void) 1503static void xen_hvm_connect_shared_info(unsigned long pfn)
1469{ 1504{
1470 int cpu;
1471 struct xen_add_to_physmap xatp; 1505 struct xen_add_to_physmap xatp;
1472 static struct shared_info *shared_info_page = 0;
1473 1506
1474 if (!shared_info_page)
1475 shared_info_page = (struct shared_info *)
1476 extend_brk(PAGE_SIZE, PAGE_SIZE);
1477 xatp.domid = DOMID_SELF; 1507 xatp.domid = DOMID_SELF;
1478 xatp.idx = 0; 1508 xatp.idx = 0;
1479 xatp.space = XENMAPSPACE_shared_info; 1509 xatp.space = XENMAPSPACE_shared_info;
1480 xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT; 1510 xatp.gpfn = pfn;
1481 if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) 1511 if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
1482 BUG(); 1512 BUG();
1483 1513
1484 HYPERVISOR_shared_info = (struct shared_info *)shared_info_page; 1514}
1515static void xen_hvm_set_shared_info(struct shared_info *sip)
1516{
1517 int cpu;
1518
1519 HYPERVISOR_shared_info = sip;
1485 1520
1486 /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info 1521 /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
1487 * page, we use it in the event channel upcall and in some pvclock 1522 * page, we use it in the event channel upcall and in some pvclock
1488 * related functions. We don't need the vcpu_info placement 1523 * related functions. We don't need the vcpu_info placement
1489 * optimizations because we don't use any pv_mmu or pv_irq op on 1524 * optimizations because we don't use any pv_mmu or pv_irq op on
1490 * HVM. 1525 * HVM.
1491 * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is 1526 * When xen_hvm_set_shared_info is run at boot time only vcpu 0 is
1492 * online but xen_hvm_init_shared_info is run at resume time too and 1527 * online but xen_hvm_set_shared_info is run at resume time too and
1493 * in that case multiple vcpus might be online. */ 1528 * in that case multiple vcpus might be online. */
1494 for_each_online_cpu(cpu) { 1529 for_each_online_cpu(cpu) {
1495 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; 1530 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
1496 } 1531 }
1497} 1532}
1498 1533
1499#ifdef CONFIG_XEN_PVHVM 1534/* Reconnect the shared_info pfn to a mfn */
1535void xen_hvm_resume_shared_info(void)
1536{
1537 xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
1538}
1539
1540#ifdef CONFIG_KEXEC
1541static struct shared_info *xen_hvm_shared_info_kexec;
1542static unsigned long xen_hvm_shared_info_pfn_kexec;
1543
1544/* Remember a pfn in MMIO space for kexec reboot */
1545void __devinit xen_hvm_prepare_kexec(struct shared_info *sip, unsigned long pfn)
1546{
1547 xen_hvm_shared_info_kexec = sip;
1548 xen_hvm_shared_info_pfn_kexec = pfn;
1549}
1550
1551static void xen_hvm_syscore_shutdown(void)
1552{
1553 struct xen_memory_reservation reservation = {
1554 .domid = DOMID_SELF,
1555 .nr_extents = 1,
1556 };
1557 unsigned long prev_pfn;
1558 int rc;
1559
1560 if (!xen_hvm_shared_info_kexec)
1561 return;
1562
1563 prev_pfn = __pa(xen_hvm_shared_info) >> PAGE_SHIFT;
1564 set_xen_guest_handle(reservation.extent_start, &prev_pfn);
1565
1566 /* Move pfn to MMIO, disconnects previous pfn from mfn */
1567 xen_hvm_connect_shared_info(xen_hvm_shared_info_pfn_kexec);
1568
1569 /* Update pointers, following hypercall is also a memory barrier */
1570 xen_hvm_set_shared_info(xen_hvm_shared_info_kexec);
1571
1572 /* Allocate new mfn for previous pfn */
1573 do {
1574 rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
1575 if (rc == 0)
1576 msleep(123);
1577 } while (rc == 0);
1578
1579 /* Make sure the previous pfn is really connected to a (new) mfn */
1580 BUG_ON(rc != 1);
1581}
1582
1583static struct syscore_ops xen_hvm_syscore_ops = {
1584 .shutdown = xen_hvm_syscore_shutdown,
1585};
1586#endif
1587
1588/* Use a pfn in RAM, may move to MMIO before kexec. */
1589static void __init xen_hvm_init_shared_info(void)
1590{
1591 /* Remember pointer for resume */
1592 xen_hvm_shared_info = extend_brk(PAGE_SIZE, PAGE_SIZE);
1593 xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
1594 xen_hvm_set_shared_info(xen_hvm_shared_info);
1595}
1596
1597static void __init init_hvm_pv_info(void)
1598{
1599 int major, minor;
1600 uint32_t eax, ebx, ecx, edx, pages, msr, base;
1601 u64 pfn;
1602
1603 base = xen_cpuid_base();
1604 cpuid(base + 1, &eax, &ebx, &ecx, &edx);
1605
1606 major = eax >> 16;
1607 minor = eax & 0xffff;
1608 printk(KERN_INFO "Xen version %d.%d.\n", major, minor);
1609
1610 cpuid(base + 2, &pages, &msr, &ecx, &edx);
1611
1612 pfn = __pa(hypercall_page);
1613 wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
1614
1615 xen_setup_features();
1616
1617 pv_info.name = "Xen HVM";
1618
1619 xen_domain_type = XEN_HVM_DOMAIN;
1620}
1621
1500static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self, 1622static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
1501 unsigned long action, void *hcpu) 1623 unsigned long action, void *hcpu)
1502{ 1624{
@@ -1519,14 +1641,12 @@ static struct notifier_block xen_hvm_cpu_notifier __cpuinitdata = {
1519 1641
1520static void __init xen_hvm_guest_init(void) 1642static void __init xen_hvm_guest_init(void)
1521{ 1643{
1522 int r; 1644 init_hvm_pv_info();
1523 int major, minor;
1524
1525 r = init_hvm_pv_info(&major, &minor);
1526 if (r < 0)
1527 return;
1528 1645
1529 xen_hvm_init_shared_info(); 1646 xen_hvm_init_shared_info();
1647#ifdef CONFIG_KEXEC
1648 register_syscore_ops(&xen_hvm_syscore_ops);
1649#endif
1530 1650
1531 if (xen_feature(XENFEAT_hvm_callback_vector)) 1651 if (xen_feature(XENFEAT_hvm_callback_vector))
1532 xen_have_vector_callback = 1; 1652 xen_have_vector_callback = 1;
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 3a73785631ce..b65a76133f4f 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -308,8 +308,20 @@ static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
308 308
309static inline void __xen_set_pte(pte_t *ptep, pte_t pteval) 309static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
310{ 310{
311 if (!xen_batched_set_pte(ptep, pteval)) 311 if (!xen_batched_set_pte(ptep, pteval)) {
312 native_set_pte(ptep, pteval); 312 /*
313 * Could call native_set_pte() here and trap and
314 * emulate the PTE write but with 32-bit guests this
315 * needs two traps (one for each of the two 32-bit
316 * words in the PTE) so do one hypercall directly
317 * instead.
318 */
319 struct mmu_update u;
320
321 u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
322 u.val = pte_val_ma(pteval);
323 HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
324 }
313} 325}
314 326
315static void xen_set_pte(pte_t *ptep, pte_t pteval) 327static void xen_set_pte(pte_t *ptep, pte_t pteval)
@@ -1244,7 +1256,8 @@ static void xen_flush_tlb_single(unsigned long addr)
1244} 1256}
1245 1257
1246static void xen_flush_tlb_others(const struct cpumask *cpus, 1258static void xen_flush_tlb_others(const struct cpumask *cpus,
1247 struct mm_struct *mm, unsigned long va) 1259 struct mm_struct *mm, unsigned long start,
1260 unsigned long end)
1248{ 1261{
1249 struct { 1262 struct {
1250 struct mmuext_op op; 1263 struct mmuext_op op;
@@ -1256,7 +1269,7 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
1256 } *args; 1269 } *args;
1257 struct multicall_space mcs; 1270 struct multicall_space mcs;
1258 1271
1259 trace_xen_mmu_flush_tlb_others(cpus, mm, va); 1272 trace_xen_mmu_flush_tlb_others(cpus, mm, start, end);
1260 1273
1261 if (cpumask_empty(cpus)) 1274 if (cpumask_empty(cpus))
1262 return; /* nothing to do */ 1275 return; /* nothing to do */
@@ -1269,11 +1282,10 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
1269 cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask); 1282 cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1270 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask)); 1283 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
1271 1284
1272 if (va == TLB_FLUSH_ALL) { 1285 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1273 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; 1286 if (start != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) {
1274 } else {
1275 args->op.cmd = MMUEXT_INVLPG_MULTI; 1287 args->op.cmd = MMUEXT_INVLPG_MULTI;
1276 args->op.arg1.linear_addr = va; 1288 args->op.arg1.linear_addr = start;
1277 } 1289 }
1278 1290
1279 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); 1291 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
@@ -1416,13 +1428,28 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
1416} 1428}
1417#endif /* CONFIG_X86_64 */ 1429#endif /* CONFIG_X86_64 */
1418 1430
1419/* Init-time set_pte while constructing initial pagetables, which 1431/*
1420 doesn't allow RO pagetable pages to be remapped RW */ 1432 * Init-time set_pte while constructing initial pagetables, which
1433 * doesn't allow RO page table pages to be remapped RW.
1434 *
1435 * If there is no MFN for this PFN then this page is initially
1436 * ballooned out so clear the PTE (as in decrease_reservation() in
1437 * drivers/xen/balloon.c).
1438 *
1439 * Many of these PTE updates are done on unpinned and writable pages
1440 * and doing a hypercall for these is unnecessary and expensive. At
1441 * this point it is not possible to tell if a page is pinned or not,
1442 * so always write the PTE directly and rely on Xen trapping and
1443 * emulating any updates as necessary.
1444 */
1421static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) 1445static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
1422{ 1446{
1423 pte = mask_rw_pte(ptep, pte); 1447 if (pte_mfn(pte) != INVALID_P2M_ENTRY)
1448 pte = mask_rw_pte(ptep, pte);
1449 else
1450 pte = __pte_ma(0);
1424 1451
1425 xen_set_pte(ptep, pte); 1452 native_set_pte(ptep, pte);
1426} 1453}
1427 1454
1428static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) 1455static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index a4790bf22c59..ead85576d54a 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -157,25 +157,24 @@ static unsigned long __init xen_populate_chunk(
157 unsigned long dest_pfn; 157 unsigned long dest_pfn;
158 158
159 for (i = 0, entry = list; i < map_size; i++, entry++) { 159 for (i = 0, entry = list; i < map_size; i++, entry++) {
160 unsigned long credits = credits_left;
161 unsigned long s_pfn; 160 unsigned long s_pfn;
162 unsigned long e_pfn; 161 unsigned long e_pfn;
163 unsigned long pfns; 162 unsigned long pfns;
164 long capacity; 163 long capacity;
165 164
166 if (credits <= 0) 165 if (credits_left <= 0)
167 break; 166 break;
168 167
169 if (entry->type != E820_RAM) 168 if (entry->type != E820_RAM)
170 continue; 169 continue;
171 170
172 e_pfn = PFN_UP(entry->addr + entry->size); 171 e_pfn = PFN_DOWN(entry->addr + entry->size);
173 172
174 /* We only care about E820 after the xen_start_info->nr_pages */ 173 /* We only care about E820 after the xen_start_info->nr_pages */
175 if (e_pfn <= max_pfn) 174 if (e_pfn <= max_pfn)
176 continue; 175 continue;
177 176
178 s_pfn = PFN_DOWN(entry->addr); 177 s_pfn = PFN_UP(entry->addr);
179 /* If the E820 falls within the nr_pages, we want to start 178 /* If the E820 falls within the nr_pages, we want to start
180 * at the nr_pages PFN. 179 * at the nr_pages PFN.
181 * If that would mean going past the E820 entry, skip it 180 * If that would mean going past the E820 entry, skip it
@@ -184,23 +183,19 @@ static unsigned long __init xen_populate_chunk(
184 capacity = e_pfn - max_pfn; 183 capacity = e_pfn - max_pfn;
185 dest_pfn = max_pfn; 184 dest_pfn = max_pfn;
186 } else { 185 } else {
187 /* last_pfn MUST be within E820_RAM regions */
188 if (*last_pfn && e_pfn >= *last_pfn)
189 s_pfn = *last_pfn;
190 capacity = e_pfn - s_pfn; 186 capacity = e_pfn - s_pfn;
191 dest_pfn = s_pfn; 187 dest_pfn = s_pfn;
192 } 188 }
193 /* If we had filled this E820_RAM entry, go to the next one. */
194 if (capacity <= 0)
195 continue;
196 189
197 if (credits > capacity) 190 if (credits_left < capacity)
198 credits = capacity; 191 capacity = credits_left;
199 192
200 pfns = xen_do_chunk(dest_pfn, dest_pfn + credits, false); 193 pfns = xen_do_chunk(dest_pfn, dest_pfn + capacity, false);
201 done += pfns; 194 done += pfns;
202 credits_left -= pfns;
203 *last_pfn = (dest_pfn + pfns); 195 *last_pfn = (dest_pfn + pfns);
196 if (pfns < capacity)
197 break;
198 credits_left -= pfns;
204 } 199 }
205 return done; 200 return done;
206} 201}
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index afb250d22a6b..f58dca7a6e52 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -80,9 +80,7 @@ static void __cpuinit cpu_bringup(void)
80 80
81 notify_cpu_starting(cpu); 81 notify_cpu_starting(cpu);
82 82
83 ipi_call_lock();
84 set_cpu_online(cpu, true); 83 set_cpu_online(cpu, true);
85 ipi_call_unlock();
86 84
87 this_cpu_write(cpu_state, CPU_ONLINE); 85 this_cpu_write(cpu_state, CPU_ONLINE);
88 86
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 45329c8c226e..ae8a00c39de4 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -30,7 +30,7 @@ void xen_arch_hvm_post_suspend(int suspend_cancelled)
30{ 30{
31#ifdef CONFIG_XEN_PVHVM 31#ifdef CONFIG_XEN_PVHVM
32 int cpu; 32 int cpu;
33 xen_hvm_init_shared_info(); 33 xen_hvm_resume_shared_info();
34 xen_callback_vector(); 34 xen_callback_vector();
35 xen_unplug_emulated_devices(); 35 xen_unplug_emulated_devices();
36 if (xen_feature(XENFEAT_hvm_safe_pvclock)) { 36 if (xen_feature(XENFEAT_hvm_safe_pvclock)) {
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 202d4c150154..1e4329e04e0f 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -41,7 +41,7 @@ void xen_enable_syscall(void);
41void xen_vcpu_restore(void); 41void xen_vcpu_restore(void);
42 42
43void xen_callback_vector(void); 43void xen_callback_vector(void);
44void xen_hvm_init_shared_info(void); 44void xen_hvm_resume_shared_info(void);
45void xen_unplug_emulated_devices(void); 45void xen_unplug_emulated_devices(void);
46 46
47void __init xen_build_dynamic_phys_to_machine(void); 47void __init xen_build_dynamic_phys_to_machine(void);