diff options
Diffstat (limited to 'arch/x86')
260 files changed, 9747 insertions, 5373 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b7d31ca55187..6049d587599e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -87,7 +87,7 @@ config X86 | |||
87 | select HAVE_ARCH_KMEMCHECK | 87 | select HAVE_ARCH_KMEMCHECK |
88 | select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP | 88 | select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP |
89 | select HAVE_USER_RETURN_NOTIFIER | 89 | select HAVE_USER_RETURN_NOTIFIER |
90 | select ARCH_BINFMT_ELF_RANDOMIZE_PIE | 90 | select ARCH_HAS_ELF_RANDOMIZE |
91 | select HAVE_ARCH_JUMP_LABEL | 91 | select HAVE_ARCH_JUMP_LABEL |
92 | select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE | 92 | select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE |
93 | select SPARSE_IRQ | 93 | select SPARSE_IRQ |
@@ -99,6 +99,7 @@ config X86 | |||
99 | select IRQ_FORCED_THREADING | 99 | select IRQ_FORCED_THREADING |
100 | select HAVE_BPF_JIT if X86_64 | 100 | select HAVE_BPF_JIT if X86_64 |
101 | select HAVE_ARCH_TRANSPARENT_HUGEPAGE | 101 | select HAVE_ARCH_TRANSPARENT_HUGEPAGE |
102 | select HAVE_ARCH_HUGE_VMAP if X86_64 || (X86_32 && X86_PAE) | ||
102 | select ARCH_HAS_SG_CHAIN | 103 | select ARCH_HAS_SG_CHAIN |
103 | select CLKEVT_I8253 | 104 | select CLKEVT_I8253 |
104 | select ARCH_HAVE_NMI_SAFE_CMPXCHG | 105 | select ARCH_HAVE_NMI_SAFE_CMPXCHG |
@@ -177,7 +178,7 @@ config SBUS | |||
177 | 178 | ||
178 | config NEED_DMA_MAP_STATE | 179 | config NEED_DMA_MAP_STATE |
179 | def_bool y | 180 | def_bool y |
180 | depends on X86_64 || INTEL_IOMMU || DMA_API_DEBUG | 181 | depends on X86_64 || INTEL_IOMMU || DMA_API_DEBUG || SWIOTLB |
181 | 182 | ||
182 | config NEED_SG_DMA_LENGTH | 183 | config NEED_SG_DMA_LENGTH |
183 | def_bool y | 184 | def_bool y |
@@ -235,12 +236,10 @@ config ARCH_WANT_GENERAL_HUGETLB | |||
235 | def_bool y | 236 | def_bool y |
236 | 237 | ||
237 | config ZONE_DMA32 | 238 | config ZONE_DMA32 |
238 | bool | 239 | def_bool y if X86_64 |
239 | default X86_64 | ||
240 | 240 | ||
241 | config AUDIT_ARCH | 241 | config AUDIT_ARCH |
242 | bool | 242 | def_bool y if X86_64 |
243 | default X86_64 | ||
244 | 243 | ||
245 | config ARCH_SUPPORTS_OPTIMIZED_INLINING | 244 | config ARCH_SUPPORTS_OPTIMIZED_INLINING |
246 | def_bool y | 245 | def_bool y |
@@ -279,6 +278,12 @@ config ARCH_SUPPORTS_UPROBES | |||
279 | config FIX_EARLYCON_MEM | 278 | config FIX_EARLYCON_MEM |
280 | def_bool y | 279 | def_bool y |
281 | 280 | ||
281 | config PGTABLE_LEVELS | ||
282 | int | ||
283 | default 4 if X86_64 | ||
284 | default 3 if X86_PAE | ||
285 | default 2 | ||
286 | |||
282 | source "init/Kconfig" | 287 | source "init/Kconfig" |
283 | source "kernel/Kconfig.freezer" | 288 | source "kernel/Kconfig.freezer" |
284 | 289 | ||
@@ -716,17 +721,6 @@ endif #HYPERVISOR_GUEST | |||
716 | config NO_BOOTMEM | 721 | config NO_BOOTMEM |
717 | def_bool y | 722 | def_bool y |
718 | 723 | ||
719 | config MEMTEST | ||
720 | bool "Memtest" | ||
721 | ---help--- | ||
722 | This option adds a kernel parameter 'memtest', which allows memtest | ||
723 | to be set. | ||
724 | memtest=0, mean disabled; -- default | ||
725 | memtest=1, mean do 1 test pattern; | ||
726 | ... | ||
727 | memtest=4, mean do 4 test patterns. | ||
728 | If you are unsure how to answer this question, answer N. | ||
729 | |||
730 | source "arch/x86/Kconfig.cpu" | 724 | source "arch/x86/Kconfig.cpu" |
731 | 725 | ||
732 | config HPET_TIMER | 726 | config HPET_TIMER |
@@ -891,7 +885,8 @@ config UP_LATE_INIT | |||
891 | depends on !SMP && X86_LOCAL_APIC | 885 | depends on !SMP && X86_LOCAL_APIC |
892 | 886 | ||
893 | config X86_UP_APIC | 887 | config X86_UP_APIC |
894 | bool "Local APIC support on uniprocessors" | 888 | bool "Local APIC support on uniprocessors" if !PCI_MSI |
889 | default PCI_MSI | ||
895 | depends on X86_32 && !SMP && !X86_32_NON_STANDARD | 890 | depends on X86_32 && !SMP && !X86_32_NON_STANDARD |
896 | ---help--- | 891 | ---help--- |
897 | A local APIC (Advanced Programmable Interrupt Controller) is an | 892 | A local APIC (Advanced Programmable Interrupt Controller) is an |
@@ -903,10 +898,6 @@ config X86_UP_APIC | |||
903 | performance counters), and the NMI watchdog which detects hard | 898 | performance counters), and the NMI watchdog which detects hard |
904 | lockups. | 899 | lockups. |
905 | 900 | ||
906 | config X86_UP_APIC_MSI | ||
907 | def_bool y | ||
908 | select X86_UP_APIC if X86_32 && !SMP && !X86_32_NON_STANDARD && PCI_MSI | ||
909 | |||
910 | config X86_UP_IOAPIC | 901 | config X86_UP_IOAPIC |
911 | bool "IO-APIC support on uniprocessors" | 902 | bool "IO-APIC support on uniprocessors" |
912 | depends on X86_UP_APIC | 903 | depends on X86_UP_APIC |
@@ -925,8 +916,8 @@ config X86_LOCAL_APIC | |||
925 | select GENERIC_IRQ_LEGACY_ALLOC_HWIRQ | 916 | select GENERIC_IRQ_LEGACY_ALLOC_HWIRQ |
926 | 917 | ||
927 | config X86_IO_APIC | 918 | config X86_IO_APIC |
928 | def_bool X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_IOAPIC | 919 | def_bool y |
929 | depends on X86_LOCAL_APIC | 920 | depends on X86_LOCAL_APIC || X86_UP_IOAPIC |
930 | select IRQ_DOMAIN | 921 | select IRQ_DOMAIN |
931 | 922 | ||
932 | config X86_REROUTE_FOR_BROKEN_BOOT_IRQS | 923 | config X86_REROUTE_FOR_BROKEN_BOOT_IRQS |
@@ -1145,10 +1136,10 @@ config MICROCODE_OLD_INTERFACE | |||
1145 | depends on MICROCODE | 1136 | depends on MICROCODE |
1146 | 1137 | ||
1147 | config MICROCODE_INTEL_EARLY | 1138 | config MICROCODE_INTEL_EARLY |
1148 | def_bool n | 1139 | bool |
1149 | 1140 | ||
1150 | config MICROCODE_AMD_EARLY | 1141 | config MICROCODE_AMD_EARLY |
1151 | def_bool n | 1142 | bool |
1152 | 1143 | ||
1153 | config MICROCODE_EARLY | 1144 | config MICROCODE_EARLY |
1154 | bool "Early load microcode" | 1145 | bool "Early load microcode" |
@@ -1300,14 +1291,14 @@ config ARCH_DMA_ADDR_T_64BIT | |||
1300 | def_bool y | 1291 | def_bool y |
1301 | depends on X86_64 || HIGHMEM64G | 1292 | depends on X86_64 || HIGHMEM64G |
1302 | 1293 | ||
1303 | config DIRECT_GBPAGES | 1294 | config X86_DIRECT_GBPAGES |
1304 | bool "Enable 1GB pages for kernel pagetables" if EXPERT | 1295 | def_bool y |
1305 | default y | 1296 | depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK |
1306 | depends on X86_64 | ||
1307 | ---help--- | 1297 | ---help--- |
1308 | Allow the kernel linear mapping to use 1GB pages on CPUs that | 1298 | Certain kernel features effectively disable kernel |
1309 | support it. This can improve the kernel's performance a tiny bit by | 1299 | linear 1 GB mappings (even if the CPU otherwise |
1310 | reducing TLB pressure. If in doubt, say "Y". | 1300 | supports them), so don't confuse the user by printing |
1301 | that we have them enabled. | ||
1311 | 1302 | ||
1312 | # Common NUMA Features | 1303 | # Common NUMA Features |
1313 | config NUMA | 1304 | config NUMA |
@@ -1430,6 +1421,16 @@ config ILLEGAL_POINTER_VALUE | |||
1430 | 1421 | ||
1431 | source "mm/Kconfig" | 1422 | source "mm/Kconfig" |
1432 | 1423 | ||
1424 | config X86_PMEM_LEGACY | ||
1425 | bool "Support non-standard NVDIMMs and ADR protected memory" | ||
1426 | help | ||
1427 | Treat memory marked using the non-standard e820 type of 12 as used | ||
1428 | by the Intel Sandy Bridge-EP reference BIOS as protected memory. | ||
1429 | The kernel will offer these regions to the 'pmem' driver so | ||
1430 | they can be used for persistent storage. | ||
1431 | |||
1432 | Say Y if unsure. | ||
1433 | |||
1433 | config HIGHPTE | 1434 | config HIGHPTE |
1434 | bool "Allocate 3rd-level pagetables from highmem" | 1435 | bool "Allocate 3rd-level pagetables from highmem" |
1435 | depends on HIGHMEM | 1436 | depends on HIGHMEM |
@@ -1747,14 +1748,11 @@ config KEXEC_VERIFY_SIG | |||
1747 | depends on KEXEC_FILE | 1748 | depends on KEXEC_FILE |
1748 | ---help--- | 1749 | ---help--- |
1749 | This option makes kernel signature verification mandatory for | 1750 | This option makes kernel signature verification mandatory for |
1750 | kexec_file_load() syscall. If kernel is signature can not be | 1751 | the kexec_file_load() syscall. |
1751 | verified, kexec_file_load() will fail. | 1752 | |
1752 | 1753 | In addition to that option, you need to enable signature | |
1753 | This option enforces signature verification at generic level. | 1754 | verification for the corresponding kernel image type being |
1754 | One needs to enable signature verification for type of kernel | 1755 | loaded in order for this to work. |
1755 | image being loaded to make sure it works. For example, enable | ||
1756 | bzImage signature verification option to be able to load and | ||
1757 | verify signatures of bzImage. Otherwise kernel loading will fail. | ||
1758 | 1756 | ||
1759 | config KEXEC_BZIMAGE_VERIFY_SIG | 1757 | config KEXEC_BZIMAGE_VERIFY_SIG |
1760 | bool "Enable bzImage signature verification support" | 1758 | bool "Enable bzImage signature verification support" |
diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 5ba2d9ce82dc..2fda005bb334 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile | |||
@@ -63,7 +63,7 @@ ifeq ($(CONFIG_X86_32),y) | |||
63 | $(call cc-option,-fno-unit-at-a-time)) | 63 | $(call cc-option,-fno-unit-at-a-time)) |
64 | 64 | ||
65 | # CPU-specific tuning. Anything which can be shared with UML should go here. | 65 | # CPU-specific tuning. Anything which can be shared with UML should go here. |
66 | include $(srctree)/arch/x86/Makefile_32.cpu | 66 | include arch/x86/Makefile_32.cpu |
67 | KBUILD_CFLAGS += $(cflags-y) | 67 | KBUILD_CFLAGS += $(cflags-y) |
68 | 68 | ||
69 | # temporary until string.h is fixed | 69 | # temporary until string.h is fixed |
diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um index 95eba554baf9..5b7e898ffd9a 100644 --- a/arch/x86/Makefile.um +++ b/arch/x86/Makefile.um | |||
@@ -18,7 +18,7 @@ LDS_EXTRA := -Ui386 | |||
18 | export LDS_EXTRA | 18 | export LDS_EXTRA |
19 | 19 | ||
20 | # First of all, tune CFLAGS for the specific CPU. This actually sets cflags-y. | 20 | # First of all, tune CFLAGS for the specific CPU. This actually sets cflags-y. |
21 | include $(srctree)/arch/x86/Makefile_32.cpu | 21 | include arch/x86/Makefile_32.cpu |
22 | 22 | ||
23 | # prevent gcc from keeping the stack 16 byte aligned. Taken from i386. | 23 | # prevent gcc from keeping the stack 16 byte aligned. Taken from i386. |
24 | cflags-y += $(call cc-option,-mpreferred-stack-boundary=2) | 24 | cflags-y += $(call cc-option,-mpreferred-stack-boundary=2) |
diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c index bb1376381985..d7b1f655b3ef 100644 --- a/arch/x86/boot/compressed/aslr.c +++ b/arch/x86/boot/compressed/aslr.c | |||
@@ -295,7 +295,8 @@ static unsigned long find_random_addr(unsigned long minimum, | |||
295 | return slots_fetch_random(); | 295 | return slots_fetch_random(); |
296 | } | 296 | } |
297 | 297 | ||
298 | unsigned char *choose_kernel_location(unsigned char *input, | 298 | unsigned char *choose_kernel_location(struct boot_params *boot_params, |
299 | unsigned char *input, | ||
299 | unsigned long input_size, | 300 | unsigned long input_size, |
300 | unsigned char *output, | 301 | unsigned char *output, |
301 | unsigned long output_size) | 302 | unsigned long output_size) |
@@ -315,6 +316,8 @@ unsigned char *choose_kernel_location(unsigned char *input, | |||
315 | } | 316 | } |
316 | #endif | 317 | #endif |
317 | 318 | ||
319 | boot_params->hdr.loadflags |= KASLR_FLAG; | ||
320 | |||
318 | /* Record the various known unsafe memory ranges. */ | 321 | /* Record the various known unsafe memory ranges. */ |
319 | mem_avoid_init((unsigned long)input, input_size, | 322 | mem_avoid_init((unsigned long)input, input_size, |
320 | (unsigned long)output, output_size); | 323 | (unsigned long)output, output_size); |
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 1d7fbbcc196d..8ef964ddc18e 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S | |||
@@ -29,6 +29,7 @@ | |||
29 | #include <asm/page_types.h> | 29 | #include <asm/page_types.h> |
30 | #include <asm/boot.h> | 30 | #include <asm/boot.h> |
31 | #include <asm/asm-offsets.h> | 31 | #include <asm/asm-offsets.h> |
32 | #include <asm/bootparam.h> | ||
32 | 33 | ||
33 | __HEAD | 34 | __HEAD |
34 | ENTRY(startup_32) | 35 | ENTRY(startup_32) |
@@ -102,7 +103,7 @@ preferred_addr: | |||
102 | * Test KEEP_SEGMENTS flag to see if the bootloader is asking | 103 | * Test KEEP_SEGMENTS flag to see if the bootloader is asking |
103 | * us to not reload segments | 104 | * us to not reload segments |
104 | */ | 105 | */ |
105 | testb $(1<<6), BP_loadflags(%esi) | 106 | testb $KEEP_SEGMENTS, BP_loadflags(%esi) |
106 | jnz 1f | 107 | jnz 1f |
107 | 108 | ||
108 | cli | 109 | cli |
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 6b1766c6c082..b0c0d16ef58d 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S | |||
@@ -31,6 +31,7 @@ | |||
31 | #include <asm/msr.h> | 31 | #include <asm/msr.h> |
32 | #include <asm/processor-flags.h> | 32 | #include <asm/processor-flags.h> |
33 | #include <asm/asm-offsets.h> | 33 | #include <asm/asm-offsets.h> |
34 | #include <asm/bootparam.h> | ||
34 | 35 | ||
35 | __HEAD | 36 | __HEAD |
36 | .code32 | 37 | .code32 |
@@ -46,7 +47,7 @@ ENTRY(startup_32) | |||
46 | * Test KEEP_SEGMENTS flag to see if the bootloader is asking | 47 | * Test KEEP_SEGMENTS flag to see if the bootloader is asking |
47 | * us to not reload segments | 48 | * us to not reload segments |
48 | */ | 49 | */ |
49 | testb $(1<<6), BP_loadflags(%esi) | 50 | testb $KEEP_SEGMENTS, BP_loadflags(%esi) |
50 | jnz 1f | 51 | jnz 1f |
51 | 52 | ||
52 | cli | 53 | cli |
@@ -164,7 +165,7 @@ ENTRY(startup_32) | |||
164 | /* After gdt is loaded */ | 165 | /* After gdt is loaded */ |
165 | xorl %eax, %eax | 166 | xorl %eax, %eax |
166 | lldt %ax | 167 | lldt %ax |
167 | movl $0x20, %eax | 168 | movl $__BOOT_TSS, %eax |
168 | ltr %ax | 169 | ltr %ax |
169 | 170 | ||
170 | /* | 171 | /* |
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index a950864a64da..a107b935e22f 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c | |||
@@ -377,6 +377,9 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, | |||
377 | 377 | ||
378 | real_mode = rmode; | 378 | real_mode = rmode; |
379 | 379 | ||
380 | /* Clear it for solely in-kernel use */ | ||
381 | real_mode->hdr.loadflags &= ~KASLR_FLAG; | ||
382 | |||
380 | sanitize_boot_params(real_mode); | 383 | sanitize_boot_params(real_mode); |
381 | 384 | ||
382 | if (real_mode->screen_info.orig_video_mode == 7) { | 385 | if (real_mode->screen_info.orig_video_mode == 7) { |
@@ -401,7 +404,7 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, | |||
401 | * the entire decompressed kernel plus relocation table, or the | 404 | * the entire decompressed kernel plus relocation table, or the |
402 | * entire decompressed kernel plus .bss and .brk sections. | 405 | * entire decompressed kernel plus .bss and .brk sections. |
403 | */ | 406 | */ |
404 | output = choose_kernel_location(input_data, input_len, output, | 407 | output = choose_kernel_location(real_mode, input_data, input_len, output, |
405 | output_len > run_size ? output_len | 408 | output_len > run_size ? output_len |
406 | : run_size); | 409 | : run_size); |
407 | 410 | ||
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 04477d68403f..89dd0d78013a 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h | |||
@@ -57,7 +57,8 @@ int cmdline_find_option_bool(const char *option); | |||
57 | 57 | ||
58 | #if CONFIG_RANDOMIZE_BASE | 58 | #if CONFIG_RANDOMIZE_BASE |
59 | /* aslr.c */ | 59 | /* aslr.c */ |
60 | unsigned char *choose_kernel_location(unsigned char *input, | 60 | unsigned char *choose_kernel_location(struct boot_params *boot_params, |
61 | unsigned char *input, | ||
61 | unsigned long input_size, | 62 | unsigned long input_size, |
62 | unsigned char *output, | 63 | unsigned char *output, |
63 | unsigned long output_size); | 64 | unsigned long output_size); |
@@ -65,7 +66,8 @@ unsigned char *choose_kernel_location(unsigned char *input, | |||
65 | bool has_cpuflag(int flag); | 66 | bool has_cpuflag(int flag); |
66 | #else | 67 | #else |
67 | static inline | 68 | static inline |
68 | unsigned char *choose_kernel_location(unsigned char *input, | 69 | unsigned char *choose_kernel_location(struct boot_params *boot_params, |
70 | unsigned char *input, | ||
69 | unsigned long input_size, | 71 | unsigned long input_size, |
70 | unsigned char *output, | 72 | unsigned char *output, |
71 | unsigned long output_size) | 73 | unsigned long output_size) |
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index 493f3fd9f139..318b8465d302 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c | |||
@@ -30,7 +30,7 @@ int strcmp(const char *str1, const char *str2) | |||
30 | int delta = 0; | 30 | int delta = 0; |
31 | 31 | ||
32 | while (*s1 || *s2) { | 32 | while (*s1 || *s2) { |
33 | delta = *s2 - *s1; | 33 | delta = *s1 - *s2; |
34 | if (delta) | 34 | if (delta) |
35 | return delta; | 35 | return delta; |
36 | s1++; | 36 | s1++; |
diff --git a/arch/x86/boot/video-mode.c b/arch/x86/boot/video-mode.c index 748e8d06290a..aa8a96b052e3 100644 --- a/arch/x86/boot/video-mode.c +++ b/arch/x86/boot/video-mode.c | |||
@@ -22,10 +22,8 @@ | |||
22 | /* | 22 | /* |
23 | * Common variables | 23 | * Common variables |
24 | */ | 24 | */ |
25 | int adapter; /* 0=CGA/MDA/HGC, 1=EGA, 2=VGA+ */ | 25 | int adapter; /* 0=CGA/MDA/HGC, 1=EGA, 2=VGA+ */ |
26 | u16 video_segment; | ||
27 | int force_x, force_y; /* Don't query the BIOS for cols/rows */ | 26 | int force_x, force_y; /* Don't query the BIOS for cols/rows */ |
28 | |||
29 | int do_restore; /* Screen contents changed during mode flip */ | 27 | int do_restore; /* Screen contents changed during mode flip */ |
30 | int graphic_mode; /* Graphic mode with linear frame buffer */ | 28 | int graphic_mode; /* Graphic mode with linear frame buffer */ |
31 | 29 | ||
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c index 43eda284d27f..05111bb8d018 100644 --- a/arch/x86/boot/video.c +++ b/arch/x86/boot/video.c | |||
@@ -17,6 +17,8 @@ | |||
17 | #include "video.h" | 17 | #include "video.h" |
18 | #include "vesa.h" | 18 | #include "vesa.h" |
19 | 19 | ||
20 | static u16 video_segment; | ||
21 | |||
20 | static void store_cursor_position(void) | 22 | static void store_cursor_position(void) |
21 | { | 23 | { |
22 | struct biosregs ireg, oreg; | 24 | struct biosregs ireg, oreg; |
diff --git a/arch/x86/boot/video.h b/arch/x86/boot/video.h index 0bb25491262d..b54e0328c449 100644 --- a/arch/x86/boot/video.h +++ b/arch/x86/boot/video.h | |||
@@ -91,7 +91,6 @@ int mode_defined(u16 mode); /* video.c */ | |||
91 | #define ADAPTER_VGA 2 | 91 | #define ADAPTER_VGA 2 |
92 | 92 | ||
93 | extern int adapter; | 93 | extern int adapter; |
94 | extern u16 video_segment; | ||
95 | extern int force_x, force_y; /* Don't query the BIOS for cols/rows */ | 94 | extern int force_x, force_y; /* Don't query the BIOS for cols/rows */ |
96 | extern int do_restore; /* Restore screen contents */ | 95 | extern int do_restore; /* Restore screen contents */ |
97 | extern int graphic_mode; /* Graphics mode with linear frame buffer */ | 96 | extern int graphic_mode; /* Graphics mode with linear frame buffer */ |
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 419819d6dab3..aaa1118bf01e 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig | |||
@@ -248,7 +248,7 @@ CONFIG_USB=y | |||
248 | CONFIG_USB_ANNOUNCE_NEW_DEVICES=y | 248 | CONFIG_USB_ANNOUNCE_NEW_DEVICES=y |
249 | CONFIG_USB_MON=y | 249 | CONFIG_USB_MON=y |
250 | CONFIG_USB_EHCI_HCD=y | 250 | CONFIG_USB_EHCI_HCD=y |
251 | # CONFIG_USB_EHCI_TT_NEWSCHED is not set | 251 | CONFIG_USB_EHCI_TT_NEWSCHED=y |
252 | CONFIG_USB_OHCI_HCD=y | 252 | CONFIG_USB_OHCI_HCD=y |
253 | CONFIG_USB_UHCI_HCD=y | 253 | CONFIG_USB_UHCI_HCD=y |
254 | CONFIG_USB_PRINTER=y | 254 | CONFIG_USB_PRINTER=y |
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 4c311ddd973b..315b86106572 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig | |||
@@ -243,7 +243,7 @@ CONFIG_USB=y | |||
243 | CONFIG_USB_ANNOUNCE_NEW_DEVICES=y | 243 | CONFIG_USB_ANNOUNCE_NEW_DEVICES=y |
244 | CONFIG_USB_MON=y | 244 | CONFIG_USB_MON=y |
245 | CONFIG_USB_EHCI_HCD=y | 245 | CONFIG_USB_EHCI_HCD=y |
246 | # CONFIG_USB_EHCI_TT_NEWSCHED is not set | 246 | CONFIG_USB_EHCI_TT_NEWSCHED=y |
247 | CONFIG_USB_OHCI_HCD=y | 247 | CONFIG_USB_OHCI_HCD=y |
248 | CONFIG_USB_UHCI_HCD=y | 248 | CONFIG_USB_UHCI_HCD=y |
249 | CONFIG_USB_PRINTER=y | 249 | CONFIG_USB_PRINTER=y |
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 54f60ab41c63..112cefacf2af 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c | |||
@@ -797,7 +797,9 @@ static int rfc4106_init(struct crypto_tfm *tfm) | |||
797 | PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN); | 797 | PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN); |
798 | struct crypto_aead *cryptd_child; | 798 | struct crypto_aead *cryptd_child; |
799 | struct aesni_rfc4106_gcm_ctx *child_ctx; | 799 | struct aesni_rfc4106_gcm_ctx *child_ctx; |
800 | cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni", 0, 0); | 800 | cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni", |
801 | CRYPTO_ALG_INTERNAL, | ||
802 | CRYPTO_ALG_INTERNAL); | ||
801 | if (IS_ERR(cryptd_tfm)) | 803 | if (IS_ERR(cryptd_tfm)) |
802 | return PTR_ERR(cryptd_tfm); | 804 | return PTR_ERR(cryptd_tfm); |
803 | 805 | ||
@@ -890,15 +892,12 @@ out_free_ablkcipher: | |||
890 | return ret; | 892 | return ret; |
891 | } | 893 | } |
892 | 894 | ||
893 | static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key, | 895 | static int common_rfc4106_set_key(struct crypto_aead *aead, const u8 *key, |
894 | unsigned int key_len) | 896 | unsigned int key_len) |
895 | { | 897 | { |
896 | int ret = 0; | 898 | int ret = 0; |
897 | struct crypto_tfm *tfm = crypto_aead_tfm(parent); | 899 | struct crypto_tfm *tfm = crypto_aead_tfm(aead); |
898 | struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent); | 900 | struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(aead); |
899 | struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); | ||
900 | struct aesni_rfc4106_gcm_ctx *child_ctx = | ||
901 | aesni_rfc4106_gcm_ctx_get(cryptd_child); | ||
902 | u8 *new_key_align, *new_key_mem = NULL; | 901 | u8 *new_key_align, *new_key_mem = NULL; |
903 | 902 | ||
904 | if (key_len < 4) { | 903 | if (key_len < 4) { |
@@ -943,20 +942,31 @@ static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key, | |||
943 | goto exit; | 942 | goto exit; |
944 | } | 943 | } |
945 | ret = rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len); | 944 | ret = rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len); |
946 | memcpy(child_ctx, ctx, sizeof(*ctx)); | ||
947 | exit: | 945 | exit: |
948 | kfree(new_key_mem); | 946 | kfree(new_key_mem); |
949 | return ret; | 947 | return ret; |
950 | } | 948 | } |
951 | 949 | ||
952 | /* This is the Integrity Check Value (aka the authentication tag length and can | 950 | static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key, |
953 | * be 8, 12 or 16 bytes long. */ | 951 | unsigned int key_len) |
954 | static int rfc4106_set_authsize(struct crypto_aead *parent, | ||
955 | unsigned int authsize) | ||
956 | { | 952 | { |
957 | struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent); | 953 | struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent); |
958 | struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); | 954 | struct crypto_aead *child = cryptd_aead_child(ctx->cryptd_tfm); |
955 | struct aesni_rfc4106_gcm_ctx *c_ctx = aesni_rfc4106_gcm_ctx_get(child); | ||
956 | struct cryptd_aead *cryptd_tfm = ctx->cryptd_tfm; | ||
957 | int ret; | ||
959 | 958 | ||
959 | ret = crypto_aead_setkey(child, key, key_len); | ||
960 | if (!ret) { | ||
961 | memcpy(ctx, c_ctx, sizeof(*ctx)); | ||
962 | ctx->cryptd_tfm = cryptd_tfm; | ||
963 | } | ||
964 | return ret; | ||
965 | } | ||
966 | |||
967 | static int common_rfc4106_set_authsize(struct crypto_aead *aead, | ||
968 | unsigned int authsize) | ||
969 | { | ||
960 | switch (authsize) { | 970 | switch (authsize) { |
961 | case 8: | 971 | case 8: |
962 | case 12: | 972 | case 12: |
@@ -965,51 +975,23 @@ static int rfc4106_set_authsize(struct crypto_aead *parent, | |||
965 | default: | 975 | default: |
966 | return -EINVAL; | 976 | return -EINVAL; |
967 | } | 977 | } |
968 | crypto_aead_crt(parent)->authsize = authsize; | 978 | crypto_aead_crt(aead)->authsize = authsize; |
969 | crypto_aead_crt(cryptd_child)->authsize = authsize; | ||
970 | return 0; | 979 | return 0; |
971 | } | 980 | } |
972 | 981 | ||
973 | static int rfc4106_encrypt(struct aead_request *req) | 982 | /* This is the Integrity Check Value (aka the authentication tag length and can |
974 | { | 983 | * be 8, 12 or 16 bytes long. */ |
975 | int ret; | 984 | static int rfc4106_set_authsize(struct crypto_aead *parent, |
976 | struct crypto_aead *tfm = crypto_aead_reqtfm(req); | 985 | unsigned int authsize) |
977 | struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); | ||
978 | |||
979 | if (!irq_fpu_usable()) { | ||
980 | struct aead_request *cryptd_req = | ||
981 | (struct aead_request *) aead_request_ctx(req); | ||
982 | memcpy(cryptd_req, req, sizeof(*req)); | ||
983 | aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); | ||
984 | return crypto_aead_encrypt(cryptd_req); | ||
985 | } else { | ||
986 | struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); | ||
987 | kernel_fpu_begin(); | ||
988 | ret = cryptd_child->base.crt_aead.encrypt(req); | ||
989 | kernel_fpu_end(); | ||
990 | return ret; | ||
991 | } | ||
992 | } | ||
993 | |||
994 | static int rfc4106_decrypt(struct aead_request *req) | ||
995 | { | 986 | { |
987 | struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent); | ||
988 | struct crypto_aead *child = cryptd_aead_child(ctx->cryptd_tfm); | ||
996 | int ret; | 989 | int ret; |
997 | struct crypto_aead *tfm = crypto_aead_reqtfm(req); | ||
998 | struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); | ||
999 | 990 | ||
1000 | if (!irq_fpu_usable()) { | 991 | ret = crypto_aead_setauthsize(child, authsize); |
1001 | struct aead_request *cryptd_req = | 992 | if (!ret) |
1002 | (struct aead_request *) aead_request_ctx(req); | 993 | crypto_aead_crt(parent)->authsize = authsize; |
1003 | memcpy(cryptd_req, req, sizeof(*req)); | 994 | return ret; |
1004 | aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); | ||
1005 | return crypto_aead_decrypt(cryptd_req); | ||
1006 | } else { | ||
1007 | struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); | ||
1008 | kernel_fpu_begin(); | ||
1009 | ret = cryptd_child->base.crt_aead.decrypt(req); | ||
1010 | kernel_fpu_end(); | ||
1011 | return ret; | ||
1012 | } | ||
1013 | } | 995 | } |
1014 | 996 | ||
1015 | static int __driver_rfc4106_encrypt(struct aead_request *req) | 997 | static int __driver_rfc4106_encrypt(struct aead_request *req) |
@@ -1185,6 +1167,78 @@ static int __driver_rfc4106_decrypt(struct aead_request *req) | |||
1185 | } | 1167 | } |
1186 | return retval; | 1168 | return retval; |
1187 | } | 1169 | } |
1170 | |||
1171 | static int rfc4106_encrypt(struct aead_request *req) | ||
1172 | { | ||
1173 | int ret; | ||
1174 | struct crypto_aead *tfm = crypto_aead_reqtfm(req); | ||
1175 | struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); | ||
1176 | |||
1177 | if (!irq_fpu_usable()) { | ||
1178 | struct aead_request *cryptd_req = | ||
1179 | (struct aead_request *) aead_request_ctx(req); | ||
1180 | |||
1181 | memcpy(cryptd_req, req, sizeof(*req)); | ||
1182 | aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); | ||
1183 | ret = crypto_aead_encrypt(cryptd_req); | ||
1184 | } else { | ||
1185 | kernel_fpu_begin(); | ||
1186 | ret = __driver_rfc4106_encrypt(req); | ||
1187 | kernel_fpu_end(); | ||
1188 | } | ||
1189 | return ret; | ||
1190 | } | ||
1191 | |||
1192 | static int rfc4106_decrypt(struct aead_request *req) | ||
1193 | { | ||
1194 | int ret; | ||
1195 | struct crypto_aead *tfm = crypto_aead_reqtfm(req); | ||
1196 | struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); | ||
1197 | |||
1198 | if (!irq_fpu_usable()) { | ||
1199 | struct aead_request *cryptd_req = | ||
1200 | (struct aead_request *) aead_request_ctx(req); | ||
1201 | |||
1202 | memcpy(cryptd_req, req, sizeof(*req)); | ||
1203 | aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); | ||
1204 | ret = crypto_aead_decrypt(cryptd_req); | ||
1205 | } else { | ||
1206 | kernel_fpu_begin(); | ||
1207 | ret = __driver_rfc4106_decrypt(req); | ||
1208 | kernel_fpu_end(); | ||
1209 | } | ||
1210 | return ret; | ||
1211 | } | ||
1212 | |||
1213 | static int helper_rfc4106_encrypt(struct aead_request *req) | ||
1214 | { | ||
1215 | int ret; | ||
1216 | |||
1217 | if (unlikely(!irq_fpu_usable())) { | ||
1218 | WARN_ONCE(1, "__gcm-aes-aesni alg used in invalid context"); | ||
1219 | ret = -EINVAL; | ||
1220 | } else { | ||
1221 | kernel_fpu_begin(); | ||
1222 | ret = __driver_rfc4106_encrypt(req); | ||
1223 | kernel_fpu_end(); | ||
1224 | } | ||
1225 | return ret; | ||
1226 | } | ||
1227 | |||
1228 | static int helper_rfc4106_decrypt(struct aead_request *req) | ||
1229 | { | ||
1230 | int ret; | ||
1231 | |||
1232 | if (unlikely(!irq_fpu_usable())) { | ||
1233 | WARN_ONCE(1, "__gcm-aes-aesni alg used in invalid context"); | ||
1234 | ret = -EINVAL; | ||
1235 | } else { | ||
1236 | kernel_fpu_begin(); | ||
1237 | ret = __driver_rfc4106_decrypt(req); | ||
1238 | kernel_fpu_end(); | ||
1239 | } | ||
1240 | return ret; | ||
1241 | } | ||
1188 | #endif | 1242 | #endif |
1189 | 1243 | ||
1190 | static struct crypto_alg aesni_algs[] = { { | 1244 | static struct crypto_alg aesni_algs[] = { { |
@@ -1210,7 +1264,7 @@ static struct crypto_alg aesni_algs[] = { { | |||
1210 | .cra_name = "__aes-aesni", | 1264 | .cra_name = "__aes-aesni", |
1211 | .cra_driver_name = "__driver-aes-aesni", | 1265 | .cra_driver_name = "__driver-aes-aesni", |
1212 | .cra_priority = 0, | 1266 | .cra_priority = 0, |
1213 | .cra_flags = CRYPTO_ALG_TYPE_CIPHER, | 1267 | .cra_flags = CRYPTO_ALG_TYPE_CIPHER | CRYPTO_ALG_INTERNAL, |
1214 | .cra_blocksize = AES_BLOCK_SIZE, | 1268 | .cra_blocksize = AES_BLOCK_SIZE, |
1215 | .cra_ctxsize = sizeof(struct crypto_aes_ctx) + | 1269 | .cra_ctxsize = sizeof(struct crypto_aes_ctx) + |
1216 | AESNI_ALIGN - 1, | 1270 | AESNI_ALIGN - 1, |
@@ -1229,7 +1283,8 @@ static struct crypto_alg aesni_algs[] = { { | |||
1229 | .cra_name = "__ecb-aes-aesni", | 1283 | .cra_name = "__ecb-aes-aesni", |
1230 | .cra_driver_name = "__driver-ecb-aes-aesni", | 1284 | .cra_driver_name = "__driver-ecb-aes-aesni", |
1231 | .cra_priority = 0, | 1285 | .cra_priority = 0, |
1232 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 1286 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
1287 | CRYPTO_ALG_INTERNAL, | ||
1233 | .cra_blocksize = AES_BLOCK_SIZE, | 1288 | .cra_blocksize = AES_BLOCK_SIZE, |
1234 | .cra_ctxsize = sizeof(struct crypto_aes_ctx) + | 1289 | .cra_ctxsize = sizeof(struct crypto_aes_ctx) + |
1235 | AESNI_ALIGN - 1, | 1290 | AESNI_ALIGN - 1, |
@@ -1249,7 +1304,8 @@ static struct crypto_alg aesni_algs[] = { { | |||
1249 | .cra_name = "__cbc-aes-aesni", | 1304 | .cra_name = "__cbc-aes-aesni", |
1250 | .cra_driver_name = "__driver-cbc-aes-aesni", | 1305 | .cra_driver_name = "__driver-cbc-aes-aesni", |
1251 | .cra_priority = 0, | 1306 | .cra_priority = 0, |
1252 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 1307 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
1308 | CRYPTO_ALG_INTERNAL, | ||
1253 | .cra_blocksize = AES_BLOCK_SIZE, | 1309 | .cra_blocksize = AES_BLOCK_SIZE, |
1254 | .cra_ctxsize = sizeof(struct crypto_aes_ctx) + | 1310 | .cra_ctxsize = sizeof(struct crypto_aes_ctx) + |
1255 | AESNI_ALIGN - 1, | 1311 | AESNI_ALIGN - 1, |
@@ -1313,7 +1369,8 @@ static struct crypto_alg aesni_algs[] = { { | |||
1313 | .cra_name = "__ctr-aes-aesni", | 1369 | .cra_name = "__ctr-aes-aesni", |
1314 | .cra_driver_name = "__driver-ctr-aes-aesni", | 1370 | .cra_driver_name = "__driver-ctr-aes-aesni", |
1315 | .cra_priority = 0, | 1371 | .cra_priority = 0, |
1316 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 1372 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
1373 | CRYPTO_ALG_INTERNAL, | ||
1317 | .cra_blocksize = 1, | 1374 | .cra_blocksize = 1, |
1318 | .cra_ctxsize = sizeof(struct crypto_aes_ctx) + | 1375 | .cra_ctxsize = sizeof(struct crypto_aes_ctx) + |
1319 | AESNI_ALIGN - 1, | 1376 | AESNI_ALIGN - 1, |
@@ -1357,7 +1414,7 @@ static struct crypto_alg aesni_algs[] = { { | |||
1357 | .cra_name = "__gcm-aes-aesni", | 1414 | .cra_name = "__gcm-aes-aesni", |
1358 | .cra_driver_name = "__driver-gcm-aes-aesni", | 1415 | .cra_driver_name = "__driver-gcm-aes-aesni", |
1359 | .cra_priority = 0, | 1416 | .cra_priority = 0, |
1360 | .cra_flags = CRYPTO_ALG_TYPE_AEAD, | 1417 | .cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_INTERNAL, |
1361 | .cra_blocksize = 1, | 1418 | .cra_blocksize = 1, |
1362 | .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + | 1419 | .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + |
1363 | AESNI_ALIGN, | 1420 | AESNI_ALIGN, |
@@ -1366,8 +1423,12 @@ static struct crypto_alg aesni_algs[] = { { | |||
1366 | .cra_module = THIS_MODULE, | 1423 | .cra_module = THIS_MODULE, |
1367 | .cra_u = { | 1424 | .cra_u = { |
1368 | .aead = { | 1425 | .aead = { |
1369 | .encrypt = __driver_rfc4106_encrypt, | 1426 | .setkey = common_rfc4106_set_key, |
1370 | .decrypt = __driver_rfc4106_decrypt, | 1427 | .setauthsize = common_rfc4106_set_authsize, |
1428 | .encrypt = helper_rfc4106_encrypt, | ||
1429 | .decrypt = helper_rfc4106_decrypt, | ||
1430 | .ivsize = 8, | ||
1431 | .maxauthsize = 16, | ||
1371 | }, | 1432 | }, |
1372 | }, | 1433 | }, |
1373 | }, { | 1434 | }, { |
@@ -1423,7 +1484,8 @@ static struct crypto_alg aesni_algs[] = { { | |||
1423 | .cra_name = "__lrw-aes-aesni", | 1484 | .cra_name = "__lrw-aes-aesni", |
1424 | .cra_driver_name = "__driver-lrw-aes-aesni", | 1485 | .cra_driver_name = "__driver-lrw-aes-aesni", |
1425 | .cra_priority = 0, | 1486 | .cra_priority = 0, |
1426 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 1487 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
1488 | CRYPTO_ALG_INTERNAL, | ||
1427 | .cra_blocksize = AES_BLOCK_SIZE, | 1489 | .cra_blocksize = AES_BLOCK_SIZE, |
1428 | .cra_ctxsize = sizeof(struct aesni_lrw_ctx), | 1490 | .cra_ctxsize = sizeof(struct aesni_lrw_ctx), |
1429 | .cra_alignmask = 0, | 1491 | .cra_alignmask = 0, |
@@ -1444,7 +1506,8 @@ static struct crypto_alg aesni_algs[] = { { | |||
1444 | .cra_name = "__xts-aes-aesni", | 1506 | .cra_name = "__xts-aes-aesni", |
1445 | .cra_driver_name = "__driver-xts-aes-aesni", | 1507 | .cra_driver_name = "__driver-xts-aes-aesni", |
1446 | .cra_priority = 0, | 1508 | .cra_priority = 0, |
1447 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 1509 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
1510 | CRYPTO_ALG_INTERNAL, | ||
1448 | .cra_blocksize = AES_BLOCK_SIZE, | 1511 | .cra_blocksize = AES_BLOCK_SIZE, |
1449 | .cra_ctxsize = sizeof(struct aesni_xts_ctx), | 1512 | .cra_ctxsize = sizeof(struct aesni_xts_ctx), |
1450 | .cra_alignmask = 0, | 1513 | .cra_alignmask = 0, |
diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c index 9a07fafe3831..baf0ac21ace5 100644 --- a/arch/x86/crypto/camellia_aesni_avx2_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c | |||
@@ -343,7 +343,8 @@ static struct crypto_alg cmll_algs[10] = { { | |||
343 | .cra_name = "__ecb-camellia-aesni-avx2", | 343 | .cra_name = "__ecb-camellia-aesni-avx2", |
344 | .cra_driver_name = "__driver-ecb-camellia-aesni-avx2", | 344 | .cra_driver_name = "__driver-ecb-camellia-aesni-avx2", |
345 | .cra_priority = 0, | 345 | .cra_priority = 0, |
346 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 346 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
347 | CRYPTO_ALG_INTERNAL, | ||
347 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, | 348 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, |
348 | .cra_ctxsize = sizeof(struct camellia_ctx), | 349 | .cra_ctxsize = sizeof(struct camellia_ctx), |
349 | .cra_alignmask = 0, | 350 | .cra_alignmask = 0, |
@@ -362,7 +363,8 @@ static struct crypto_alg cmll_algs[10] = { { | |||
362 | .cra_name = "__cbc-camellia-aesni-avx2", | 363 | .cra_name = "__cbc-camellia-aesni-avx2", |
363 | .cra_driver_name = "__driver-cbc-camellia-aesni-avx2", | 364 | .cra_driver_name = "__driver-cbc-camellia-aesni-avx2", |
364 | .cra_priority = 0, | 365 | .cra_priority = 0, |
365 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 366 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
367 | CRYPTO_ALG_INTERNAL, | ||
366 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, | 368 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, |
367 | .cra_ctxsize = sizeof(struct camellia_ctx), | 369 | .cra_ctxsize = sizeof(struct camellia_ctx), |
368 | .cra_alignmask = 0, | 370 | .cra_alignmask = 0, |
@@ -381,7 +383,8 @@ static struct crypto_alg cmll_algs[10] = { { | |||
381 | .cra_name = "__ctr-camellia-aesni-avx2", | 383 | .cra_name = "__ctr-camellia-aesni-avx2", |
382 | .cra_driver_name = "__driver-ctr-camellia-aesni-avx2", | 384 | .cra_driver_name = "__driver-ctr-camellia-aesni-avx2", |
383 | .cra_priority = 0, | 385 | .cra_priority = 0, |
384 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 386 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
387 | CRYPTO_ALG_INTERNAL, | ||
385 | .cra_blocksize = 1, | 388 | .cra_blocksize = 1, |
386 | .cra_ctxsize = sizeof(struct camellia_ctx), | 389 | .cra_ctxsize = sizeof(struct camellia_ctx), |
387 | .cra_alignmask = 0, | 390 | .cra_alignmask = 0, |
@@ -401,7 +404,8 @@ static struct crypto_alg cmll_algs[10] = { { | |||
401 | .cra_name = "__lrw-camellia-aesni-avx2", | 404 | .cra_name = "__lrw-camellia-aesni-avx2", |
402 | .cra_driver_name = "__driver-lrw-camellia-aesni-avx2", | 405 | .cra_driver_name = "__driver-lrw-camellia-aesni-avx2", |
403 | .cra_priority = 0, | 406 | .cra_priority = 0, |
404 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 407 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
408 | CRYPTO_ALG_INTERNAL, | ||
405 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, | 409 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, |
406 | .cra_ctxsize = sizeof(struct camellia_lrw_ctx), | 410 | .cra_ctxsize = sizeof(struct camellia_lrw_ctx), |
407 | .cra_alignmask = 0, | 411 | .cra_alignmask = 0, |
@@ -424,7 +428,8 @@ static struct crypto_alg cmll_algs[10] = { { | |||
424 | .cra_name = "__xts-camellia-aesni-avx2", | 428 | .cra_name = "__xts-camellia-aesni-avx2", |
425 | .cra_driver_name = "__driver-xts-camellia-aesni-avx2", | 429 | .cra_driver_name = "__driver-xts-camellia-aesni-avx2", |
426 | .cra_priority = 0, | 430 | .cra_priority = 0, |
427 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 431 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
432 | CRYPTO_ALG_INTERNAL, | ||
428 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, | 433 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, |
429 | .cra_ctxsize = sizeof(struct camellia_xts_ctx), | 434 | .cra_ctxsize = sizeof(struct camellia_xts_ctx), |
430 | .cra_alignmask = 0, | 435 | .cra_alignmask = 0, |
diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c index ed38d959add6..78818a1e73e3 100644 --- a/arch/x86/crypto/camellia_aesni_avx_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c | |||
@@ -335,7 +335,8 @@ static struct crypto_alg cmll_algs[10] = { { | |||
335 | .cra_name = "__ecb-camellia-aesni", | 335 | .cra_name = "__ecb-camellia-aesni", |
336 | .cra_driver_name = "__driver-ecb-camellia-aesni", | 336 | .cra_driver_name = "__driver-ecb-camellia-aesni", |
337 | .cra_priority = 0, | 337 | .cra_priority = 0, |
338 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 338 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
339 | CRYPTO_ALG_INTERNAL, | ||
339 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, | 340 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, |
340 | .cra_ctxsize = sizeof(struct camellia_ctx), | 341 | .cra_ctxsize = sizeof(struct camellia_ctx), |
341 | .cra_alignmask = 0, | 342 | .cra_alignmask = 0, |
@@ -354,7 +355,8 @@ static struct crypto_alg cmll_algs[10] = { { | |||
354 | .cra_name = "__cbc-camellia-aesni", | 355 | .cra_name = "__cbc-camellia-aesni", |
355 | .cra_driver_name = "__driver-cbc-camellia-aesni", | 356 | .cra_driver_name = "__driver-cbc-camellia-aesni", |
356 | .cra_priority = 0, | 357 | .cra_priority = 0, |
357 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 358 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
359 | CRYPTO_ALG_INTERNAL, | ||
358 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, | 360 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, |
359 | .cra_ctxsize = sizeof(struct camellia_ctx), | 361 | .cra_ctxsize = sizeof(struct camellia_ctx), |
360 | .cra_alignmask = 0, | 362 | .cra_alignmask = 0, |
@@ -373,7 +375,8 @@ static struct crypto_alg cmll_algs[10] = { { | |||
373 | .cra_name = "__ctr-camellia-aesni", | 375 | .cra_name = "__ctr-camellia-aesni", |
374 | .cra_driver_name = "__driver-ctr-camellia-aesni", | 376 | .cra_driver_name = "__driver-ctr-camellia-aesni", |
375 | .cra_priority = 0, | 377 | .cra_priority = 0, |
376 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 378 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
379 | CRYPTO_ALG_INTERNAL, | ||
377 | .cra_blocksize = 1, | 380 | .cra_blocksize = 1, |
378 | .cra_ctxsize = sizeof(struct camellia_ctx), | 381 | .cra_ctxsize = sizeof(struct camellia_ctx), |
379 | .cra_alignmask = 0, | 382 | .cra_alignmask = 0, |
@@ -393,7 +396,8 @@ static struct crypto_alg cmll_algs[10] = { { | |||
393 | .cra_name = "__lrw-camellia-aesni", | 396 | .cra_name = "__lrw-camellia-aesni", |
394 | .cra_driver_name = "__driver-lrw-camellia-aesni", | 397 | .cra_driver_name = "__driver-lrw-camellia-aesni", |
395 | .cra_priority = 0, | 398 | .cra_priority = 0, |
396 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 399 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
400 | CRYPTO_ALG_INTERNAL, | ||
397 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, | 401 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, |
398 | .cra_ctxsize = sizeof(struct camellia_lrw_ctx), | 402 | .cra_ctxsize = sizeof(struct camellia_lrw_ctx), |
399 | .cra_alignmask = 0, | 403 | .cra_alignmask = 0, |
@@ -416,7 +420,8 @@ static struct crypto_alg cmll_algs[10] = { { | |||
416 | .cra_name = "__xts-camellia-aesni", | 420 | .cra_name = "__xts-camellia-aesni", |
417 | .cra_driver_name = "__driver-xts-camellia-aesni", | 421 | .cra_driver_name = "__driver-xts-camellia-aesni", |
418 | .cra_priority = 0, | 422 | .cra_priority = 0, |
419 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 423 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
424 | CRYPTO_ALG_INTERNAL, | ||
420 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, | 425 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, |
421 | .cra_ctxsize = sizeof(struct camellia_xts_ctx), | 426 | .cra_ctxsize = sizeof(struct camellia_xts_ctx), |
422 | .cra_alignmask = 0, | 427 | .cra_alignmask = 0, |
diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c index 60ada677a928..236c80974457 100644 --- a/arch/x86/crypto/cast5_avx_glue.c +++ b/arch/x86/crypto/cast5_avx_glue.c | |||
@@ -341,7 +341,8 @@ static struct crypto_alg cast5_algs[6] = { { | |||
341 | .cra_name = "__ecb-cast5-avx", | 341 | .cra_name = "__ecb-cast5-avx", |
342 | .cra_driver_name = "__driver-ecb-cast5-avx", | 342 | .cra_driver_name = "__driver-ecb-cast5-avx", |
343 | .cra_priority = 0, | 343 | .cra_priority = 0, |
344 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 344 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
345 | CRYPTO_ALG_INTERNAL, | ||
345 | .cra_blocksize = CAST5_BLOCK_SIZE, | 346 | .cra_blocksize = CAST5_BLOCK_SIZE, |
346 | .cra_ctxsize = sizeof(struct cast5_ctx), | 347 | .cra_ctxsize = sizeof(struct cast5_ctx), |
347 | .cra_alignmask = 0, | 348 | .cra_alignmask = 0, |
@@ -360,7 +361,8 @@ static struct crypto_alg cast5_algs[6] = { { | |||
360 | .cra_name = "__cbc-cast5-avx", | 361 | .cra_name = "__cbc-cast5-avx", |
361 | .cra_driver_name = "__driver-cbc-cast5-avx", | 362 | .cra_driver_name = "__driver-cbc-cast5-avx", |
362 | .cra_priority = 0, | 363 | .cra_priority = 0, |
363 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 364 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
365 | CRYPTO_ALG_INTERNAL, | ||
364 | .cra_blocksize = CAST5_BLOCK_SIZE, | 366 | .cra_blocksize = CAST5_BLOCK_SIZE, |
365 | .cra_ctxsize = sizeof(struct cast5_ctx), | 367 | .cra_ctxsize = sizeof(struct cast5_ctx), |
366 | .cra_alignmask = 0, | 368 | .cra_alignmask = 0, |
@@ -379,7 +381,8 @@ static struct crypto_alg cast5_algs[6] = { { | |||
379 | .cra_name = "__ctr-cast5-avx", | 381 | .cra_name = "__ctr-cast5-avx", |
380 | .cra_driver_name = "__driver-ctr-cast5-avx", | 382 | .cra_driver_name = "__driver-ctr-cast5-avx", |
381 | .cra_priority = 0, | 383 | .cra_priority = 0, |
382 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 384 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
385 | CRYPTO_ALG_INTERNAL, | ||
383 | .cra_blocksize = 1, | 386 | .cra_blocksize = 1, |
384 | .cra_ctxsize = sizeof(struct cast5_ctx), | 387 | .cra_ctxsize = sizeof(struct cast5_ctx), |
385 | .cra_alignmask = 0, | 388 | .cra_alignmask = 0, |
diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c index 0160f68a57ff..f448810ca4ac 100644 --- a/arch/x86/crypto/cast6_avx_glue.c +++ b/arch/x86/crypto/cast6_avx_glue.c | |||
@@ -372,7 +372,8 @@ static struct crypto_alg cast6_algs[10] = { { | |||
372 | .cra_name = "__ecb-cast6-avx", | 372 | .cra_name = "__ecb-cast6-avx", |
373 | .cra_driver_name = "__driver-ecb-cast6-avx", | 373 | .cra_driver_name = "__driver-ecb-cast6-avx", |
374 | .cra_priority = 0, | 374 | .cra_priority = 0, |
375 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 375 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
376 | CRYPTO_ALG_INTERNAL, | ||
376 | .cra_blocksize = CAST6_BLOCK_SIZE, | 377 | .cra_blocksize = CAST6_BLOCK_SIZE, |
377 | .cra_ctxsize = sizeof(struct cast6_ctx), | 378 | .cra_ctxsize = sizeof(struct cast6_ctx), |
378 | .cra_alignmask = 0, | 379 | .cra_alignmask = 0, |
@@ -391,7 +392,8 @@ static struct crypto_alg cast6_algs[10] = { { | |||
391 | .cra_name = "__cbc-cast6-avx", | 392 | .cra_name = "__cbc-cast6-avx", |
392 | .cra_driver_name = "__driver-cbc-cast6-avx", | 393 | .cra_driver_name = "__driver-cbc-cast6-avx", |
393 | .cra_priority = 0, | 394 | .cra_priority = 0, |
394 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 395 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
396 | CRYPTO_ALG_INTERNAL, | ||
395 | .cra_blocksize = CAST6_BLOCK_SIZE, | 397 | .cra_blocksize = CAST6_BLOCK_SIZE, |
396 | .cra_ctxsize = sizeof(struct cast6_ctx), | 398 | .cra_ctxsize = sizeof(struct cast6_ctx), |
397 | .cra_alignmask = 0, | 399 | .cra_alignmask = 0, |
@@ -410,7 +412,8 @@ static struct crypto_alg cast6_algs[10] = { { | |||
410 | .cra_name = "__ctr-cast6-avx", | 412 | .cra_name = "__ctr-cast6-avx", |
411 | .cra_driver_name = "__driver-ctr-cast6-avx", | 413 | .cra_driver_name = "__driver-ctr-cast6-avx", |
412 | .cra_priority = 0, | 414 | .cra_priority = 0, |
413 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 415 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
416 | CRYPTO_ALG_INTERNAL, | ||
414 | .cra_blocksize = 1, | 417 | .cra_blocksize = 1, |
415 | .cra_ctxsize = sizeof(struct cast6_ctx), | 418 | .cra_ctxsize = sizeof(struct cast6_ctx), |
416 | .cra_alignmask = 0, | 419 | .cra_alignmask = 0, |
@@ -430,7 +433,8 @@ static struct crypto_alg cast6_algs[10] = { { | |||
430 | .cra_name = "__lrw-cast6-avx", | 433 | .cra_name = "__lrw-cast6-avx", |
431 | .cra_driver_name = "__driver-lrw-cast6-avx", | 434 | .cra_driver_name = "__driver-lrw-cast6-avx", |
432 | .cra_priority = 0, | 435 | .cra_priority = 0, |
433 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 436 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
437 | CRYPTO_ALG_INTERNAL, | ||
434 | .cra_blocksize = CAST6_BLOCK_SIZE, | 438 | .cra_blocksize = CAST6_BLOCK_SIZE, |
435 | .cra_ctxsize = sizeof(struct cast6_lrw_ctx), | 439 | .cra_ctxsize = sizeof(struct cast6_lrw_ctx), |
436 | .cra_alignmask = 0, | 440 | .cra_alignmask = 0, |
@@ -453,7 +457,8 @@ static struct crypto_alg cast6_algs[10] = { { | |||
453 | .cra_name = "__xts-cast6-avx", | 457 | .cra_name = "__xts-cast6-avx", |
454 | .cra_driver_name = "__driver-xts-cast6-avx", | 458 | .cra_driver_name = "__driver-xts-cast6-avx", |
455 | .cra_priority = 0, | 459 | .cra_priority = 0, |
456 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 460 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
461 | CRYPTO_ALG_INTERNAL, | ||
457 | .cra_blocksize = CAST6_BLOCK_SIZE, | 462 | .cra_blocksize = CAST6_BLOCK_SIZE, |
458 | .cra_ctxsize = sizeof(struct cast6_xts_ctx), | 463 | .cra_ctxsize = sizeof(struct cast6_xts_ctx), |
459 | .cra_alignmask = 0, | 464 | .cra_alignmask = 0, |
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S index 26d49ebae040..225be06edc80 100644 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S | |||
@@ -178,7 +178,7 @@ continue_block: | |||
178 | ## 2a) PROCESS FULL BLOCKS: | 178 | ## 2a) PROCESS FULL BLOCKS: |
179 | ################################################################ | 179 | ################################################################ |
180 | full_block: | 180 | full_block: |
181 | movq $128,%rax | 181 | movl $128,%eax |
182 | lea 128*8*2(block_0), block_1 | 182 | lea 128*8*2(block_0), block_1 |
183 | lea 128*8*3(block_0), block_2 | 183 | lea 128*8*3(block_0), block_2 |
184 | add $128*8*1, block_0 | 184 | add $128*8*1, block_0 |
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index 8253d85aa165..2079baf06bdd 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c | |||
@@ -154,7 +154,8 @@ static struct shash_alg ghash_alg = { | |||
154 | .cra_name = "__ghash", | 154 | .cra_name = "__ghash", |
155 | .cra_driver_name = "__ghash-pclmulqdqni", | 155 | .cra_driver_name = "__ghash-pclmulqdqni", |
156 | .cra_priority = 0, | 156 | .cra_priority = 0, |
157 | .cra_flags = CRYPTO_ALG_TYPE_SHASH, | 157 | .cra_flags = CRYPTO_ALG_TYPE_SHASH | |
158 | CRYPTO_ALG_INTERNAL, | ||
158 | .cra_blocksize = GHASH_BLOCK_SIZE, | 159 | .cra_blocksize = GHASH_BLOCK_SIZE, |
159 | .cra_ctxsize = sizeof(struct ghash_ctx), | 160 | .cra_ctxsize = sizeof(struct ghash_ctx), |
160 | .cra_module = THIS_MODULE, | 161 | .cra_module = THIS_MODULE, |
@@ -261,7 +262,9 @@ static int ghash_async_init_tfm(struct crypto_tfm *tfm) | |||
261 | struct cryptd_ahash *cryptd_tfm; | 262 | struct cryptd_ahash *cryptd_tfm; |
262 | struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm); | 263 | struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm); |
263 | 264 | ||
264 | cryptd_tfm = cryptd_alloc_ahash("__ghash-pclmulqdqni", 0, 0); | 265 | cryptd_tfm = cryptd_alloc_ahash("__ghash-pclmulqdqni", |
266 | CRYPTO_ALG_INTERNAL, | ||
267 | CRYPTO_ALG_INTERNAL); | ||
265 | if (IS_ERR(cryptd_tfm)) | 268 | if (IS_ERR(cryptd_tfm)) |
266 | return PTR_ERR(cryptd_tfm); | 269 | return PTR_ERR(cryptd_tfm); |
267 | ctx->cryptd_tfm = cryptd_tfm; | 270 | ctx->cryptd_tfm = cryptd_tfm; |
diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c index 432f1d76ceb8..6a85598931b5 100644 --- a/arch/x86/crypto/glue_helper.c +++ b/arch/x86/crypto/glue_helper.c | |||
@@ -232,7 +232,6 @@ static void glue_ctr_crypt_final_128bit(const common_glue_ctr_func_t fn_ctr, | |||
232 | 232 | ||
233 | le128_to_be128((be128 *)walk->iv, &ctrblk); | 233 | le128_to_be128((be128 *)walk->iv, &ctrblk); |
234 | } | 234 | } |
235 | EXPORT_SYMBOL_GPL(glue_ctr_crypt_final_128bit); | ||
236 | 235 | ||
237 | static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx, | 236 | static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx, |
238 | struct blkcipher_desc *desc, | 237 | struct blkcipher_desc *desc, |
diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c index 437e47a4d302..2f63dc89e7a9 100644 --- a/arch/x86/crypto/serpent_avx2_glue.c +++ b/arch/x86/crypto/serpent_avx2_glue.c | |||
@@ -309,7 +309,8 @@ static struct crypto_alg srp_algs[10] = { { | |||
309 | .cra_name = "__ecb-serpent-avx2", | 309 | .cra_name = "__ecb-serpent-avx2", |
310 | .cra_driver_name = "__driver-ecb-serpent-avx2", | 310 | .cra_driver_name = "__driver-ecb-serpent-avx2", |
311 | .cra_priority = 0, | 311 | .cra_priority = 0, |
312 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 312 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
313 | CRYPTO_ALG_INTERNAL, | ||
313 | .cra_blocksize = SERPENT_BLOCK_SIZE, | 314 | .cra_blocksize = SERPENT_BLOCK_SIZE, |
314 | .cra_ctxsize = sizeof(struct serpent_ctx), | 315 | .cra_ctxsize = sizeof(struct serpent_ctx), |
315 | .cra_alignmask = 0, | 316 | .cra_alignmask = 0, |
@@ -329,7 +330,8 @@ static struct crypto_alg srp_algs[10] = { { | |||
329 | .cra_name = "__cbc-serpent-avx2", | 330 | .cra_name = "__cbc-serpent-avx2", |
330 | .cra_driver_name = "__driver-cbc-serpent-avx2", | 331 | .cra_driver_name = "__driver-cbc-serpent-avx2", |
331 | .cra_priority = 0, | 332 | .cra_priority = 0, |
332 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 333 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
334 | CRYPTO_ALG_INTERNAL, | ||
333 | .cra_blocksize = SERPENT_BLOCK_SIZE, | 335 | .cra_blocksize = SERPENT_BLOCK_SIZE, |
334 | .cra_ctxsize = sizeof(struct serpent_ctx), | 336 | .cra_ctxsize = sizeof(struct serpent_ctx), |
335 | .cra_alignmask = 0, | 337 | .cra_alignmask = 0, |
@@ -349,7 +351,8 @@ static struct crypto_alg srp_algs[10] = { { | |||
349 | .cra_name = "__ctr-serpent-avx2", | 351 | .cra_name = "__ctr-serpent-avx2", |
350 | .cra_driver_name = "__driver-ctr-serpent-avx2", | 352 | .cra_driver_name = "__driver-ctr-serpent-avx2", |
351 | .cra_priority = 0, | 353 | .cra_priority = 0, |
352 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 354 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
355 | CRYPTO_ALG_INTERNAL, | ||
353 | .cra_blocksize = 1, | 356 | .cra_blocksize = 1, |
354 | .cra_ctxsize = sizeof(struct serpent_ctx), | 357 | .cra_ctxsize = sizeof(struct serpent_ctx), |
355 | .cra_alignmask = 0, | 358 | .cra_alignmask = 0, |
@@ -370,7 +373,8 @@ static struct crypto_alg srp_algs[10] = { { | |||
370 | .cra_name = "__lrw-serpent-avx2", | 373 | .cra_name = "__lrw-serpent-avx2", |
371 | .cra_driver_name = "__driver-lrw-serpent-avx2", | 374 | .cra_driver_name = "__driver-lrw-serpent-avx2", |
372 | .cra_priority = 0, | 375 | .cra_priority = 0, |
373 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 376 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
377 | CRYPTO_ALG_INTERNAL, | ||
374 | .cra_blocksize = SERPENT_BLOCK_SIZE, | 378 | .cra_blocksize = SERPENT_BLOCK_SIZE, |
375 | .cra_ctxsize = sizeof(struct serpent_lrw_ctx), | 379 | .cra_ctxsize = sizeof(struct serpent_lrw_ctx), |
376 | .cra_alignmask = 0, | 380 | .cra_alignmask = 0, |
@@ -394,7 +398,8 @@ static struct crypto_alg srp_algs[10] = { { | |||
394 | .cra_name = "__xts-serpent-avx2", | 398 | .cra_name = "__xts-serpent-avx2", |
395 | .cra_driver_name = "__driver-xts-serpent-avx2", | 399 | .cra_driver_name = "__driver-xts-serpent-avx2", |
396 | .cra_priority = 0, | 400 | .cra_priority = 0, |
397 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 401 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
402 | CRYPTO_ALG_INTERNAL, | ||
398 | .cra_blocksize = SERPENT_BLOCK_SIZE, | 403 | .cra_blocksize = SERPENT_BLOCK_SIZE, |
399 | .cra_ctxsize = sizeof(struct serpent_xts_ctx), | 404 | .cra_ctxsize = sizeof(struct serpent_xts_ctx), |
400 | .cra_alignmask = 0, | 405 | .cra_alignmask = 0, |
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c index 7e217398b4eb..c8d478af8456 100644 --- a/arch/x86/crypto/serpent_avx_glue.c +++ b/arch/x86/crypto/serpent_avx_glue.c | |||
@@ -378,7 +378,8 @@ static struct crypto_alg serpent_algs[10] = { { | |||
378 | .cra_name = "__ecb-serpent-avx", | 378 | .cra_name = "__ecb-serpent-avx", |
379 | .cra_driver_name = "__driver-ecb-serpent-avx", | 379 | .cra_driver_name = "__driver-ecb-serpent-avx", |
380 | .cra_priority = 0, | 380 | .cra_priority = 0, |
381 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 381 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
382 | CRYPTO_ALG_INTERNAL, | ||
382 | .cra_blocksize = SERPENT_BLOCK_SIZE, | 383 | .cra_blocksize = SERPENT_BLOCK_SIZE, |
383 | .cra_ctxsize = sizeof(struct serpent_ctx), | 384 | .cra_ctxsize = sizeof(struct serpent_ctx), |
384 | .cra_alignmask = 0, | 385 | .cra_alignmask = 0, |
@@ -397,7 +398,8 @@ static struct crypto_alg serpent_algs[10] = { { | |||
397 | .cra_name = "__cbc-serpent-avx", | 398 | .cra_name = "__cbc-serpent-avx", |
398 | .cra_driver_name = "__driver-cbc-serpent-avx", | 399 | .cra_driver_name = "__driver-cbc-serpent-avx", |
399 | .cra_priority = 0, | 400 | .cra_priority = 0, |
400 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 401 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
402 | CRYPTO_ALG_INTERNAL, | ||
401 | .cra_blocksize = SERPENT_BLOCK_SIZE, | 403 | .cra_blocksize = SERPENT_BLOCK_SIZE, |
402 | .cra_ctxsize = sizeof(struct serpent_ctx), | 404 | .cra_ctxsize = sizeof(struct serpent_ctx), |
403 | .cra_alignmask = 0, | 405 | .cra_alignmask = 0, |
@@ -416,7 +418,8 @@ static struct crypto_alg serpent_algs[10] = { { | |||
416 | .cra_name = "__ctr-serpent-avx", | 418 | .cra_name = "__ctr-serpent-avx", |
417 | .cra_driver_name = "__driver-ctr-serpent-avx", | 419 | .cra_driver_name = "__driver-ctr-serpent-avx", |
418 | .cra_priority = 0, | 420 | .cra_priority = 0, |
419 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 421 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
422 | CRYPTO_ALG_INTERNAL, | ||
420 | .cra_blocksize = 1, | 423 | .cra_blocksize = 1, |
421 | .cra_ctxsize = sizeof(struct serpent_ctx), | 424 | .cra_ctxsize = sizeof(struct serpent_ctx), |
422 | .cra_alignmask = 0, | 425 | .cra_alignmask = 0, |
@@ -436,7 +439,8 @@ static struct crypto_alg serpent_algs[10] = { { | |||
436 | .cra_name = "__lrw-serpent-avx", | 439 | .cra_name = "__lrw-serpent-avx", |
437 | .cra_driver_name = "__driver-lrw-serpent-avx", | 440 | .cra_driver_name = "__driver-lrw-serpent-avx", |
438 | .cra_priority = 0, | 441 | .cra_priority = 0, |
439 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 442 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
443 | CRYPTO_ALG_INTERNAL, | ||
440 | .cra_blocksize = SERPENT_BLOCK_SIZE, | 444 | .cra_blocksize = SERPENT_BLOCK_SIZE, |
441 | .cra_ctxsize = sizeof(struct serpent_lrw_ctx), | 445 | .cra_ctxsize = sizeof(struct serpent_lrw_ctx), |
442 | .cra_alignmask = 0, | 446 | .cra_alignmask = 0, |
@@ -459,7 +463,8 @@ static struct crypto_alg serpent_algs[10] = { { | |||
459 | .cra_name = "__xts-serpent-avx", | 463 | .cra_name = "__xts-serpent-avx", |
460 | .cra_driver_name = "__driver-xts-serpent-avx", | 464 | .cra_driver_name = "__driver-xts-serpent-avx", |
461 | .cra_priority = 0, | 465 | .cra_priority = 0, |
462 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 466 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
467 | CRYPTO_ALG_INTERNAL, | ||
463 | .cra_blocksize = SERPENT_BLOCK_SIZE, | 468 | .cra_blocksize = SERPENT_BLOCK_SIZE, |
464 | .cra_ctxsize = sizeof(struct serpent_xts_ctx), | 469 | .cra_ctxsize = sizeof(struct serpent_xts_ctx), |
465 | .cra_alignmask = 0, | 470 | .cra_alignmask = 0, |
diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c index bf025adaea01..3643dd508f45 100644 --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c | |||
@@ -387,7 +387,8 @@ static struct crypto_alg serpent_algs[10] = { { | |||
387 | .cra_name = "__ecb-serpent-sse2", | 387 | .cra_name = "__ecb-serpent-sse2", |
388 | .cra_driver_name = "__driver-ecb-serpent-sse2", | 388 | .cra_driver_name = "__driver-ecb-serpent-sse2", |
389 | .cra_priority = 0, | 389 | .cra_priority = 0, |
390 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 390 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
391 | CRYPTO_ALG_INTERNAL, | ||
391 | .cra_blocksize = SERPENT_BLOCK_SIZE, | 392 | .cra_blocksize = SERPENT_BLOCK_SIZE, |
392 | .cra_ctxsize = sizeof(struct serpent_ctx), | 393 | .cra_ctxsize = sizeof(struct serpent_ctx), |
393 | .cra_alignmask = 0, | 394 | .cra_alignmask = 0, |
@@ -406,7 +407,8 @@ static struct crypto_alg serpent_algs[10] = { { | |||
406 | .cra_name = "__cbc-serpent-sse2", | 407 | .cra_name = "__cbc-serpent-sse2", |
407 | .cra_driver_name = "__driver-cbc-serpent-sse2", | 408 | .cra_driver_name = "__driver-cbc-serpent-sse2", |
408 | .cra_priority = 0, | 409 | .cra_priority = 0, |
409 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 410 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
411 | CRYPTO_ALG_INTERNAL, | ||
410 | .cra_blocksize = SERPENT_BLOCK_SIZE, | 412 | .cra_blocksize = SERPENT_BLOCK_SIZE, |
411 | .cra_ctxsize = sizeof(struct serpent_ctx), | 413 | .cra_ctxsize = sizeof(struct serpent_ctx), |
412 | .cra_alignmask = 0, | 414 | .cra_alignmask = 0, |
@@ -425,7 +427,8 @@ static struct crypto_alg serpent_algs[10] = { { | |||
425 | .cra_name = "__ctr-serpent-sse2", | 427 | .cra_name = "__ctr-serpent-sse2", |
426 | .cra_driver_name = "__driver-ctr-serpent-sse2", | 428 | .cra_driver_name = "__driver-ctr-serpent-sse2", |
427 | .cra_priority = 0, | 429 | .cra_priority = 0, |
428 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 430 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
431 | CRYPTO_ALG_INTERNAL, | ||
429 | .cra_blocksize = 1, | 432 | .cra_blocksize = 1, |
430 | .cra_ctxsize = sizeof(struct serpent_ctx), | 433 | .cra_ctxsize = sizeof(struct serpent_ctx), |
431 | .cra_alignmask = 0, | 434 | .cra_alignmask = 0, |
@@ -445,7 +448,8 @@ static struct crypto_alg serpent_algs[10] = { { | |||
445 | .cra_name = "__lrw-serpent-sse2", | 448 | .cra_name = "__lrw-serpent-sse2", |
446 | .cra_driver_name = "__driver-lrw-serpent-sse2", | 449 | .cra_driver_name = "__driver-lrw-serpent-sse2", |
447 | .cra_priority = 0, | 450 | .cra_priority = 0, |
448 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 451 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
452 | CRYPTO_ALG_INTERNAL, | ||
449 | .cra_blocksize = SERPENT_BLOCK_SIZE, | 453 | .cra_blocksize = SERPENT_BLOCK_SIZE, |
450 | .cra_ctxsize = sizeof(struct serpent_lrw_ctx), | 454 | .cra_ctxsize = sizeof(struct serpent_lrw_ctx), |
451 | .cra_alignmask = 0, | 455 | .cra_alignmask = 0, |
@@ -468,7 +472,8 @@ static struct crypto_alg serpent_algs[10] = { { | |||
468 | .cra_name = "__xts-serpent-sse2", | 472 | .cra_name = "__xts-serpent-sse2", |
469 | .cra_driver_name = "__driver-xts-serpent-sse2", | 473 | .cra_driver_name = "__driver-xts-serpent-sse2", |
470 | .cra_priority = 0, | 474 | .cra_priority = 0, |
471 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 475 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
476 | CRYPTO_ALG_INTERNAL, | ||
472 | .cra_blocksize = SERPENT_BLOCK_SIZE, | 477 | .cra_blocksize = SERPENT_BLOCK_SIZE, |
473 | .cra_ctxsize = sizeof(struct serpent_xts_ctx), | 478 | .cra_ctxsize = sizeof(struct serpent_xts_ctx), |
474 | .cra_alignmask = 0, | 479 | .cra_alignmask = 0, |
diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha-mb/sha1_mb.c index fd9f6b035b16..e510b1c5d690 100644 --- a/arch/x86/crypto/sha-mb/sha1_mb.c +++ b/arch/x86/crypto/sha-mb/sha1_mb.c | |||
@@ -694,7 +694,8 @@ static struct shash_alg sha1_mb_shash_alg = { | |||
694 | * use ASYNC flag as some buffers in multi-buffer | 694 | * use ASYNC flag as some buffers in multi-buffer |
695 | * algo may not have completed before hashing thread sleep | 695 | * algo may not have completed before hashing thread sleep |
696 | */ | 696 | */ |
697 | .cra_flags = CRYPTO_ALG_TYPE_SHASH | CRYPTO_ALG_ASYNC, | 697 | .cra_flags = CRYPTO_ALG_TYPE_SHASH | CRYPTO_ALG_ASYNC | |
698 | CRYPTO_ALG_INTERNAL, | ||
698 | .cra_blocksize = SHA1_BLOCK_SIZE, | 699 | .cra_blocksize = SHA1_BLOCK_SIZE, |
699 | .cra_module = THIS_MODULE, | 700 | .cra_module = THIS_MODULE, |
700 | .cra_list = LIST_HEAD_INIT(sha1_mb_shash_alg.base.cra_list), | 701 | .cra_list = LIST_HEAD_INIT(sha1_mb_shash_alg.base.cra_list), |
@@ -770,7 +771,9 @@ static int sha1_mb_async_init_tfm(struct crypto_tfm *tfm) | |||
770 | struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm); | 771 | struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm); |
771 | struct mcryptd_hash_ctx *mctx; | 772 | struct mcryptd_hash_ctx *mctx; |
772 | 773 | ||
773 | mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha1-mb", 0, 0); | 774 | mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha1-mb", |
775 | CRYPTO_ALG_INTERNAL, | ||
776 | CRYPTO_ALG_INTERNAL); | ||
774 | if (IS_ERR(mcryptd_tfm)) | 777 | if (IS_ERR(mcryptd_tfm)) |
775 | return PTR_ERR(mcryptd_tfm); | 778 | return PTR_ERR(mcryptd_tfm); |
776 | mctx = crypto_ahash_ctx(&mcryptd_tfm->base); | 779 | mctx = crypto_ahash_ctx(&mcryptd_tfm->base); |
@@ -828,7 +831,7 @@ static unsigned long sha1_mb_flusher(struct mcryptd_alg_cstate *cstate) | |||
828 | while (!list_empty(&cstate->work_list)) { | 831 | while (!list_empty(&cstate->work_list)) { |
829 | rctx = list_entry(cstate->work_list.next, | 832 | rctx = list_entry(cstate->work_list.next, |
830 | struct mcryptd_hash_request_ctx, waiter); | 833 | struct mcryptd_hash_request_ctx, waiter); |
831 | if time_before(cur_time, rctx->tag.expire) | 834 | if (time_before(cur_time, rctx->tag.expire)) |
832 | break; | 835 | break; |
833 | kernel_fpu_begin(); | 836 | kernel_fpu_begin(); |
834 | sha_ctx = (struct sha1_hash_ctx *) sha1_ctx_mgr_flush(cstate->mgr); | 837 | sha_ctx = (struct sha1_hash_ctx *) sha1_ctx_mgr_flush(cstate->mgr); |
diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c b/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c index 4ca7e166a2aa..822acb5b464c 100644 --- a/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c +++ b/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c | |||
@@ -56,7 +56,7 @@ | |||
56 | void sha1_mb_mgr_init_avx2(struct sha1_mb_mgr *state) | 56 | void sha1_mb_mgr_init_avx2(struct sha1_mb_mgr *state) |
57 | { | 57 | { |
58 | unsigned int j; | 58 | unsigned int j; |
59 | state->unused_lanes = 0xF76543210; | 59 | state->unused_lanes = 0xF76543210ULL; |
60 | for (j = 0; j < 8; j++) { | 60 | for (j = 0; j < 8; j++) { |
61 | state->lens[j] = 0xFFFFFFFF; | 61 | state->lens[j] = 0xFFFFFFFF; |
62 | state->ldata[j].job_in_lane = NULL; | 62 | state->ldata[j].job_in_lane = NULL; |
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index 6c20fe04a738..33d1b9dc14cc 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c | |||
@@ -28,7 +28,7 @@ | |||
28 | #include <linux/cryptohash.h> | 28 | #include <linux/cryptohash.h> |
29 | #include <linux/types.h> | 29 | #include <linux/types.h> |
30 | #include <crypto/sha.h> | 30 | #include <crypto/sha.h> |
31 | #include <asm/byteorder.h> | 31 | #include <crypto/sha1_base.h> |
32 | #include <asm/i387.h> | 32 | #include <asm/i387.h> |
33 | #include <asm/xcr.h> | 33 | #include <asm/xcr.h> |
34 | #include <asm/xsave.h> | 34 | #include <asm/xsave.h> |
@@ -44,132 +44,51 @@ asmlinkage void sha1_transform_avx(u32 *digest, const char *data, | |||
44 | #define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */ | 44 | #define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */ |
45 | 45 | ||
46 | asmlinkage void sha1_transform_avx2(u32 *digest, const char *data, | 46 | asmlinkage void sha1_transform_avx2(u32 *digest, const char *data, |
47 | unsigned int rounds); | 47 | unsigned int rounds); |
48 | #endif | 48 | #endif |
49 | 49 | ||
50 | static asmlinkage void (*sha1_transform_asm)(u32 *, const char *, unsigned int); | 50 | static void (*sha1_transform_asm)(u32 *, const char *, unsigned int); |
51 | |||
52 | |||
53 | static int sha1_ssse3_init(struct shash_desc *desc) | ||
54 | { | ||
55 | struct sha1_state *sctx = shash_desc_ctx(desc); | ||
56 | |||
57 | *sctx = (struct sha1_state){ | ||
58 | .state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 }, | ||
59 | }; | ||
60 | |||
61 | return 0; | ||
62 | } | ||
63 | |||
64 | static int __sha1_ssse3_update(struct shash_desc *desc, const u8 *data, | ||
65 | unsigned int len, unsigned int partial) | ||
66 | { | ||
67 | struct sha1_state *sctx = shash_desc_ctx(desc); | ||
68 | unsigned int done = 0; | ||
69 | |||
70 | sctx->count += len; | ||
71 | |||
72 | if (partial) { | ||
73 | done = SHA1_BLOCK_SIZE - partial; | ||
74 | memcpy(sctx->buffer + partial, data, done); | ||
75 | sha1_transform_asm(sctx->state, sctx->buffer, 1); | ||
76 | } | ||
77 | |||
78 | if (len - done >= SHA1_BLOCK_SIZE) { | ||
79 | const unsigned int rounds = (len - done) / SHA1_BLOCK_SIZE; | ||
80 | |||
81 | sha1_transform_asm(sctx->state, data + done, rounds); | ||
82 | done += rounds * SHA1_BLOCK_SIZE; | ||
83 | } | ||
84 | |||
85 | memcpy(sctx->buffer, data + done, len - done); | ||
86 | |||
87 | return 0; | ||
88 | } | ||
89 | 51 | ||
90 | static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data, | 52 | static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data, |
91 | unsigned int len) | 53 | unsigned int len) |
92 | { | 54 | { |
93 | struct sha1_state *sctx = shash_desc_ctx(desc); | 55 | struct sha1_state *sctx = shash_desc_ctx(desc); |
94 | unsigned int partial = sctx->count % SHA1_BLOCK_SIZE; | ||
95 | int res; | ||
96 | 56 | ||
97 | /* Handle the fast case right here */ | 57 | if (!irq_fpu_usable() || |
98 | if (partial + len < SHA1_BLOCK_SIZE) { | 58 | (sctx->count % SHA1_BLOCK_SIZE) + len < SHA1_BLOCK_SIZE) |
99 | sctx->count += len; | 59 | return crypto_sha1_update(desc, data, len); |
100 | memcpy(sctx->buffer + partial, data, len); | ||
101 | 60 | ||
102 | return 0; | 61 | /* make sure casting to sha1_block_fn() is safe */ |
103 | } | 62 | BUILD_BUG_ON(offsetof(struct sha1_state, state) != 0); |
104 | 63 | ||
105 | if (!irq_fpu_usable()) { | 64 | kernel_fpu_begin(); |
106 | res = crypto_sha1_update(desc, data, len); | 65 | sha1_base_do_update(desc, data, len, |
107 | } else { | 66 | (sha1_block_fn *)sha1_transform_asm); |
108 | kernel_fpu_begin(); | 67 | kernel_fpu_end(); |
109 | res = __sha1_ssse3_update(desc, data, len, partial); | ||
110 | kernel_fpu_end(); | ||
111 | } | ||
112 | |||
113 | return res; | ||
114 | } | ||
115 | |||
116 | |||
117 | /* Add padding and return the message digest. */ | ||
118 | static int sha1_ssse3_final(struct shash_desc *desc, u8 *out) | ||
119 | { | ||
120 | struct sha1_state *sctx = shash_desc_ctx(desc); | ||
121 | unsigned int i, index, padlen; | ||
122 | __be32 *dst = (__be32 *)out; | ||
123 | __be64 bits; | ||
124 | static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, }; | ||
125 | |||
126 | bits = cpu_to_be64(sctx->count << 3); | ||
127 | |||
128 | /* Pad out to 56 mod 64 and append length */ | ||
129 | index = sctx->count % SHA1_BLOCK_SIZE; | ||
130 | padlen = (index < 56) ? (56 - index) : ((SHA1_BLOCK_SIZE+56) - index); | ||
131 | if (!irq_fpu_usable()) { | ||
132 | crypto_sha1_update(desc, padding, padlen); | ||
133 | crypto_sha1_update(desc, (const u8 *)&bits, sizeof(bits)); | ||
134 | } else { | ||
135 | kernel_fpu_begin(); | ||
136 | /* We need to fill a whole block for __sha1_ssse3_update() */ | ||
137 | if (padlen <= 56) { | ||
138 | sctx->count += padlen; | ||
139 | memcpy(sctx->buffer + index, padding, padlen); | ||
140 | } else { | ||
141 | __sha1_ssse3_update(desc, padding, padlen, index); | ||
142 | } | ||
143 | __sha1_ssse3_update(desc, (const u8 *)&bits, sizeof(bits), 56); | ||
144 | kernel_fpu_end(); | ||
145 | } | ||
146 | |||
147 | /* Store state in digest */ | ||
148 | for (i = 0; i < 5; i++) | ||
149 | dst[i] = cpu_to_be32(sctx->state[i]); | ||
150 | |||
151 | /* Wipe context */ | ||
152 | memset(sctx, 0, sizeof(*sctx)); | ||
153 | 68 | ||
154 | return 0; | 69 | return 0; |
155 | } | 70 | } |
156 | 71 | ||
157 | static int sha1_ssse3_export(struct shash_desc *desc, void *out) | 72 | static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data, |
73 | unsigned int len, u8 *out) | ||
158 | { | 74 | { |
159 | struct sha1_state *sctx = shash_desc_ctx(desc); | 75 | if (!irq_fpu_usable()) |
76 | return crypto_sha1_finup(desc, data, len, out); | ||
160 | 77 | ||
161 | memcpy(out, sctx, sizeof(*sctx)); | 78 | kernel_fpu_begin(); |
79 | if (len) | ||
80 | sha1_base_do_update(desc, data, len, | ||
81 | (sha1_block_fn *)sha1_transform_asm); | ||
82 | sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_transform_asm); | ||
83 | kernel_fpu_end(); | ||
162 | 84 | ||
163 | return 0; | 85 | return sha1_base_finish(desc, out); |
164 | } | 86 | } |
165 | 87 | ||
166 | static int sha1_ssse3_import(struct shash_desc *desc, const void *in) | 88 | /* Add padding and return the message digest. */ |
89 | static int sha1_ssse3_final(struct shash_desc *desc, u8 *out) | ||
167 | { | 90 | { |
168 | struct sha1_state *sctx = shash_desc_ctx(desc); | 91 | return sha1_ssse3_finup(desc, NULL, 0, out); |
169 | |||
170 | memcpy(sctx, in, sizeof(*sctx)); | ||
171 | |||
172 | return 0; | ||
173 | } | 92 | } |
174 | 93 | ||
175 | #ifdef CONFIG_AS_AVX2 | 94 | #ifdef CONFIG_AS_AVX2 |
@@ -186,13 +105,11 @@ static void sha1_apply_transform_avx2(u32 *digest, const char *data, | |||
186 | 105 | ||
187 | static struct shash_alg alg = { | 106 | static struct shash_alg alg = { |
188 | .digestsize = SHA1_DIGEST_SIZE, | 107 | .digestsize = SHA1_DIGEST_SIZE, |
189 | .init = sha1_ssse3_init, | 108 | .init = sha1_base_init, |
190 | .update = sha1_ssse3_update, | 109 | .update = sha1_ssse3_update, |
191 | .final = sha1_ssse3_final, | 110 | .final = sha1_ssse3_final, |
192 | .export = sha1_ssse3_export, | 111 | .finup = sha1_ssse3_finup, |
193 | .import = sha1_ssse3_import, | ||
194 | .descsize = sizeof(struct sha1_state), | 112 | .descsize = sizeof(struct sha1_state), |
195 | .statesize = sizeof(struct sha1_state), | ||
196 | .base = { | 113 | .base = { |
197 | .cra_name = "sha1", | 114 | .cra_name = "sha1", |
198 | .cra_driver_name= "sha1-ssse3", | 115 | .cra_driver_name= "sha1-ssse3", |
diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S index 642f15687a0a..92b3b5d75ba9 100644 --- a/arch/x86/crypto/sha256-avx-asm.S +++ b/arch/x86/crypto/sha256-avx-asm.S | |||
@@ -96,10 +96,10 @@ SHUF_DC00 = %xmm12 # shuffle xDxC -> DC00 | |||
96 | BYTE_FLIP_MASK = %xmm13 | 96 | BYTE_FLIP_MASK = %xmm13 |
97 | 97 | ||
98 | NUM_BLKS = %rdx # 3rd arg | 98 | NUM_BLKS = %rdx # 3rd arg |
99 | CTX = %rsi # 2nd arg | 99 | INP = %rsi # 2nd arg |
100 | INP = %rdi # 1st arg | 100 | CTX = %rdi # 1st arg |
101 | 101 | ||
102 | SRND = %rdi # clobbers INP | 102 | SRND = %rsi # clobbers INP |
103 | c = %ecx | 103 | c = %ecx |
104 | d = %r8d | 104 | d = %r8d |
105 | e = %edx | 105 | e = %edx |
@@ -342,8 +342,8 @@ a = TMP_ | |||
342 | 342 | ||
343 | ######################################################################## | 343 | ######################################################################## |
344 | ## void sha256_transform_avx(void *input_data, UINT32 digest[8], UINT64 num_blks) | 344 | ## void sha256_transform_avx(void *input_data, UINT32 digest[8], UINT64 num_blks) |
345 | ## arg 1 : pointer to input data | 345 | ## arg 1 : pointer to digest |
346 | ## arg 2 : pointer to digest | 346 | ## arg 2 : pointer to input data |
347 | ## arg 3 : Num blocks | 347 | ## arg 3 : Num blocks |
348 | ######################################################################## | 348 | ######################################################################## |
349 | .text | 349 | .text |
diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S index 9e86944c539d..570ec5ec62d7 100644 --- a/arch/x86/crypto/sha256-avx2-asm.S +++ b/arch/x86/crypto/sha256-avx2-asm.S | |||
@@ -91,12 +91,12 @@ BYTE_FLIP_MASK = %ymm13 | |||
91 | X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK | 91 | X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK |
92 | 92 | ||
93 | NUM_BLKS = %rdx # 3rd arg | 93 | NUM_BLKS = %rdx # 3rd arg |
94 | CTX = %rsi # 2nd arg | 94 | INP = %rsi # 2nd arg |
95 | INP = %rdi # 1st arg | 95 | CTX = %rdi # 1st arg |
96 | c = %ecx | 96 | c = %ecx |
97 | d = %r8d | 97 | d = %r8d |
98 | e = %edx # clobbers NUM_BLKS | 98 | e = %edx # clobbers NUM_BLKS |
99 | y3 = %edi # clobbers INP | 99 | y3 = %esi # clobbers INP |
100 | 100 | ||
101 | 101 | ||
102 | TBL = %rbp | 102 | TBL = %rbp |
@@ -523,8 +523,8 @@ STACK_SIZE = _RSP + _RSP_SIZE | |||
523 | 523 | ||
524 | ######################################################################## | 524 | ######################################################################## |
525 | ## void sha256_transform_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks) | 525 | ## void sha256_transform_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks) |
526 | ## arg 1 : pointer to input data | 526 | ## arg 1 : pointer to digest |
527 | ## arg 2 : pointer to digest | 527 | ## arg 2 : pointer to input data |
528 | ## arg 3 : Num blocks | 528 | ## arg 3 : Num blocks |
529 | ######################################################################## | 529 | ######################################################################## |
530 | .text | 530 | .text |
diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S index f833b74d902b..2cedc44e8121 100644 --- a/arch/x86/crypto/sha256-ssse3-asm.S +++ b/arch/x86/crypto/sha256-ssse3-asm.S | |||
@@ -88,10 +88,10 @@ SHUF_DC00 = %xmm11 # shuffle xDxC -> DC00 | |||
88 | BYTE_FLIP_MASK = %xmm12 | 88 | BYTE_FLIP_MASK = %xmm12 |
89 | 89 | ||
90 | NUM_BLKS = %rdx # 3rd arg | 90 | NUM_BLKS = %rdx # 3rd arg |
91 | CTX = %rsi # 2nd arg | 91 | INP = %rsi # 2nd arg |
92 | INP = %rdi # 1st arg | 92 | CTX = %rdi # 1st arg |
93 | 93 | ||
94 | SRND = %rdi # clobbers INP | 94 | SRND = %rsi # clobbers INP |
95 | c = %ecx | 95 | c = %ecx |
96 | d = %r8d | 96 | d = %r8d |
97 | e = %edx | 97 | e = %edx |
@@ -348,8 +348,8 @@ a = TMP_ | |||
348 | 348 | ||
349 | ######################################################################## | 349 | ######################################################################## |
350 | ## void sha256_transform_ssse3(void *input_data, UINT32 digest[8], UINT64 num_blks) | 350 | ## void sha256_transform_ssse3(void *input_data, UINT32 digest[8], UINT64 num_blks) |
351 | ## arg 1 : pointer to input data | 351 | ## arg 1 : pointer to digest |
352 | ## arg 2 : pointer to digest | 352 | ## arg 2 : pointer to input data |
353 | ## arg 3 : Num blocks | 353 | ## arg 3 : Num blocks |
354 | ######################################################################## | 354 | ######################################################################## |
355 | .text | 355 | .text |
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c index 8fad72f4dfd2..ccc338881ee8 100644 --- a/arch/x86/crypto/sha256_ssse3_glue.c +++ b/arch/x86/crypto/sha256_ssse3_glue.c | |||
@@ -36,195 +36,74 @@ | |||
36 | #include <linux/cryptohash.h> | 36 | #include <linux/cryptohash.h> |
37 | #include <linux/types.h> | 37 | #include <linux/types.h> |
38 | #include <crypto/sha.h> | 38 | #include <crypto/sha.h> |
39 | #include <asm/byteorder.h> | 39 | #include <crypto/sha256_base.h> |
40 | #include <asm/i387.h> | 40 | #include <asm/i387.h> |
41 | #include <asm/xcr.h> | 41 | #include <asm/xcr.h> |
42 | #include <asm/xsave.h> | 42 | #include <asm/xsave.h> |
43 | #include <linux/string.h> | 43 | #include <linux/string.h> |
44 | 44 | ||
45 | asmlinkage void sha256_transform_ssse3(const char *data, u32 *digest, | 45 | asmlinkage void sha256_transform_ssse3(u32 *digest, const char *data, |
46 | u64 rounds); | 46 | u64 rounds); |
47 | #ifdef CONFIG_AS_AVX | 47 | #ifdef CONFIG_AS_AVX |
48 | asmlinkage void sha256_transform_avx(const char *data, u32 *digest, | 48 | asmlinkage void sha256_transform_avx(u32 *digest, const char *data, |
49 | u64 rounds); | 49 | u64 rounds); |
50 | #endif | 50 | #endif |
51 | #ifdef CONFIG_AS_AVX2 | 51 | #ifdef CONFIG_AS_AVX2 |
52 | asmlinkage void sha256_transform_rorx(const char *data, u32 *digest, | 52 | asmlinkage void sha256_transform_rorx(u32 *digest, const char *data, |
53 | u64 rounds); | 53 | u64 rounds); |
54 | #endif | 54 | #endif |
55 | 55 | ||
56 | static asmlinkage void (*sha256_transform_asm)(const char *, u32 *, u64); | 56 | static void (*sha256_transform_asm)(u32 *, const char *, u64); |
57 | |||
58 | |||
59 | static int sha256_ssse3_init(struct shash_desc *desc) | ||
60 | { | ||
61 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
62 | |||
63 | sctx->state[0] = SHA256_H0; | ||
64 | sctx->state[1] = SHA256_H1; | ||
65 | sctx->state[2] = SHA256_H2; | ||
66 | sctx->state[3] = SHA256_H3; | ||
67 | sctx->state[4] = SHA256_H4; | ||
68 | sctx->state[5] = SHA256_H5; | ||
69 | sctx->state[6] = SHA256_H6; | ||
70 | sctx->state[7] = SHA256_H7; | ||
71 | sctx->count = 0; | ||
72 | |||
73 | return 0; | ||
74 | } | ||
75 | |||
76 | static int __sha256_ssse3_update(struct shash_desc *desc, const u8 *data, | ||
77 | unsigned int len, unsigned int partial) | ||
78 | { | ||
79 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
80 | unsigned int done = 0; | ||
81 | |||
82 | sctx->count += len; | ||
83 | |||
84 | if (partial) { | ||
85 | done = SHA256_BLOCK_SIZE - partial; | ||
86 | memcpy(sctx->buf + partial, data, done); | ||
87 | sha256_transform_asm(sctx->buf, sctx->state, 1); | ||
88 | } | ||
89 | |||
90 | if (len - done >= SHA256_BLOCK_SIZE) { | ||
91 | const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE; | ||
92 | |||
93 | sha256_transform_asm(data + done, sctx->state, (u64) rounds); | ||
94 | |||
95 | done += rounds * SHA256_BLOCK_SIZE; | ||
96 | } | ||
97 | |||
98 | memcpy(sctx->buf, data + done, len - done); | ||
99 | |||
100 | return 0; | ||
101 | } | ||
102 | 57 | ||
103 | static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data, | 58 | static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data, |
104 | unsigned int len) | 59 | unsigned int len) |
105 | { | 60 | { |
106 | struct sha256_state *sctx = shash_desc_ctx(desc); | 61 | struct sha256_state *sctx = shash_desc_ctx(desc); |
107 | unsigned int partial = sctx->count % SHA256_BLOCK_SIZE; | ||
108 | int res; | ||
109 | 62 | ||
110 | /* Handle the fast case right here */ | 63 | if (!irq_fpu_usable() || |
111 | if (partial + len < SHA256_BLOCK_SIZE) { | 64 | (sctx->count % SHA256_BLOCK_SIZE) + len < SHA256_BLOCK_SIZE) |
112 | sctx->count += len; | 65 | return crypto_sha256_update(desc, data, len); |
113 | memcpy(sctx->buf + partial, data, len); | ||
114 | 66 | ||
115 | return 0; | 67 | /* make sure casting to sha256_block_fn() is safe */ |
116 | } | 68 | BUILD_BUG_ON(offsetof(struct sha256_state, state) != 0); |
117 | |||
118 | if (!irq_fpu_usable()) { | ||
119 | res = crypto_sha256_update(desc, data, len); | ||
120 | } else { | ||
121 | kernel_fpu_begin(); | ||
122 | res = __sha256_ssse3_update(desc, data, len, partial); | ||
123 | kernel_fpu_end(); | ||
124 | } | ||
125 | |||
126 | return res; | ||
127 | } | ||
128 | 69 | ||
129 | 70 | kernel_fpu_begin(); | |
130 | /* Add padding and return the message digest. */ | 71 | sha256_base_do_update(desc, data, len, |
131 | static int sha256_ssse3_final(struct shash_desc *desc, u8 *out) | 72 | (sha256_block_fn *)sha256_transform_asm); |
132 | { | 73 | kernel_fpu_end(); |
133 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
134 | unsigned int i, index, padlen; | ||
135 | __be32 *dst = (__be32 *)out; | ||
136 | __be64 bits; | ||
137 | static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, }; | ||
138 | |||
139 | bits = cpu_to_be64(sctx->count << 3); | ||
140 | |||
141 | /* Pad out to 56 mod 64 and append length */ | ||
142 | index = sctx->count % SHA256_BLOCK_SIZE; | ||
143 | padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index); | ||
144 | |||
145 | if (!irq_fpu_usable()) { | ||
146 | crypto_sha256_update(desc, padding, padlen); | ||
147 | crypto_sha256_update(desc, (const u8 *)&bits, sizeof(bits)); | ||
148 | } else { | ||
149 | kernel_fpu_begin(); | ||
150 | /* We need to fill a whole block for __sha256_ssse3_update() */ | ||
151 | if (padlen <= 56) { | ||
152 | sctx->count += padlen; | ||
153 | memcpy(sctx->buf + index, padding, padlen); | ||
154 | } else { | ||
155 | __sha256_ssse3_update(desc, padding, padlen, index); | ||
156 | } | ||
157 | __sha256_ssse3_update(desc, (const u8 *)&bits, | ||
158 | sizeof(bits), 56); | ||
159 | kernel_fpu_end(); | ||
160 | } | ||
161 | |||
162 | /* Store state in digest */ | ||
163 | for (i = 0; i < 8; i++) | ||
164 | dst[i] = cpu_to_be32(sctx->state[i]); | ||
165 | |||
166 | /* Wipe context */ | ||
167 | memset(sctx, 0, sizeof(*sctx)); | ||
168 | 74 | ||
169 | return 0; | 75 | return 0; |
170 | } | 76 | } |
171 | 77 | ||
172 | static int sha256_ssse3_export(struct shash_desc *desc, void *out) | 78 | static int sha256_ssse3_finup(struct shash_desc *desc, const u8 *data, |
79 | unsigned int len, u8 *out) | ||
173 | { | 80 | { |
174 | struct sha256_state *sctx = shash_desc_ctx(desc); | 81 | if (!irq_fpu_usable()) |
82 | return crypto_sha256_finup(desc, data, len, out); | ||
175 | 83 | ||
176 | memcpy(out, sctx, sizeof(*sctx)); | 84 | kernel_fpu_begin(); |
85 | if (len) | ||
86 | sha256_base_do_update(desc, data, len, | ||
87 | (sha256_block_fn *)sha256_transform_asm); | ||
88 | sha256_base_do_finalize(desc, (sha256_block_fn *)sha256_transform_asm); | ||
89 | kernel_fpu_end(); | ||
177 | 90 | ||
178 | return 0; | 91 | return sha256_base_finish(desc, out); |
179 | } | 92 | } |
180 | 93 | ||
181 | static int sha256_ssse3_import(struct shash_desc *desc, const void *in) | 94 | /* Add padding and return the message digest. */ |
182 | { | 95 | static int sha256_ssse3_final(struct shash_desc *desc, u8 *out) |
183 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
184 | |||
185 | memcpy(sctx, in, sizeof(*sctx)); | ||
186 | |||
187 | return 0; | ||
188 | } | ||
189 | |||
190 | static int sha224_ssse3_init(struct shash_desc *desc) | ||
191 | { | ||
192 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
193 | |||
194 | sctx->state[0] = SHA224_H0; | ||
195 | sctx->state[1] = SHA224_H1; | ||
196 | sctx->state[2] = SHA224_H2; | ||
197 | sctx->state[3] = SHA224_H3; | ||
198 | sctx->state[4] = SHA224_H4; | ||
199 | sctx->state[5] = SHA224_H5; | ||
200 | sctx->state[6] = SHA224_H6; | ||
201 | sctx->state[7] = SHA224_H7; | ||
202 | sctx->count = 0; | ||
203 | |||
204 | return 0; | ||
205 | } | ||
206 | |||
207 | static int sha224_ssse3_final(struct shash_desc *desc, u8 *hash) | ||
208 | { | 96 | { |
209 | u8 D[SHA256_DIGEST_SIZE]; | 97 | return sha256_ssse3_finup(desc, NULL, 0, out); |
210 | |||
211 | sha256_ssse3_final(desc, D); | ||
212 | |||
213 | memcpy(hash, D, SHA224_DIGEST_SIZE); | ||
214 | memzero_explicit(D, SHA256_DIGEST_SIZE); | ||
215 | |||
216 | return 0; | ||
217 | } | 98 | } |
218 | 99 | ||
219 | static struct shash_alg algs[] = { { | 100 | static struct shash_alg algs[] = { { |
220 | .digestsize = SHA256_DIGEST_SIZE, | 101 | .digestsize = SHA256_DIGEST_SIZE, |
221 | .init = sha256_ssse3_init, | 102 | .init = sha256_base_init, |
222 | .update = sha256_ssse3_update, | 103 | .update = sha256_ssse3_update, |
223 | .final = sha256_ssse3_final, | 104 | .final = sha256_ssse3_final, |
224 | .export = sha256_ssse3_export, | 105 | .finup = sha256_ssse3_finup, |
225 | .import = sha256_ssse3_import, | ||
226 | .descsize = sizeof(struct sha256_state), | 106 | .descsize = sizeof(struct sha256_state), |
227 | .statesize = sizeof(struct sha256_state), | ||
228 | .base = { | 107 | .base = { |
229 | .cra_name = "sha256", | 108 | .cra_name = "sha256", |
230 | .cra_driver_name = "sha256-ssse3", | 109 | .cra_driver_name = "sha256-ssse3", |
@@ -235,13 +114,11 @@ static struct shash_alg algs[] = { { | |||
235 | } | 114 | } |
236 | }, { | 115 | }, { |
237 | .digestsize = SHA224_DIGEST_SIZE, | 116 | .digestsize = SHA224_DIGEST_SIZE, |
238 | .init = sha224_ssse3_init, | 117 | .init = sha224_base_init, |
239 | .update = sha256_ssse3_update, | 118 | .update = sha256_ssse3_update, |
240 | .final = sha224_ssse3_final, | 119 | .final = sha256_ssse3_final, |
241 | .export = sha256_ssse3_export, | 120 | .finup = sha256_ssse3_finup, |
242 | .import = sha256_ssse3_import, | ||
243 | .descsize = sizeof(struct sha256_state), | 121 | .descsize = sizeof(struct sha256_state), |
244 | .statesize = sizeof(struct sha256_state), | ||
245 | .base = { | 122 | .base = { |
246 | .cra_name = "sha224", | 123 | .cra_name = "sha224", |
247 | .cra_driver_name = "sha224-ssse3", | 124 | .cra_driver_name = "sha224-ssse3", |
diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S index 974dde9bc6cd..565274d6a641 100644 --- a/arch/x86/crypto/sha512-avx-asm.S +++ b/arch/x86/crypto/sha512-avx-asm.S | |||
@@ -54,9 +54,9 @@ | |||
54 | 54 | ||
55 | # Virtual Registers | 55 | # Virtual Registers |
56 | # ARG1 | 56 | # ARG1 |
57 | msg = %rdi | 57 | digest = %rdi |
58 | # ARG2 | 58 | # ARG2 |
59 | digest = %rsi | 59 | msg = %rsi |
60 | # ARG3 | 60 | # ARG3 |
61 | msglen = %rdx | 61 | msglen = %rdx |
62 | T1 = %rcx | 62 | T1 = %rcx |
@@ -271,7 +271,7 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE | |||
271 | .endm | 271 | .endm |
272 | 272 | ||
273 | ######################################################################## | 273 | ######################################################################## |
274 | # void sha512_transform_avx(const void* M, void* D, u64 L) | 274 | # void sha512_transform_avx(void* D, const void* M, u64 L) |
275 | # Purpose: Updates the SHA512 digest stored at D with the message stored in M. | 275 | # Purpose: Updates the SHA512 digest stored at D with the message stored in M. |
276 | # The size of the message pointed to by M must be an integer multiple of SHA512 | 276 | # The size of the message pointed to by M must be an integer multiple of SHA512 |
277 | # message blocks. | 277 | # message blocks. |
diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S index 568b96105f5c..a4771dcd1fcf 100644 --- a/arch/x86/crypto/sha512-avx2-asm.S +++ b/arch/x86/crypto/sha512-avx2-asm.S | |||
@@ -70,9 +70,9 @@ XFER = YTMP0 | |||
70 | BYTE_FLIP_MASK = %ymm9 | 70 | BYTE_FLIP_MASK = %ymm9 |
71 | 71 | ||
72 | # 1st arg | 72 | # 1st arg |
73 | INP = %rdi | 73 | CTX = %rdi |
74 | # 2nd arg | 74 | # 2nd arg |
75 | CTX = %rsi | 75 | INP = %rsi |
76 | # 3rd arg | 76 | # 3rd arg |
77 | NUM_BLKS = %rdx | 77 | NUM_BLKS = %rdx |
78 | 78 | ||
@@ -562,7 +562,7 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE | |||
562 | .endm | 562 | .endm |
563 | 563 | ||
564 | ######################################################################## | 564 | ######################################################################## |
565 | # void sha512_transform_rorx(const void* M, void* D, uint64_t L)# | 565 | # void sha512_transform_rorx(void* D, const void* M, uint64_t L)# |
566 | # Purpose: Updates the SHA512 digest stored at D with the message stored in M. | 566 | # Purpose: Updates the SHA512 digest stored at D with the message stored in M. |
567 | # The size of the message pointed to by M must be an integer multiple of SHA512 | 567 | # The size of the message pointed to by M must be an integer multiple of SHA512 |
568 | # message blocks. | 568 | # message blocks. |
diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S index fb56855d51f5..e610e29cbc81 100644 --- a/arch/x86/crypto/sha512-ssse3-asm.S +++ b/arch/x86/crypto/sha512-ssse3-asm.S | |||
@@ -53,9 +53,9 @@ | |||
53 | 53 | ||
54 | # Virtual Registers | 54 | # Virtual Registers |
55 | # ARG1 | 55 | # ARG1 |
56 | msg = %rdi | 56 | digest = %rdi |
57 | # ARG2 | 57 | # ARG2 |
58 | digest = %rsi | 58 | msg = %rsi |
59 | # ARG3 | 59 | # ARG3 |
60 | msglen = %rdx | 60 | msglen = %rdx |
61 | T1 = %rcx | 61 | T1 = %rcx |
@@ -269,7 +269,7 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE | |||
269 | .endm | 269 | .endm |
270 | 270 | ||
271 | ######################################################################## | 271 | ######################################################################## |
272 | # void sha512_transform_ssse3(const void* M, void* D, u64 L)# | 272 | # void sha512_transform_ssse3(void* D, const void* M, u64 L)# |
273 | # Purpose: Updates the SHA512 digest stored at D with the message stored in M. | 273 | # Purpose: Updates the SHA512 digest stored at D with the message stored in M. |
274 | # The size of the message pointed to by M must be an integer multiple of SHA512 | 274 | # The size of the message pointed to by M must be an integer multiple of SHA512 |
275 | # message blocks. | 275 | # message blocks. |
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c index 0b6af26832bf..d9fa4c1e063f 100644 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ b/arch/x86/crypto/sha512_ssse3_glue.c | |||
@@ -34,205 +34,75 @@ | |||
34 | #include <linux/cryptohash.h> | 34 | #include <linux/cryptohash.h> |
35 | #include <linux/types.h> | 35 | #include <linux/types.h> |
36 | #include <crypto/sha.h> | 36 | #include <crypto/sha.h> |
37 | #include <asm/byteorder.h> | 37 | #include <crypto/sha512_base.h> |
38 | #include <asm/i387.h> | 38 | #include <asm/i387.h> |
39 | #include <asm/xcr.h> | 39 | #include <asm/xcr.h> |
40 | #include <asm/xsave.h> | 40 | #include <asm/xsave.h> |
41 | 41 | ||
42 | #include <linux/string.h> | 42 | #include <linux/string.h> |
43 | 43 | ||
44 | asmlinkage void sha512_transform_ssse3(const char *data, u64 *digest, | 44 | asmlinkage void sha512_transform_ssse3(u64 *digest, const char *data, |
45 | u64 rounds); | 45 | u64 rounds); |
46 | #ifdef CONFIG_AS_AVX | 46 | #ifdef CONFIG_AS_AVX |
47 | asmlinkage void sha512_transform_avx(const char *data, u64 *digest, | 47 | asmlinkage void sha512_transform_avx(u64 *digest, const char *data, |
48 | u64 rounds); | 48 | u64 rounds); |
49 | #endif | 49 | #endif |
50 | #ifdef CONFIG_AS_AVX2 | 50 | #ifdef CONFIG_AS_AVX2 |
51 | asmlinkage void sha512_transform_rorx(const char *data, u64 *digest, | 51 | asmlinkage void sha512_transform_rorx(u64 *digest, const char *data, |
52 | u64 rounds); | 52 | u64 rounds); |
53 | #endif | 53 | #endif |
54 | 54 | ||
55 | static asmlinkage void (*sha512_transform_asm)(const char *, u64 *, u64); | 55 | static void (*sha512_transform_asm)(u64 *, const char *, u64); |
56 | |||
57 | |||
58 | static int sha512_ssse3_init(struct shash_desc *desc) | ||
59 | { | ||
60 | struct sha512_state *sctx = shash_desc_ctx(desc); | ||
61 | |||
62 | sctx->state[0] = SHA512_H0; | ||
63 | sctx->state[1] = SHA512_H1; | ||
64 | sctx->state[2] = SHA512_H2; | ||
65 | sctx->state[3] = SHA512_H3; | ||
66 | sctx->state[4] = SHA512_H4; | ||
67 | sctx->state[5] = SHA512_H5; | ||
68 | sctx->state[6] = SHA512_H6; | ||
69 | sctx->state[7] = SHA512_H7; | ||
70 | sctx->count[0] = sctx->count[1] = 0; | ||
71 | |||
72 | return 0; | ||
73 | } | ||
74 | 56 | ||
75 | static int __sha512_ssse3_update(struct shash_desc *desc, const u8 *data, | 57 | static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data, |
76 | unsigned int len, unsigned int partial) | 58 | unsigned int len) |
77 | { | 59 | { |
78 | struct sha512_state *sctx = shash_desc_ctx(desc); | 60 | struct sha512_state *sctx = shash_desc_ctx(desc); |
79 | unsigned int done = 0; | ||
80 | |||
81 | sctx->count[0] += len; | ||
82 | if (sctx->count[0] < len) | ||
83 | sctx->count[1]++; | ||
84 | 61 | ||
85 | if (partial) { | 62 | if (!irq_fpu_usable() || |
86 | done = SHA512_BLOCK_SIZE - partial; | 63 | (sctx->count[0] % SHA512_BLOCK_SIZE) + len < SHA512_BLOCK_SIZE) |
87 | memcpy(sctx->buf + partial, data, done); | 64 | return crypto_sha512_update(desc, data, len); |
88 | sha512_transform_asm(sctx->buf, sctx->state, 1); | ||
89 | } | ||
90 | |||
91 | if (len - done >= SHA512_BLOCK_SIZE) { | ||
92 | const unsigned int rounds = (len - done) / SHA512_BLOCK_SIZE; | ||
93 | 65 | ||
94 | sha512_transform_asm(data + done, sctx->state, (u64) rounds); | 66 | /* make sure casting to sha512_block_fn() is safe */ |
95 | 67 | BUILD_BUG_ON(offsetof(struct sha512_state, state) != 0); | |
96 | done += rounds * SHA512_BLOCK_SIZE; | ||
97 | } | ||
98 | 68 | ||
99 | memcpy(sctx->buf, data + done, len - done); | 69 | kernel_fpu_begin(); |
70 | sha512_base_do_update(desc, data, len, | ||
71 | (sha512_block_fn *)sha512_transform_asm); | ||
72 | kernel_fpu_end(); | ||
100 | 73 | ||
101 | return 0; | 74 | return 0; |
102 | } | 75 | } |
103 | 76 | ||
104 | static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data, | 77 | static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data, |
105 | unsigned int len) | 78 | unsigned int len, u8 *out) |
106 | { | 79 | { |
107 | struct sha512_state *sctx = shash_desc_ctx(desc); | 80 | if (!irq_fpu_usable()) |
108 | unsigned int partial = sctx->count[0] % SHA512_BLOCK_SIZE; | 81 | return crypto_sha512_finup(desc, data, len, out); |
109 | int res; | ||
110 | |||
111 | /* Handle the fast case right here */ | ||
112 | if (partial + len < SHA512_BLOCK_SIZE) { | ||
113 | sctx->count[0] += len; | ||
114 | if (sctx->count[0] < len) | ||
115 | sctx->count[1]++; | ||
116 | memcpy(sctx->buf + partial, data, len); | ||
117 | |||
118 | return 0; | ||
119 | } | ||
120 | 82 | ||
121 | if (!irq_fpu_usable()) { | 83 | kernel_fpu_begin(); |
122 | res = crypto_sha512_update(desc, data, len); | 84 | if (len) |
123 | } else { | 85 | sha512_base_do_update(desc, data, len, |
124 | kernel_fpu_begin(); | 86 | (sha512_block_fn *)sha512_transform_asm); |
125 | res = __sha512_ssse3_update(desc, data, len, partial); | 87 | sha512_base_do_finalize(desc, (sha512_block_fn *)sha512_transform_asm); |
126 | kernel_fpu_end(); | 88 | kernel_fpu_end(); |
127 | } | ||
128 | 89 | ||
129 | return res; | 90 | return sha512_base_finish(desc, out); |
130 | } | 91 | } |
131 | 92 | ||
132 | |||
133 | /* Add padding and return the message digest. */ | 93 | /* Add padding and return the message digest. */ |
134 | static int sha512_ssse3_final(struct shash_desc *desc, u8 *out) | 94 | static int sha512_ssse3_final(struct shash_desc *desc, u8 *out) |
135 | { | 95 | { |
136 | struct sha512_state *sctx = shash_desc_ctx(desc); | 96 | return sha512_ssse3_finup(desc, NULL, 0, out); |
137 | unsigned int i, index, padlen; | ||
138 | __be64 *dst = (__be64 *)out; | ||
139 | __be64 bits[2]; | ||
140 | static const u8 padding[SHA512_BLOCK_SIZE] = { 0x80, }; | ||
141 | |||
142 | /* save number of bits */ | ||
143 | bits[1] = cpu_to_be64(sctx->count[0] << 3); | ||
144 | bits[0] = cpu_to_be64(sctx->count[1] << 3 | sctx->count[0] >> 61); | ||
145 | |||
146 | /* Pad out to 112 mod 128 and append length */ | ||
147 | index = sctx->count[0] & 0x7f; | ||
148 | padlen = (index < 112) ? (112 - index) : ((128+112) - index); | ||
149 | |||
150 | if (!irq_fpu_usable()) { | ||
151 | crypto_sha512_update(desc, padding, padlen); | ||
152 | crypto_sha512_update(desc, (const u8 *)&bits, sizeof(bits)); | ||
153 | } else { | ||
154 | kernel_fpu_begin(); | ||
155 | /* We need to fill a whole block for __sha512_ssse3_update() */ | ||
156 | if (padlen <= 112) { | ||
157 | sctx->count[0] += padlen; | ||
158 | if (sctx->count[0] < padlen) | ||
159 | sctx->count[1]++; | ||
160 | memcpy(sctx->buf + index, padding, padlen); | ||
161 | } else { | ||
162 | __sha512_ssse3_update(desc, padding, padlen, index); | ||
163 | } | ||
164 | __sha512_ssse3_update(desc, (const u8 *)&bits, | ||
165 | sizeof(bits), 112); | ||
166 | kernel_fpu_end(); | ||
167 | } | ||
168 | |||
169 | /* Store state in digest */ | ||
170 | for (i = 0; i < 8; i++) | ||
171 | dst[i] = cpu_to_be64(sctx->state[i]); | ||
172 | |||
173 | /* Wipe context */ | ||
174 | memset(sctx, 0, sizeof(*sctx)); | ||
175 | |||
176 | return 0; | ||
177 | } | ||
178 | |||
179 | static int sha512_ssse3_export(struct shash_desc *desc, void *out) | ||
180 | { | ||
181 | struct sha512_state *sctx = shash_desc_ctx(desc); | ||
182 | |||
183 | memcpy(out, sctx, sizeof(*sctx)); | ||
184 | |||
185 | return 0; | ||
186 | } | ||
187 | |||
188 | static int sha512_ssse3_import(struct shash_desc *desc, const void *in) | ||
189 | { | ||
190 | struct sha512_state *sctx = shash_desc_ctx(desc); | ||
191 | |||
192 | memcpy(sctx, in, sizeof(*sctx)); | ||
193 | |||
194 | return 0; | ||
195 | } | ||
196 | |||
197 | static int sha384_ssse3_init(struct shash_desc *desc) | ||
198 | { | ||
199 | struct sha512_state *sctx = shash_desc_ctx(desc); | ||
200 | |||
201 | sctx->state[0] = SHA384_H0; | ||
202 | sctx->state[1] = SHA384_H1; | ||
203 | sctx->state[2] = SHA384_H2; | ||
204 | sctx->state[3] = SHA384_H3; | ||
205 | sctx->state[4] = SHA384_H4; | ||
206 | sctx->state[5] = SHA384_H5; | ||
207 | sctx->state[6] = SHA384_H6; | ||
208 | sctx->state[7] = SHA384_H7; | ||
209 | |||
210 | sctx->count[0] = sctx->count[1] = 0; | ||
211 | |||
212 | return 0; | ||
213 | } | ||
214 | |||
215 | static int sha384_ssse3_final(struct shash_desc *desc, u8 *hash) | ||
216 | { | ||
217 | u8 D[SHA512_DIGEST_SIZE]; | ||
218 | |||
219 | sha512_ssse3_final(desc, D); | ||
220 | |||
221 | memcpy(hash, D, SHA384_DIGEST_SIZE); | ||
222 | memzero_explicit(D, SHA512_DIGEST_SIZE); | ||
223 | |||
224 | return 0; | ||
225 | } | 97 | } |
226 | 98 | ||
227 | static struct shash_alg algs[] = { { | 99 | static struct shash_alg algs[] = { { |
228 | .digestsize = SHA512_DIGEST_SIZE, | 100 | .digestsize = SHA512_DIGEST_SIZE, |
229 | .init = sha512_ssse3_init, | 101 | .init = sha512_base_init, |
230 | .update = sha512_ssse3_update, | 102 | .update = sha512_ssse3_update, |
231 | .final = sha512_ssse3_final, | 103 | .final = sha512_ssse3_final, |
232 | .export = sha512_ssse3_export, | 104 | .finup = sha512_ssse3_finup, |
233 | .import = sha512_ssse3_import, | ||
234 | .descsize = sizeof(struct sha512_state), | 105 | .descsize = sizeof(struct sha512_state), |
235 | .statesize = sizeof(struct sha512_state), | ||
236 | .base = { | 106 | .base = { |
237 | .cra_name = "sha512", | 107 | .cra_name = "sha512", |
238 | .cra_driver_name = "sha512-ssse3", | 108 | .cra_driver_name = "sha512-ssse3", |
@@ -243,13 +113,11 @@ static struct shash_alg algs[] = { { | |||
243 | } | 113 | } |
244 | }, { | 114 | }, { |
245 | .digestsize = SHA384_DIGEST_SIZE, | 115 | .digestsize = SHA384_DIGEST_SIZE, |
246 | .init = sha384_ssse3_init, | 116 | .init = sha384_base_init, |
247 | .update = sha512_ssse3_update, | 117 | .update = sha512_ssse3_update, |
248 | .final = sha384_ssse3_final, | 118 | .final = sha512_ssse3_final, |
249 | .export = sha512_ssse3_export, | 119 | .finup = sha512_ssse3_finup, |
250 | .import = sha512_ssse3_import, | ||
251 | .descsize = sizeof(struct sha512_state), | 120 | .descsize = sizeof(struct sha512_state), |
252 | .statesize = sizeof(struct sha512_state), | ||
253 | .base = { | 121 | .base = { |
254 | .cra_name = "sha384", | 122 | .cra_name = "sha384", |
255 | .cra_driver_name = "sha384-ssse3", | 123 | .cra_driver_name = "sha384-ssse3", |
diff --git a/arch/x86/crypto/twofish-x86_64-asm_64.S b/arch/x86/crypto/twofish-x86_64-asm_64.S index a039d21986a2..a350c990dc86 100644 --- a/arch/x86/crypto/twofish-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-x86_64-asm_64.S | |||
@@ -264,7 +264,7 @@ ENTRY(twofish_enc_blk) | |||
264 | movq R1, 8(%rsi) | 264 | movq R1, 8(%rsi) |
265 | 265 | ||
266 | popq R1 | 266 | popq R1 |
267 | movq $1,%rax | 267 | movl $1,%eax |
268 | ret | 268 | ret |
269 | ENDPROC(twofish_enc_blk) | 269 | ENDPROC(twofish_enc_blk) |
270 | 270 | ||
@@ -316,6 +316,6 @@ ENTRY(twofish_dec_blk) | |||
316 | movq R1, 8(%rsi) | 316 | movq R1, 8(%rsi) |
317 | 317 | ||
318 | popq R1 | 318 | popq R1 |
319 | movq $1,%rax | 319 | movl $1,%eax |
320 | ret | 320 | ret |
321 | ENDPROC(twofish_dec_blk) | 321 | ENDPROC(twofish_dec_blk) |
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c index 1ac531ea9bcc..b5e2d5651851 100644 --- a/arch/x86/crypto/twofish_avx_glue.c +++ b/arch/x86/crypto/twofish_avx_glue.c | |||
@@ -340,7 +340,8 @@ static struct crypto_alg twofish_algs[10] = { { | |||
340 | .cra_name = "__ecb-twofish-avx", | 340 | .cra_name = "__ecb-twofish-avx", |
341 | .cra_driver_name = "__driver-ecb-twofish-avx", | 341 | .cra_driver_name = "__driver-ecb-twofish-avx", |
342 | .cra_priority = 0, | 342 | .cra_priority = 0, |
343 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 343 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
344 | CRYPTO_ALG_INTERNAL, | ||
344 | .cra_blocksize = TF_BLOCK_SIZE, | 345 | .cra_blocksize = TF_BLOCK_SIZE, |
345 | .cra_ctxsize = sizeof(struct twofish_ctx), | 346 | .cra_ctxsize = sizeof(struct twofish_ctx), |
346 | .cra_alignmask = 0, | 347 | .cra_alignmask = 0, |
@@ -359,7 +360,8 @@ static struct crypto_alg twofish_algs[10] = { { | |||
359 | .cra_name = "__cbc-twofish-avx", | 360 | .cra_name = "__cbc-twofish-avx", |
360 | .cra_driver_name = "__driver-cbc-twofish-avx", | 361 | .cra_driver_name = "__driver-cbc-twofish-avx", |
361 | .cra_priority = 0, | 362 | .cra_priority = 0, |
362 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 363 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
364 | CRYPTO_ALG_INTERNAL, | ||
363 | .cra_blocksize = TF_BLOCK_SIZE, | 365 | .cra_blocksize = TF_BLOCK_SIZE, |
364 | .cra_ctxsize = sizeof(struct twofish_ctx), | 366 | .cra_ctxsize = sizeof(struct twofish_ctx), |
365 | .cra_alignmask = 0, | 367 | .cra_alignmask = 0, |
@@ -378,7 +380,8 @@ static struct crypto_alg twofish_algs[10] = { { | |||
378 | .cra_name = "__ctr-twofish-avx", | 380 | .cra_name = "__ctr-twofish-avx", |
379 | .cra_driver_name = "__driver-ctr-twofish-avx", | 381 | .cra_driver_name = "__driver-ctr-twofish-avx", |
380 | .cra_priority = 0, | 382 | .cra_priority = 0, |
381 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 383 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
384 | CRYPTO_ALG_INTERNAL, | ||
382 | .cra_blocksize = 1, | 385 | .cra_blocksize = 1, |
383 | .cra_ctxsize = sizeof(struct twofish_ctx), | 386 | .cra_ctxsize = sizeof(struct twofish_ctx), |
384 | .cra_alignmask = 0, | 387 | .cra_alignmask = 0, |
@@ -398,7 +401,8 @@ static struct crypto_alg twofish_algs[10] = { { | |||
398 | .cra_name = "__lrw-twofish-avx", | 401 | .cra_name = "__lrw-twofish-avx", |
399 | .cra_driver_name = "__driver-lrw-twofish-avx", | 402 | .cra_driver_name = "__driver-lrw-twofish-avx", |
400 | .cra_priority = 0, | 403 | .cra_priority = 0, |
401 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 404 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
405 | CRYPTO_ALG_INTERNAL, | ||
402 | .cra_blocksize = TF_BLOCK_SIZE, | 406 | .cra_blocksize = TF_BLOCK_SIZE, |
403 | .cra_ctxsize = sizeof(struct twofish_lrw_ctx), | 407 | .cra_ctxsize = sizeof(struct twofish_lrw_ctx), |
404 | .cra_alignmask = 0, | 408 | .cra_alignmask = 0, |
@@ -421,7 +425,8 @@ static struct crypto_alg twofish_algs[10] = { { | |||
421 | .cra_name = "__xts-twofish-avx", | 425 | .cra_name = "__xts-twofish-avx", |
422 | .cra_driver_name = "__driver-xts-twofish-avx", | 426 | .cra_driver_name = "__driver-xts-twofish-avx", |
423 | .cra_priority = 0, | 427 | .cra_priority = 0, |
424 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | 428 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | |
429 | CRYPTO_ALG_INTERNAL, | ||
425 | .cra_blocksize = TF_BLOCK_SIZE, | 430 | .cra_blocksize = TF_BLOCK_SIZE, |
426 | .cra_ctxsize = sizeof(struct twofish_xts_ctx), | 431 | .cra_ctxsize = sizeof(struct twofish_xts_ctx), |
427 | .cra_alignmask = 0, | 432 | .cra_alignmask = 0, |
diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile index e785b422b766..bb635c641869 100644 --- a/arch/x86/ia32/Makefile +++ b/arch/x86/ia32/Makefile | |||
@@ -3,7 +3,6 @@ | |||
3 | # | 3 | # |
4 | 4 | ||
5 | obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o | 5 | obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o |
6 | obj-$(CONFIG_IA32_EMULATION) += nosyscall.o syscall_ia32.o | ||
7 | 6 | ||
8 | obj-$(CONFIG_IA32_AOUT) += ia32_aout.o | 7 | obj-$(CONFIG_IA32_AOUT) += ia32_aout.o |
9 | 8 | ||
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index d0165c9a2932..c81d35e6c7f1 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c | |||
@@ -161,8 +161,7 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) | |||
161 | } | 161 | } |
162 | 162 | ||
163 | static int ia32_restore_sigcontext(struct pt_regs *regs, | 163 | static int ia32_restore_sigcontext(struct pt_regs *regs, |
164 | struct sigcontext_ia32 __user *sc, | 164 | struct sigcontext_ia32 __user *sc) |
165 | unsigned int *pax) | ||
166 | { | 165 | { |
167 | unsigned int tmpflags, err = 0; | 166 | unsigned int tmpflags, err = 0; |
168 | void __user *buf; | 167 | void __user *buf; |
@@ -184,7 +183,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, | |||
184 | RELOAD_SEG(es); | 183 | RELOAD_SEG(es); |
185 | 184 | ||
186 | COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); | 185 | COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); |
187 | COPY(dx); COPY(cx); COPY(ip); | 186 | COPY(dx); COPY(cx); COPY(ip); COPY(ax); |
188 | /* Don't touch extended registers */ | 187 | /* Don't touch extended registers */ |
189 | 188 | ||
190 | COPY_SEG_CPL3(cs); | 189 | COPY_SEG_CPL3(cs); |
@@ -197,12 +196,12 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, | |||
197 | 196 | ||
198 | get_user_ex(tmp, &sc->fpstate); | 197 | get_user_ex(tmp, &sc->fpstate); |
199 | buf = compat_ptr(tmp); | 198 | buf = compat_ptr(tmp); |
200 | |||
201 | get_user_ex(*pax, &sc->ax); | ||
202 | } get_user_catch(err); | 199 | } get_user_catch(err); |
203 | 200 | ||
204 | err |= restore_xstate_sig(buf, 1); | 201 | err |= restore_xstate_sig(buf, 1); |
205 | 202 | ||
203 | force_iret(); | ||
204 | |||
206 | return err; | 205 | return err; |
207 | } | 206 | } |
208 | 207 | ||
@@ -211,7 +210,6 @@ asmlinkage long sys32_sigreturn(void) | |||
211 | struct pt_regs *regs = current_pt_regs(); | 210 | struct pt_regs *regs = current_pt_regs(); |
212 | struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8); | 211 | struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8); |
213 | sigset_t set; | 212 | sigset_t set; |
214 | unsigned int ax; | ||
215 | 213 | ||
216 | if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) | 214 | if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) |
217 | goto badframe; | 215 | goto badframe; |
@@ -224,9 +222,9 @@ asmlinkage long sys32_sigreturn(void) | |||
224 | 222 | ||
225 | set_current_blocked(&set); | 223 | set_current_blocked(&set); |
226 | 224 | ||
227 | if (ia32_restore_sigcontext(regs, &frame->sc, &ax)) | 225 | if (ia32_restore_sigcontext(regs, &frame->sc)) |
228 | goto badframe; | 226 | goto badframe; |
229 | return ax; | 227 | return regs->ax; |
230 | 228 | ||
231 | badframe: | 229 | badframe: |
232 | signal_fault(regs, frame, "32bit sigreturn"); | 230 | signal_fault(regs, frame, "32bit sigreturn"); |
@@ -238,7 +236,6 @@ asmlinkage long sys32_rt_sigreturn(void) | |||
238 | struct pt_regs *regs = current_pt_regs(); | 236 | struct pt_regs *regs = current_pt_regs(); |
239 | struct rt_sigframe_ia32 __user *frame; | 237 | struct rt_sigframe_ia32 __user *frame; |
240 | sigset_t set; | 238 | sigset_t set; |
241 | unsigned int ax; | ||
242 | 239 | ||
243 | frame = (struct rt_sigframe_ia32 __user *)(regs->sp - 4); | 240 | frame = (struct rt_sigframe_ia32 __user *)(regs->sp - 4); |
244 | 241 | ||
@@ -249,13 +246,13 @@ asmlinkage long sys32_rt_sigreturn(void) | |||
249 | 246 | ||
250 | set_current_blocked(&set); | 247 | set_current_blocked(&set); |
251 | 248 | ||
252 | if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) | 249 | if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext)) |
253 | goto badframe; | 250 | goto badframe; |
254 | 251 | ||
255 | if (compat_restore_altstack(&frame->uc.uc_stack)) | 252 | if (compat_restore_altstack(&frame->uc.uc_stack)) |
256 | goto badframe; | 253 | goto badframe; |
257 | 254 | ||
258 | return ax; | 255 | return regs->ax; |
259 | 256 | ||
260 | badframe: | 257 | badframe: |
261 | signal_fault(regs, frame, "32bit rt sigreturn"); | 258 | signal_fault(regs, frame, "32bit rt sigreturn"); |
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 156ebcab4ada..a821b1cd4fa7 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S | |||
@@ -30,24 +30,13 @@ | |||
30 | 30 | ||
31 | .section .entry.text, "ax" | 31 | .section .entry.text, "ax" |
32 | 32 | ||
33 | .macro IA32_ARG_FIXUP noebp=0 | 33 | /* clobbers %rax */ |
34 | movl %edi,%r8d | 34 | .macro CLEAR_RREGS _r9=rax |
35 | .if \noebp | ||
36 | .else | ||
37 | movl %ebp,%r9d | ||
38 | .endif | ||
39 | xchg %ecx,%esi | ||
40 | movl %ebx,%edi | ||
41 | movl %edx,%edx /* zero extension */ | ||
42 | .endm | ||
43 | |||
44 | /* clobbers %eax */ | ||
45 | .macro CLEAR_RREGS offset=0, _r9=rax | ||
46 | xorl %eax,%eax | 35 | xorl %eax,%eax |
47 | movq %rax,\offset+R11(%rsp) | 36 | movq %rax,R11(%rsp) |
48 | movq %rax,\offset+R10(%rsp) | 37 | movq %rax,R10(%rsp) |
49 | movq %\_r9,\offset+R9(%rsp) | 38 | movq %\_r9,R9(%rsp) |
50 | movq %rax,\offset+R8(%rsp) | 39 | movq %rax,R8(%rsp) |
51 | .endm | 40 | .endm |
52 | 41 | ||
53 | /* | 42 | /* |
@@ -60,14 +49,14 @@ | |||
60 | * If it's -1 to make us punt the syscall, then (u32)-1 is still | 49 | * If it's -1 to make us punt the syscall, then (u32)-1 is still |
61 | * an appropriately invalid value. | 50 | * an appropriately invalid value. |
62 | */ | 51 | */ |
63 | .macro LOAD_ARGS32 offset, _r9=0 | 52 | .macro LOAD_ARGS32 _r9=0 |
64 | .if \_r9 | 53 | .if \_r9 |
65 | movl \offset+16(%rsp),%r9d | 54 | movl R9(%rsp),%r9d |
66 | .endif | 55 | .endif |
67 | movl \offset+40(%rsp),%ecx | 56 | movl RCX(%rsp),%ecx |
68 | movl \offset+48(%rsp),%edx | 57 | movl RDX(%rsp),%edx |
69 | movl \offset+56(%rsp),%esi | 58 | movl RSI(%rsp),%esi |
70 | movl \offset+64(%rsp),%edi | 59 | movl RDI(%rsp),%edi |
71 | movl %eax,%eax /* zero extension */ | 60 | movl %eax,%eax /* zero extension */ |
72 | .endm | 61 | .endm |
73 | 62 | ||
@@ -99,54 +88,69 @@ ENDPROC(native_irq_enable_sysexit) | |||
99 | /* | 88 | /* |
100 | * 32bit SYSENTER instruction entry. | 89 | * 32bit SYSENTER instruction entry. |
101 | * | 90 | * |
91 | * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs. | ||
92 | * IF and VM in rflags are cleared (IOW: interrupts are off). | ||
93 | * SYSENTER does not save anything on the stack, | ||
94 | * and does not save old rip (!!!) and rflags. | ||
95 | * | ||
102 | * Arguments: | 96 | * Arguments: |
103 | * %eax System call number. | 97 | * eax system call number |
104 | * %ebx Arg1 | 98 | * ebx arg1 |
105 | * %ecx Arg2 | 99 | * ecx arg2 |
106 | * %edx Arg3 | 100 | * edx arg3 |
107 | * %esi Arg4 | 101 | * esi arg4 |
108 | * %edi Arg5 | 102 | * edi arg5 |
109 | * %ebp user stack | 103 | * ebp user stack |
110 | * 0(%ebp) Arg6 | 104 | * 0(%ebp) arg6 |
111 | * | 105 | * |
112 | * Interrupts off. | ||
113 | * | ||
114 | * This is purely a fast path. For anything complicated we use the int 0x80 | 106 | * This is purely a fast path. For anything complicated we use the int 0x80 |
115 | * path below. Set up a complete hardware stack frame to share code | 107 | * path below. We set up a complete hardware stack frame to share code |
116 | * with the int 0x80 path. | 108 | * with the int 0x80 path. |
117 | */ | 109 | */ |
118 | ENTRY(ia32_sysenter_target) | 110 | ENTRY(ia32_sysenter_target) |
119 | CFI_STARTPROC32 simple | 111 | CFI_STARTPROC32 simple |
120 | CFI_SIGNAL_FRAME | 112 | CFI_SIGNAL_FRAME |
121 | CFI_DEF_CFA rsp,0 | 113 | CFI_DEF_CFA rsp,0 |
122 | CFI_REGISTER rsp,rbp | 114 | CFI_REGISTER rsp,rbp |
123 | SWAPGS_UNSAFE_STACK | 115 | |
124 | movq PER_CPU_VAR(kernel_stack), %rsp | ||
125 | addq $(KERNEL_STACK_OFFSET),%rsp | ||
126 | /* | 116 | /* |
127 | * No need to follow this irqs on/off section: the syscall | 117 | * Interrupts are off on entry. |
128 | * disabled irqs, here we enable it straight after entry: | 118 | * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, |
119 | * it is too small to ever cause noticeable irq latency. | ||
129 | */ | 120 | */ |
121 | SWAPGS_UNSAFE_STACK | ||
122 | movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp | ||
130 | ENABLE_INTERRUPTS(CLBR_NONE) | 123 | ENABLE_INTERRUPTS(CLBR_NONE) |
131 | movl %ebp,%ebp /* zero extension */ | 124 | |
132 | pushq_cfi $__USER32_DS | 125 | /* Zero-extending 32-bit regs, do not remove */ |
133 | /*CFI_REL_OFFSET ss,0*/ | 126 | movl %ebp, %ebp |
134 | pushq_cfi %rbp | ||
135 | CFI_REL_OFFSET rsp,0 | ||
136 | pushfq_cfi | ||
137 | /*CFI_REL_OFFSET rflags,0*/ | ||
138 | movl TI_sysenter_return+THREAD_INFO(%rsp,3*8-KERNEL_STACK_OFFSET),%r10d | ||
139 | CFI_REGISTER rip,r10 | ||
140 | pushq_cfi $__USER32_CS | ||
141 | /*CFI_REL_OFFSET cs,0*/ | ||
142 | movl %eax, %eax | 127 | movl %eax, %eax |
143 | pushq_cfi %r10 | 128 | |
144 | CFI_REL_OFFSET rip,0 | 129 | movl ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d |
145 | pushq_cfi %rax | 130 | CFI_REGISTER rip,r10 |
131 | |||
132 | /* Construct struct pt_regs on stack */ | ||
133 | pushq_cfi $__USER32_DS /* pt_regs->ss */ | ||
134 | pushq_cfi %rbp /* pt_regs->sp */ | ||
135 | CFI_REL_OFFSET rsp,0 | ||
136 | pushfq_cfi /* pt_regs->flags */ | ||
137 | pushq_cfi $__USER32_CS /* pt_regs->cs */ | ||
138 | pushq_cfi %r10 /* pt_regs->ip = thread_info->sysenter_return */ | ||
139 | CFI_REL_OFFSET rip,0 | ||
140 | pushq_cfi_reg rax /* pt_regs->orig_ax */ | ||
141 | pushq_cfi_reg rdi /* pt_regs->di */ | ||
142 | pushq_cfi_reg rsi /* pt_regs->si */ | ||
143 | pushq_cfi_reg rdx /* pt_regs->dx */ | ||
144 | pushq_cfi_reg rcx /* pt_regs->cx */ | ||
145 | pushq_cfi_reg rax /* pt_regs->ax */ | ||
146 | cld | 146 | cld |
147 | SAVE_ARGS 0,1,0 | 147 | sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ |
148 | /* no need to do an access_ok check here because rbp has been | 148 | CFI_ADJUST_CFA_OFFSET 10*8 |
149 | 32bit zero extended */ | 149 | |
150 | /* | ||
151 | * no need to do an access_ok check here because rbp has been | ||
152 | * 32bit zero extended | ||
153 | */ | ||
150 | ASM_STAC | 154 | ASM_STAC |
151 | 1: movl (%rbp),%ebp | 155 | 1: movl (%rbp),%ebp |
152 | _ASM_EXTABLE(1b,ia32_badarg) | 156 | _ASM_EXTABLE(1b,ia32_badarg) |
@@ -157,42 +161,80 @@ ENTRY(ia32_sysenter_target) | |||
157 | * ourselves. To save a few cycles, we can check whether | 161 | * ourselves. To save a few cycles, we can check whether |
158 | * NT was set instead of doing an unconditional popfq. | 162 | * NT was set instead of doing an unconditional popfq. |
159 | */ | 163 | */ |
160 | testl $X86_EFLAGS_NT,EFLAGS-ARGOFFSET(%rsp) | 164 | testl $X86_EFLAGS_NT,EFLAGS(%rsp) |
161 | jnz sysenter_fix_flags | 165 | jnz sysenter_fix_flags |
162 | sysenter_flags_fixed: | 166 | sysenter_flags_fixed: |
163 | 167 | ||
164 | orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) | 168 | orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) |
165 | testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) | 169 | testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) |
166 | CFI_REMEMBER_STATE | 170 | CFI_REMEMBER_STATE |
167 | jnz sysenter_tracesys | 171 | jnz sysenter_tracesys |
168 | cmpq $(IA32_NR_syscalls-1),%rax | 172 | cmpq $(IA32_NR_syscalls-1),%rax |
169 | ja ia32_badsys | 173 | ja ia32_badsys |
170 | sysenter_do_call: | 174 | sysenter_do_call: |
171 | IA32_ARG_FIXUP | 175 | /* 32bit syscall -> 64bit C ABI argument conversion */ |
176 | movl %edi,%r8d /* arg5 */ | ||
177 | movl %ebp,%r9d /* arg6 */ | ||
178 | xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ | ||
179 | movl %ebx,%edi /* arg1 */ | ||
180 | movl %edx,%edx /* arg3 (zero extension) */ | ||
172 | sysenter_dispatch: | 181 | sysenter_dispatch: |
173 | call *ia32_sys_call_table(,%rax,8) | 182 | call *ia32_sys_call_table(,%rax,8) |
174 | movq %rax,RAX-ARGOFFSET(%rsp) | 183 | movq %rax,RAX(%rsp) |
175 | DISABLE_INTERRUPTS(CLBR_NONE) | 184 | DISABLE_INTERRUPTS(CLBR_NONE) |
176 | TRACE_IRQS_OFF | 185 | TRACE_IRQS_OFF |
177 | testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) | 186 | testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) |
178 | jnz sysexit_audit | 187 | jnz sysexit_audit |
179 | sysexit_from_sys_call: | 188 | sysexit_from_sys_call: |
180 | andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) | 189 | /* |
181 | /* clear IF, that popfq doesn't enable interrupts early */ | 190 | * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an |
182 | andl $~0x200,EFLAGS-ARGOFFSET(%rsp) | 191 | * NMI between STI and SYSEXIT has poorly specified behavior, |
183 | movl RIP-ARGOFFSET(%rsp),%edx /* User %eip */ | 192 | * and and NMI followed by an IRQ with usergs is fatal. So |
184 | CFI_REGISTER rip,rdx | 193 | * we just pretend we're using SYSEXIT but we really use |
185 | RESTORE_ARGS 0,24,0,0,0,0 | 194 | * SYSRETL instead. |
195 | * | ||
196 | * This code path is still called 'sysexit' because it pairs | ||
197 | * with 'sysenter' and it uses the SYSENTER calling convention. | ||
198 | */ | ||
199 | andl $~TS_COMPAT,ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) | ||
200 | movl RIP(%rsp),%ecx /* User %eip */ | ||
201 | CFI_REGISTER rip,rcx | ||
202 | RESTORE_RSI_RDI | ||
203 | xorl %edx,%edx /* avoid info leaks */ | ||
186 | xorq %r8,%r8 | 204 | xorq %r8,%r8 |
187 | xorq %r9,%r9 | 205 | xorq %r9,%r9 |
188 | xorq %r10,%r10 | 206 | xorq %r10,%r10 |
189 | xorq %r11,%r11 | 207 | movl EFLAGS(%rsp),%r11d /* User eflags */ |
190 | popfq_cfi | ||
191 | /*CFI_RESTORE rflags*/ | 208 | /*CFI_RESTORE rflags*/ |
192 | popq_cfi %rcx /* User %esp */ | ||
193 | CFI_REGISTER rsp,rcx | ||
194 | TRACE_IRQS_ON | 209 | TRACE_IRQS_ON |
195 | ENABLE_INTERRUPTS_SYSEXIT32 | 210 | |
211 | /* | ||
212 | * SYSRETL works even on Intel CPUs. Use it in preference to SYSEXIT, | ||
213 | * since it avoids a dicey window with interrupts enabled. | ||
214 | */ | ||
215 | movl RSP(%rsp),%esp | ||
216 | |||
217 | /* | ||
218 | * USERGS_SYSRET32 does: | ||
219 | * gsbase = user's gs base | ||
220 | * eip = ecx | ||
221 | * rflags = r11 | ||
222 | * cs = __USER32_CS | ||
223 | * ss = __USER_DS | ||
224 | * | ||
225 | * The prologue set RIP(%rsp) to VDSO32_SYSENTER_RETURN, which does: | ||
226 | * | ||
227 | * pop %ebp | ||
228 | * pop %edx | ||
229 | * pop %ecx | ||
230 | * | ||
231 | * Therefore, we invoke SYSRETL with EDX and R8-R10 zeroed to | ||
232 | * avoid info leaks. R11 ends up with VDSO32_SYSENTER_RETURN's | ||
233 | * address (already known to user code), and R12-R15 are | ||
234 | * callee-saved and therefore don't contain any interesting | ||
235 | * kernel data. | ||
236 | */ | ||
237 | USERGS_SYSRET32 | ||
196 | 238 | ||
197 | CFI_RESTORE_STATE | 239 | CFI_RESTORE_STATE |
198 | 240 | ||
@@ -205,18 +247,18 @@ sysexit_from_sys_call: | |||
205 | movl %ebx,%esi /* 2nd arg: 1st syscall arg */ | 247 | movl %ebx,%esi /* 2nd arg: 1st syscall arg */ |
206 | movl %eax,%edi /* 1st arg: syscall number */ | 248 | movl %eax,%edi /* 1st arg: syscall number */ |
207 | call __audit_syscall_entry | 249 | call __audit_syscall_entry |
208 | movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */ | 250 | movl RAX(%rsp),%eax /* reload syscall number */ |
209 | cmpq $(IA32_NR_syscalls-1),%rax | 251 | cmpq $(IA32_NR_syscalls-1),%rax |
210 | ja ia32_badsys | 252 | ja ia32_badsys |
211 | movl %ebx,%edi /* reload 1st syscall arg */ | 253 | movl %ebx,%edi /* reload 1st syscall arg */ |
212 | movl RCX-ARGOFFSET(%rsp),%esi /* reload 2nd syscall arg */ | 254 | movl RCX(%rsp),%esi /* reload 2nd syscall arg */ |
213 | movl RDX-ARGOFFSET(%rsp),%edx /* reload 3rd syscall arg */ | 255 | movl RDX(%rsp),%edx /* reload 3rd syscall arg */ |
214 | movl RSI-ARGOFFSET(%rsp),%ecx /* reload 4th syscall arg */ | 256 | movl RSI(%rsp),%ecx /* reload 4th syscall arg */ |
215 | movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */ | 257 | movl RDI(%rsp),%r8d /* reload 5th syscall arg */ |
216 | .endm | 258 | .endm |
217 | 259 | ||
218 | .macro auditsys_exit exit | 260 | .macro auditsys_exit exit |
219 | testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) | 261 | testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) |
220 | jnz ia32_ret_from_sys_call | 262 | jnz ia32_ret_from_sys_call |
221 | TRACE_IRQS_ON | 263 | TRACE_IRQS_ON |
222 | ENABLE_INTERRUPTS(CLBR_NONE) | 264 | ENABLE_INTERRUPTS(CLBR_NONE) |
@@ -227,13 +269,13 @@ sysexit_from_sys_call: | |||
227 | 1: setbe %al /* 1 if error, 0 if not */ | 269 | 1: setbe %al /* 1 if error, 0 if not */ |
228 | movzbl %al,%edi /* zero-extend that into %edi */ | 270 | movzbl %al,%edi /* zero-extend that into %edi */ |
229 | call __audit_syscall_exit | 271 | call __audit_syscall_exit |
230 | movq RAX-ARGOFFSET(%rsp),%rax /* reload syscall return value */ | 272 | movq RAX(%rsp),%rax /* reload syscall return value */ |
231 | movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi | 273 | movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi |
232 | DISABLE_INTERRUPTS(CLBR_NONE) | 274 | DISABLE_INTERRUPTS(CLBR_NONE) |
233 | TRACE_IRQS_OFF | 275 | TRACE_IRQS_OFF |
234 | testl %edi,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) | 276 | testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) |
235 | jz \exit | 277 | jz \exit |
236 | CLEAR_RREGS -ARGOFFSET | 278 | CLEAR_RREGS |
237 | jmp int_with_check | 279 | jmp int_with_check |
238 | .endm | 280 | .endm |
239 | 281 | ||
@@ -253,16 +295,16 @@ sysenter_fix_flags: | |||
253 | 295 | ||
254 | sysenter_tracesys: | 296 | sysenter_tracesys: |
255 | #ifdef CONFIG_AUDITSYSCALL | 297 | #ifdef CONFIG_AUDITSYSCALL |
256 | testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) | 298 | testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) |
257 | jz sysenter_auditsys | 299 | jz sysenter_auditsys |
258 | #endif | 300 | #endif |
259 | SAVE_REST | 301 | SAVE_EXTRA_REGS |
260 | CLEAR_RREGS | 302 | CLEAR_RREGS |
261 | movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */ | 303 | movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */ |
262 | movq %rsp,%rdi /* &pt_regs -> arg1 */ | 304 | movq %rsp,%rdi /* &pt_regs -> arg1 */ |
263 | call syscall_trace_enter | 305 | call syscall_trace_enter |
264 | LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ | 306 | LOAD_ARGS32 /* reload args from stack in case ptrace changed it */ |
265 | RESTORE_REST | 307 | RESTORE_EXTRA_REGS |
266 | cmpq $(IA32_NR_syscalls-1),%rax | 308 | cmpq $(IA32_NR_syscalls-1),%rax |
267 | ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */ | 309 | ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */ |
268 | jmp sysenter_do_call | 310 | jmp sysenter_do_call |
@@ -272,94 +314,128 @@ ENDPROC(ia32_sysenter_target) | |||
272 | /* | 314 | /* |
273 | * 32bit SYSCALL instruction entry. | 315 | * 32bit SYSCALL instruction entry. |
274 | * | 316 | * |
317 | * 32bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, | ||
318 | * then loads new ss, cs, and rip from previously programmed MSRs. | ||
319 | * rflags gets masked by a value from another MSR (so CLD and CLAC | ||
320 | * are not needed). SYSCALL does not save anything on the stack | ||
321 | * and does not change rsp. | ||
322 | * | ||
323 | * Note: rflags saving+masking-with-MSR happens only in Long mode | ||
324 | * (in legacy 32bit mode, IF, RF and VM bits are cleared and that's it). | ||
325 | * Don't get confused: rflags saving+masking depends on Long Mode Active bit | ||
326 | * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes | ||
327 | * or target CS descriptor's L bit (SYSCALL does not read segment descriptors). | ||
328 | * | ||
275 | * Arguments: | 329 | * Arguments: |
276 | * %eax System call number. | 330 | * eax system call number |
277 | * %ebx Arg1 | 331 | * ecx return address |
278 | * %ecx return EIP | 332 | * ebx arg1 |
279 | * %edx Arg3 | 333 | * ebp arg2 (note: not saved in the stack frame, should not be touched) |
280 | * %esi Arg4 | 334 | * edx arg3 |
281 | * %edi Arg5 | 335 | * esi arg4 |
282 | * %ebp Arg2 [note: not saved in the stack frame, should not be touched] | 336 | * edi arg5 |
283 | * %esp user stack | 337 | * esp user stack |
284 | * 0(%esp) Arg6 | 338 | * 0(%esp) arg6 |
285 | * | 339 | * |
286 | * Interrupts off. | ||
287 | * | ||
288 | * This is purely a fast path. For anything complicated we use the int 0x80 | 340 | * This is purely a fast path. For anything complicated we use the int 0x80 |
289 | * path below. Set up a complete hardware stack frame to share code | 341 | * path below. We set up a complete hardware stack frame to share code |
290 | * with the int 0x80 path. | 342 | * with the int 0x80 path. |
291 | */ | 343 | */ |
292 | ENTRY(ia32_cstar_target) | 344 | ENTRY(ia32_cstar_target) |
293 | CFI_STARTPROC32 simple | 345 | CFI_STARTPROC32 simple |
294 | CFI_SIGNAL_FRAME | 346 | CFI_SIGNAL_FRAME |
295 | CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET | 347 | CFI_DEF_CFA rsp,0 |
296 | CFI_REGISTER rip,rcx | 348 | CFI_REGISTER rip,rcx |
297 | /*CFI_REGISTER rflags,r11*/ | 349 | /*CFI_REGISTER rflags,r11*/ |
350 | |||
351 | /* | ||
352 | * Interrupts are off on entry. | ||
353 | * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, | ||
354 | * it is too small to ever cause noticeable irq latency. | ||
355 | */ | ||
298 | SWAPGS_UNSAFE_STACK | 356 | SWAPGS_UNSAFE_STACK |
299 | movl %esp,%r8d | 357 | movl %esp,%r8d |
300 | CFI_REGISTER rsp,r8 | 358 | CFI_REGISTER rsp,r8 |
301 | movq PER_CPU_VAR(kernel_stack),%rsp | 359 | movq PER_CPU_VAR(kernel_stack),%rsp |
302 | /* | ||
303 | * No need to follow this irqs on/off section: the syscall | ||
304 | * disabled irqs and here we enable it straight after entry: | ||
305 | */ | ||
306 | ENABLE_INTERRUPTS(CLBR_NONE) | 360 | ENABLE_INTERRUPTS(CLBR_NONE) |
307 | SAVE_ARGS 8,0,0 | 361 | |
308 | movl %eax,%eax /* zero extension */ | 362 | /* Zero-extending 32-bit regs, do not remove */ |
309 | movq %rax,ORIG_RAX-ARGOFFSET(%rsp) | 363 | movl %eax,%eax |
310 | movq %rcx,RIP-ARGOFFSET(%rsp) | 364 | |
311 | CFI_REL_OFFSET rip,RIP-ARGOFFSET | 365 | /* Construct struct pt_regs on stack */ |
312 | movq %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */ | 366 | pushq_cfi $__USER32_DS /* pt_regs->ss */ |
367 | pushq_cfi %r8 /* pt_regs->sp */ | ||
368 | CFI_REL_OFFSET rsp,0 | ||
369 | pushq_cfi %r11 /* pt_regs->flags */ | ||
370 | pushq_cfi $__USER32_CS /* pt_regs->cs */ | ||
371 | pushq_cfi %rcx /* pt_regs->ip */ | ||
372 | CFI_REL_OFFSET rip,0 | ||
373 | pushq_cfi_reg rax /* pt_regs->orig_ax */ | ||
374 | pushq_cfi_reg rdi /* pt_regs->di */ | ||
375 | pushq_cfi_reg rsi /* pt_regs->si */ | ||
376 | pushq_cfi_reg rdx /* pt_regs->dx */ | ||
377 | pushq_cfi_reg rbp /* pt_regs->cx */ | ||
313 | movl %ebp,%ecx | 378 | movl %ebp,%ecx |
314 | movq $__USER32_CS,CS-ARGOFFSET(%rsp) | 379 | pushq_cfi_reg rax /* pt_regs->ax */ |
315 | movq $__USER32_DS,SS-ARGOFFSET(%rsp) | 380 | sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ |
316 | movq %r11,EFLAGS-ARGOFFSET(%rsp) | 381 | CFI_ADJUST_CFA_OFFSET 10*8 |
317 | /*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/ | 382 | |
318 | movq %r8,RSP-ARGOFFSET(%rsp) | 383 | /* |
319 | CFI_REL_OFFSET rsp,RSP-ARGOFFSET | 384 | * no need to do an access_ok check here because r8 has been |
320 | /* no need to do an access_ok check here because r8 has been | 385 | * 32bit zero extended |
321 | 32bit zero extended */ | 386 | */ |
322 | /* hardware stack frame is complete now */ | ||
323 | ASM_STAC | 387 | ASM_STAC |
324 | 1: movl (%r8),%r9d | 388 | 1: movl (%r8),%r9d |
325 | _ASM_EXTABLE(1b,ia32_badarg) | 389 | _ASM_EXTABLE(1b,ia32_badarg) |
326 | ASM_CLAC | 390 | ASM_CLAC |
327 | orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) | 391 | orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) |
328 | testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) | 392 | testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) |
329 | CFI_REMEMBER_STATE | 393 | CFI_REMEMBER_STATE |
330 | jnz cstar_tracesys | 394 | jnz cstar_tracesys |
331 | cmpq $IA32_NR_syscalls-1,%rax | 395 | cmpq $IA32_NR_syscalls-1,%rax |
332 | ja ia32_badsys | 396 | ja ia32_badsys |
333 | cstar_do_call: | 397 | cstar_do_call: |
334 | IA32_ARG_FIXUP 1 | 398 | /* 32bit syscall -> 64bit C ABI argument conversion */ |
399 | movl %edi,%r8d /* arg5 */ | ||
400 | /* r9 already loaded */ /* arg6 */ | ||
401 | xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ | ||
402 | movl %ebx,%edi /* arg1 */ | ||
403 | movl %edx,%edx /* arg3 (zero extension) */ | ||
335 | cstar_dispatch: | 404 | cstar_dispatch: |
336 | call *ia32_sys_call_table(,%rax,8) | 405 | call *ia32_sys_call_table(,%rax,8) |
337 | movq %rax,RAX-ARGOFFSET(%rsp) | 406 | movq %rax,RAX(%rsp) |
338 | DISABLE_INTERRUPTS(CLBR_NONE) | 407 | DISABLE_INTERRUPTS(CLBR_NONE) |
339 | TRACE_IRQS_OFF | 408 | TRACE_IRQS_OFF |
340 | testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) | 409 | testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) |
341 | jnz sysretl_audit | 410 | jnz sysretl_audit |
342 | sysretl_from_sys_call: | 411 | sysretl_from_sys_call: |
343 | andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) | 412 | andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) |
344 | RESTORE_ARGS 0,-ARG_SKIP,0,0,0 | 413 | RESTORE_RSI_RDI_RDX |
345 | movl RIP-ARGOFFSET(%rsp),%ecx | 414 | movl RIP(%rsp),%ecx |
346 | CFI_REGISTER rip,rcx | 415 | CFI_REGISTER rip,rcx |
347 | movl EFLAGS-ARGOFFSET(%rsp),%r11d | 416 | movl EFLAGS(%rsp),%r11d |
348 | /*CFI_REGISTER rflags,r11*/ | 417 | /*CFI_REGISTER rflags,r11*/ |
349 | xorq %r10,%r10 | 418 | xorq %r10,%r10 |
350 | xorq %r9,%r9 | 419 | xorq %r9,%r9 |
351 | xorq %r8,%r8 | 420 | xorq %r8,%r8 |
352 | TRACE_IRQS_ON | 421 | TRACE_IRQS_ON |
353 | movl RSP-ARGOFFSET(%rsp),%esp | 422 | movl RSP(%rsp),%esp |
354 | CFI_RESTORE rsp | 423 | CFI_RESTORE rsp |
424 | /* | ||
425 | * 64bit->32bit SYSRET restores eip from ecx, | ||
426 | * eflags from r11 (but RF and VM bits are forced to 0), | ||
427 | * cs and ss are loaded from MSRs. | ||
428 | * (Note: 32bit->32bit SYSRET is different: since r11 | ||
429 | * does not exist, it merely sets eflags.IF=1). | ||
430 | */ | ||
355 | USERGS_SYSRET32 | 431 | USERGS_SYSRET32 |
356 | 432 | ||
357 | #ifdef CONFIG_AUDITSYSCALL | 433 | #ifdef CONFIG_AUDITSYSCALL |
358 | cstar_auditsys: | 434 | cstar_auditsys: |
359 | CFI_RESTORE_STATE | 435 | CFI_RESTORE_STATE |
360 | movl %r9d,R9-ARGOFFSET(%rsp) /* register to be clobbered by call */ | 436 | movl %r9d,R9(%rsp) /* register to be clobbered by call */ |
361 | auditsys_entry_common | 437 | auditsys_entry_common |
362 | movl R9-ARGOFFSET(%rsp),%r9d /* reload 6th syscall arg */ | 438 | movl R9(%rsp),%r9d /* reload 6th syscall arg */ |
363 | jmp cstar_dispatch | 439 | jmp cstar_dispatch |
364 | 440 | ||
365 | sysretl_audit: | 441 | sysretl_audit: |
@@ -368,17 +444,17 @@ sysretl_audit: | |||
368 | 444 | ||
369 | cstar_tracesys: | 445 | cstar_tracesys: |
370 | #ifdef CONFIG_AUDITSYSCALL | 446 | #ifdef CONFIG_AUDITSYSCALL |
371 | testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) | 447 | testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) |
372 | jz cstar_auditsys | 448 | jz cstar_auditsys |
373 | #endif | 449 | #endif |
374 | xchgl %r9d,%ebp | 450 | xchgl %r9d,%ebp |
375 | SAVE_REST | 451 | SAVE_EXTRA_REGS |
376 | CLEAR_RREGS 0, r9 | 452 | CLEAR_RREGS r9 |
377 | movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ | 453 | movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ |
378 | movq %rsp,%rdi /* &pt_regs -> arg1 */ | 454 | movq %rsp,%rdi /* &pt_regs -> arg1 */ |
379 | call syscall_trace_enter | 455 | call syscall_trace_enter |
380 | LOAD_ARGS32 ARGOFFSET, 1 /* reload args from stack in case ptrace changed it */ | 456 | LOAD_ARGS32 1 /* reload args from stack in case ptrace changed it */ |
381 | RESTORE_REST | 457 | RESTORE_EXTRA_REGS |
382 | xchgl %ebp,%r9d | 458 | xchgl %ebp,%r9d |
383 | cmpq $(IA32_NR_syscalls-1),%rax | 459 | cmpq $(IA32_NR_syscalls-1),%rax |
384 | ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */ | 460 | ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */ |
@@ -391,78 +467,94 @@ ia32_badarg: | |||
391 | jmp ia32_sysret | 467 | jmp ia32_sysret |
392 | CFI_ENDPROC | 468 | CFI_ENDPROC |
393 | 469 | ||
394 | /* | 470 | /* |
395 | * Emulated IA32 system calls via int 0x80. | 471 | * Emulated IA32 system calls via int 0x80. |
396 | * | 472 | * |
397 | * Arguments: | 473 | * Arguments: |
398 | * %eax System call number. | 474 | * eax system call number |
399 | * %ebx Arg1 | 475 | * ebx arg1 |
400 | * %ecx Arg2 | 476 | * ecx arg2 |
401 | * %edx Arg3 | 477 | * edx arg3 |
402 | * %esi Arg4 | 478 | * esi arg4 |
403 | * %edi Arg5 | 479 | * edi arg5 |
404 | * %ebp Arg6 [note: not saved in the stack frame, should not be touched] | 480 | * ebp arg6 (note: not saved in the stack frame, should not be touched) |
405 | * | 481 | * |
406 | * Notes: | 482 | * Notes: |
407 | * Uses the same stack frame as the x86-64 version. | 483 | * Uses the same stack frame as the x86-64 version. |
408 | * All registers except %eax must be saved (but ptrace may violate that) | 484 | * All registers except eax must be saved (but ptrace may violate that). |
409 | * Arguments are zero extended. For system calls that want sign extension and | 485 | * Arguments are zero extended. For system calls that want sign extension and |
410 | * take long arguments a wrapper is needed. Most calls can just be called | 486 | * take long arguments a wrapper is needed. Most calls can just be called |
411 | * directly. | 487 | * directly. |
412 | * Assumes it is only called from user space and entered with interrupts off. | 488 | * Assumes it is only called from user space and entered with interrupts off. |
413 | */ | 489 | */ |
414 | 490 | ||
415 | ENTRY(ia32_syscall) | 491 | ENTRY(ia32_syscall) |
416 | CFI_STARTPROC32 simple | 492 | CFI_STARTPROC32 simple |
417 | CFI_SIGNAL_FRAME | 493 | CFI_SIGNAL_FRAME |
418 | CFI_DEF_CFA rsp,SS+8-RIP | 494 | CFI_DEF_CFA rsp,5*8 |
419 | /*CFI_REL_OFFSET ss,SS-RIP*/ | 495 | /*CFI_REL_OFFSET ss,4*8 */ |
420 | CFI_REL_OFFSET rsp,RSP-RIP | 496 | CFI_REL_OFFSET rsp,3*8 |
421 | /*CFI_REL_OFFSET rflags,EFLAGS-RIP*/ | 497 | /*CFI_REL_OFFSET rflags,2*8 */ |
422 | /*CFI_REL_OFFSET cs,CS-RIP*/ | 498 | /*CFI_REL_OFFSET cs,1*8 */ |
423 | CFI_REL_OFFSET rip,RIP-RIP | 499 | CFI_REL_OFFSET rip,0*8 |
424 | PARAVIRT_ADJUST_EXCEPTION_FRAME | 500 | |
425 | SWAPGS | ||
426 | /* | 501 | /* |
427 | * No need to follow this irqs on/off section: the syscall | 502 | * Interrupts are off on entry. |
428 | * disabled irqs and here we enable it straight after entry: | 503 | * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, |
504 | * it is too small to ever cause noticeable irq latency. | ||
429 | */ | 505 | */ |
506 | PARAVIRT_ADJUST_EXCEPTION_FRAME | ||
507 | SWAPGS | ||
430 | ENABLE_INTERRUPTS(CLBR_NONE) | 508 | ENABLE_INTERRUPTS(CLBR_NONE) |
431 | movl %eax,%eax | 509 | |
432 | pushq_cfi %rax | 510 | /* Zero-extending 32-bit regs, do not remove */ |
511 | movl %eax,%eax | ||
512 | |||
513 | /* Construct struct pt_regs on stack (iret frame is already on stack) */ | ||
514 | pushq_cfi_reg rax /* pt_regs->orig_ax */ | ||
515 | pushq_cfi_reg rdi /* pt_regs->di */ | ||
516 | pushq_cfi_reg rsi /* pt_regs->si */ | ||
517 | pushq_cfi_reg rdx /* pt_regs->dx */ | ||
518 | pushq_cfi_reg rcx /* pt_regs->cx */ | ||
519 | pushq_cfi_reg rax /* pt_regs->ax */ | ||
433 | cld | 520 | cld |
434 | /* note the registers are not zero extended to the sf. | 521 | sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ |
435 | this could be a problem. */ | 522 | CFI_ADJUST_CFA_OFFSET 10*8 |
436 | SAVE_ARGS 0,1,0 | 523 | |
437 | orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) | 524 | orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) |
438 | testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) | 525 | testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) |
439 | jnz ia32_tracesys | 526 | jnz ia32_tracesys |
440 | cmpq $(IA32_NR_syscalls-1),%rax | 527 | cmpq $(IA32_NR_syscalls-1),%rax |
441 | ja ia32_badsys | 528 | ja ia32_badsys |
442 | ia32_do_call: | 529 | ia32_do_call: |
443 | IA32_ARG_FIXUP | 530 | /* 32bit syscall -> 64bit C ABI argument conversion */ |
531 | movl %edi,%r8d /* arg5 */ | ||
532 | movl %ebp,%r9d /* arg6 */ | ||
533 | xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ | ||
534 | movl %ebx,%edi /* arg1 */ | ||
535 | movl %edx,%edx /* arg3 (zero extension) */ | ||
444 | call *ia32_sys_call_table(,%rax,8) # xxx: rip relative | 536 | call *ia32_sys_call_table(,%rax,8) # xxx: rip relative |
445 | ia32_sysret: | 537 | ia32_sysret: |
446 | movq %rax,RAX-ARGOFFSET(%rsp) | 538 | movq %rax,RAX(%rsp) |
447 | ia32_ret_from_sys_call: | 539 | ia32_ret_from_sys_call: |
448 | CLEAR_RREGS -ARGOFFSET | 540 | CLEAR_RREGS |
449 | jmp int_ret_from_sys_call | 541 | jmp int_ret_from_sys_call |
450 | 542 | ||
451 | ia32_tracesys: | 543 | ia32_tracesys: |
452 | SAVE_REST | 544 | SAVE_EXTRA_REGS |
453 | CLEAR_RREGS | 545 | CLEAR_RREGS |
454 | movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ | 546 | movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ |
455 | movq %rsp,%rdi /* &pt_regs -> arg1 */ | 547 | movq %rsp,%rdi /* &pt_regs -> arg1 */ |
456 | call syscall_trace_enter | 548 | call syscall_trace_enter |
457 | LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ | 549 | LOAD_ARGS32 /* reload args from stack in case ptrace changed it */ |
458 | RESTORE_REST | 550 | RESTORE_EXTRA_REGS |
459 | cmpq $(IA32_NR_syscalls-1),%rax | 551 | cmpq $(IA32_NR_syscalls-1),%rax |
460 | ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */ | 552 | ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */ |
461 | jmp ia32_do_call | 553 | jmp ia32_do_call |
462 | END(ia32_syscall) | 554 | END(ia32_syscall) |
463 | 555 | ||
464 | ia32_badsys: | 556 | ia32_badsys: |
465 | movq $0,ORIG_RAX-ARGOFFSET(%rsp) | 557 | movq $0,ORIG_RAX(%rsp) |
466 | movq $-ENOSYS,%rax | 558 | movq $-ENOSYS,%rax |
467 | jmp ia32_sysret | 559 | jmp ia32_sysret |
468 | 560 | ||
@@ -479,8 +571,6 @@ GLOBAL(\label) | |||
479 | 571 | ||
480 | PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn | 572 | PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn |
481 | PTREGSCALL stub32_sigreturn, sys32_sigreturn | 573 | PTREGSCALL stub32_sigreturn, sys32_sigreturn |
482 | PTREGSCALL stub32_execve, compat_sys_execve | ||
483 | PTREGSCALL stub32_execveat, compat_sys_execveat | ||
484 | PTREGSCALL stub32_fork, sys_fork | 574 | PTREGSCALL stub32_fork, sys_fork |
485 | PTREGSCALL stub32_vfork, sys_vfork | 575 | PTREGSCALL stub32_vfork, sys_vfork |
486 | 576 | ||
@@ -492,24 +582,23 @@ GLOBAL(stub32_clone) | |||
492 | 582 | ||
493 | ALIGN | 583 | ALIGN |
494 | ia32_ptregs_common: | 584 | ia32_ptregs_common: |
495 | popq %r11 | ||
496 | CFI_ENDPROC | 585 | CFI_ENDPROC |
497 | CFI_STARTPROC32 simple | 586 | CFI_STARTPROC32 simple |
498 | CFI_SIGNAL_FRAME | 587 | CFI_SIGNAL_FRAME |
499 | CFI_DEF_CFA rsp,SS+8-ARGOFFSET | 588 | CFI_DEF_CFA rsp,SIZEOF_PTREGS |
500 | CFI_REL_OFFSET rax,RAX-ARGOFFSET | 589 | CFI_REL_OFFSET rax,RAX |
501 | CFI_REL_OFFSET rcx,RCX-ARGOFFSET | 590 | CFI_REL_OFFSET rcx,RCX |
502 | CFI_REL_OFFSET rdx,RDX-ARGOFFSET | 591 | CFI_REL_OFFSET rdx,RDX |
503 | CFI_REL_OFFSET rsi,RSI-ARGOFFSET | 592 | CFI_REL_OFFSET rsi,RSI |
504 | CFI_REL_OFFSET rdi,RDI-ARGOFFSET | 593 | CFI_REL_OFFSET rdi,RDI |
505 | CFI_REL_OFFSET rip,RIP-ARGOFFSET | 594 | CFI_REL_OFFSET rip,RIP |
506 | /* CFI_REL_OFFSET cs,CS-ARGOFFSET*/ | 595 | /* CFI_REL_OFFSET cs,CS*/ |
507 | /* CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/ | 596 | /* CFI_REL_OFFSET rflags,EFLAGS*/ |
508 | CFI_REL_OFFSET rsp,RSP-ARGOFFSET | 597 | CFI_REL_OFFSET rsp,RSP |
509 | /* CFI_REL_OFFSET ss,SS-ARGOFFSET*/ | 598 | /* CFI_REL_OFFSET ss,SS*/ |
510 | SAVE_REST | 599 | SAVE_EXTRA_REGS 8 |
511 | call *%rax | 600 | call *%rax |
512 | RESTORE_REST | 601 | RESTORE_EXTRA_REGS 8 |
513 | jmp ia32_sysret /* misbalances the return cache */ | 602 | ret |
514 | CFI_ENDPROC | 603 | CFI_ENDPROC |
515 | END(ia32_ptregs_common) | 604 | END(ia32_ptregs_common) |
diff --git a/arch/x86/ia32/nosyscall.c b/arch/x86/ia32/nosyscall.c deleted file mode 100644 index 51ecd5b4e787..000000000000 --- a/arch/x86/ia32/nosyscall.c +++ /dev/null | |||
@@ -1,7 +0,0 @@ | |||
1 | #include <linux/kernel.h> | ||
2 | #include <linux/errno.h> | ||
3 | |||
4 | long compat_ni_syscall(void) | ||
5 | { | ||
6 | return -ENOSYS; | ||
7 | } | ||
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 8e0ceecdc957..719cd702b0a4 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c | |||
@@ -201,20 +201,6 @@ long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high, | |||
201 | advice); | 201 | advice); |
202 | } | 202 | } |
203 | 203 | ||
204 | long sys32_vm86_warning(void) | ||
205 | { | ||
206 | struct task_struct *me = current; | ||
207 | static char lastcomm[sizeof(me->comm)]; | ||
208 | |||
209 | if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) { | ||
210 | compat_printk(KERN_INFO | ||
211 | "%s: vm86 mode not supported on 64 bit kernel\n", | ||
212 | me->comm); | ||
213 | strncpy(lastcomm, me->comm, sizeof(lastcomm)); | ||
214 | } | ||
215 | return -ENOSYS; | ||
216 | } | ||
217 | |||
218 | asmlinkage ssize_t sys32_readahead(int fd, unsigned off_lo, unsigned off_hi, | 204 | asmlinkage ssize_t sys32_readahead(int fd, unsigned off_lo, unsigned off_hi, |
219 | size_t count) | 205 | size_t count) |
220 | { | 206 | { |
diff --git a/arch/x86/ia32/syscall_ia32.c b/arch/x86/ia32/syscall_ia32.c deleted file mode 100644 index 4754ba0f5d9f..000000000000 --- a/arch/x86/ia32/syscall_ia32.c +++ /dev/null | |||
@@ -1,25 +0,0 @@ | |||
1 | /* System call table for ia32 emulation. */ | ||
2 | |||
3 | #include <linux/linkage.h> | ||
4 | #include <linux/sys.h> | ||
5 | #include <linux/cache.h> | ||
6 | #include <asm/asm-offsets.h> | ||
7 | |||
8 | #define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void compat(void) ; | ||
9 | #include <asm/syscalls_32.h> | ||
10 | #undef __SYSCALL_I386 | ||
11 | |||
12 | #define __SYSCALL_I386(nr, sym, compat) [nr] = compat, | ||
13 | |||
14 | typedef void (*sys_call_ptr_t)(void); | ||
15 | |||
16 | extern void compat_ni_syscall(void); | ||
17 | |||
18 | const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = { | ||
19 | /* | ||
20 | * Smells like a compiler bug -- it doesn't work | ||
21 | * when the & below is removed. | ||
22 | */ | ||
23 | [0 ... __NR_ia32_syscall_max] = &compat_ni_syscall, | ||
24 | #include <asm/syscalls_32.h> | ||
25 | }; | ||
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index 372231c22a47..bdf02eeee765 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h | |||
@@ -18,12 +18,63 @@ | |||
18 | .endm | 18 | .endm |
19 | #endif | 19 | #endif |
20 | 20 | ||
21 | .macro altinstruction_entry orig alt feature orig_len alt_len | 21 | .macro altinstruction_entry orig alt feature orig_len alt_len pad_len |
22 | .long \orig - . | 22 | .long \orig - . |
23 | .long \alt - . | 23 | .long \alt - . |
24 | .word \feature | 24 | .word \feature |
25 | .byte \orig_len | 25 | .byte \orig_len |
26 | .byte \alt_len | 26 | .byte \alt_len |
27 | .byte \pad_len | ||
28 | .endm | ||
29 | |||
30 | .macro ALTERNATIVE oldinstr, newinstr, feature | ||
31 | 140: | ||
32 | \oldinstr | ||
33 | 141: | ||
34 | .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90 | ||
35 | 142: | ||
36 | |||
37 | .pushsection .altinstructions,"a" | ||
38 | altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f,142b-141b | ||
39 | .popsection | ||
40 | |||
41 | .pushsection .altinstr_replacement,"ax" | ||
42 | 143: | ||
43 | \newinstr | ||
44 | 144: | ||
45 | .popsection | ||
46 | .endm | ||
47 | |||
48 | #define old_len 141b-140b | ||
49 | #define new_len1 144f-143f | ||
50 | #define new_len2 145f-144f | ||
51 | |||
52 | /* | ||
53 | * max without conditionals. Idea adapted from: | ||
54 | * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax | ||
55 | */ | ||
56 | #define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b))))) | ||
57 | |||
58 | .macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2 | ||
59 | 140: | ||
60 | \oldinstr | ||
61 | 141: | ||
62 | .skip -((alt_max_short(new_len1, new_len2) - (old_len)) > 0) * \ | ||
63 | (alt_max_short(new_len1, new_len2) - (old_len)),0x90 | ||
64 | 142: | ||
65 | |||
66 | .pushsection .altinstructions,"a" | ||
67 | altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f,142b-141b | ||
68 | altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f,142b-141b | ||
69 | .popsection | ||
70 | |||
71 | .pushsection .altinstr_replacement,"ax" | ||
72 | 143: | ||
73 | \newinstr1 | ||
74 | 144: | ||
75 | \newinstr2 | ||
76 | 145: | ||
77 | .popsection | ||
27 | .endm | 78 | .endm |
28 | 79 | ||
29 | #endif /* __ASSEMBLY__ */ | 80 | #endif /* __ASSEMBLY__ */ |
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 473bdbee378a..ba32af062f61 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h | |||
@@ -48,8 +48,9 @@ struct alt_instr { | |||
48 | s32 repl_offset; /* offset to replacement instruction */ | 48 | s32 repl_offset; /* offset to replacement instruction */ |
49 | u16 cpuid; /* cpuid bit set for replacement */ | 49 | u16 cpuid; /* cpuid bit set for replacement */ |
50 | u8 instrlen; /* length of original instruction */ | 50 | u8 instrlen; /* length of original instruction */ |
51 | u8 replacementlen; /* length of new instruction, <= instrlen */ | 51 | u8 replacementlen; /* length of new instruction */ |
52 | }; | 52 | u8 padlen; /* length of build-time padding */ |
53 | } __packed; | ||
53 | 54 | ||
54 | extern void alternative_instructions(void); | 55 | extern void alternative_instructions(void); |
55 | extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); | 56 | extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); |
@@ -76,50 +77,69 @@ static inline int alternatives_text_reserved(void *start, void *end) | |||
76 | } | 77 | } |
77 | #endif /* CONFIG_SMP */ | 78 | #endif /* CONFIG_SMP */ |
78 | 79 | ||
79 | #define OLDINSTR(oldinstr) "661:\n\t" oldinstr "\n662:\n" | 80 | #define b_replacement(num) "664"#num |
81 | #define e_replacement(num) "665"#num | ||
80 | 82 | ||
81 | #define b_replacement(number) "663"#number | 83 | #define alt_end_marker "663" |
82 | #define e_replacement(number) "664"#number | 84 | #define alt_slen "662b-661b" |
85 | #define alt_pad_len alt_end_marker"b-662b" | ||
86 | #define alt_total_slen alt_end_marker"b-661b" | ||
87 | #define alt_rlen(num) e_replacement(num)"f-"b_replacement(num)"f" | ||
83 | 88 | ||
84 | #define alt_slen "662b-661b" | 89 | #define __OLDINSTR(oldinstr, num) \ |
85 | #define alt_rlen(number) e_replacement(number)"f-"b_replacement(number)"f" | 90 | "661:\n\t" oldinstr "\n662:\n" \ |
91 | ".skip -(((" alt_rlen(num) ")-(" alt_slen ")) > 0) * " \ | ||
92 | "((" alt_rlen(num) ")-(" alt_slen ")),0x90\n" | ||
86 | 93 | ||
87 | #define ALTINSTR_ENTRY(feature, number) \ | 94 | #define OLDINSTR(oldinstr, num) \ |
95 | __OLDINSTR(oldinstr, num) \ | ||
96 | alt_end_marker ":\n" | ||
97 | |||
98 | /* | ||
99 | * max without conditionals. Idea adapted from: | ||
100 | * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax | ||
101 | * | ||
102 | * The additional "-" is needed because gas works with s32s. | ||
103 | */ | ||
104 | #define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") - (" b ")))))" | ||
105 | |||
106 | /* | ||
107 | * Pad the second replacement alternative with additional NOPs if it is | ||
108 | * additionally longer than the first replacement alternative. | ||
109 | */ | ||
110 | #define OLDINSTR_2(oldinstr, num1, num2) \ | ||
111 | "661:\n\t" oldinstr "\n662:\n" \ | ||
112 | ".skip -((" alt_max_short(alt_rlen(num1), alt_rlen(num2)) " - (" alt_slen ")) > 0) * " \ | ||
113 | "(" alt_max_short(alt_rlen(num1), alt_rlen(num2)) " - (" alt_slen ")), 0x90\n" \ | ||
114 | alt_end_marker ":\n" | ||
115 | |||
116 | #define ALTINSTR_ENTRY(feature, num) \ | ||
88 | " .long 661b - .\n" /* label */ \ | 117 | " .long 661b - .\n" /* label */ \ |
89 | " .long " b_replacement(number)"f - .\n" /* new instruction */ \ | 118 | " .long " b_replacement(num)"f - .\n" /* new instruction */ \ |
90 | " .word " __stringify(feature) "\n" /* feature bit */ \ | 119 | " .word " __stringify(feature) "\n" /* feature bit */ \ |
91 | " .byte " alt_slen "\n" /* source len */ \ | 120 | " .byte " alt_total_slen "\n" /* source len */ \ |
92 | " .byte " alt_rlen(number) "\n" /* replacement len */ | 121 | " .byte " alt_rlen(num) "\n" /* replacement len */ \ |
93 | 122 | " .byte " alt_pad_len "\n" /* pad len */ | |
94 | #define DISCARD_ENTRY(number) /* rlen <= slen */ \ | ||
95 | " .byte 0xff + (" alt_rlen(number) ") - (" alt_slen ")\n" | ||
96 | 123 | ||
97 | #define ALTINSTR_REPLACEMENT(newinstr, feature, number) /* replacement */ \ | 124 | #define ALTINSTR_REPLACEMENT(newinstr, feature, num) /* replacement */ \ |
98 | b_replacement(number)":\n\t" newinstr "\n" e_replacement(number) ":\n\t" | 125 | b_replacement(num)":\n\t" newinstr "\n" e_replacement(num) ":\n\t" |
99 | 126 | ||
100 | /* alternative assembly primitive: */ | 127 | /* alternative assembly primitive: */ |
101 | #define ALTERNATIVE(oldinstr, newinstr, feature) \ | 128 | #define ALTERNATIVE(oldinstr, newinstr, feature) \ |
102 | OLDINSTR(oldinstr) \ | 129 | OLDINSTR(oldinstr, 1) \ |
103 | ".pushsection .altinstructions,\"a\"\n" \ | 130 | ".pushsection .altinstructions,\"a\"\n" \ |
104 | ALTINSTR_ENTRY(feature, 1) \ | 131 | ALTINSTR_ENTRY(feature, 1) \ |
105 | ".popsection\n" \ | 132 | ".popsection\n" \ |
106 | ".pushsection .discard,\"aw\",@progbits\n" \ | ||
107 | DISCARD_ENTRY(1) \ | ||
108 | ".popsection\n" \ | ||
109 | ".pushsection .altinstr_replacement, \"ax\"\n" \ | 133 | ".pushsection .altinstr_replacement, \"ax\"\n" \ |
110 | ALTINSTR_REPLACEMENT(newinstr, feature, 1) \ | 134 | ALTINSTR_REPLACEMENT(newinstr, feature, 1) \ |
111 | ".popsection" | 135 | ".popsection" |
112 | 136 | ||
113 | #define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\ | 137 | #define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\ |
114 | OLDINSTR(oldinstr) \ | 138 | OLDINSTR_2(oldinstr, 1, 2) \ |
115 | ".pushsection .altinstructions,\"a\"\n" \ | 139 | ".pushsection .altinstructions,\"a\"\n" \ |
116 | ALTINSTR_ENTRY(feature1, 1) \ | 140 | ALTINSTR_ENTRY(feature1, 1) \ |
117 | ALTINSTR_ENTRY(feature2, 2) \ | 141 | ALTINSTR_ENTRY(feature2, 2) \ |
118 | ".popsection\n" \ | 142 | ".popsection\n" \ |
119 | ".pushsection .discard,\"aw\",@progbits\n" \ | ||
120 | DISCARD_ENTRY(1) \ | ||
121 | DISCARD_ENTRY(2) \ | ||
122 | ".popsection\n" \ | ||
123 | ".pushsection .altinstr_replacement, \"ax\"\n" \ | 143 | ".pushsection .altinstr_replacement, \"ax\"\n" \ |
124 | ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \ | 144 | ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \ |
125 | ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \ | 145 | ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \ |
@@ -146,6 +166,9 @@ static inline int alternatives_text_reserved(void *start, void *end) | |||
146 | #define alternative(oldinstr, newinstr, feature) \ | 166 | #define alternative(oldinstr, newinstr, feature) \ |
147 | asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory") | 167 | asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory") |
148 | 168 | ||
169 | #define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \ | ||
170 | asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2) ::: "memory") | ||
171 | |||
149 | /* | 172 | /* |
150 | * Alternative inline assembly with input. | 173 | * Alternative inline assembly with input. |
151 | * | 174 | * |
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index efc3b22d896e..976b86a325e5 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h | |||
@@ -91,7 +91,7 @@ static inline void native_apic_mem_write(u32 reg, u32 v) | |||
91 | { | 91 | { |
92 | volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg); | 92 | volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg); |
93 | 93 | ||
94 | alternative_io("movl %0, %1", "xchgl %0, %1", X86_BUG_11AP, | 94 | alternative_io("movl %0, %P1", "xchgl %0, %P1", X86_BUG_11AP, |
95 | ASM_OUTPUT2("=r" (v), "=m" (*addr)), | 95 | ASM_OUTPUT2("=r" (v), "=m" (*addr)), |
96 | ASM_OUTPUT2("0" (v), "m" (*addr))); | 96 | ASM_OUTPUT2("0" (v), "m" (*addr))); |
97 | } | 97 | } |
@@ -204,7 +204,6 @@ extern void clear_local_APIC(void); | |||
204 | extern void disconnect_bsp_APIC(int virt_wire_setup); | 204 | extern void disconnect_bsp_APIC(int virt_wire_setup); |
205 | extern void disable_local_APIC(void); | 205 | extern void disable_local_APIC(void); |
206 | extern void lapic_shutdown(void); | 206 | extern void lapic_shutdown(void); |
207 | extern int verify_local_APIC(void); | ||
208 | extern void sync_Arb_IDs(void); | 207 | extern void sync_Arb_IDs(void); |
209 | extern void init_bsp_APIC(void); | 208 | extern void init_bsp_APIC(void); |
210 | extern void setup_local_APIC(void); | 209 | extern void setup_local_APIC(void); |
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 2ab1eb33106e..959e45b81fe2 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h | |||
@@ -95,13 +95,11 @@ do { \ | |||
95 | * Stop RDTSC speculation. This is needed when you need to use RDTSC | 95 | * Stop RDTSC speculation. This is needed when you need to use RDTSC |
96 | * (or get_cycles or vread that possibly accesses the TSC) in a defined | 96 | * (or get_cycles or vread that possibly accesses the TSC) in a defined |
97 | * code region. | 97 | * code region. |
98 | * | ||
99 | * (Could use an alternative three way for this if there was one.) | ||
100 | */ | 98 | */ |
101 | static __always_inline void rdtsc_barrier(void) | 99 | static __always_inline void rdtsc_barrier(void) |
102 | { | 100 | { |
103 | alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); | 101 | alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, |
104 | alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); | 102 | "lfence", X86_FEATURE_LFENCE_RDTSC); |
105 | } | 103 | } |
106 | 104 | ||
107 | #endif /* _ASM_X86_BARRIER_H */ | 105 | #endif /* _ASM_X86_BARRIER_H */ |
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h index 1f1297b46f83..1c8b50edb2db 100644 --- a/arch/x86/include/asm/calling.h +++ b/arch/x86/include/asm/calling.h | |||
@@ -55,143 +55,157 @@ For 32-bit we have the following conventions - kernel is built with | |||
55 | * for assembly code: | 55 | * for assembly code: |
56 | */ | 56 | */ |
57 | 57 | ||
58 | #define R15 0 | 58 | /* The layout forms the "struct pt_regs" on the stack: */ |
59 | #define R14 8 | 59 | /* |
60 | #define R13 16 | 60 | * C ABI says these regs are callee-preserved. They aren't saved on kernel entry |
61 | #define R12 24 | 61 | * unless syscall needs a complete, fully filled "struct pt_regs". |
62 | #define RBP 32 | 62 | */ |
63 | #define RBX 40 | 63 | #define R15 0*8 |
64 | 64 | #define R14 1*8 | |
65 | /* arguments: interrupts/non tracing syscalls only save up to here: */ | 65 | #define R13 2*8 |
66 | #define R11 48 | 66 | #define R12 3*8 |
67 | #define R10 56 | 67 | #define RBP 4*8 |
68 | #define R9 64 | 68 | #define RBX 5*8 |
69 | #define R8 72 | 69 | /* These regs are callee-clobbered. Always saved on kernel entry. */ |
70 | #define RAX 80 | 70 | #define R11 6*8 |
71 | #define RCX 88 | 71 | #define R10 7*8 |
72 | #define RDX 96 | 72 | #define R9 8*8 |
73 | #define RSI 104 | 73 | #define R8 9*8 |
74 | #define RDI 112 | 74 | #define RAX 10*8 |
75 | #define ORIG_RAX 120 /* + error_code */ | 75 | #define RCX 11*8 |
76 | /* end of arguments */ | 76 | #define RDX 12*8 |
77 | 77 | #define RSI 13*8 | |
78 | /* cpu exception frame or undefined in case of fast syscall: */ | 78 | #define RDI 14*8 |
79 | #define RIP 128 | 79 | /* |
80 | #define CS 136 | 80 | * On syscall entry, this is syscall#. On CPU exception, this is error code. |
81 | #define EFLAGS 144 | 81 | * On hw interrupt, it's IRQ number: |
82 | #define RSP 152 | 82 | */ |
83 | #define SS 160 | 83 | #define ORIG_RAX 15*8 |
84 | 84 | /* Return frame for iretq */ | |
85 | #define ARGOFFSET R11 | 85 | #define RIP 16*8 |
86 | 86 | #define CS 17*8 | |
87 | .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1, rax_enosys=0 | 87 | #define EFLAGS 18*8 |
88 | subq $9*8+\addskip, %rsp | 88 | #define RSP 19*8 |
89 | CFI_ADJUST_CFA_OFFSET 9*8+\addskip | 89 | #define SS 20*8 |
90 | movq_cfi rdi, 8*8 | 90 | |
91 | movq_cfi rsi, 7*8 | 91 | #define SIZEOF_PTREGS 21*8 |
92 | movq_cfi rdx, 6*8 | 92 | |
93 | 93 | .macro ALLOC_PT_GPREGS_ON_STACK addskip=0 | |
94 | .if \save_rcx | 94 | subq $15*8+\addskip, %rsp |
95 | movq_cfi rcx, 5*8 | 95 | CFI_ADJUST_CFA_OFFSET 15*8+\addskip |
96 | .endif | 96 | .endm |
97 | 97 | ||
98 | .if \rax_enosys | 98 | .macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8910=1 r11=1 |
99 | movq $-ENOSYS, 4*8(%rsp) | 99 | .if \r11 |
100 | .else | 100 | movq_cfi r11, 6*8+\offset |
101 | movq_cfi rax, 4*8 | ||
102 | .endif | 101 | .endif |
103 | 102 | .if \r8910 | |
104 | .if \save_r891011 | 103 | movq_cfi r10, 7*8+\offset |
105 | movq_cfi r8, 3*8 | 104 | movq_cfi r9, 8*8+\offset |
106 | movq_cfi r9, 2*8 | 105 | movq_cfi r8, 9*8+\offset |
107 | movq_cfi r10, 1*8 | 106 | .endif |
108 | movq_cfi r11, 0*8 | 107 | .if \rax |
108 | movq_cfi rax, 10*8+\offset | ||
109 | .endif | ||
110 | .if \rcx | ||
111 | movq_cfi rcx, 11*8+\offset | ||
109 | .endif | 112 | .endif |
113 | movq_cfi rdx, 12*8+\offset | ||
114 | movq_cfi rsi, 13*8+\offset | ||
115 | movq_cfi rdi, 14*8+\offset | ||
116 | .endm | ||
117 | .macro SAVE_C_REGS offset=0 | ||
118 | SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1 | ||
119 | .endm | ||
120 | .macro SAVE_C_REGS_EXCEPT_RAX_RCX offset=0 | ||
121 | SAVE_C_REGS_HELPER \offset, 0, 0, 1, 1 | ||
122 | .endm | ||
123 | .macro SAVE_C_REGS_EXCEPT_R891011 | ||
124 | SAVE_C_REGS_HELPER 0, 1, 1, 0, 0 | ||
125 | .endm | ||
126 | .macro SAVE_C_REGS_EXCEPT_RCX_R891011 | ||
127 | SAVE_C_REGS_HELPER 0, 1, 0, 0, 0 | ||
128 | .endm | ||
129 | .macro SAVE_C_REGS_EXCEPT_RAX_RCX_R11 | ||
130 | SAVE_C_REGS_HELPER 0, 0, 0, 1, 0 | ||
131 | .endm | ||
132 | |||
133 | .macro SAVE_EXTRA_REGS offset=0 | ||
134 | movq_cfi r15, 0*8+\offset | ||
135 | movq_cfi r14, 1*8+\offset | ||
136 | movq_cfi r13, 2*8+\offset | ||
137 | movq_cfi r12, 3*8+\offset | ||
138 | movq_cfi rbp, 4*8+\offset | ||
139 | movq_cfi rbx, 5*8+\offset | ||
140 | .endm | ||
141 | .macro SAVE_EXTRA_REGS_RBP offset=0 | ||
142 | movq_cfi rbp, 4*8+\offset | ||
143 | .endm | ||
110 | 144 | ||
145 | .macro RESTORE_EXTRA_REGS offset=0 | ||
146 | movq_cfi_restore 0*8+\offset, r15 | ||
147 | movq_cfi_restore 1*8+\offset, r14 | ||
148 | movq_cfi_restore 2*8+\offset, r13 | ||
149 | movq_cfi_restore 3*8+\offset, r12 | ||
150 | movq_cfi_restore 4*8+\offset, rbp | ||
151 | movq_cfi_restore 5*8+\offset, rbx | ||
111 | .endm | 152 | .endm |
112 | 153 | ||
113 | #define ARG_SKIP (9*8) | 154 | .macro ZERO_EXTRA_REGS |
155 | xorl %r15d, %r15d | ||
156 | xorl %r14d, %r14d | ||
157 | xorl %r13d, %r13d | ||
158 | xorl %r12d, %r12d | ||
159 | xorl %ebp, %ebp | ||
160 | xorl %ebx, %ebx | ||
161 | .endm | ||
114 | 162 | ||
115 | .macro RESTORE_ARGS rstor_rax=1, addskip=0, rstor_rcx=1, rstor_r11=1, \ | 163 | .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 |
116 | rstor_r8910=1, rstor_rdx=1 | ||
117 | .if \rstor_r11 | 164 | .if \rstor_r11 |
118 | movq_cfi_restore 0*8, r11 | 165 | movq_cfi_restore 6*8, r11 |
119 | .endif | 166 | .endif |
120 | |||
121 | .if \rstor_r8910 | 167 | .if \rstor_r8910 |
122 | movq_cfi_restore 1*8, r10 | 168 | movq_cfi_restore 7*8, r10 |
123 | movq_cfi_restore 2*8, r9 | 169 | movq_cfi_restore 8*8, r9 |
124 | movq_cfi_restore 3*8, r8 | 170 | movq_cfi_restore 9*8, r8 |
125 | .endif | 171 | .endif |
126 | |||
127 | .if \rstor_rax | 172 | .if \rstor_rax |
128 | movq_cfi_restore 4*8, rax | 173 | movq_cfi_restore 10*8, rax |
129 | .endif | 174 | .endif |
130 | |||
131 | .if \rstor_rcx | 175 | .if \rstor_rcx |
132 | movq_cfi_restore 5*8, rcx | 176 | movq_cfi_restore 11*8, rcx |
133 | .endif | 177 | .endif |
134 | |||
135 | .if \rstor_rdx | 178 | .if \rstor_rdx |
136 | movq_cfi_restore 6*8, rdx | 179 | movq_cfi_restore 12*8, rdx |
137 | .endif | ||
138 | |||
139 | movq_cfi_restore 7*8, rsi | ||
140 | movq_cfi_restore 8*8, rdi | ||
141 | |||
142 | .if ARG_SKIP+\addskip > 0 | ||
143 | addq $ARG_SKIP+\addskip, %rsp | ||
144 | CFI_ADJUST_CFA_OFFSET -(ARG_SKIP+\addskip) | ||
145 | .endif | 180 | .endif |
181 | movq_cfi_restore 13*8, rsi | ||
182 | movq_cfi_restore 14*8, rdi | ||
146 | .endm | 183 | .endm |
147 | 184 | .macro RESTORE_C_REGS | |
148 | .macro LOAD_ARGS offset, skiprax=0 | 185 | RESTORE_C_REGS_HELPER 1,1,1,1,1 |
149 | movq \offset(%rsp), %r11 | ||
150 | movq \offset+8(%rsp), %r10 | ||
151 | movq \offset+16(%rsp), %r9 | ||
152 | movq \offset+24(%rsp), %r8 | ||
153 | movq \offset+40(%rsp), %rcx | ||
154 | movq \offset+48(%rsp), %rdx | ||
155 | movq \offset+56(%rsp), %rsi | ||
156 | movq \offset+64(%rsp), %rdi | ||
157 | .if \skiprax | ||
158 | .else | ||
159 | movq \offset+72(%rsp), %rax | ||
160 | .endif | ||
161 | .endm | 186 | .endm |
162 | 187 | .macro RESTORE_C_REGS_EXCEPT_RAX | |
163 | #define REST_SKIP (6*8) | 188 | RESTORE_C_REGS_HELPER 0,1,1,1,1 |
164 | |||
165 | .macro SAVE_REST | ||
166 | subq $REST_SKIP, %rsp | ||
167 | CFI_ADJUST_CFA_OFFSET REST_SKIP | ||
168 | movq_cfi rbx, 5*8 | ||
169 | movq_cfi rbp, 4*8 | ||
170 | movq_cfi r12, 3*8 | ||
171 | movq_cfi r13, 2*8 | ||
172 | movq_cfi r14, 1*8 | ||
173 | movq_cfi r15, 0*8 | ||
174 | .endm | 189 | .endm |
175 | 190 | .macro RESTORE_C_REGS_EXCEPT_RCX | |
176 | .macro RESTORE_REST | 191 | RESTORE_C_REGS_HELPER 1,0,1,1,1 |
177 | movq_cfi_restore 0*8, r15 | ||
178 | movq_cfi_restore 1*8, r14 | ||
179 | movq_cfi_restore 2*8, r13 | ||
180 | movq_cfi_restore 3*8, r12 | ||
181 | movq_cfi_restore 4*8, rbp | ||
182 | movq_cfi_restore 5*8, rbx | ||
183 | addq $REST_SKIP, %rsp | ||
184 | CFI_ADJUST_CFA_OFFSET -(REST_SKIP) | ||
185 | .endm | 192 | .endm |
186 | 193 | .macro RESTORE_C_REGS_EXCEPT_R11 | |
187 | .macro SAVE_ALL | 194 | RESTORE_C_REGS_HELPER 1,1,0,1,1 |
188 | SAVE_ARGS | 195 | .endm |
189 | SAVE_REST | 196 | .macro RESTORE_C_REGS_EXCEPT_RCX_R11 |
197 | RESTORE_C_REGS_HELPER 1,0,0,1,1 | ||
198 | .endm | ||
199 | .macro RESTORE_RSI_RDI | ||
200 | RESTORE_C_REGS_HELPER 0,0,0,0,0 | ||
201 | .endm | ||
202 | .macro RESTORE_RSI_RDI_RDX | ||
203 | RESTORE_C_REGS_HELPER 0,0,0,0,1 | ||
190 | .endm | 204 | .endm |
191 | 205 | ||
192 | .macro RESTORE_ALL addskip=0 | 206 | .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0 |
193 | RESTORE_REST | 207 | addq $15*8+\addskip, %rsp |
194 | RESTORE_ARGS 1, \addskip | 208 | CFI_ADJUST_CFA_OFFSET -(15*8+\addskip) |
195 | .endm | 209 | .endm |
196 | 210 | ||
197 | .macro icebp | 211 | .macro icebp |
@@ -210,37 +224,23 @@ For 32-bit we have the following conventions - kernel is built with | |||
210 | */ | 224 | */ |
211 | 225 | ||
212 | .macro SAVE_ALL | 226 | .macro SAVE_ALL |
213 | pushl_cfi %eax | 227 | pushl_cfi_reg eax |
214 | CFI_REL_OFFSET eax, 0 | 228 | pushl_cfi_reg ebp |
215 | pushl_cfi %ebp | 229 | pushl_cfi_reg edi |
216 | CFI_REL_OFFSET ebp, 0 | 230 | pushl_cfi_reg esi |
217 | pushl_cfi %edi | 231 | pushl_cfi_reg edx |
218 | CFI_REL_OFFSET edi, 0 | 232 | pushl_cfi_reg ecx |
219 | pushl_cfi %esi | 233 | pushl_cfi_reg ebx |
220 | CFI_REL_OFFSET esi, 0 | ||
221 | pushl_cfi %edx | ||
222 | CFI_REL_OFFSET edx, 0 | ||
223 | pushl_cfi %ecx | ||
224 | CFI_REL_OFFSET ecx, 0 | ||
225 | pushl_cfi %ebx | ||
226 | CFI_REL_OFFSET ebx, 0 | ||
227 | .endm | 234 | .endm |
228 | 235 | ||
229 | .macro RESTORE_ALL | 236 | .macro RESTORE_ALL |
230 | popl_cfi %ebx | 237 | popl_cfi_reg ebx |
231 | CFI_RESTORE ebx | 238 | popl_cfi_reg ecx |
232 | popl_cfi %ecx | 239 | popl_cfi_reg edx |
233 | CFI_RESTORE ecx | 240 | popl_cfi_reg esi |
234 | popl_cfi %edx | 241 | popl_cfi_reg edi |
235 | CFI_RESTORE edx | 242 | popl_cfi_reg ebp |
236 | popl_cfi %esi | 243 | popl_cfi_reg eax |
237 | CFI_RESTORE esi | ||
238 | popl_cfi %edi | ||
239 | CFI_RESTORE edi | ||
240 | popl_cfi %ebp | ||
241 | CFI_RESTORE ebp | ||
242 | popl_cfi %eax | ||
243 | CFI_RESTORE eax | ||
244 | .endm | 244 | .endm |
245 | 245 | ||
246 | #endif /* CONFIG_X86_64 */ | 246 | #endif /* CONFIG_X86_64 */ |
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 59c6c401f79f..acdee09228b3 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h | |||
@@ -301,7 +301,7 @@ static inline void __user *arch_compat_alloc_user_space(long len) | |||
301 | sp = task_pt_regs(current)->sp; | 301 | sp = task_pt_regs(current)->sp; |
302 | } else { | 302 | } else { |
303 | /* -128 for the x32 ABI redzone */ | 303 | /* -128 for the x32 ABI redzone */ |
304 | sp = this_cpu_read(old_rsp) - 128; | 304 | sp = task_pt_regs(current)->sp - 128; |
305 | } | 305 | } |
306 | 306 | ||
307 | return (void __user *)round_down(sp - len, 16); | 307 | return (void __user *)round_down(sp - len, 16); |
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index d2b12988d2ed..bf2caa1dedc5 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h | |||
@@ -34,8 +34,6 @@ extern int _debug_hotplug_cpu(int cpu, int action); | |||
34 | #endif | 34 | #endif |
35 | #endif | 35 | #endif |
36 | 36 | ||
37 | DECLARE_PER_CPU(int, cpu_state); | ||
38 | |||
39 | int mwait_usable(const struct cpuinfo_x86 *); | 37 | int mwait_usable(const struct cpuinfo_x86 *); |
40 | 38 | ||
41 | #endif /* _ASM_X86_CPU_H */ | 39 | #endif /* _ASM_X86_CPU_H */ |
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 90a54851aedc..7ee9b94d9921 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h | |||
@@ -12,7 +12,7 @@ | |||
12 | #include <asm/disabled-features.h> | 12 | #include <asm/disabled-features.h> |
13 | #endif | 13 | #endif |
14 | 14 | ||
15 | #define NCAPINTS 11 /* N 32-bit words worth of info */ | 15 | #define NCAPINTS 13 /* N 32-bit words worth of info */ |
16 | #define NBUGINTS 1 /* N 32-bit bug flags */ | 16 | #define NBUGINTS 1 /* N 32-bit bug flags */ |
17 | 17 | ||
18 | /* | 18 | /* |
@@ -195,6 +195,7 @@ | |||
195 | #define X86_FEATURE_HWP_ACT_WINDOW ( 7*32+ 12) /* Intel HWP_ACT_WINDOW */ | 195 | #define X86_FEATURE_HWP_ACT_WINDOW ( 7*32+ 12) /* Intel HWP_ACT_WINDOW */ |
196 | #define X86_FEATURE_HWP_EPP ( 7*32+13) /* Intel HWP_EPP */ | 196 | #define X86_FEATURE_HWP_EPP ( 7*32+13) /* Intel HWP_EPP */ |
197 | #define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */ | 197 | #define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */ |
198 | #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ | ||
198 | 199 | ||
199 | /* Virtualization flags: Linux defined, word 8 */ | 200 | /* Virtualization flags: Linux defined, word 8 */ |
200 | #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ | 201 | #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ |
@@ -226,12 +227,15 @@ | |||
226 | #define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */ | 227 | #define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */ |
227 | #define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ | 228 | #define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ |
228 | #define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ | 229 | #define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ |
230 | #define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ | ||
229 | #define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ | 231 | #define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ |
230 | #define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ | 232 | #define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ |
231 | #define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ | 233 | #define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ |
232 | #define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ | 234 | #define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ |
233 | #define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ | 235 | #define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ |
236 | #define X86_FEATURE_PCOMMIT ( 9*32+22) /* PCOMMIT instruction */ | ||
234 | #define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ | 237 | #define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ |
238 | #define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ | ||
235 | #define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ | 239 | #define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ |
236 | #define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ | 240 | #define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ |
237 | #define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ | 241 | #define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ |
@@ -242,6 +246,12 @@ | |||
242 | #define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */ | 246 | #define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */ |
243 | #define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */ | 247 | #define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */ |
244 | 248 | ||
249 | /* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */ | ||
250 | #define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */ | ||
251 | |||
252 | /* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */ | ||
253 | #define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */ | ||
254 | |||
245 | /* | 255 | /* |
246 | * BUG word(s) | 256 | * BUG word(s) |
247 | */ | 257 | */ |
@@ -418,6 +428,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) | |||
418 | " .word %P0\n" /* 1: do replace */ | 428 | " .word %P0\n" /* 1: do replace */ |
419 | " .byte 2b - 1b\n" /* source len */ | 429 | " .byte 2b - 1b\n" /* source len */ |
420 | " .byte 0\n" /* replacement len */ | 430 | " .byte 0\n" /* replacement len */ |
431 | " .byte 0\n" /* pad len */ | ||
421 | ".previous\n" | 432 | ".previous\n" |
422 | /* skipping size check since replacement size = 0 */ | 433 | /* skipping size check since replacement size = 0 */ |
423 | : : "i" (X86_FEATURE_ALWAYS) : : t_warn); | 434 | : : "i" (X86_FEATURE_ALWAYS) : : t_warn); |
@@ -432,6 +443,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) | |||
432 | " .word %P0\n" /* feature bit */ | 443 | " .word %P0\n" /* feature bit */ |
433 | " .byte 2b - 1b\n" /* source len */ | 444 | " .byte 2b - 1b\n" /* source len */ |
434 | " .byte 0\n" /* replacement len */ | 445 | " .byte 0\n" /* replacement len */ |
446 | " .byte 0\n" /* pad len */ | ||
435 | ".previous\n" | 447 | ".previous\n" |
436 | /* skipping size check since replacement size = 0 */ | 448 | /* skipping size check since replacement size = 0 */ |
437 | : : "i" (bit) : : t_no); | 449 | : : "i" (bit) : : t_no); |
@@ -457,6 +469,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) | |||
457 | " .word %P1\n" /* feature bit */ | 469 | " .word %P1\n" /* feature bit */ |
458 | " .byte 2b - 1b\n" /* source len */ | 470 | " .byte 2b - 1b\n" /* source len */ |
459 | " .byte 4f - 3f\n" /* replacement len */ | 471 | " .byte 4f - 3f\n" /* replacement len */ |
472 | " .byte 0\n" /* pad len */ | ||
460 | ".previous\n" | 473 | ".previous\n" |
461 | ".section .discard,\"aw\",@progbits\n" | 474 | ".section .discard,\"aw\",@progbits\n" |
462 | " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ | 475 | " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ |
@@ -483,31 +496,30 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) | |||
483 | static __always_inline __pure bool _static_cpu_has_safe(u16 bit) | 496 | static __always_inline __pure bool _static_cpu_has_safe(u16 bit) |
484 | { | 497 | { |
485 | #ifdef CC_HAVE_ASM_GOTO | 498 | #ifdef CC_HAVE_ASM_GOTO |
486 | /* | 499 | asm_volatile_goto("1: jmp %l[t_dynamic]\n" |
487 | * We need to spell the jumps to the compiler because, depending on the offset, | ||
488 | * the replacement jump can be bigger than the original jump, and this we cannot | ||
489 | * have. Thus, we force the jump to the widest, 4-byte, signed relative | ||
490 | * offset even though the last would often fit in less bytes. | ||
491 | */ | ||
492 | asm_volatile_goto("1: .byte 0xe9\n .long %l[t_dynamic] - 2f\n" | ||
493 | "2:\n" | 500 | "2:\n" |
501 | ".skip -(((5f-4f) - (2b-1b)) > 0) * " | ||
502 | "((5f-4f) - (2b-1b)),0x90\n" | ||
503 | "3:\n" | ||
494 | ".section .altinstructions,\"a\"\n" | 504 | ".section .altinstructions,\"a\"\n" |
495 | " .long 1b - .\n" /* src offset */ | 505 | " .long 1b - .\n" /* src offset */ |
496 | " .long 3f - .\n" /* repl offset */ | 506 | " .long 4f - .\n" /* repl offset */ |
497 | " .word %P1\n" /* always replace */ | 507 | " .word %P1\n" /* always replace */ |
498 | " .byte 2b - 1b\n" /* src len */ | 508 | " .byte 3b - 1b\n" /* src len */ |
499 | " .byte 4f - 3f\n" /* repl len */ | 509 | " .byte 5f - 4f\n" /* repl len */ |
510 | " .byte 3b - 2b\n" /* pad len */ | ||
500 | ".previous\n" | 511 | ".previous\n" |
501 | ".section .altinstr_replacement,\"ax\"\n" | 512 | ".section .altinstr_replacement,\"ax\"\n" |
502 | "3: .byte 0xe9\n .long %l[t_no] - 2b\n" | 513 | "4: jmp %l[t_no]\n" |
503 | "4:\n" | 514 | "5:\n" |
504 | ".previous\n" | 515 | ".previous\n" |
505 | ".section .altinstructions,\"a\"\n" | 516 | ".section .altinstructions,\"a\"\n" |
506 | " .long 1b - .\n" /* src offset */ | 517 | " .long 1b - .\n" /* src offset */ |
507 | " .long 0\n" /* no replacement */ | 518 | " .long 0\n" /* no replacement */ |
508 | " .word %P0\n" /* feature bit */ | 519 | " .word %P0\n" /* feature bit */ |
509 | " .byte 2b - 1b\n" /* src len */ | 520 | " .byte 3b - 1b\n" /* src len */ |
510 | " .byte 0\n" /* repl len */ | 521 | " .byte 0\n" /* repl len */ |
522 | " .byte 0\n" /* pad len */ | ||
511 | ".previous\n" | 523 | ".previous\n" |
512 | : : "i" (bit), "i" (X86_FEATURE_ALWAYS) | 524 | : : "i" (bit), "i" (X86_FEATURE_ALWAYS) |
513 | : : t_dynamic, t_no); | 525 | : : t_dynamic, t_no); |
@@ -527,6 +539,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit) | |||
527 | " .word %P2\n" /* always replace */ | 539 | " .word %P2\n" /* always replace */ |
528 | " .byte 2b - 1b\n" /* source len */ | 540 | " .byte 2b - 1b\n" /* source len */ |
529 | " .byte 4f - 3f\n" /* replacement len */ | 541 | " .byte 4f - 3f\n" /* replacement len */ |
542 | " .byte 0\n" /* pad len */ | ||
530 | ".previous\n" | 543 | ".previous\n" |
531 | ".section .discard,\"aw\",@progbits\n" | 544 | ".section .discard,\"aw\",@progbits\n" |
532 | " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ | 545 | " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ |
@@ -541,6 +554,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit) | |||
541 | " .word %P1\n" /* feature bit */ | 554 | " .word %P1\n" /* feature bit */ |
542 | " .byte 4b - 3b\n" /* src len */ | 555 | " .byte 4b - 3b\n" /* src len */ |
543 | " .byte 6f - 5f\n" /* repl len */ | 556 | " .byte 6f - 5f\n" /* repl len */ |
557 | " .byte 0\n" /* pad len */ | ||
544 | ".previous\n" | 558 | ".previous\n" |
545 | ".section .discard,\"aw\",@progbits\n" | 559 | ".section .discard,\"aw\",@progbits\n" |
546 | " .byte 0xff + (6f-5f) - (4b-3b)\n" /* size check */ | 560 | " .byte 0xff + (6f-5f) - (4b-3b)\n" /* size check */ |
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index a94b82e8f156..a0bf89fd2647 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h | |||
@@ -376,11 +376,16 @@ static inline void _set_gate(int gate, unsigned type, void *addr, | |||
376 | * Pentium F0 0F bugfix can have resulted in the mapped | 376 | * Pentium F0 0F bugfix can have resulted in the mapped |
377 | * IDT being write-protected. | 377 | * IDT being write-protected. |
378 | */ | 378 | */ |
379 | #define set_intr_gate(n, addr) \ | 379 | #define set_intr_gate_notrace(n, addr) \ |
380 | do { \ | 380 | do { \ |
381 | BUG_ON((unsigned)n > 0xFF); \ | 381 | BUG_ON((unsigned)n > 0xFF); \ |
382 | _set_gate(n, GATE_INTERRUPT, (void *)addr, 0, 0, \ | 382 | _set_gate(n, GATE_INTERRUPT, (void *)addr, 0, 0, \ |
383 | __KERNEL_CS); \ | 383 | __KERNEL_CS); \ |
384 | } while (0) | ||
385 | |||
386 | #define set_intr_gate(n, addr) \ | ||
387 | do { \ | ||
388 | set_intr_gate_notrace(n, addr); \ | ||
384 | _trace_set_gate(n, GATE_INTERRUPT, (void *)trace_##addr,\ | 389 | _trace_set_gate(n, GATE_INTERRUPT, (void *)trace_##addr,\ |
385 | 0, 0, __KERNEL_CS); \ | 390 | 0, 0, __KERNEL_CS); \ |
386 | } while (0) | 391 | } while (0) |
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h index f6f15986df6c..de1cdaf4d743 100644 --- a/arch/x86/include/asm/dwarf2.h +++ b/arch/x86/include/asm/dwarf2.h | |||
@@ -86,11 +86,23 @@ | |||
86 | CFI_ADJUST_CFA_OFFSET 8 | 86 | CFI_ADJUST_CFA_OFFSET 8 |
87 | .endm | 87 | .endm |
88 | 88 | ||
89 | .macro pushq_cfi_reg reg | ||
90 | pushq %\reg | ||
91 | CFI_ADJUST_CFA_OFFSET 8 | ||
92 | CFI_REL_OFFSET \reg, 0 | ||
93 | .endm | ||
94 | |||
89 | .macro popq_cfi reg | 95 | .macro popq_cfi reg |
90 | popq \reg | 96 | popq \reg |
91 | CFI_ADJUST_CFA_OFFSET -8 | 97 | CFI_ADJUST_CFA_OFFSET -8 |
92 | .endm | 98 | .endm |
93 | 99 | ||
100 | .macro popq_cfi_reg reg | ||
101 | popq %\reg | ||
102 | CFI_ADJUST_CFA_OFFSET -8 | ||
103 | CFI_RESTORE \reg | ||
104 | .endm | ||
105 | |||
94 | .macro pushfq_cfi | 106 | .macro pushfq_cfi |
95 | pushfq | 107 | pushfq |
96 | CFI_ADJUST_CFA_OFFSET 8 | 108 | CFI_ADJUST_CFA_OFFSET 8 |
@@ -116,11 +128,23 @@ | |||
116 | CFI_ADJUST_CFA_OFFSET 4 | 128 | CFI_ADJUST_CFA_OFFSET 4 |
117 | .endm | 129 | .endm |
118 | 130 | ||
131 | .macro pushl_cfi_reg reg | ||
132 | pushl %\reg | ||
133 | CFI_ADJUST_CFA_OFFSET 4 | ||
134 | CFI_REL_OFFSET \reg, 0 | ||
135 | .endm | ||
136 | |||
119 | .macro popl_cfi reg | 137 | .macro popl_cfi reg |
120 | popl \reg | 138 | popl \reg |
121 | CFI_ADJUST_CFA_OFFSET -4 | 139 | CFI_ADJUST_CFA_OFFSET -4 |
122 | .endm | 140 | .endm |
123 | 141 | ||
142 | .macro popl_cfi_reg reg | ||
143 | popl %\reg | ||
144 | CFI_ADJUST_CFA_OFFSET -4 | ||
145 | CFI_RESTORE \reg | ||
146 | .endm | ||
147 | |||
124 | .macro pushfl_cfi | 148 | .macro pushfl_cfi |
125 | pushfl | 149 | pushfl |
126 | CFI_ADJUST_CFA_OFFSET 4 | 150 | CFI_ADJUST_CFA_OFFSET 4 |
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index 779c2efe2e97..3ab0537872fb 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h | |||
@@ -40,14 +40,6 @@ static inline void e820_mark_nosave_regions(unsigned long limit_pfn) | |||
40 | } | 40 | } |
41 | #endif | 41 | #endif |
42 | 42 | ||
43 | #ifdef CONFIG_MEMTEST | ||
44 | extern void early_memtest(unsigned long start, unsigned long end); | ||
45 | #else | ||
46 | static inline void early_memtest(unsigned long start, unsigned long end) | ||
47 | { | ||
48 | } | ||
49 | #endif | ||
50 | |||
51 | extern unsigned long e820_end_of_ram_pfn(void); | 43 | extern unsigned long e820_end_of_ram_pfn(void); |
52 | extern unsigned long e820_end_of_low_ram_pfn(void); | 44 | extern unsigned long e820_end_of_low_ram_pfn(void); |
53 | extern u64 early_reserve_e820(u64 sizet, u64 align); | 45 | extern u64 early_reserve_e820(u64 sizet, u64 align); |
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 25bce45c6fc4..3738b138b843 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h | |||
@@ -2,6 +2,8 @@ | |||
2 | #define _ASM_X86_EFI_H | 2 | #define _ASM_X86_EFI_H |
3 | 3 | ||
4 | #include <asm/i387.h> | 4 | #include <asm/i387.h> |
5 | #include <asm/pgtable.h> | ||
6 | |||
5 | /* | 7 | /* |
6 | * We map the EFI regions needed for runtime services non-contiguously, | 8 | * We map the EFI regions needed for runtime services non-contiguously, |
7 | * with preserved alignment on virtual addresses starting from -4G down | 9 | * with preserved alignment on virtual addresses starting from -4G down |
@@ -89,8 +91,8 @@ extern void __iomem *__init efi_ioremap(unsigned long addr, unsigned long size, | |||
89 | extern struct efi_scratch efi_scratch; | 91 | extern struct efi_scratch efi_scratch; |
90 | extern void __init efi_set_executable(efi_memory_desc_t *md, bool executable); | 92 | extern void __init efi_set_executable(efi_memory_desc_t *md, bool executable); |
91 | extern int __init efi_memblock_x86_reserve_range(void); | 93 | extern int __init efi_memblock_x86_reserve_range(void); |
92 | extern void __init efi_call_phys_prolog(void); | 94 | extern pgd_t * __init efi_call_phys_prolog(void); |
93 | extern void __init efi_call_phys_epilog(void); | 95 | extern void __init efi_call_phys_epilog(pgd_t *save_pgd); |
94 | extern void __init efi_unmap_memmap(void); | 96 | extern void __init efi_unmap_memmap(void); |
95 | extern void __init efi_memory_uc(u64 addr, unsigned long size); | 97 | extern void __init efi_memory_uc(u64 addr, unsigned long size); |
96 | extern void __init efi_map_region(efi_memory_desc_t *md); | 98 | extern void __init efi_map_region(efi_memory_desc_t *md); |
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index ca3347a9dab5..f161c189c27b 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h | |||
@@ -171,10 +171,11 @@ do { \ | |||
171 | static inline void elf_common_init(struct thread_struct *t, | 171 | static inline void elf_common_init(struct thread_struct *t, |
172 | struct pt_regs *regs, const u16 ds) | 172 | struct pt_regs *regs, const u16 ds) |
173 | { | 173 | { |
174 | regs->ax = regs->bx = regs->cx = regs->dx = 0; | 174 | /* Commented-out registers are cleared in stub_execve */ |
175 | regs->si = regs->di = regs->bp = 0; | 175 | /*regs->ax = regs->bx =*/ regs->cx = regs->dx = 0; |
176 | regs->si = regs->di /*= regs->bp*/ = 0; | ||
176 | regs->r8 = regs->r9 = regs->r10 = regs->r11 = 0; | 177 | regs->r8 = regs->r9 = regs->r10 = regs->r11 = 0; |
177 | regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0; | 178 | /*regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;*/ |
178 | t->fs = t->gs = 0; | 179 | t->fs = t->gs = 0; |
179 | t->fsindex = t->gsindex = 0; | 180 | t->fsindex = t->gsindex = 0; |
180 | t->ds = t->es = ds; | 181 | t->ds = t->es = ds; |
@@ -338,9 +339,6 @@ extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm, | |||
338 | int uses_interp); | 339 | int uses_interp); |
339 | #define compat_arch_setup_additional_pages compat_arch_setup_additional_pages | 340 | #define compat_arch_setup_additional_pages compat_arch_setup_additional_pages |
340 | 341 | ||
341 | extern unsigned long arch_randomize_brk(struct mm_struct *mm); | ||
342 | #define arch_randomize_brk arch_randomize_brk | ||
343 | |||
344 | /* | 342 | /* |
345 | * True on X86_32 or when emulating IA32 on X86_64 | 343 | * True on X86_32 or when emulating IA32 on X86_64 |
346 | */ | 344 | */ |
@@ -365,6 +363,7 @@ enum align_flags { | |||
365 | struct va_alignment { | 363 | struct va_alignment { |
366 | int flags; | 364 | int flags; |
367 | unsigned long mask; | 365 | unsigned long mask; |
366 | unsigned long bits; | ||
368 | } ____cacheline_aligned; | 367 | } ____cacheline_aligned; |
369 | 368 | ||
370 | extern struct va_alignment va_align; | 369 | extern struct va_alignment va_align; |
diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 72ba21a8b5fc..da5e96756570 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h | |||
@@ -67,6 +67,34 @@ extern void finit_soft_fpu(struct i387_soft_struct *soft); | |||
67 | static inline void finit_soft_fpu(struct i387_soft_struct *soft) {} | 67 | static inline void finit_soft_fpu(struct i387_soft_struct *soft) {} |
68 | #endif | 68 | #endif |
69 | 69 | ||
70 | /* | ||
71 | * Must be run with preemption disabled: this clears the fpu_owner_task, | ||
72 | * on this CPU. | ||
73 | * | ||
74 | * This will disable any lazy FPU state restore of the current FPU state, | ||
75 | * but if the current thread owns the FPU, it will still be saved by. | ||
76 | */ | ||
77 | static inline void __cpu_disable_lazy_restore(unsigned int cpu) | ||
78 | { | ||
79 | per_cpu(fpu_owner_task, cpu) = NULL; | ||
80 | } | ||
81 | |||
82 | /* | ||
83 | * Used to indicate that the FPU state in memory is newer than the FPU | ||
84 | * state in registers, and the FPU state should be reloaded next time the | ||
85 | * task is run. Only safe on the current task, or non-running tasks. | ||
86 | */ | ||
87 | static inline void task_disable_lazy_fpu_restore(struct task_struct *tsk) | ||
88 | { | ||
89 | tsk->thread.fpu.last_cpu = ~0; | ||
90 | } | ||
91 | |||
92 | static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu) | ||
93 | { | ||
94 | return new == this_cpu_read_stable(fpu_owner_task) && | ||
95 | cpu == new->thread.fpu.last_cpu; | ||
96 | } | ||
97 | |||
70 | static inline int is_ia32_compat_frame(void) | 98 | static inline int is_ia32_compat_frame(void) |
71 | { | 99 | { |
72 | return config_enabled(CONFIG_IA32_EMULATION) && | 100 | return config_enabled(CONFIG_IA32_EMULATION) && |
@@ -107,7 +135,6 @@ static __always_inline __pure bool use_fxsr(void) | |||
107 | 135 | ||
108 | static inline void fx_finit(struct i387_fxsave_struct *fx) | 136 | static inline void fx_finit(struct i387_fxsave_struct *fx) |
109 | { | 137 | { |
110 | memset(fx, 0, xstate_size); | ||
111 | fx->cwd = 0x37f; | 138 | fx->cwd = 0x37f; |
112 | fx->mxcsr = MXCSR_DEFAULT; | 139 | fx->mxcsr = MXCSR_DEFAULT; |
113 | } | 140 | } |
@@ -351,8 +378,14 @@ static inline void __thread_fpu_begin(struct task_struct *tsk) | |||
351 | __thread_set_has_fpu(tsk); | 378 | __thread_set_has_fpu(tsk); |
352 | } | 379 | } |
353 | 380 | ||
354 | static inline void __drop_fpu(struct task_struct *tsk) | 381 | static inline void drop_fpu(struct task_struct *tsk) |
355 | { | 382 | { |
383 | /* | ||
384 | * Forget coprocessor state.. | ||
385 | */ | ||
386 | preempt_disable(); | ||
387 | tsk->thread.fpu_counter = 0; | ||
388 | |||
356 | if (__thread_has_fpu(tsk)) { | 389 | if (__thread_has_fpu(tsk)) { |
357 | /* Ignore delayed exceptions from user space */ | 390 | /* Ignore delayed exceptions from user space */ |
358 | asm volatile("1: fwait\n" | 391 | asm volatile("1: fwait\n" |
@@ -360,30 +393,29 @@ static inline void __drop_fpu(struct task_struct *tsk) | |||
360 | _ASM_EXTABLE(1b, 2b)); | 393 | _ASM_EXTABLE(1b, 2b)); |
361 | __thread_fpu_end(tsk); | 394 | __thread_fpu_end(tsk); |
362 | } | 395 | } |
363 | } | ||
364 | 396 | ||
365 | static inline void drop_fpu(struct task_struct *tsk) | ||
366 | { | ||
367 | /* | ||
368 | * Forget coprocessor state.. | ||
369 | */ | ||
370 | preempt_disable(); | ||
371 | tsk->thread.fpu_counter = 0; | ||
372 | __drop_fpu(tsk); | ||
373 | clear_stopped_child_used_math(tsk); | 397 | clear_stopped_child_used_math(tsk); |
374 | preempt_enable(); | 398 | preempt_enable(); |
375 | } | 399 | } |
376 | 400 | ||
377 | static inline void drop_init_fpu(struct task_struct *tsk) | 401 | static inline void restore_init_xstate(void) |
402 | { | ||
403 | if (use_xsave()) | ||
404 | xrstor_state(init_xstate_buf, -1); | ||
405 | else | ||
406 | fxrstor_checking(&init_xstate_buf->i387); | ||
407 | } | ||
408 | |||
409 | /* | ||
410 | * Reset the FPU state in the eager case and drop it in the lazy case (later use | ||
411 | * will reinit it). | ||
412 | */ | ||
413 | static inline void fpu_reset_state(struct task_struct *tsk) | ||
378 | { | 414 | { |
379 | if (!use_eager_fpu()) | 415 | if (!use_eager_fpu()) |
380 | drop_fpu(tsk); | 416 | drop_fpu(tsk); |
381 | else { | 417 | else |
382 | if (use_xsave()) | 418 | restore_init_xstate(); |
383 | xrstor_state(init_xstate_buf, -1); | ||
384 | else | ||
385 | fxrstor_checking(&init_xstate_buf->i387); | ||
386 | } | ||
387 | } | 419 | } |
388 | 420 | ||
389 | /* | 421 | /* |
@@ -400,24 +432,6 @@ static inline void drop_init_fpu(struct task_struct *tsk) | |||
400 | */ | 432 | */ |
401 | typedef struct { int preload; } fpu_switch_t; | 433 | typedef struct { int preload; } fpu_switch_t; |
402 | 434 | ||
403 | /* | ||
404 | * Must be run with preemption disabled: this clears the fpu_owner_task, | ||
405 | * on this CPU. | ||
406 | * | ||
407 | * This will disable any lazy FPU state restore of the current FPU state, | ||
408 | * but if the current thread owns the FPU, it will still be saved by. | ||
409 | */ | ||
410 | static inline void __cpu_disable_lazy_restore(unsigned int cpu) | ||
411 | { | ||
412 | per_cpu(fpu_owner_task, cpu) = NULL; | ||
413 | } | ||
414 | |||
415 | static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu) | ||
416 | { | ||
417 | return new == this_cpu_read_stable(fpu_owner_task) && | ||
418 | cpu == new->thread.fpu.last_cpu; | ||
419 | } | ||
420 | |||
421 | static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new, int cpu) | 435 | static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new, int cpu) |
422 | { | 436 | { |
423 | fpu_switch_t fpu; | 437 | fpu_switch_t fpu; |
@@ -426,13 +440,17 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta | |||
426 | * If the task has used the math, pre-load the FPU on xsave processors | 440 | * If the task has used the math, pre-load the FPU on xsave processors |
427 | * or if the past 5 consecutive context-switches used math. | 441 | * or if the past 5 consecutive context-switches used math. |
428 | */ | 442 | */ |
429 | fpu.preload = tsk_used_math(new) && (use_eager_fpu() || | 443 | fpu.preload = tsk_used_math(new) && |
430 | new->thread.fpu_counter > 5); | 444 | (use_eager_fpu() || new->thread.fpu_counter > 5); |
445 | |||
431 | if (__thread_has_fpu(old)) { | 446 | if (__thread_has_fpu(old)) { |
432 | if (!__save_init_fpu(old)) | 447 | if (!__save_init_fpu(old)) |
433 | cpu = ~0; | 448 | task_disable_lazy_fpu_restore(old); |
434 | old->thread.fpu.last_cpu = cpu; | 449 | else |
435 | old->thread.fpu.has_fpu = 0; /* But leave fpu_owner_task! */ | 450 | old->thread.fpu.last_cpu = cpu; |
451 | |||
452 | /* But leave fpu_owner_task! */ | ||
453 | old->thread.fpu.has_fpu = 0; | ||
436 | 454 | ||
437 | /* Don't change CR0.TS if we just switch! */ | 455 | /* Don't change CR0.TS if we just switch! */ |
438 | if (fpu.preload) { | 456 | if (fpu.preload) { |
@@ -443,10 +461,10 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta | |||
443 | stts(); | 461 | stts(); |
444 | } else { | 462 | } else { |
445 | old->thread.fpu_counter = 0; | 463 | old->thread.fpu_counter = 0; |
446 | old->thread.fpu.last_cpu = ~0; | 464 | task_disable_lazy_fpu_restore(old); |
447 | if (fpu.preload) { | 465 | if (fpu.preload) { |
448 | new->thread.fpu_counter++; | 466 | new->thread.fpu_counter++; |
449 | if (!use_eager_fpu() && fpu_lazy_restore(new, cpu)) | 467 | if (fpu_lazy_restore(new, cpu)) |
450 | fpu.preload = 0; | 468 | fpu.preload = 0; |
451 | else | 469 | else |
452 | prefetch(new->thread.fpu.state); | 470 | prefetch(new->thread.fpu.state); |
@@ -466,7 +484,7 @@ static inline void switch_fpu_finish(struct task_struct *new, fpu_switch_t fpu) | |||
466 | { | 484 | { |
467 | if (fpu.preload) { | 485 | if (fpu.preload) { |
468 | if (unlikely(restore_fpu_checking(new))) | 486 | if (unlikely(restore_fpu_checking(new))) |
469 | drop_init_fpu(new); | 487 | fpu_reset_state(new); |
470 | } | 488 | } |
471 | } | 489 | } |
472 | 490 | ||
@@ -495,10 +513,12 @@ static inline int restore_xstate_sig(void __user *buf, int ia32_frame) | |||
495 | } | 513 | } |
496 | 514 | ||
497 | /* | 515 | /* |
498 | * Need to be preemption-safe. | 516 | * Needs to be preemption-safe. |
499 | * | 517 | * |
500 | * NOTE! user_fpu_begin() must be used only immediately before restoring | 518 | * NOTE! user_fpu_begin() must be used only immediately before restoring |
501 | * it. This function does not do any save/restore on their own. | 519 | * the save state. It does not do any saving/restoring on its own. In |
520 | * lazy FPU mode, it is just an optimization to avoid a #NM exception, | ||
521 | * the task can lose the FPU right after preempt_enable(). | ||
502 | */ | 522 | */ |
503 | static inline void user_fpu_begin(void) | 523 | static inline void user_fpu_begin(void) |
504 | { | 524 | { |
@@ -520,24 +540,6 @@ static inline void __save_fpu(struct task_struct *tsk) | |||
520 | } | 540 | } |
521 | 541 | ||
522 | /* | 542 | /* |
523 | * These disable preemption on their own and are safe | ||
524 | */ | ||
525 | static inline void save_init_fpu(struct task_struct *tsk) | ||
526 | { | ||
527 | WARN_ON_ONCE(!__thread_has_fpu(tsk)); | ||
528 | |||
529 | if (use_eager_fpu()) { | ||
530 | __save_fpu(tsk); | ||
531 | return; | ||
532 | } | ||
533 | |||
534 | preempt_disable(); | ||
535 | __save_init_fpu(tsk); | ||
536 | __thread_fpu_end(tsk); | ||
537 | preempt_enable(); | ||
538 | } | ||
539 | |||
540 | /* | ||
541 | * i387 state interaction | 543 | * i387 state interaction |
542 | */ | 544 | */ |
543 | static inline unsigned short get_fpu_cwd(struct task_struct *tsk) | 545 | static inline unsigned short get_fpu_cwd(struct task_struct *tsk) |
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 9662290e0b20..e9571ddabc4f 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h | |||
@@ -181,10 +181,9 @@ extern __visible void smp_call_function_single_interrupt(struct pt_regs *); | |||
181 | extern __visible void smp_invalidate_interrupt(struct pt_regs *); | 181 | extern __visible void smp_invalidate_interrupt(struct pt_regs *); |
182 | #endif | 182 | #endif |
183 | 183 | ||
184 | extern void (*__initconst interrupt[FIRST_SYSTEM_VECTOR | 184 | extern char irq_entries_start[]; |
185 | - FIRST_EXTERNAL_VECTOR])(void); | ||
186 | #ifdef CONFIG_TRACING | 185 | #ifdef CONFIG_TRACING |
187 | #define trace_interrupt interrupt | 186 | #define trace_irq_entries_start irq_entries_start |
188 | #endif | 187 | #endif |
189 | 188 | ||
190 | #define VECTOR_UNDEFINED (-1) | 189 | #define VECTOR_UNDEFINED (-1) |
diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h index 47f29b1d1846..e7814b74caf8 100644 --- a/arch/x86/include/asm/insn.h +++ b/arch/x86/include/asm/insn.h | |||
@@ -69,7 +69,7 @@ struct insn { | |||
69 | const insn_byte_t *next_byte; | 69 | const insn_byte_t *next_byte; |
70 | }; | 70 | }; |
71 | 71 | ||
72 | #define MAX_INSN_SIZE 16 | 72 | #define MAX_INSN_SIZE 15 |
73 | 73 | ||
74 | #define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6) | 74 | #define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6) |
75 | #define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3) | 75 | #define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3) |
diff --git a/arch/x86/include/asm/iommu_table.h b/arch/x86/include/asm/iommu_table.h index f42a04735a0a..e37d6b3ad983 100644 --- a/arch/x86/include/asm/iommu_table.h +++ b/arch/x86/include/asm/iommu_table.h | |||
@@ -79,11 +79,12 @@ struct iommu_table_entry { | |||
79 | * d). Similar to the 'init', except that this gets called from pci_iommu_init | 79 | * d). Similar to the 'init', except that this gets called from pci_iommu_init |
80 | * where we do have a memory allocator. | 80 | * where we do have a memory allocator. |
81 | * | 81 | * |
82 | * The standard vs the _FINISH differs in that the _FINISH variant will | 82 | * The standard IOMMU_INIT differs from the IOMMU_INIT_FINISH variant |
83 | * continue detecting other IOMMUs in the call list after the | 83 | * in that the former will continue detecting other IOMMUs in the call |
84 | * the detection routine returns a positive number. The _FINISH will | 84 | * list after the detection routine returns a positive number, while the |
85 | * stop the execution chain. Both will still call the 'init' and | 85 | * latter will stop the execution chain upon first successful detection. |
86 | * 'late_init' functions if they are set. | 86 | * Both variants will still call the 'init' and 'late_init' functions if |
87 | * they are set. | ||
87 | */ | 88 | */ |
88 | #define IOMMU_INIT_FINISH(_detect, _depend, _init, _late_init) \ | 89 | #define IOMMU_INIT_FINISH(_detect, _depend, _init, _late_init) \ |
89 | __IOMMU_INIT(_detect, _depend, _init, _late_init, 1) | 90 | __IOMMU_INIT(_detect, _depend, _init, _late_init, 1) |
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 0a8b519226b8..b77f5edb03b0 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h | |||
@@ -136,10 +136,6 @@ static inline notrace unsigned long arch_local_irq_save(void) | |||
136 | #define USERGS_SYSRET32 \ | 136 | #define USERGS_SYSRET32 \ |
137 | swapgs; \ | 137 | swapgs; \ |
138 | sysretl | 138 | sysretl |
139 | #define ENABLE_INTERRUPTS_SYSEXIT32 \ | ||
140 | swapgs; \ | ||
141 | sti; \ | ||
142 | sysexit | ||
143 | 139 | ||
144 | #else | 140 | #else |
145 | #define INTERRUPT_RETURN iret | 141 | #define INTERRUPT_RETURN iret |
@@ -163,22 +159,27 @@ static inline int arch_irqs_disabled(void) | |||
163 | 159 | ||
164 | return arch_irqs_disabled_flags(flags); | 160 | return arch_irqs_disabled_flags(flags); |
165 | } | 161 | } |
162 | #endif /* !__ASSEMBLY__ */ | ||
166 | 163 | ||
164 | #ifdef __ASSEMBLY__ | ||
165 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
166 | # define TRACE_IRQS_ON call trace_hardirqs_on_thunk; | ||
167 | # define TRACE_IRQS_OFF call trace_hardirqs_off_thunk; | ||
167 | #else | 168 | #else |
168 | 169 | # define TRACE_IRQS_ON | |
169 | #ifdef CONFIG_X86_64 | 170 | # define TRACE_IRQS_OFF |
170 | #define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk | 171 | #endif |
171 | #define ARCH_LOCKDEP_SYS_EXIT_IRQ \ | 172 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
173 | # ifdef CONFIG_X86_64 | ||
174 | # define LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk | ||
175 | # define LOCKDEP_SYS_EXIT_IRQ \ | ||
172 | TRACE_IRQS_ON; \ | 176 | TRACE_IRQS_ON; \ |
173 | sti; \ | 177 | sti; \ |
174 | SAVE_REST; \ | 178 | call lockdep_sys_exit_thunk; \ |
175 | LOCKDEP_SYS_EXIT; \ | ||
176 | RESTORE_REST; \ | ||
177 | cli; \ | 179 | cli; \ |
178 | TRACE_IRQS_OFF; | 180 | TRACE_IRQS_OFF; |
179 | 181 | # else | |
180 | #else | 182 | # define LOCKDEP_SYS_EXIT \ |
181 | #define ARCH_LOCKDEP_SYS_EXIT \ | ||
182 | pushl %eax; \ | 183 | pushl %eax; \ |
183 | pushl %ecx; \ | 184 | pushl %ecx; \ |
184 | pushl %edx; \ | 185 | pushl %edx; \ |
@@ -186,24 +187,12 @@ static inline int arch_irqs_disabled(void) | |||
186 | popl %edx; \ | 187 | popl %edx; \ |
187 | popl %ecx; \ | 188 | popl %ecx; \ |
188 | popl %eax; | 189 | popl %eax; |
189 | 190 | # define LOCKDEP_SYS_EXIT_IRQ | |
190 | #define ARCH_LOCKDEP_SYS_EXIT_IRQ | 191 | # endif |
191 | #endif | ||
192 | |||
193 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
194 | # define TRACE_IRQS_ON call trace_hardirqs_on_thunk; | ||
195 | # define TRACE_IRQS_OFF call trace_hardirqs_off_thunk; | ||
196 | #else | 192 | #else |
197 | # define TRACE_IRQS_ON | ||
198 | # define TRACE_IRQS_OFF | ||
199 | #endif | ||
200 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
201 | # define LOCKDEP_SYS_EXIT ARCH_LOCKDEP_SYS_EXIT | ||
202 | # define LOCKDEP_SYS_EXIT_IRQ ARCH_LOCKDEP_SYS_EXIT_IRQ | ||
203 | # else | ||
204 | # define LOCKDEP_SYS_EXIT | 193 | # define LOCKDEP_SYS_EXIT |
205 | # define LOCKDEP_SYS_EXIT_IRQ | 194 | # define LOCKDEP_SYS_EXIT_IRQ |
206 | # endif | 195 | #endif |
207 | |||
208 | #endif /* __ASSEMBLY__ */ | 196 | #endif /* __ASSEMBLY__ */ |
197 | |||
209 | #endif | 198 | #endif |
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 6a2cefb4395a..a4c1cf7e93f8 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h | |||
@@ -1,7 +1,7 @@ | |||
1 | #ifndef _ASM_X86_JUMP_LABEL_H | 1 | #ifndef _ASM_X86_JUMP_LABEL_H |
2 | #define _ASM_X86_JUMP_LABEL_H | 2 | #define _ASM_X86_JUMP_LABEL_H |
3 | 3 | ||
4 | #ifdef __KERNEL__ | 4 | #ifndef __ASSEMBLY__ |
5 | 5 | ||
6 | #include <linux/stringify.h> | 6 | #include <linux/stringify.h> |
7 | #include <linux/types.h> | 7 | #include <linux/types.h> |
@@ -30,8 +30,6 @@ l_yes: | |||
30 | return true; | 30 | return true; |
31 | } | 31 | } |
32 | 32 | ||
33 | #endif /* __KERNEL__ */ | ||
34 | |||
35 | #ifdef CONFIG_X86_64 | 33 | #ifdef CONFIG_X86_64 |
36 | typedef u64 jump_label_t; | 34 | typedef u64 jump_label_t; |
37 | #else | 35 | #else |
@@ -44,4 +42,5 @@ struct jump_entry { | |||
44 | jump_label_t key; | 42 | jump_label_t key; |
45 | }; | 43 | }; |
46 | 44 | ||
45 | #endif /* __ASSEMBLY__ */ | ||
47 | #endif | 46 | #endif |
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a236e39cc385..dea2e7e962e3 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h | |||
@@ -81,11 +81,6 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) | |||
81 | (base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); | 81 | (base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); |
82 | } | 82 | } |
83 | 83 | ||
84 | #define SELECTOR_TI_MASK (1 << 2) | ||
85 | #define SELECTOR_RPL_MASK 0x03 | ||
86 | |||
87 | #define IOPL_SHIFT 12 | ||
88 | |||
89 | #define KVM_PERMILLE_MMU_PAGES 20 | 84 | #define KVM_PERMILLE_MMU_PAGES 20 |
90 | #define KVM_MIN_ALLOC_MMU_PAGES 64 | 85 | #define KVM_MIN_ALLOC_MMU_PAGES 64 |
91 | #define KVM_MMU_HASH_SHIFT 10 | 86 | #define KVM_MMU_HASH_SHIFT 10 |
@@ -345,6 +340,7 @@ struct kvm_pmu { | |||
345 | enum { | 340 | enum { |
346 | KVM_DEBUGREG_BP_ENABLED = 1, | 341 | KVM_DEBUGREG_BP_ENABLED = 1, |
347 | KVM_DEBUGREG_WONT_EXIT = 2, | 342 | KVM_DEBUGREG_WONT_EXIT = 2, |
343 | KVM_DEBUGREG_RELOAD = 4, | ||
348 | }; | 344 | }; |
349 | 345 | ||
350 | struct kvm_vcpu_arch { | 346 | struct kvm_vcpu_arch { |
@@ -431,6 +427,9 @@ struct kvm_vcpu_arch { | |||
431 | 427 | ||
432 | int cpuid_nent; | 428 | int cpuid_nent; |
433 | struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES]; | 429 | struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES]; |
430 | |||
431 | int maxphyaddr; | ||
432 | |||
434 | /* emulate context */ | 433 | /* emulate context */ |
435 | 434 | ||
436 | struct x86_emulate_ctxt emulate_ctxt; | 435 | struct x86_emulate_ctxt emulate_ctxt; |
@@ -550,11 +549,20 @@ struct kvm_arch_memory_slot { | |||
550 | struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; | 549 | struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; |
551 | }; | 550 | }; |
552 | 551 | ||
552 | /* | ||
553 | * We use as the mode the number of bits allocated in the LDR for the | ||
554 | * logical processor ID. It happens that these are all powers of two. | ||
555 | * This makes it is very easy to detect cases where the APICs are | ||
556 | * configured for multiple modes; in that case, we cannot use the map and | ||
557 | * hence cannot use kvm_irq_delivery_to_apic_fast either. | ||
558 | */ | ||
559 | #define KVM_APIC_MODE_XAPIC_CLUSTER 4 | ||
560 | #define KVM_APIC_MODE_XAPIC_FLAT 8 | ||
561 | #define KVM_APIC_MODE_X2APIC 16 | ||
562 | |||
553 | struct kvm_apic_map { | 563 | struct kvm_apic_map { |
554 | struct rcu_head rcu; | 564 | struct rcu_head rcu; |
555 | u8 ldr_bits; | 565 | u8 mode; |
556 | /* fields bellow are used to decode ldr values in different modes */ | ||
557 | u32 cid_shift, cid_mask, lid_mask, broadcast; | ||
558 | struct kvm_lapic *phys_map[256]; | 566 | struct kvm_lapic *phys_map[256]; |
559 | /* first index is cluster id second is cpu id in a cluster */ | 567 | /* first index is cluster id second is cpu id in a cluster */ |
560 | struct kvm_lapic *logical_map[16][16]; | 568 | struct kvm_lapic *logical_map[16][16]; |
@@ -859,6 +867,8 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, | |||
859 | void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); | 867 | void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); |
860 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, | 868 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, |
861 | struct kvm_memory_slot *memslot); | 869 | struct kvm_memory_slot *memslot); |
870 | void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, | ||
871 | struct kvm_memory_slot *memslot); | ||
862 | void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, | 872 | void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, |
863 | struct kvm_memory_slot *memslot); | 873 | struct kvm_memory_slot *memslot); |
864 | void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, | 874 | void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, |
@@ -933,6 +943,7 @@ struct x86_emulate_ctxt; | |||
933 | int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port); | 943 | int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port); |
934 | void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); | 944 | void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); |
935 | int kvm_emulate_halt(struct kvm_vcpu *vcpu); | 945 | int kvm_emulate_halt(struct kvm_vcpu *vcpu); |
946 | int kvm_vcpu_halt(struct kvm_vcpu *vcpu); | ||
936 | int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); | 947 | int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); |
937 | 948 | ||
938 | void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); | 949 | void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); |
@@ -1128,7 +1139,6 @@ int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) | |||
1128 | int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); | 1139 | int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); |
1129 | int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); | 1140 | int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); |
1130 | void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); | 1141 | void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); |
1131 | int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); | ||
1132 | int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); | 1142 | int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); |
1133 | int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); | 1143 | int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); |
1134 | int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); | 1144 | int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); |
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index e62cf897f781..c1adf33fdd0d 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h | |||
@@ -115,7 +115,7 @@ static inline void kvm_spinlock_init(void) | |||
115 | 115 | ||
116 | static inline bool kvm_para_available(void) | 116 | static inline bool kvm_para_available(void) |
117 | { | 117 | { |
118 | return 0; | 118 | return false; |
119 | } | 119 | } |
120 | 120 | ||
121 | static inline unsigned int kvm_arch_para_features(void) | 121 | static inline unsigned int kvm_arch_para_features(void) |
diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h index a455a53d789a..2d29197bd2fb 100644 --- a/arch/x86/include/asm/livepatch.h +++ b/arch/x86/include/asm/livepatch.h | |||
@@ -32,8 +32,8 @@ static inline int klp_check_compiler_support(void) | |||
32 | #endif | 32 | #endif |
33 | return 0; | 33 | return 0; |
34 | } | 34 | } |
35 | extern int klp_write_module_reloc(struct module *mod, unsigned long type, | 35 | int klp_write_module_reloc(struct module *mod, unsigned long type, |
36 | unsigned long loc, unsigned long value); | 36 | unsigned long loc, unsigned long value); |
37 | 37 | ||
38 | static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip) | 38 | static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip) |
39 | { | 39 | { |
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 9b3de99dc004..1f5a86d518db 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h | |||
@@ -116,6 +116,12 @@ struct mca_config { | |||
116 | u32 rip_msr; | 116 | u32 rip_msr; |
117 | }; | 117 | }; |
118 | 118 | ||
119 | struct mce_vendor_flags { | ||
120 | __u64 overflow_recov : 1, /* cpuid_ebx(80000007) */ | ||
121 | __reserved_0 : 63; | ||
122 | }; | ||
123 | extern struct mce_vendor_flags mce_flags; | ||
124 | |||
119 | extern struct mca_config mca_cfg; | 125 | extern struct mca_config mca_cfg; |
120 | extern void mce_register_decode_chain(struct notifier_block *nb); | 126 | extern void mce_register_decode_chain(struct notifier_block *nb); |
121 | extern void mce_unregister_decode_chain(struct notifier_block *nb); | 127 | extern void mce_unregister_decode_chain(struct notifier_block *nb); |
@@ -128,9 +134,11 @@ extern int mce_p5_enabled; | |||
128 | #ifdef CONFIG_X86_MCE | 134 | #ifdef CONFIG_X86_MCE |
129 | int mcheck_init(void); | 135 | int mcheck_init(void); |
130 | void mcheck_cpu_init(struct cpuinfo_x86 *c); | 136 | void mcheck_cpu_init(struct cpuinfo_x86 *c); |
137 | void mcheck_vendor_init_severity(void); | ||
131 | #else | 138 | #else |
132 | static inline int mcheck_init(void) { return 0; } | 139 | static inline int mcheck_init(void) { return 0; } |
133 | static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {} | 140 | static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {} |
141 | static inline void mcheck_vendor_init_severity(void) {} | ||
134 | #endif | 142 | #endif |
135 | 143 | ||
136 | #ifdef CONFIG_X86_ANCIENT_MCE | 144 | #ifdef CONFIG_X86_ANCIENT_MCE |
@@ -183,11 +191,11 @@ typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS); | |||
183 | DECLARE_PER_CPU(mce_banks_t, mce_poll_banks); | 191 | DECLARE_PER_CPU(mce_banks_t, mce_poll_banks); |
184 | 192 | ||
185 | enum mcp_flags { | 193 | enum mcp_flags { |
186 | MCP_TIMESTAMP = (1 << 0), /* log time stamp */ | 194 | MCP_TIMESTAMP = BIT(0), /* log time stamp */ |
187 | MCP_UC = (1 << 1), /* log uncorrected errors */ | 195 | MCP_UC = BIT(1), /* log uncorrected errors */ |
188 | MCP_DONTLOG = (1 << 2), /* only clear, don't log */ | 196 | MCP_DONTLOG = BIT(2), /* only clear, don't log */ |
189 | }; | 197 | }; |
190 | void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); | 198 | bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b); |
191 | 199 | ||
192 | int mce_notify_irq(void); | 200 | int mce_notify_irq(void); |
193 | 201 | ||
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index 201b520521ed..2fb20d6f7e23 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h | |||
@@ -75,6 +75,79 @@ static inline void __exit exit_amd_microcode(void) {} | |||
75 | 75 | ||
76 | #ifdef CONFIG_MICROCODE_EARLY | 76 | #ifdef CONFIG_MICROCODE_EARLY |
77 | #define MAX_UCODE_COUNT 128 | 77 | #define MAX_UCODE_COUNT 128 |
78 | |||
79 | #define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24)) | ||
80 | #define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u') | ||
81 | #define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I') | ||
82 | #define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l') | ||
83 | #define CPUID_AMD1 QCHAR('A', 'u', 't', 'h') | ||
84 | #define CPUID_AMD2 QCHAR('e', 'n', 't', 'i') | ||
85 | #define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D') | ||
86 | |||
87 | #define CPUID_IS(a, b, c, ebx, ecx, edx) \ | ||
88 | (!((ebx ^ (a))|(edx ^ (b))|(ecx ^ (c)))) | ||
89 | |||
90 | /* | ||
91 | * In early loading microcode phase on BSP, boot_cpu_data is not set up yet. | ||
92 | * x86_vendor() gets vendor id for BSP. | ||
93 | * | ||
94 | * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify | ||
95 | * coding, we still use x86_vendor() to get vendor id for AP. | ||
96 | * | ||
97 | * x86_vendor() gets vendor information directly from CPUID. | ||
98 | */ | ||
99 | static inline int x86_vendor(void) | ||
100 | { | ||
101 | u32 eax = 0x00000000; | ||
102 | u32 ebx, ecx = 0, edx; | ||
103 | |||
104 | native_cpuid(&eax, &ebx, &ecx, &edx); | ||
105 | |||
106 | if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx)) | ||
107 | return X86_VENDOR_INTEL; | ||
108 | |||
109 | if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx)) | ||
110 | return X86_VENDOR_AMD; | ||
111 | |||
112 | return X86_VENDOR_UNKNOWN; | ||
113 | } | ||
114 | |||
115 | static inline unsigned int __x86_family(unsigned int sig) | ||
116 | { | ||
117 | unsigned int x86; | ||
118 | |||
119 | x86 = (sig >> 8) & 0xf; | ||
120 | |||
121 | if (x86 == 0xf) | ||
122 | x86 += (sig >> 20) & 0xff; | ||
123 | |||
124 | return x86; | ||
125 | } | ||
126 | |||
127 | static inline unsigned int x86_family(void) | ||
128 | { | ||
129 | u32 eax = 0x00000001; | ||
130 | u32 ebx, ecx = 0, edx; | ||
131 | |||
132 | native_cpuid(&eax, &ebx, &ecx, &edx); | ||
133 | |||
134 | return __x86_family(eax); | ||
135 | } | ||
136 | |||
137 | static inline unsigned int x86_model(unsigned int sig) | ||
138 | { | ||
139 | unsigned int x86, model; | ||
140 | |||
141 | x86 = __x86_family(sig); | ||
142 | |||
143 | model = (sig >> 4) & 0xf; | ||
144 | |||
145 | if (x86 == 0x6 || x86 == 0xf) | ||
146 | model += ((sig >> 16) & 0xf) << 4; | ||
147 | |||
148 | return model; | ||
149 | } | ||
150 | |||
78 | extern void __init load_ucode_bsp(void); | 151 | extern void __init load_ucode_bsp(void); |
79 | extern void load_ucode_ap(void); | 152 | extern void load_ucode_ap(void); |
80 | extern int __init save_microcode_in_initrd(void); | 153 | extern int __init save_microcode_in_initrd(void); |
diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h index dd4c20043ce7..2b9209c46ca9 100644 --- a/arch/x86/include/asm/microcode_intel.h +++ b/arch/x86/include/asm/microcode_intel.h | |||
@@ -56,12 +56,15 @@ struct extended_sigtable { | |||
56 | 56 | ||
57 | #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) | 57 | #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) |
58 | 58 | ||
59 | extern int | 59 | extern int get_matching_microcode(unsigned int csig, int cpf, int rev, void *mc); |
60 | get_matching_microcode(unsigned int csig, int cpf, void *mc, int rev); | ||
61 | extern int microcode_sanity_check(void *mc, int print_err); | 60 | extern int microcode_sanity_check(void *mc, int print_err); |
62 | extern int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev); | 61 | extern int get_matching_sig(unsigned int csig, int cpf, int rev, void *mc); |
63 | extern int | 62 | |
64 | update_match_revision(struct microcode_header_intel *mc_header, int rev); | 63 | static inline int |
64 | revision_is_newer(struct microcode_header_intel *mc_header, int rev) | ||
65 | { | ||
66 | return (mc_header->rev <= rev) ? 0 : 1; | ||
67 | } | ||
65 | 68 | ||
66 | #ifdef CONFIG_MICROCODE_INTEL_EARLY | 69 | #ifdef CONFIG_MICROCODE_INTEL_EARLY |
67 | extern void __init load_ucode_intel_bsp(void); | 70 | extern void __init load_ucode_intel_bsp(void); |
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index a1410db38a1a..653dfa7662e1 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h | |||
@@ -30,6 +30,14 @@ static inline void __mwait(unsigned long eax, unsigned long ecx) | |||
30 | :: "a" (eax), "c" (ecx)); | 30 | :: "a" (eax), "c" (ecx)); |
31 | } | 31 | } |
32 | 32 | ||
33 | static inline void __sti_mwait(unsigned long eax, unsigned long ecx) | ||
34 | { | ||
35 | trace_hardirqs_on(); | ||
36 | /* "mwait %eax, %ecx;" */ | ||
37 | asm volatile("sti; .byte 0x0f, 0x01, 0xc9;" | ||
38 | :: "a" (eax), "c" (ecx)); | ||
39 | } | ||
40 | |||
33 | /* | 41 | /* |
34 | * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, | 42 | * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, |
35 | * which can obviate IPI to trigger checking of need_resched. | 43 | * which can obviate IPI to trigger checking of need_resched. |
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index f97fbe3abb67..c7c712f2648b 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h | |||
@@ -40,8 +40,10 @@ | |||
40 | 40 | ||
41 | #ifdef CONFIG_X86_64 | 41 | #ifdef CONFIG_X86_64 |
42 | #include <asm/page_64_types.h> | 42 | #include <asm/page_64_types.h> |
43 | #define IOREMAP_MAX_ORDER (PUD_SHIFT) | ||
43 | #else | 44 | #else |
44 | #include <asm/page_32_types.h> | 45 | #include <asm/page_32_types.h> |
46 | #define IOREMAP_MAX_ORDER (PMD_SHIFT) | ||
45 | #endif /* CONFIG_X86_64 */ | 47 | #endif /* CONFIG_X86_64 */ |
46 | 48 | ||
47 | #ifndef __ASSEMBLY__ | 49 | #ifndef __ASSEMBLY__ |
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 965c47d254aa..8957810ad7d1 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h | |||
@@ -545,7 +545,7 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) | |||
545 | PVOP_VCALL2(pv_mmu_ops.set_pmd, pmdp, val); | 545 | PVOP_VCALL2(pv_mmu_ops.set_pmd, pmdp, val); |
546 | } | 546 | } |
547 | 547 | ||
548 | #if PAGETABLE_LEVELS >= 3 | 548 | #if CONFIG_PGTABLE_LEVELS >= 3 |
549 | static inline pmd_t __pmd(pmdval_t val) | 549 | static inline pmd_t __pmd(pmdval_t val) |
550 | { | 550 | { |
551 | pmdval_t ret; | 551 | pmdval_t ret; |
@@ -585,7 +585,7 @@ static inline void set_pud(pud_t *pudp, pud_t pud) | |||
585 | PVOP_VCALL2(pv_mmu_ops.set_pud, pudp, | 585 | PVOP_VCALL2(pv_mmu_ops.set_pud, pudp, |
586 | val); | 586 | val); |
587 | } | 587 | } |
588 | #if PAGETABLE_LEVELS == 4 | 588 | #if CONFIG_PGTABLE_LEVELS == 4 |
589 | static inline pud_t __pud(pudval_t val) | 589 | static inline pud_t __pud(pudval_t val) |
590 | { | 590 | { |
591 | pudval_t ret; | 591 | pudval_t ret; |
@@ -636,9 +636,9 @@ static inline void pud_clear(pud_t *pudp) | |||
636 | set_pud(pudp, __pud(0)); | 636 | set_pud(pudp, __pud(0)); |
637 | } | 637 | } |
638 | 638 | ||
639 | #endif /* PAGETABLE_LEVELS == 4 */ | 639 | #endif /* CONFIG_PGTABLE_LEVELS == 4 */ |
640 | 640 | ||
641 | #endif /* PAGETABLE_LEVELS >= 3 */ | 641 | #endif /* CONFIG_PGTABLE_LEVELS >= 3 */ |
642 | 642 | ||
643 | #ifdef CONFIG_X86_PAE | 643 | #ifdef CONFIG_X86_PAE |
644 | /* Special-case pte-setting operations for PAE, which can't update a | 644 | /* Special-case pte-setting operations for PAE, which can't update a |
@@ -976,11 +976,6 @@ extern void default_banner(void); | |||
976 | PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \ | 976 | PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \ |
977 | CLBR_NONE, \ | 977 | CLBR_NONE, \ |
978 | jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64)) | 978 | jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64)) |
979 | |||
980 | #define ENABLE_INTERRUPTS_SYSEXIT32 \ | ||
981 | PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), \ | ||
982 | CLBR_NONE, \ | ||
983 | jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit)) | ||
984 | #endif /* CONFIG_X86_32 */ | 979 | #endif /* CONFIG_X86_32 */ |
985 | 980 | ||
986 | #endif /* __ASSEMBLY__ */ | 981 | #endif /* __ASSEMBLY__ */ |
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 7549b8b369e4..f7b0b5c112f2 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h | |||
@@ -294,7 +294,7 @@ struct pv_mmu_ops { | |||
294 | struct paravirt_callee_save pgd_val; | 294 | struct paravirt_callee_save pgd_val; |
295 | struct paravirt_callee_save make_pgd; | 295 | struct paravirt_callee_save make_pgd; |
296 | 296 | ||
297 | #if PAGETABLE_LEVELS >= 3 | 297 | #if CONFIG_PGTABLE_LEVELS >= 3 |
298 | #ifdef CONFIG_X86_PAE | 298 | #ifdef CONFIG_X86_PAE |
299 | void (*set_pte_atomic)(pte_t *ptep, pte_t pteval); | 299 | void (*set_pte_atomic)(pte_t *ptep, pte_t pteval); |
300 | void (*pte_clear)(struct mm_struct *mm, unsigned long addr, | 300 | void (*pte_clear)(struct mm_struct *mm, unsigned long addr, |
@@ -308,13 +308,13 @@ struct pv_mmu_ops { | |||
308 | struct paravirt_callee_save pmd_val; | 308 | struct paravirt_callee_save pmd_val; |
309 | struct paravirt_callee_save make_pmd; | 309 | struct paravirt_callee_save make_pmd; |
310 | 310 | ||
311 | #if PAGETABLE_LEVELS == 4 | 311 | #if CONFIG_PGTABLE_LEVELS == 4 |
312 | struct paravirt_callee_save pud_val; | 312 | struct paravirt_callee_save pud_val; |
313 | struct paravirt_callee_save make_pud; | 313 | struct paravirt_callee_save make_pud; |
314 | 314 | ||
315 | void (*set_pgd)(pgd_t *pudp, pgd_t pgdval); | 315 | void (*set_pgd)(pgd_t *pudp, pgd_t pgdval); |
316 | #endif /* PAGETABLE_LEVELS == 4 */ | 316 | #endif /* CONFIG_PGTABLE_LEVELS == 4 */ |
317 | #endif /* PAGETABLE_LEVELS >= 3 */ | 317 | #endif /* CONFIG_PGTABLE_LEVELS >= 3 */ |
318 | 318 | ||
319 | struct pv_lazy_ops lazy_mode; | 319 | struct pv_lazy_ops lazy_mode; |
320 | 320 | ||
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index c4412e972bbd..bf7f8b55b0f9 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h | |||
@@ -77,7 +77,7 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, | |||
77 | 77 | ||
78 | #define pmd_pgtable(pmd) pmd_page(pmd) | 78 | #define pmd_pgtable(pmd) pmd_page(pmd) |
79 | 79 | ||
80 | #if PAGETABLE_LEVELS > 2 | 80 | #if CONFIG_PGTABLE_LEVELS > 2 |
81 | static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) | 81 | static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) |
82 | { | 82 | { |
83 | struct page *page; | 83 | struct page *page; |
@@ -116,7 +116,7 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) | |||
116 | } | 116 | } |
117 | #endif /* CONFIG_X86_PAE */ | 117 | #endif /* CONFIG_X86_PAE */ |
118 | 118 | ||
119 | #if PAGETABLE_LEVELS > 3 | 119 | #if CONFIG_PGTABLE_LEVELS > 3 |
120 | static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) | 120 | static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) |
121 | { | 121 | { |
122 | paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT); | 122 | paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT); |
@@ -142,7 +142,7 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, | |||
142 | ___pud_free_tlb(tlb, pud); | 142 | ___pud_free_tlb(tlb, pud); |
143 | } | 143 | } |
144 | 144 | ||
145 | #endif /* PAGETABLE_LEVELS > 3 */ | 145 | #endif /* CONFIG_PGTABLE_LEVELS > 3 */ |
146 | #endif /* PAGETABLE_LEVELS > 2 */ | 146 | #endif /* CONFIG_PGTABLE_LEVELS > 2 */ |
147 | 147 | ||
148 | #endif /* _ASM_X86_PGALLOC_H */ | 148 | #endif /* _ASM_X86_PGALLOC_H */ |
diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h index daacc23e3fb9..392576433e77 100644 --- a/arch/x86/include/asm/pgtable-2level_types.h +++ b/arch/x86/include/asm/pgtable-2level_types.h | |||
@@ -17,7 +17,6 @@ typedef union { | |||
17 | #endif /* !__ASSEMBLY__ */ | 17 | #endif /* !__ASSEMBLY__ */ |
18 | 18 | ||
19 | #define SHARED_KERNEL_PMD 0 | 19 | #define SHARED_KERNEL_PMD 0 |
20 | #define PAGETABLE_LEVELS 2 | ||
21 | 20 | ||
22 | /* | 21 | /* |
23 | * traditional i386 two-level paging structure: | 22 | * traditional i386 two-level paging structure: |
diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h index 1bd5876c8649..bcc89625ebe5 100644 --- a/arch/x86/include/asm/pgtable-3level_types.h +++ b/arch/x86/include/asm/pgtable-3level_types.h | |||
@@ -24,8 +24,6 @@ typedef union { | |||
24 | #define SHARED_KERNEL_PMD 1 | 24 | #define SHARED_KERNEL_PMD 1 |
25 | #endif | 25 | #endif |
26 | 26 | ||
27 | #define PAGETABLE_LEVELS 3 | ||
28 | |||
29 | /* | 27 | /* |
30 | * PGDIR_SHIFT determines what a top-level page table entry can map | 28 | * PGDIR_SHIFT determines what a top-level page table entry can map |
31 | */ | 29 | */ |
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index a0c35bf6cb92..fe57e7a98839 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h | |||
@@ -551,7 +551,7 @@ static inline unsigned long pages_to_mb(unsigned long npg) | |||
551 | return npg >> (20 - PAGE_SHIFT); | 551 | return npg >> (20 - PAGE_SHIFT); |
552 | } | 552 | } |
553 | 553 | ||
554 | #if PAGETABLE_LEVELS > 2 | 554 | #if CONFIG_PGTABLE_LEVELS > 2 |
555 | static inline int pud_none(pud_t pud) | 555 | static inline int pud_none(pud_t pud) |
556 | { | 556 | { |
557 | return native_pud_val(pud) == 0; | 557 | return native_pud_val(pud) == 0; |
@@ -594,9 +594,9 @@ static inline int pud_large(pud_t pud) | |||
594 | { | 594 | { |
595 | return 0; | 595 | return 0; |
596 | } | 596 | } |
597 | #endif /* PAGETABLE_LEVELS > 2 */ | 597 | #endif /* CONFIG_PGTABLE_LEVELS > 2 */ |
598 | 598 | ||
599 | #if PAGETABLE_LEVELS > 3 | 599 | #if CONFIG_PGTABLE_LEVELS > 3 |
600 | static inline int pgd_present(pgd_t pgd) | 600 | static inline int pgd_present(pgd_t pgd) |
601 | { | 601 | { |
602 | return pgd_flags(pgd) & _PAGE_PRESENT; | 602 | return pgd_flags(pgd) & _PAGE_PRESENT; |
@@ -633,7 +633,7 @@ static inline int pgd_none(pgd_t pgd) | |||
633 | { | 633 | { |
634 | return !native_pgd_val(pgd); | 634 | return !native_pgd_val(pgd); |
635 | } | 635 | } |
636 | #endif /* PAGETABLE_LEVELS > 3 */ | 636 | #endif /* CONFIG_PGTABLE_LEVELS > 3 */ |
637 | 637 | ||
638 | #endif /* __ASSEMBLY__ */ | 638 | #endif /* __ASSEMBLY__ */ |
639 | 639 | ||
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 602b6028c5b6..e6844dfb4471 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h | |||
@@ -20,7 +20,6 @@ typedef struct { pteval_t pte; } pte_t; | |||
20 | #endif /* !__ASSEMBLY__ */ | 20 | #endif /* !__ASSEMBLY__ */ |
21 | 21 | ||
22 | #define SHARED_KERNEL_PMD 0 | 22 | #define SHARED_KERNEL_PMD 0 |
23 | #define PAGETABLE_LEVELS 4 | ||
24 | 23 | ||
25 | /* | 24 | /* |
26 | * PGDIR_SHIFT determines what a top-level page table entry can map | 25 | * PGDIR_SHIFT determines what a top-level page table entry can map |
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 8c7c10802e9c..78f0c8cbe316 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h | |||
@@ -234,7 +234,7 @@ static inline pgdval_t pgd_flags(pgd_t pgd) | |||
234 | return native_pgd_val(pgd) & PTE_FLAGS_MASK; | 234 | return native_pgd_val(pgd) & PTE_FLAGS_MASK; |
235 | } | 235 | } |
236 | 236 | ||
237 | #if PAGETABLE_LEVELS > 3 | 237 | #if CONFIG_PGTABLE_LEVELS > 3 |
238 | typedef struct { pudval_t pud; } pud_t; | 238 | typedef struct { pudval_t pud; } pud_t; |
239 | 239 | ||
240 | static inline pud_t native_make_pud(pmdval_t val) | 240 | static inline pud_t native_make_pud(pmdval_t val) |
@@ -255,7 +255,7 @@ static inline pudval_t native_pud_val(pud_t pud) | |||
255 | } | 255 | } |
256 | #endif | 256 | #endif |
257 | 257 | ||
258 | #if PAGETABLE_LEVELS > 2 | 258 | #if CONFIG_PGTABLE_LEVELS > 2 |
259 | typedef struct { pmdval_t pmd; } pmd_t; | 259 | typedef struct { pmdval_t pmd; } pmd_t; |
260 | 260 | ||
261 | static inline pmd_t native_make_pmd(pmdval_t val) | 261 | static inline pmd_t native_make_pmd(pmdval_t val) |
diff --git a/arch/x86/include/asm/resume-trace.h b/arch/x86/include/asm/pm-trace.h index 3ff1c2cb1da5..7b7ac42c3661 100644 --- a/arch/x86/include/asm/resume-trace.h +++ b/arch/x86/include/asm/pm-trace.h | |||
@@ -1,5 +1,5 @@ | |||
1 | #ifndef _ASM_X86_RESUME_TRACE_H | 1 | #ifndef _ASM_X86_PM_TRACE_H |
2 | #define _ASM_X86_RESUME_TRACE_H | 2 | #define _ASM_X86_PM_TRACE_H |
3 | 3 | ||
4 | #include <asm/asm.h> | 4 | #include <asm/asm.h> |
5 | 5 | ||
@@ -14,8 +14,10 @@ do { \ | |||
14 | ".previous" \ | 14 | ".previous" \ |
15 | :"=r" (tracedata) \ | 15 | :"=r" (tracedata) \ |
16 | : "i" (__LINE__), "i" (__FILE__)); \ | 16 | : "i" (__LINE__), "i" (__FILE__)); \ |
17 | generate_resume_trace(tracedata, user); \ | 17 | generate_pm_trace(tracedata, user); \ |
18 | } \ | 18 | } \ |
19 | } while (0) | 19 | } while (0) |
20 | 20 | ||
21 | #endif /* _ASM_X86_RESUME_TRACE_H */ | 21 | #define TRACE_SUSPEND(user) TRACE_RESUME(user) |
22 | |||
23 | #endif /* _ASM_X86_PM_TRACE_H */ | ||
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index ec1c93588cef..23ba6765b718 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h | |||
@@ -109,6 +109,9 @@ struct cpuinfo_x86 { | |||
109 | /* in KB - valid for CPUS which support this call: */ | 109 | /* in KB - valid for CPUS which support this call: */ |
110 | int x86_cache_size; | 110 | int x86_cache_size; |
111 | int x86_cache_alignment; /* In bytes */ | 111 | int x86_cache_alignment; /* In bytes */ |
112 | /* Cache QoS architectural values: */ | ||
113 | int x86_cache_max_rmid; /* max index */ | ||
114 | int x86_cache_occ_scale; /* scale to bytes */ | ||
112 | int x86_power; | 115 | int x86_power; |
113 | unsigned long loops_per_jiffy; | 116 | unsigned long loops_per_jiffy; |
114 | /* cpuid returned max cores value: */ | 117 | /* cpuid returned max cores value: */ |
@@ -210,8 +213,23 @@ struct x86_hw_tss { | |||
210 | unsigned long sp0; | 213 | unsigned long sp0; |
211 | unsigned short ss0, __ss0h; | 214 | unsigned short ss0, __ss0h; |
212 | unsigned long sp1; | 215 | unsigned long sp1; |
213 | /* ss1 caches MSR_IA32_SYSENTER_CS: */ | 216 | |
214 | unsigned short ss1, __ss1h; | 217 | /* |
218 | * We don't use ring 1, so ss1 is a convenient scratch space in | ||
219 | * the same cacheline as sp0. We use ss1 to cache the value in | ||
220 | * MSR_IA32_SYSENTER_CS. When we context switch | ||
221 | * MSR_IA32_SYSENTER_CS, we first check if the new value being | ||
222 | * written matches ss1, and, if it's not, then we wrmsr the new | ||
223 | * value and update ss1. | ||
224 | * | ||
225 | * The only reason we context switch MSR_IA32_SYSENTER_CS is | ||
226 | * that we set it to zero in vm86 tasks to avoid corrupting the | ||
227 | * stack if we were to go through the sysenter path from vm86 | ||
228 | * mode. | ||
229 | */ | ||
230 | unsigned short ss1; /* MSR_IA32_SYSENTER_CS */ | ||
231 | |||
232 | unsigned short __ss1h; | ||
215 | unsigned long sp2; | 233 | unsigned long sp2; |
216 | unsigned short ss2, __ss2h; | 234 | unsigned short ss2, __ss2h; |
217 | unsigned long __cr3; | 235 | unsigned long __cr3; |
@@ -276,13 +294,17 @@ struct tss_struct { | |||
276 | unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; | 294 | unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; |
277 | 295 | ||
278 | /* | 296 | /* |
279 | * .. and then another 0x100 bytes for the emergency kernel stack: | 297 | * Space for the temporary SYSENTER stack: |
280 | */ | 298 | */ |
281 | unsigned long stack[64]; | 299 | unsigned long SYSENTER_stack[64]; |
282 | 300 | ||
283 | } ____cacheline_aligned; | 301 | } ____cacheline_aligned; |
284 | 302 | ||
285 | DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss); | 303 | DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss); |
304 | |||
305 | #ifdef CONFIG_X86_32 | ||
306 | DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); | ||
307 | #endif | ||
286 | 308 | ||
287 | /* | 309 | /* |
288 | * Save the original ist values for checking stack pointers during debugging | 310 | * Save the original ist values for checking stack pointers during debugging |
@@ -474,7 +496,6 @@ struct thread_struct { | |||
474 | #ifdef CONFIG_X86_32 | 496 | #ifdef CONFIG_X86_32 |
475 | unsigned long sysenter_cs; | 497 | unsigned long sysenter_cs; |
476 | #else | 498 | #else |
477 | unsigned long usersp; /* Copy from PDA */ | ||
478 | unsigned short es; | 499 | unsigned short es; |
479 | unsigned short ds; | 500 | unsigned short ds; |
480 | unsigned short fsindex; | 501 | unsigned short fsindex; |
@@ -564,6 +585,16 @@ static inline void native_swapgs(void) | |||
564 | #endif | 585 | #endif |
565 | } | 586 | } |
566 | 587 | ||
588 | static inline unsigned long current_top_of_stack(void) | ||
589 | { | ||
590 | #ifdef CONFIG_X86_64 | ||
591 | return this_cpu_read_stable(cpu_tss.x86_tss.sp0); | ||
592 | #else | ||
593 | /* sp0 on x86_32 is special in and around vm86 mode. */ | ||
594 | return this_cpu_read_stable(cpu_current_top_of_stack); | ||
595 | #endif | ||
596 | } | ||
597 | |||
567 | #ifdef CONFIG_PARAVIRT | 598 | #ifdef CONFIG_PARAVIRT |
568 | #include <asm/paravirt.h> | 599 | #include <asm/paravirt.h> |
569 | #else | 600 | #else |
@@ -761,10 +792,10 @@ extern char ignore_fpu_irq; | |||
761 | #define ARCH_HAS_SPINLOCK_PREFETCH | 792 | #define ARCH_HAS_SPINLOCK_PREFETCH |
762 | 793 | ||
763 | #ifdef CONFIG_X86_32 | 794 | #ifdef CONFIG_X86_32 |
764 | # define BASE_PREFETCH ASM_NOP4 | 795 | # define BASE_PREFETCH "" |
765 | # define ARCH_HAS_PREFETCH | 796 | # define ARCH_HAS_PREFETCH |
766 | #else | 797 | #else |
767 | # define BASE_PREFETCH "prefetcht0 (%1)" | 798 | # define BASE_PREFETCH "prefetcht0 %P1" |
768 | #endif | 799 | #endif |
769 | 800 | ||
770 | /* | 801 | /* |
@@ -775,10 +806,9 @@ extern char ignore_fpu_irq; | |||
775 | */ | 806 | */ |
776 | static inline void prefetch(const void *x) | 807 | static inline void prefetch(const void *x) |
777 | { | 808 | { |
778 | alternative_input(BASE_PREFETCH, | 809 | alternative_input(BASE_PREFETCH, "prefetchnta %P1", |
779 | "prefetchnta (%1)", | ||
780 | X86_FEATURE_XMM, | 810 | X86_FEATURE_XMM, |
781 | "r" (x)); | 811 | "m" (*(const char *)x)); |
782 | } | 812 | } |
783 | 813 | ||
784 | /* | 814 | /* |
@@ -788,10 +818,9 @@ static inline void prefetch(const void *x) | |||
788 | */ | 818 | */ |
789 | static inline void prefetchw(const void *x) | 819 | static inline void prefetchw(const void *x) |
790 | { | 820 | { |
791 | alternative_input(BASE_PREFETCH, | 821 | alternative_input(BASE_PREFETCH, "prefetchw %P1", |
792 | "prefetchw (%1)", | 822 | X86_FEATURE_3DNOWPREFETCH, |
793 | X86_FEATURE_3DNOW, | 823 | "m" (*(const char *)x)); |
794 | "r" (x)); | ||
795 | } | 824 | } |
796 | 825 | ||
797 | static inline void spin_lock_prefetch(const void *x) | 826 | static inline void spin_lock_prefetch(const void *x) |
@@ -799,6 +828,9 @@ static inline void spin_lock_prefetch(const void *x) | |||
799 | prefetchw(x); | 828 | prefetchw(x); |
800 | } | 829 | } |
801 | 830 | ||
831 | #define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \ | ||
832 | TOP_OF_KERNEL_STACK_PADDING) | ||
833 | |||
802 | #ifdef CONFIG_X86_32 | 834 | #ifdef CONFIG_X86_32 |
803 | /* | 835 | /* |
804 | * User space process size: 3GB (default). | 836 | * User space process size: 3GB (default). |
@@ -809,39 +841,16 @@ static inline void spin_lock_prefetch(const void *x) | |||
809 | #define STACK_TOP_MAX STACK_TOP | 841 | #define STACK_TOP_MAX STACK_TOP |
810 | 842 | ||
811 | #define INIT_THREAD { \ | 843 | #define INIT_THREAD { \ |
812 | .sp0 = sizeof(init_stack) + (long)&init_stack, \ | 844 | .sp0 = TOP_OF_INIT_STACK, \ |
813 | .vm86_info = NULL, \ | 845 | .vm86_info = NULL, \ |
814 | .sysenter_cs = __KERNEL_CS, \ | 846 | .sysenter_cs = __KERNEL_CS, \ |
815 | .io_bitmap_ptr = NULL, \ | 847 | .io_bitmap_ptr = NULL, \ |
816 | } | 848 | } |
817 | 849 | ||
818 | /* | ||
819 | * Note that the .io_bitmap member must be extra-big. This is because | ||
820 | * the CPU will access an additional byte beyond the end of the IO | ||
821 | * permission bitmap. The extra byte must be all 1 bits, and must | ||
822 | * be within the limit. | ||
823 | */ | ||
824 | #define INIT_TSS { \ | ||
825 | .x86_tss = { \ | ||
826 | .sp0 = sizeof(init_stack) + (long)&init_stack, \ | ||
827 | .ss0 = __KERNEL_DS, \ | ||
828 | .ss1 = __KERNEL_CS, \ | ||
829 | .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \ | ||
830 | }, \ | ||
831 | .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \ | ||
832 | } | ||
833 | |||
834 | extern unsigned long thread_saved_pc(struct task_struct *tsk); | 850 | extern unsigned long thread_saved_pc(struct task_struct *tsk); |
835 | 851 | ||
836 | #define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long)) | ||
837 | #define KSTK_TOP(info) \ | ||
838 | ({ \ | ||
839 | unsigned long *__ptr = (unsigned long *)(info); \ | ||
840 | (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \ | ||
841 | }) | ||
842 | |||
843 | /* | 852 | /* |
844 | * The below -8 is to reserve 8 bytes on top of the ring0 stack. | 853 | * TOP_OF_KERNEL_STACK_PADDING reserves 8 bytes on top of the ring0 stack. |
845 | * This is necessary to guarantee that the entire "struct pt_regs" | 854 | * This is necessary to guarantee that the entire "struct pt_regs" |
846 | * is accessible even if the CPU haven't stored the SS/ESP registers | 855 | * is accessible even if the CPU haven't stored the SS/ESP registers |
847 | * on the stack (interrupt gate does not save these registers | 856 | * on the stack (interrupt gate does not save these registers |
@@ -850,11 +859,11 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); | |||
850 | * "struct pt_regs" is possible, but they may contain the | 859 | * "struct pt_regs" is possible, but they may contain the |
851 | * completely wrong values. | 860 | * completely wrong values. |
852 | */ | 861 | */ |
853 | #define task_pt_regs(task) \ | 862 | #define task_pt_regs(task) \ |
854 | ({ \ | 863 | ({ \ |
855 | struct pt_regs *__regs__; \ | 864 | unsigned long __ptr = (unsigned long)task_stack_page(task); \ |
856 | __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \ | 865 | __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \ |
857 | __regs__ - 1; \ | 866 | ((struct pt_regs *)__ptr) - 1; \ |
858 | }) | 867 | }) |
859 | 868 | ||
860 | #define KSTK_ESP(task) (task_pt_regs(task)->sp) | 869 | #define KSTK_ESP(task) (task_pt_regs(task)->sp) |
@@ -886,11 +895,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); | |||
886 | #define STACK_TOP_MAX TASK_SIZE_MAX | 895 | #define STACK_TOP_MAX TASK_SIZE_MAX |
887 | 896 | ||
888 | #define INIT_THREAD { \ | 897 | #define INIT_THREAD { \ |
889 | .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \ | 898 | .sp0 = TOP_OF_INIT_STACK \ |
890 | } | ||
891 | |||
892 | #define INIT_TSS { \ | ||
893 | .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \ | ||
894 | } | 899 | } |
895 | 900 | ||
896 | /* | 901 | /* |
@@ -902,11 +907,6 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); | |||
902 | #define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1) | 907 | #define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1) |
903 | extern unsigned long KSTK_ESP(struct task_struct *task); | 908 | extern unsigned long KSTK_ESP(struct task_struct *task); |
904 | 909 | ||
905 | /* | ||
906 | * User space RSP while inside the SYSCALL fast path | ||
907 | */ | ||
908 | DECLARE_PER_CPU(unsigned long, old_rsp); | ||
909 | |||
910 | #endif /* CONFIG_X86_64 */ | 910 | #endif /* CONFIG_X86_64 */ |
911 | 911 | ||
912 | extern void start_thread(struct pt_regs *regs, unsigned long new_ip, | 912 | extern void start_thread(struct pt_regs *regs, unsigned long new_ip, |
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 86fc2bb82287..19507ffa5d28 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h | |||
@@ -31,13 +31,17 @@ struct pt_regs { | |||
31 | #else /* __i386__ */ | 31 | #else /* __i386__ */ |
32 | 32 | ||
33 | struct pt_regs { | 33 | struct pt_regs { |
34 | /* | ||
35 | * C ABI says these regs are callee-preserved. They aren't saved on kernel entry | ||
36 | * unless syscall needs a complete, fully filled "struct pt_regs". | ||
37 | */ | ||
34 | unsigned long r15; | 38 | unsigned long r15; |
35 | unsigned long r14; | 39 | unsigned long r14; |
36 | unsigned long r13; | 40 | unsigned long r13; |
37 | unsigned long r12; | 41 | unsigned long r12; |
38 | unsigned long bp; | 42 | unsigned long bp; |
39 | unsigned long bx; | 43 | unsigned long bx; |
40 | /* arguments: non interrupts/non tracing syscalls only save up to here*/ | 44 | /* These regs are callee-clobbered. Always saved on kernel entry. */ |
41 | unsigned long r11; | 45 | unsigned long r11; |
42 | unsigned long r10; | 46 | unsigned long r10; |
43 | unsigned long r9; | 47 | unsigned long r9; |
@@ -47,9 +51,12 @@ struct pt_regs { | |||
47 | unsigned long dx; | 51 | unsigned long dx; |
48 | unsigned long si; | 52 | unsigned long si; |
49 | unsigned long di; | 53 | unsigned long di; |
54 | /* | ||
55 | * On syscall entry, this is syscall#. On CPU exception, this is error code. | ||
56 | * On hw interrupt, it's IRQ number: | ||
57 | */ | ||
50 | unsigned long orig_ax; | 58 | unsigned long orig_ax; |
51 | /* end of arguments */ | 59 | /* Return frame for iretq */ |
52 | /* cpu exception frame or undefined */ | ||
53 | unsigned long ip; | 60 | unsigned long ip; |
54 | unsigned long cs; | 61 | unsigned long cs; |
55 | unsigned long flags; | 62 | unsigned long flags; |
@@ -89,11 +96,13 @@ static inline unsigned long regs_return_value(struct pt_regs *regs) | |||
89 | } | 96 | } |
90 | 97 | ||
91 | /* | 98 | /* |
92 | * user_mode_vm(regs) determines whether a register set came from user mode. | 99 | * user_mode(regs) determines whether a register set came from user |
93 | * This is true if V8086 mode was enabled OR if the register set was from | 100 | * mode. On x86_32, this is true if V8086 mode was enabled OR if the |
94 | * protected mode with RPL-3 CS value. This tricky test checks that with | 101 | * register set was from protected mode with RPL-3 CS value. This |
95 | * one comparison. Many places in the kernel can bypass this full check | 102 | * tricky test checks that with one comparison. |
96 | * if they have already ruled out V8086 mode, so user_mode(regs) can be used. | 103 | * |
104 | * On x86_64, vm86 mode is mercifully nonexistent, and we don't need | ||
105 | * the extra check. | ||
97 | */ | 106 | */ |
98 | static inline int user_mode(struct pt_regs *regs) | 107 | static inline int user_mode(struct pt_regs *regs) |
99 | { | 108 | { |
@@ -104,16 +113,6 @@ static inline int user_mode(struct pt_regs *regs) | |||
104 | #endif | 113 | #endif |
105 | } | 114 | } |
106 | 115 | ||
107 | static inline int user_mode_vm(struct pt_regs *regs) | ||
108 | { | ||
109 | #ifdef CONFIG_X86_32 | ||
110 | return ((regs->cs & SEGMENT_RPL_MASK) | (regs->flags & X86_VM_MASK)) >= | ||
111 | USER_RPL; | ||
112 | #else | ||
113 | return user_mode(regs); | ||
114 | #endif | ||
115 | } | ||
116 | |||
117 | static inline int v8086_mode(struct pt_regs *regs) | 116 | static inline int v8086_mode(struct pt_regs *regs) |
118 | { | 117 | { |
119 | #ifdef CONFIG_X86_32 | 118 | #ifdef CONFIG_X86_32 |
@@ -138,12 +137,8 @@ static inline bool user_64bit_mode(struct pt_regs *regs) | |||
138 | #endif | 137 | #endif |
139 | } | 138 | } |
140 | 139 | ||
141 | #define current_user_stack_pointer() this_cpu_read(old_rsp) | 140 | #define current_user_stack_pointer() current_pt_regs()->sp |
142 | /* ia32 vs. x32 difference */ | 141 | #define compat_user_stack_pointer() current_pt_regs()->sp |
143 | #define compat_user_stack_pointer() \ | ||
144 | (test_thread_flag(TIF_IA32) \ | ||
145 | ? current_pt_regs()->sp \ | ||
146 | : this_cpu_read(old_rsp)) | ||
147 | #endif | 142 | #endif |
148 | 143 | ||
149 | #ifdef CONFIG_X86_32 | 144 | #ifdef CONFIG_X86_32 |
@@ -248,7 +243,7 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, | |||
248 | */ | 243 | */ |
249 | #define arch_ptrace_stop_needed(code, info) \ | 244 | #define arch_ptrace_stop_needed(code, info) \ |
250 | ({ \ | 245 | ({ \ |
251 | set_thread_flag(TIF_NOTIFY_RESUME); \ | 246 | force_iret(); \ |
252 | false; \ | 247 | false; \ |
253 | }) | 248 | }) |
254 | 249 | ||
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index d6b078e9fa28..25b1cc07d496 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h | |||
@@ -95,6 +95,7 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, | |||
95 | 95 | ||
96 | struct pvclock_vsyscall_time_info { | 96 | struct pvclock_vsyscall_time_info { |
97 | struct pvclock_vcpu_time_info pvti; | 97 | struct pvclock_vcpu_time_info pvti; |
98 | u32 migrate_count; | ||
98 | } __attribute__((__aligned__(SMP_CACHE_BYTES))); | 99 | } __attribute__((__aligned__(SMP_CACHE_BYTES))); |
99 | 100 | ||
100 | #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info) | 101 | #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info) |
diff --git a/arch/x86/include/asm/seccomp.h b/arch/x86/include/asm/seccomp.h index 0f3d7f099224..0c8c7c8861b4 100644 --- a/arch/x86/include/asm/seccomp.h +++ b/arch/x86/include/asm/seccomp.h | |||
@@ -1,5 +1,20 @@ | |||
1 | #ifndef _ASM_X86_SECCOMP_H | ||
2 | #define _ASM_X86_SECCOMP_H | ||
3 | |||
4 | #include <asm/unistd.h> | ||
5 | |||
1 | #ifdef CONFIG_X86_32 | 6 | #ifdef CONFIG_X86_32 |
2 | # include <asm/seccomp_32.h> | 7 | #define __NR_seccomp_sigreturn __NR_sigreturn |
3 | #else | ||
4 | # include <asm/seccomp_64.h> | ||
5 | #endif | 8 | #endif |
9 | |||
10 | #ifdef CONFIG_COMPAT | ||
11 | #include <asm/ia32_unistd.h> | ||
12 | #define __NR_seccomp_read_32 __NR_ia32_read | ||
13 | #define __NR_seccomp_write_32 __NR_ia32_write | ||
14 | #define __NR_seccomp_exit_32 __NR_ia32_exit | ||
15 | #define __NR_seccomp_sigreturn_32 __NR_ia32_sigreturn | ||
16 | #endif | ||
17 | |||
18 | #include <asm-generic/seccomp.h> | ||
19 | |||
20 | #endif /* _ASM_X86_SECCOMP_H */ | ||
diff --git a/arch/x86/include/asm/seccomp_32.h b/arch/x86/include/asm/seccomp_32.h deleted file mode 100644 index b811d6f5780c..000000000000 --- a/arch/x86/include/asm/seccomp_32.h +++ /dev/null | |||
@@ -1,11 +0,0 @@ | |||
1 | #ifndef _ASM_X86_SECCOMP_32_H | ||
2 | #define _ASM_X86_SECCOMP_32_H | ||
3 | |||
4 | #include <linux/unistd.h> | ||
5 | |||
6 | #define __NR_seccomp_read __NR_read | ||
7 | #define __NR_seccomp_write __NR_write | ||
8 | #define __NR_seccomp_exit __NR_exit | ||
9 | #define __NR_seccomp_sigreturn __NR_sigreturn | ||
10 | |||
11 | #endif /* _ASM_X86_SECCOMP_32_H */ | ||
diff --git a/arch/x86/include/asm/seccomp_64.h b/arch/x86/include/asm/seccomp_64.h deleted file mode 100644 index 84ec1bd161a5..000000000000 --- a/arch/x86/include/asm/seccomp_64.h +++ /dev/null | |||
@@ -1,17 +0,0 @@ | |||
1 | #ifndef _ASM_X86_SECCOMP_64_H | ||
2 | #define _ASM_X86_SECCOMP_64_H | ||
3 | |||
4 | #include <linux/unistd.h> | ||
5 | #include <asm/ia32_unistd.h> | ||
6 | |||
7 | #define __NR_seccomp_read __NR_read | ||
8 | #define __NR_seccomp_write __NR_write | ||
9 | #define __NR_seccomp_exit __NR_exit | ||
10 | #define __NR_seccomp_sigreturn __NR_rt_sigreturn | ||
11 | |||
12 | #define __NR_seccomp_read_32 __NR_ia32_read | ||
13 | #define __NR_seccomp_write_32 __NR_ia32_write | ||
14 | #define __NR_seccomp_exit_32 __NR_ia32_exit | ||
15 | #define __NR_seccomp_sigreturn_32 __NR_ia32_sigreturn | ||
16 | |||
17 | #endif /* _ASM_X86_SECCOMP_64_H */ | ||
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index db257a58571f..5a9856eb12ba 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h | |||
@@ -3,8 +3,10 @@ | |||
3 | 3 | ||
4 | #include <linux/const.h> | 4 | #include <linux/const.h> |
5 | 5 | ||
6 | /* Constructor for a conventional segment GDT (or LDT) entry */ | 6 | /* |
7 | /* This is a macro so it can be used in initializers */ | 7 | * Constructor for a conventional segment GDT (or LDT) entry. |
8 | * This is a macro so it can be used in initializers. | ||
9 | */ | ||
8 | #define GDT_ENTRY(flags, base, limit) \ | 10 | #define GDT_ENTRY(flags, base, limit) \ |
9 | ((((base) & _AC(0xff000000,ULL)) << (56-24)) | \ | 11 | ((((base) & _AC(0xff000000,ULL)) << (56-24)) | \ |
10 | (((flags) & _AC(0x0000f0ff,ULL)) << 40) | \ | 12 | (((flags) & _AC(0x0000f0ff,ULL)) << 40) | \ |
@@ -12,198 +14,228 @@ | |||
12 | (((base) & _AC(0x00ffffff,ULL)) << 16) | \ | 14 | (((base) & _AC(0x00ffffff,ULL)) << 16) | \ |
13 | (((limit) & _AC(0x0000ffff,ULL)))) | 15 | (((limit) & _AC(0x0000ffff,ULL)))) |
14 | 16 | ||
15 | /* Simple and small GDT entries for booting only */ | 17 | /* Simple and small GDT entries for booting only: */ |
16 | 18 | ||
17 | #define GDT_ENTRY_BOOT_CS 2 | 19 | #define GDT_ENTRY_BOOT_CS 2 |
18 | #define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8) | 20 | #define GDT_ENTRY_BOOT_DS 3 |
21 | #define GDT_ENTRY_BOOT_TSS 4 | ||
22 | #define __BOOT_CS (GDT_ENTRY_BOOT_CS*8) | ||
23 | #define __BOOT_DS (GDT_ENTRY_BOOT_DS*8) | ||
24 | #define __BOOT_TSS (GDT_ENTRY_BOOT_TSS*8) | ||
25 | |||
26 | /* | ||
27 | * Bottom two bits of selector give the ring | ||
28 | * privilege level | ||
29 | */ | ||
30 | #define SEGMENT_RPL_MASK 0x3 | ||
19 | 31 | ||
20 | #define GDT_ENTRY_BOOT_DS (GDT_ENTRY_BOOT_CS + 1) | 32 | /* User mode is privilege level 3: */ |
21 | #define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8) | 33 | #define USER_RPL 0x3 |
22 | 34 | ||
23 | #define GDT_ENTRY_BOOT_TSS (GDT_ENTRY_BOOT_CS + 2) | 35 | /* Bit 2 is Table Indicator (TI): selects between LDT or GDT */ |
24 | #define __BOOT_TSS (GDT_ENTRY_BOOT_TSS * 8) | 36 | #define SEGMENT_TI_MASK 0x4 |
37 | /* LDT segment has TI set ... */ | ||
38 | #define SEGMENT_LDT 0x4 | ||
39 | /* ... GDT has it cleared */ | ||
40 | #define SEGMENT_GDT 0x0 | ||
25 | 41 | ||
26 | #define SEGMENT_RPL_MASK 0x3 /* | 42 | #define GDT_ENTRY_INVALID_SEG 0 |
27 | * Bottom two bits of selector give the ring | ||
28 | * privilege level | ||
29 | */ | ||
30 | #define SEGMENT_TI_MASK 0x4 /* Bit 2 is table indicator (LDT/GDT) */ | ||
31 | #define USER_RPL 0x3 /* User mode is privilege level 3 */ | ||
32 | #define SEGMENT_LDT 0x4 /* LDT segment has TI set... */ | ||
33 | #define SEGMENT_GDT 0x0 /* ... GDT has it cleared */ | ||
34 | 43 | ||
35 | #ifdef CONFIG_X86_32 | 44 | #ifdef CONFIG_X86_32 |
36 | /* | 45 | /* |
37 | * The layout of the per-CPU GDT under Linux: | 46 | * The layout of the per-CPU GDT under Linux: |
38 | * | 47 | * |
39 | * 0 - null | 48 | * 0 - null <=== cacheline #1 |
40 | * 1 - reserved | 49 | * 1 - reserved |
41 | * 2 - reserved | 50 | * 2 - reserved |
42 | * 3 - reserved | 51 | * 3 - reserved |
43 | * | 52 | * |
44 | * 4 - unused <==== new cacheline | 53 | * 4 - unused <=== cacheline #2 |
45 | * 5 - unused | 54 | * 5 - unused |
46 | * | 55 | * |
47 | * ------- start of TLS (Thread-Local Storage) segments: | 56 | * ------- start of TLS (Thread-Local Storage) segments: |
48 | * | 57 | * |
49 | * 6 - TLS segment #1 [ glibc's TLS segment ] | 58 | * 6 - TLS segment #1 [ glibc's TLS segment ] |
50 | * 7 - TLS segment #2 [ Wine's %fs Win32 segment ] | 59 | * 7 - TLS segment #2 [ Wine's %fs Win32 segment ] |
51 | * 8 - TLS segment #3 | 60 | * 8 - TLS segment #3 <=== cacheline #3 |
52 | * 9 - reserved | 61 | * 9 - reserved |
53 | * 10 - reserved | 62 | * 10 - reserved |
54 | * 11 - reserved | 63 | * 11 - reserved |
55 | * | 64 | * |
56 | * ------- start of kernel segments: | 65 | * ------- start of kernel segments: |
57 | * | 66 | * |
58 | * 12 - kernel code segment <==== new cacheline | 67 | * 12 - kernel code segment <=== cacheline #4 |
59 | * 13 - kernel data segment | 68 | * 13 - kernel data segment |
60 | * 14 - default user CS | 69 | * 14 - default user CS |
61 | * 15 - default user DS | 70 | * 15 - default user DS |
62 | * 16 - TSS | 71 | * 16 - TSS <=== cacheline #5 |
63 | * 17 - LDT | 72 | * 17 - LDT |
64 | * 18 - PNPBIOS support (16->32 gate) | 73 | * 18 - PNPBIOS support (16->32 gate) |
65 | * 19 - PNPBIOS support | 74 | * 19 - PNPBIOS support |
66 | * 20 - PNPBIOS support | 75 | * 20 - PNPBIOS support <=== cacheline #6 |
67 | * 21 - PNPBIOS support | 76 | * 21 - PNPBIOS support |
68 | * 22 - PNPBIOS support | 77 | * 22 - PNPBIOS support |
69 | * 23 - APM BIOS support | 78 | * 23 - APM BIOS support |
70 | * 24 - APM BIOS support | 79 | * 24 - APM BIOS support <=== cacheline #7 |
71 | * 25 - APM BIOS support | 80 | * 25 - APM BIOS support |
72 | * | 81 | * |
73 | * 26 - ESPFIX small SS | 82 | * 26 - ESPFIX small SS |
74 | * 27 - per-cpu [ offset to per-cpu data area ] | 83 | * 27 - per-cpu [ offset to per-cpu data area ] |
75 | * 28 - stack_canary-20 [ for stack protector ] | 84 | * 28 - stack_canary-20 [ for stack protector ] <=== cacheline #8 |
76 | * 29 - unused | 85 | * 29 - unused |
77 | * 30 - unused | 86 | * 30 - unused |
78 | * 31 - TSS for double fault handler | 87 | * 31 - TSS for double fault handler |
79 | */ | 88 | */ |
80 | #define GDT_ENTRY_TLS_MIN 6 | 89 | #define GDT_ENTRY_TLS_MIN 6 |
81 | #define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1) | 90 | #define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1) |
82 | 91 | ||
92 | #define GDT_ENTRY_KERNEL_CS 12 | ||
93 | #define GDT_ENTRY_KERNEL_DS 13 | ||
83 | #define GDT_ENTRY_DEFAULT_USER_CS 14 | 94 | #define GDT_ENTRY_DEFAULT_USER_CS 14 |
84 | |||
85 | #define GDT_ENTRY_DEFAULT_USER_DS 15 | 95 | #define GDT_ENTRY_DEFAULT_USER_DS 15 |
96 | #define GDT_ENTRY_TSS 16 | ||
97 | #define GDT_ENTRY_LDT 17 | ||
98 | #define GDT_ENTRY_PNPBIOS_CS32 18 | ||
99 | #define GDT_ENTRY_PNPBIOS_CS16 19 | ||
100 | #define GDT_ENTRY_PNPBIOS_DS 20 | ||
101 | #define GDT_ENTRY_PNPBIOS_TS1 21 | ||
102 | #define GDT_ENTRY_PNPBIOS_TS2 22 | ||
103 | #define GDT_ENTRY_APMBIOS_BASE 23 | ||
104 | |||
105 | #define GDT_ENTRY_ESPFIX_SS 26 | ||
106 | #define GDT_ENTRY_PERCPU 27 | ||
107 | #define GDT_ENTRY_STACK_CANARY 28 | ||
108 | |||
109 | #define GDT_ENTRY_DOUBLEFAULT_TSS 31 | ||
86 | 110 | ||
87 | #define GDT_ENTRY_KERNEL_BASE (12) | 111 | /* |
112 | * Number of entries in the GDT table: | ||
113 | */ | ||
114 | #define GDT_ENTRIES 32 | ||
88 | 115 | ||
89 | #define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE+0) | 116 | /* |
117 | * Segment selector values corresponding to the above entries: | ||
118 | */ | ||
90 | 119 | ||
91 | #define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE+1) | 120 | #define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8) |
121 | #define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8) | ||
122 | #define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3) | ||
123 | #define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3) | ||
124 | #define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS*8) | ||
92 | 125 | ||
93 | #define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE+4) | 126 | /* segment for calling fn: */ |
94 | #define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE+5) | 127 | #define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32*8) |
128 | /* code segment for BIOS: */ | ||
129 | #define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16*8) | ||
95 | 130 | ||
96 | #define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE+6) | 131 | /* "Is this PNP code selector (PNP_CS32 or PNP_CS16)?" */ |
97 | #define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE+11) | 132 | #define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == PNP_CS32) |
98 | 133 | ||
99 | #define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE+14) | 134 | /* data segment for BIOS: */ |
100 | #define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS*8) | 135 | #define PNP_DS (GDT_ENTRY_PNPBIOS_DS*8) |
136 | /* transfer data segment: */ | ||
137 | #define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1*8) | ||
138 | /* another data segment: */ | ||
139 | #define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2*8) | ||
101 | 140 | ||
102 | #define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE+15) | ||
103 | #ifdef CONFIG_SMP | 141 | #ifdef CONFIG_SMP |
104 | #define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8) | 142 | # define __KERNEL_PERCPU (GDT_ENTRY_PERCPU*8) |
105 | #else | 143 | #else |
106 | #define __KERNEL_PERCPU 0 | 144 | # define __KERNEL_PERCPU 0 |
107 | #endif | 145 | #endif |
108 | 146 | ||
109 | #define GDT_ENTRY_STACK_CANARY (GDT_ENTRY_KERNEL_BASE+16) | ||
110 | #ifdef CONFIG_CC_STACKPROTECTOR | 147 | #ifdef CONFIG_CC_STACKPROTECTOR |
111 | #define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY*8) | 148 | # define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY*8) |
112 | #else | 149 | #else |
113 | #define __KERNEL_STACK_CANARY 0 | 150 | # define __KERNEL_STACK_CANARY 0 |
114 | #endif | 151 | #endif |
115 | 152 | ||
116 | #define GDT_ENTRY_DOUBLEFAULT_TSS 31 | 153 | #else /* 64-bit: */ |
117 | |||
118 | /* | ||
119 | * The GDT has 32 entries | ||
120 | */ | ||
121 | #define GDT_ENTRIES 32 | ||
122 | 154 | ||
123 | /* The PnP BIOS entries in the GDT */ | 155 | #include <asm/cache.h> |
124 | #define GDT_ENTRY_PNPBIOS_CS32 (GDT_ENTRY_PNPBIOS_BASE + 0) | ||
125 | #define GDT_ENTRY_PNPBIOS_CS16 (GDT_ENTRY_PNPBIOS_BASE + 1) | ||
126 | #define GDT_ENTRY_PNPBIOS_DS (GDT_ENTRY_PNPBIOS_BASE + 2) | ||
127 | #define GDT_ENTRY_PNPBIOS_TS1 (GDT_ENTRY_PNPBIOS_BASE + 3) | ||
128 | #define GDT_ENTRY_PNPBIOS_TS2 (GDT_ENTRY_PNPBIOS_BASE + 4) | ||
129 | |||
130 | /* The PnP BIOS selectors */ | ||
131 | #define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */ | ||
132 | #define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */ | ||
133 | #define PNP_DS (GDT_ENTRY_PNPBIOS_DS * 8) /* data segment for BIOS */ | ||
134 | #define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */ | ||
135 | #define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */ | ||
136 | 156 | ||
157 | #define GDT_ENTRY_KERNEL32_CS 1 | ||
158 | #define GDT_ENTRY_KERNEL_CS 2 | ||
159 | #define GDT_ENTRY_KERNEL_DS 3 | ||
137 | 160 | ||
138 | /* | 161 | /* |
139 | * Matching rules for certain types of segments. | 162 | * We cannot use the same code segment descriptor for user and kernel mode, |
163 | * not even in long flat mode, because of different DPL. | ||
164 | * | ||
165 | * GDT layout to get 64-bit SYSCALL/SYSRET support right. SYSRET hardcodes | ||
166 | * selectors: | ||
167 | * | ||
168 | * if returning to 32-bit userspace: cs = STAR.SYSRET_CS, | ||
169 | * if returning to 64-bit userspace: cs = STAR.SYSRET_CS+16, | ||
170 | * | ||
171 | * ss = STAR.SYSRET_CS+8 (in either case) | ||
172 | * | ||
173 | * thus USER_DS should be between 32-bit and 64-bit code selectors: | ||
140 | */ | 174 | */ |
175 | #define GDT_ENTRY_DEFAULT_USER32_CS 4 | ||
176 | #define GDT_ENTRY_DEFAULT_USER_DS 5 | ||
177 | #define GDT_ENTRY_DEFAULT_USER_CS 6 | ||
141 | 178 | ||
142 | /* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */ | 179 | /* Needs two entries */ |
143 | #define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8) | 180 | #define GDT_ENTRY_TSS 8 |
144 | 181 | /* Needs two entries */ | |
182 | #define GDT_ENTRY_LDT 10 | ||
145 | 183 | ||
146 | #else | 184 | #define GDT_ENTRY_TLS_MIN 12 |
147 | #include <asm/cache.h> | 185 | #define GDT_ENTRY_TLS_MAX 14 |
148 | |||
149 | #define GDT_ENTRY_KERNEL32_CS 1 | ||
150 | #define GDT_ENTRY_KERNEL_CS 2 | ||
151 | #define GDT_ENTRY_KERNEL_DS 3 | ||
152 | 186 | ||
153 | #define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS * 8) | 187 | /* Abused to load per CPU data from limit */ |
188 | #define GDT_ENTRY_PER_CPU 15 | ||
154 | 189 | ||
155 | /* | 190 | /* |
156 | * we cannot use the same code segment descriptor for user and kernel | 191 | * Number of entries in the GDT table: |
157 | * -- not even in the long flat mode, because of different DPL /kkeil | ||
158 | * The segment offset needs to contain a RPL. Grr. -AK | ||
159 | * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets) | ||
160 | */ | 192 | */ |
161 | #define GDT_ENTRY_DEFAULT_USER32_CS 4 | 193 | #define GDT_ENTRIES 16 |
162 | #define GDT_ENTRY_DEFAULT_USER_DS 5 | ||
163 | #define GDT_ENTRY_DEFAULT_USER_CS 6 | ||
164 | #define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS*8+3) | ||
165 | #define __USER32_DS __USER_DS | ||
166 | |||
167 | #define GDT_ENTRY_TSS 8 /* needs two entries */ | ||
168 | #define GDT_ENTRY_LDT 10 /* needs two entries */ | ||
169 | #define GDT_ENTRY_TLS_MIN 12 | ||
170 | #define GDT_ENTRY_TLS_MAX 14 | ||
171 | |||
172 | #define GDT_ENTRY_PER_CPU 15 /* Abused to load per CPU data from limit */ | ||
173 | #define __PER_CPU_SEG (GDT_ENTRY_PER_CPU * 8 + 3) | ||
174 | 194 | ||
175 | /* TLS indexes for 64bit - hardcoded in arch_prctl */ | 195 | /* |
176 | #define FS_TLS 0 | 196 | * Segment selector values corresponding to the above entries: |
177 | #define GS_TLS 1 | 197 | * |
178 | 198 | * Note, selectors also need to have a correct RPL, | |
179 | #define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3) | 199 | * expressed with the +3 value for user-space selectors: |
180 | #define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3) | 200 | */ |
181 | 201 | #define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS*8) | |
182 | #define GDT_ENTRIES 16 | 202 | #define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8) |
203 | #define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8) | ||
204 | #define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS*8 + 3) | ||
205 | #define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3) | ||
206 | #define __USER32_DS __USER_DS | ||
207 | #define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3) | ||
208 | #define __PER_CPU_SEG (GDT_ENTRY_PER_CPU*8 + 3) | ||
209 | |||
210 | /* TLS indexes for 64-bit - hardcoded in arch_prctl(): */ | ||
211 | #define FS_TLS 0 | ||
212 | #define GS_TLS 1 | ||
213 | |||
214 | #define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3) | ||
215 | #define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3) | ||
183 | 216 | ||
184 | #endif | 217 | #endif |
185 | 218 | ||
186 | #define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8) | ||
187 | #define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8) | ||
188 | #define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8+3) | ||
189 | #define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8+3) | ||
190 | #ifndef CONFIG_PARAVIRT | 219 | #ifndef CONFIG_PARAVIRT |
191 | #define get_kernel_rpl() 0 | 220 | # define get_kernel_rpl() 0 |
192 | #endif | 221 | #endif |
193 | 222 | ||
194 | #define IDT_ENTRIES 256 | 223 | #define IDT_ENTRIES 256 |
195 | #define NUM_EXCEPTION_VECTORS 32 | 224 | #define NUM_EXCEPTION_VECTORS 32 |
196 | /* Bitmask of exception vectors which push an error code on the stack */ | 225 | |
197 | #define EXCEPTION_ERRCODE_MASK 0x00027d00 | 226 | /* Bitmask of exception vectors which push an error code on the stack: */ |
198 | #define GDT_SIZE (GDT_ENTRIES * 8) | 227 | #define EXCEPTION_ERRCODE_MASK 0x00027d00 |
199 | #define GDT_ENTRY_TLS_ENTRIES 3 | 228 | |
200 | #define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8) | 229 | #define GDT_SIZE (GDT_ENTRIES*8) |
230 | #define GDT_ENTRY_TLS_ENTRIES 3 | ||
231 | #define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES* 8) | ||
201 | 232 | ||
202 | #ifdef __KERNEL__ | 233 | #ifdef __KERNEL__ |
203 | #ifndef __ASSEMBLY__ | 234 | #ifndef __ASSEMBLY__ |
235 | |||
204 | extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][2+2+5]; | 236 | extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][2+2+5]; |
205 | #ifdef CONFIG_TRACING | 237 | #ifdef CONFIG_TRACING |
206 | #define trace_early_idt_handlers early_idt_handlers | 238 | # define trace_early_idt_handlers early_idt_handlers |
207 | #endif | 239 | #endif |
208 | 240 | ||
209 | /* | 241 | /* |
@@ -228,37 +260,30 @@ do { \ | |||
228 | } while (0) | 260 | } while (0) |
229 | 261 | ||
230 | /* | 262 | /* |
231 | * Save a segment register away | 263 | * Save a segment register away: |
232 | */ | 264 | */ |
233 | #define savesegment(seg, value) \ | 265 | #define savesegment(seg, value) \ |
234 | asm("mov %%" #seg ",%0":"=r" (value) : : "memory") | 266 | asm("mov %%" #seg ",%0":"=r" (value) : : "memory") |
235 | 267 | ||
236 | /* | 268 | /* |
237 | * x86_32 user gs accessors. | 269 | * x86-32 user GS accessors: |
238 | */ | 270 | */ |
239 | #ifdef CONFIG_X86_32 | 271 | #ifdef CONFIG_X86_32 |
240 | #ifdef CONFIG_X86_32_LAZY_GS | 272 | # ifdef CONFIG_X86_32_LAZY_GS |
241 | #define get_user_gs(regs) (u16)({unsigned long v; savesegment(gs, v); v;}) | 273 | # define get_user_gs(regs) (u16)({ unsigned long v; savesegment(gs, v); v; }) |
242 | #define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v)) | 274 | # define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v)) |
243 | #define task_user_gs(tsk) ((tsk)->thread.gs) | 275 | # define task_user_gs(tsk) ((tsk)->thread.gs) |
244 | #define lazy_save_gs(v) savesegment(gs, (v)) | 276 | # define lazy_save_gs(v) savesegment(gs, (v)) |
245 | #define lazy_load_gs(v) loadsegment(gs, (v)) | 277 | # define lazy_load_gs(v) loadsegment(gs, (v)) |
246 | #else /* X86_32_LAZY_GS */ | 278 | # else /* X86_32_LAZY_GS */ |
247 | #define get_user_gs(regs) (u16)((regs)->gs) | 279 | # define get_user_gs(regs) (u16)((regs)->gs) |
248 | #define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0) | 280 | # define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0) |
249 | #define task_user_gs(tsk) (task_pt_regs(tsk)->gs) | 281 | # define task_user_gs(tsk) (task_pt_regs(tsk)->gs) |
250 | #define lazy_save_gs(v) do { } while (0) | 282 | # define lazy_save_gs(v) do { } while (0) |
251 | #define lazy_load_gs(v) do { } while (0) | 283 | # define lazy_load_gs(v) do { } while (0) |
252 | #endif /* X86_32_LAZY_GS */ | 284 | # endif /* X86_32_LAZY_GS */ |
253 | #endif /* X86_32 */ | 285 | #endif /* X86_32 */ |
254 | 286 | ||
255 | static inline unsigned long get_limit(unsigned long segment) | ||
256 | { | ||
257 | unsigned long __limit; | ||
258 | asm("lsll %1,%0" : "=r" (__limit) : "r" (segment)); | ||
259 | return __limit + 1; | ||
260 | } | ||
261 | |||
262 | #endif /* !__ASSEMBLY__ */ | 287 | #endif /* !__ASSEMBLY__ */ |
263 | #endif /* __KERNEL__ */ | 288 | #endif /* __KERNEL__ */ |
264 | 289 | ||
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index ff4e7b236e21..f69e06b283fb 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h | |||
@@ -66,6 +66,11 @@ static inline void x86_ce4100_early_setup(void) { } | |||
66 | */ | 66 | */ |
67 | extern struct boot_params boot_params; | 67 | extern struct boot_params boot_params; |
68 | 68 | ||
69 | static inline bool kaslr_enabled(void) | ||
70 | { | ||
71 | return !!(boot_params.hdr.loadflags & KASLR_FLAG); | ||
72 | } | ||
73 | |||
69 | /* | 74 | /* |
70 | * Do NOT EVER look at the BIOS memory size location. | 75 | * Do NOT EVER look at the BIOS memory size location. |
71 | * It does not work on many machines. | 76 | * It does not work on many machines. |
diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h index 9dfce4e0417d..6fe6b182c998 100644 --- a/arch/x86/include/asm/sigcontext.h +++ b/arch/x86/include/asm/sigcontext.h | |||
@@ -57,9 +57,9 @@ struct sigcontext { | |||
57 | unsigned long ip; | 57 | unsigned long ip; |
58 | unsigned long flags; | 58 | unsigned long flags; |
59 | unsigned short cs; | 59 | unsigned short cs; |
60 | unsigned short gs; | 60 | unsigned short __pad2; /* Was called gs, but was always zero. */ |
61 | unsigned short fs; | 61 | unsigned short __pad1; /* Was called fs, but was always zero. */ |
62 | unsigned short __pad0; | 62 | unsigned short ss; |
63 | unsigned long err; | 63 | unsigned long err; |
64 | unsigned long trapno; | 64 | unsigned long trapno; |
65 | unsigned long oldmask; | 65 | unsigned long oldmask; |
diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h index 7a958164088c..89db46752a8f 100644 --- a/arch/x86/include/asm/sighandling.h +++ b/arch/x86/include/asm/sighandling.h | |||
@@ -13,9 +13,7 @@ | |||
13 | X86_EFLAGS_CF | X86_EFLAGS_RF) | 13 | X86_EFLAGS_CF | X86_EFLAGS_RF) |
14 | 14 | ||
15 | void signal_fault(struct pt_regs *regs, void __user *frame, char *where); | 15 | void signal_fault(struct pt_regs *regs, void __user *frame, char *where); |
16 | 16 | int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc); | |
17 | int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, | ||
18 | unsigned long *pax); | ||
19 | int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, | 17 | int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, |
20 | struct pt_regs *regs, unsigned long mask); | 18 | struct pt_regs *regs, unsigned long mask); |
21 | 19 | ||
diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h index 8d3120f4e270..ba665ebd17bb 100644 --- a/arch/x86/include/asm/smap.h +++ b/arch/x86/include/asm/smap.h | |||
@@ -27,23 +27,11 @@ | |||
27 | 27 | ||
28 | #ifdef CONFIG_X86_SMAP | 28 | #ifdef CONFIG_X86_SMAP |
29 | 29 | ||
30 | #define ASM_CLAC \ | 30 | #define ASM_CLAC \ |
31 | 661: ASM_NOP3 ; \ | 31 | ALTERNATIVE "", __stringify(__ASM_CLAC), X86_FEATURE_SMAP |
32 | .pushsection .altinstr_replacement, "ax" ; \ | 32 | |
33 | 662: __ASM_CLAC ; \ | 33 | #define ASM_STAC \ |
34 | .popsection ; \ | 34 | ALTERNATIVE "", __stringify(__ASM_STAC), X86_FEATURE_SMAP |
35 | .pushsection .altinstructions, "a" ; \ | ||
36 | altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ; \ | ||
37 | .popsection | ||
38 | |||
39 | #define ASM_STAC \ | ||
40 | 661: ASM_NOP3 ; \ | ||
41 | .pushsection .altinstr_replacement, "ax" ; \ | ||
42 | 662: __ASM_STAC ; \ | ||
43 | .popsection ; \ | ||
44 | .pushsection .altinstructions, "a" ; \ | ||
45 | altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ; \ | ||
46 | .popsection | ||
47 | 35 | ||
48 | #else /* CONFIG_X86_SMAP */ | 36 | #else /* CONFIG_X86_SMAP */ |
49 | 37 | ||
@@ -61,20 +49,20 @@ | |||
61 | static __always_inline void clac(void) | 49 | static __always_inline void clac(void) |
62 | { | 50 | { |
63 | /* Note: a barrier is implicit in alternative() */ | 51 | /* Note: a barrier is implicit in alternative() */ |
64 | alternative(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP); | 52 | alternative("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP); |
65 | } | 53 | } |
66 | 54 | ||
67 | static __always_inline void stac(void) | 55 | static __always_inline void stac(void) |
68 | { | 56 | { |
69 | /* Note: a barrier is implicit in alternative() */ | 57 | /* Note: a barrier is implicit in alternative() */ |
70 | alternative(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP); | 58 | alternative("", __stringify(__ASM_STAC), X86_FEATURE_SMAP); |
71 | } | 59 | } |
72 | 60 | ||
73 | /* These macros can be used in asm() statements */ | 61 | /* These macros can be used in asm() statements */ |
74 | #define ASM_CLAC \ | 62 | #define ASM_CLAC \ |
75 | ALTERNATIVE(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP) | 63 | ALTERNATIVE("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP) |
76 | #define ASM_STAC \ | 64 | #define ASM_STAC \ |
77 | ALTERNATIVE(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP) | 65 | ALTERNATIVE("", __stringify(__ASM_STAC), X86_FEATURE_SMAP) |
78 | 66 | ||
79 | #else /* CONFIG_X86_SMAP */ | 67 | #else /* CONFIG_X86_SMAP */ |
80 | 68 | ||
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 8cd1cc3bc835..17a8dced12da 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h | |||
@@ -150,12 +150,13 @@ static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask) | |||
150 | } | 150 | } |
151 | 151 | ||
152 | void cpu_disable_common(void); | 152 | void cpu_disable_common(void); |
153 | void cpu_die_common(unsigned int cpu); | ||
154 | void native_smp_prepare_boot_cpu(void); | 153 | void native_smp_prepare_boot_cpu(void); |
155 | void native_smp_prepare_cpus(unsigned int max_cpus); | 154 | void native_smp_prepare_cpus(unsigned int max_cpus); |
156 | void native_smp_cpus_done(unsigned int max_cpus); | 155 | void native_smp_cpus_done(unsigned int max_cpus); |
156 | void common_cpu_up(unsigned int cpunum, struct task_struct *tidle); | ||
157 | int native_cpu_up(unsigned int cpunum, struct task_struct *tidle); | 157 | int native_cpu_up(unsigned int cpunum, struct task_struct *tidle); |
158 | int native_cpu_disable(void); | 158 | int native_cpu_disable(void); |
159 | int common_cpu_die(unsigned int cpu); | ||
159 | void native_cpu_die(unsigned int cpu); | 160 | void native_cpu_die(unsigned int cpu); |
160 | void native_play_dead(void); | 161 | void native_play_dead(void); |
161 | void play_dead_common(void); | 162 | void play_dead_common(void); |
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 6a4b00fafb00..aeb4666e0c0a 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h | |||
@@ -4,6 +4,8 @@ | |||
4 | 4 | ||
5 | #ifdef __KERNEL__ | 5 | #ifdef __KERNEL__ |
6 | 6 | ||
7 | #include <asm/nops.h> | ||
8 | |||
7 | static inline void native_clts(void) | 9 | static inline void native_clts(void) |
8 | { | 10 | { |
9 | asm volatile("clts"); | 11 | asm volatile("clts"); |
@@ -199,6 +201,28 @@ static inline void clflushopt(volatile void *__p) | |||
199 | "+m" (*(volatile char __force *)__p)); | 201 | "+m" (*(volatile char __force *)__p)); |
200 | } | 202 | } |
201 | 203 | ||
204 | static inline void clwb(volatile void *__p) | ||
205 | { | ||
206 | volatile struct { char x[64]; } *p = __p; | ||
207 | |||
208 | asm volatile(ALTERNATIVE_2( | ||
209 | ".byte " __stringify(NOP_DS_PREFIX) "; clflush (%[pax])", | ||
210 | ".byte 0x66; clflush (%[pax])", /* clflushopt (%%rax) */ | ||
211 | X86_FEATURE_CLFLUSHOPT, | ||
212 | ".byte 0x66, 0x0f, 0xae, 0x30", /* clwb (%%rax) */ | ||
213 | X86_FEATURE_CLWB) | ||
214 | : [p] "+m" (*p) | ||
215 | : [pax] "a" (p)); | ||
216 | } | ||
217 | |||
218 | static inline void pcommit_sfence(void) | ||
219 | { | ||
220 | alternative(ASM_NOP7, | ||
221 | ".byte 0x66, 0x0f, 0xae, 0xf8\n\t" /* pcommit */ | ||
222 | "sfence", | ||
223 | X86_FEATURE_PCOMMIT); | ||
224 | } | ||
225 | |||
202 | #define nop() asm volatile ("nop") | 226 | #define nop() asm volatile ("nop") |
203 | 227 | ||
204 | 228 | ||
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 1d4e4f279a32..b4bdec3e9523 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h | |||
@@ -13,19 +13,44 @@ | |||
13 | #include <asm/types.h> | 13 | #include <asm/types.h> |
14 | 14 | ||
15 | /* | 15 | /* |
16 | * TOP_OF_KERNEL_STACK_PADDING is a number of unused bytes that we | ||
17 | * reserve at the top of the kernel stack. We do it because of a nasty | ||
18 | * 32-bit corner case. On x86_32, the hardware stack frame is | ||
19 | * variable-length. Except for vm86 mode, struct pt_regs assumes a | ||
20 | * maximum-length frame. If we enter from CPL 0, the top 8 bytes of | ||
21 | * pt_regs don't actually exist. Ordinarily this doesn't matter, but it | ||
22 | * does in at least one case: | ||
23 | * | ||
24 | * If we take an NMI early enough in SYSENTER, then we can end up with | ||
25 | * pt_regs that extends above sp0. On the way out, in the espfix code, | ||
26 | * we can read the saved SS value, but that value will be above sp0. | ||
27 | * Without this offset, that can result in a page fault. (We are | ||
28 | * careful that, in this case, the value we read doesn't matter.) | ||
29 | * | ||
30 | * In vm86 mode, the hardware frame is much longer still, but we neither | ||
31 | * access the extra members from NMI context, nor do we write such a | ||
32 | * frame at sp0 at all. | ||
33 | * | ||
34 | * x86_64 has a fixed-length stack frame. | ||
35 | */ | ||
36 | #ifdef CONFIG_X86_32 | ||
37 | # define TOP_OF_KERNEL_STACK_PADDING 8 | ||
38 | #else | ||
39 | # define TOP_OF_KERNEL_STACK_PADDING 0 | ||
40 | #endif | ||
41 | |||
42 | /* | ||
16 | * low level task data that entry.S needs immediate access to | 43 | * low level task data that entry.S needs immediate access to |
17 | * - this struct should fit entirely inside of one cache line | 44 | * - this struct should fit entirely inside of one cache line |
18 | * - this struct shares the supervisor stack pages | 45 | * - this struct shares the supervisor stack pages |
19 | */ | 46 | */ |
20 | #ifndef __ASSEMBLY__ | 47 | #ifndef __ASSEMBLY__ |
21 | struct task_struct; | 48 | struct task_struct; |
22 | struct exec_domain; | ||
23 | #include <asm/processor.h> | 49 | #include <asm/processor.h> |
24 | #include <linux/atomic.h> | 50 | #include <linux/atomic.h> |
25 | 51 | ||
26 | struct thread_info { | 52 | struct thread_info { |
27 | struct task_struct *task; /* main task structure */ | 53 | struct task_struct *task; /* main task structure */ |
28 | struct exec_domain *exec_domain; /* execution domain */ | ||
29 | __u32 flags; /* low level flags */ | 54 | __u32 flags; /* low level flags */ |
30 | __u32 status; /* thread synchronous flags */ | 55 | __u32 status; /* thread synchronous flags */ |
31 | __u32 cpu; /* current CPU */ | 56 | __u32 cpu; /* current CPU */ |
@@ -39,7 +64,6 @@ struct thread_info { | |||
39 | #define INIT_THREAD_INFO(tsk) \ | 64 | #define INIT_THREAD_INFO(tsk) \ |
40 | { \ | 65 | { \ |
41 | .task = &tsk, \ | 66 | .task = &tsk, \ |
42 | .exec_domain = &default_exec_domain, \ | ||
43 | .flags = 0, \ | 67 | .flags = 0, \ |
44 | .cpu = 0, \ | 68 | .cpu = 0, \ |
45 | .saved_preempt_count = INIT_PREEMPT_COUNT, \ | 69 | .saved_preempt_count = INIT_PREEMPT_COUNT, \ |
@@ -145,7 +169,6 @@ struct thread_info { | |||
145 | #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW) | 169 | #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW) |
146 | 170 | ||
147 | #define STACK_WARN (THREAD_SIZE/8) | 171 | #define STACK_WARN (THREAD_SIZE/8) |
148 | #define KERNEL_STACK_OFFSET (5*(BITS_PER_LONG/8)) | ||
149 | 172 | ||
150 | /* | 173 | /* |
151 | * macros/functions for gaining access to the thread information structure | 174 | * macros/functions for gaining access to the thread information structure |
@@ -158,10 +181,7 @@ DECLARE_PER_CPU(unsigned long, kernel_stack); | |||
158 | 181 | ||
159 | static inline struct thread_info *current_thread_info(void) | 182 | static inline struct thread_info *current_thread_info(void) |
160 | { | 183 | { |
161 | struct thread_info *ti; | 184 | return (struct thread_info *)(current_top_of_stack() - THREAD_SIZE); |
162 | ti = (void *)(this_cpu_read_stable(kernel_stack) + | ||
163 | KERNEL_STACK_OFFSET - THREAD_SIZE); | ||
164 | return ti; | ||
165 | } | 185 | } |
166 | 186 | ||
167 | static inline unsigned long current_stack_pointer(void) | 187 | static inline unsigned long current_stack_pointer(void) |
@@ -177,16 +197,37 @@ static inline unsigned long current_stack_pointer(void) | |||
177 | 197 | ||
178 | #else /* !__ASSEMBLY__ */ | 198 | #else /* !__ASSEMBLY__ */ |
179 | 199 | ||
180 | /* how to get the thread information struct from ASM */ | 200 | /* Load thread_info address into "reg" */ |
181 | #define GET_THREAD_INFO(reg) \ | 201 | #define GET_THREAD_INFO(reg) \ |
182 | _ASM_MOV PER_CPU_VAR(kernel_stack),reg ; \ | 202 | _ASM_MOV PER_CPU_VAR(kernel_stack),reg ; \ |
183 | _ASM_SUB $(THREAD_SIZE-KERNEL_STACK_OFFSET),reg ; | 203 | _ASM_SUB $(THREAD_SIZE),reg ; |
184 | 204 | ||
185 | /* | 205 | /* |
186 | * Same if PER_CPU_VAR(kernel_stack) is, perhaps with some offset, already in | 206 | * ASM operand which evaluates to a 'thread_info' address of |
187 | * a certain register (to be used in assembler memory operands). | 207 | * the current task, if it is known that "reg" is exactly "off" |
208 | * bytes below the top of the stack currently. | ||
209 | * | ||
210 | * ( The kernel stack's size is known at build time, it is usually | ||
211 | * 2 or 4 pages, and the bottom of the kernel stack contains | ||
212 | * the thread_info structure. So to access the thread_info very | ||
213 | * quickly from assembly code we can calculate down from the | ||
214 | * top of the kernel stack to the bottom, using constant, | ||
215 | * build-time calculations only. ) | ||
216 | * | ||
217 | * For example, to fetch the current thread_info->flags value into %eax | ||
218 | * on x86-64 defconfig kernels, in syscall entry code where RSP is | ||
219 | * currently at exactly SIZEOF_PTREGS bytes away from the top of the | ||
220 | * stack: | ||
221 | * | ||
222 | * mov ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS), %eax | ||
223 | * | ||
224 | * will translate to: | ||
225 | * | ||
226 | * 8b 84 24 b8 c0 ff ff mov -0x3f48(%rsp), %eax | ||
227 | * | ||
228 | * which is below the current RSP by almost 16K. | ||
188 | */ | 229 | */ |
189 | #define THREAD_INFO(reg, off) KERNEL_STACK_OFFSET+(off)-THREAD_SIZE(reg) | 230 | #define ASM_THREAD_INFO(field, reg, off) ((field)+(off)-THREAD_SIZE)(reg) |
190 | 231 | ||
191 | #endif | 232 | #endif |
192 | 233 | ||
@@ -236,6 +277,16 @@ static inline bool is_ia32_task(void) | |||
236 | #endif | 277 | #endif |
237 | return false; | 278 | return false; |
238 | } | 279 | } |
280 | |||
281 | /* | ||
282 | * Force syscall return via IRET by making it look as if there was | ||
283 | * some work pending. IRET is our most capable (but slowest) syscall | ||
284 | * return path, which is able to restore modified SS, CS and certain | ||
285 | * EFLAGS values that other (fast) syscall return instructions | ||
286 | * are not able to restore properly. | ||
287 | */ | ||
288 | #define force_iret() set_thread_flag(TIF_NOTIFY_RESUME) | ||
289 | |||
239 | #endif /* !__ASSEMBLY__ */ | 290 | #endif /* !__ASSEMBLY__ */ |
240 | 291 | ||
241 | #ifndef __ASSEMBLY__ | 292 | #ifndef __ASSEMBLY__ |
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index 225b0988043a..ab456dc233b5 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h | |||
@@ -15,6 +15,7 @@ | |||
15 | 15 | ||
16 | /* loadflags */ | 16 | /* loadflags */ |
17 | #define LOADED_HIGH (1<<0) | 17 | #define LOADED_HIGH (1<<0) |
18 | #define KASLR_FLAG (1<<1) | ||
18 | #define QUIET_FLAG (1<<5) | 19 | #define QUIET_FLAG (1<<5) |
19 | #define KEEP_SEGMENTS (1<<6) | 20 | #define KEEP_SEGMENTS (1<<6) |
20 | #define CAN_USE_HEAP (1<<7) | 21 | #define CAN_USE_HEAP (1<<7) |
diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h index d993e33f5236..960a8a9dc4ab 100644 --- a/arch/x86/include/uapi/asm/e820.h +++ b/arch/x86/include/uapi/asm/e820.h | |||
@@ -33,6 +33,16 @@ | |||
33 | #define E820_NVS 4 | 33 | #define E820_NVS 4 |
34 | #define E820_UNUSABLE 5 | 34 | #define E820_UNUSABLE 5 |
35 | 35 | ||
36 | /* | ||
37 | * This is a non-standardized way to represent ADR or NVDIMM regions that | ||
38 | * persist over a reboot. The kernel will ignore their special capabilities | ||
39 | * unless the CONFIG_X86_PMEM_LEGACY=y option is set. | ||
40 | * | ||
41 | * ( Note that older platforms also used 6 for the same type of memory, | ||
42 | * but newer versions switched to 12 as 6 was assigned differently. Some | ||
43 | * time they will learn... ) | ||
44 | */ | ||
45 | #define E820_PRAM 12 | ||
36 | 46 | ||
37 | /* | 47 | /* |
38 | * reserved RAM used by kernel itself | 48 | * reserved RAM used by kernel itself |
diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h index fe01b0a784e7..c469490db4a8 100644 --- a/arch/x86/include/uapi/asm/msr-index.h +++ b/arch/x86/include/uapi/asm/msr-index.h | |||
@@ -77,6 +77,24 @@ | |||
77 | #define MSR_IA32_PERF_CAPABILITIES 0x00000345 | 77 | #define MSR_IA32_PERF_CAPABILITIES 0x00000345 |
78 | #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 | 78 | #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 |
79 | 79 | ||
80 | #define MSR_IA32_RTIT_CTL 0x00000570 | ||
81 | #define RTIT_CTL_TRACEEN BIT(0) | ||
82 | #define RTIT_CTL_OS BIT(2) | ||
83 | #define RTIT_CTL_USR BIT(3) | ||
84 | #define RTIT_CTL_CR3EN BIT(7) | ||
85 | #define RTIT_CTL_TOPA BIT(8) | ||
86 | #define RTIT_CTL_TSC_EN BIT(10) | ||
87 | #define RTIT_CTL_DISRETC BIT(11) | ||
88 | #define RTIT_CTL_BRANCH_EN BIT(13) | ||
89 | #define MSR_IA32_RTIT_STATUS 0x00000571 | ||
90 | #define RTIT_STATUS_CONTEXTEN BIT(1) | ||
91 | #define RTIT_STATUS_TRIGGEREN BIT(2) | ||
92 | #define RTIT_STATUS_ERROR BIT(4) | ||
93 | #define RTIT_STATUS_STOPPED BIT(5) | ||
94 | #define MSR_IA32_RTIT_CR3_MATCH 0x00000572 | ||
95 | #define MSR_IA32_RTIT_OUTPUT_BASE 0x00000560 | ||
96 | #define MSR_IA32_RTIT_OUTPUT_MASK 0x00000561 | ||
97 | |||
80 | #define MSR_MTRRfix64K_00000 0x00000250 | 98 | #define MSR_MTRRfix64K_00000 0x00000250 |
81 | #define MSR_MTRRfix16K_80000 0x00000258 | 99 | #define MSR_MTRRfix16K_80000 0x00000258 |
82 | #define MSR_MTRRfix16K_A0000 0x00000259 | 100 | #define MSR_MTRRfix16K_A0000 0x00000259 |
diff --git a/arch/x86/include/uapi/asm/ptrace-abi.h b/arch/x86/include/uapi/asm/ptrace-abi.h index 7b0a55a88851..580aee3072e0 100644 --- a/arch/x86/include/uapi/asm/ptrace-abi.h +++ b/arch/x86/include/uapi/asm/ptrace-abi.h | |||
@@ -25,13 +25,17 @@ | |||
25 | #else /* __i386__ */ | 25 | #else /* __i386__ */ |
26 | 26 | ||
27 | #if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS) | 27 | #if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS) |
28 | /* | ||
29 | * C ABI says these regs are callee-preserved. They aren't saved on kernel entry | ||
30 | * unless syscall needs a complete, fully filled "struct pt_regs". | ||
31 | */ | ||
28 | #define R15 0 | 32 | #define R15 0 |
29 | #define R14 8 | 33 | #define R14 8 |
30 | #define R13 16 | 34 | #define R13 16 |
31 | #define R12 24 | 35 | #define R12 24 |
32 | #define RBP 32 | 36 | #define RBP 32 |
33 | #define RBX 40 | 37 | #define RBX 40 |
34 | /* arguments: interrupts/non tracing syscalls only save up to here*/ | 38 | /* These regs are callee-clobbered. Always saved on kernel entry. */ |
35 | #define R11 48 | 39 | #define R11 48 |
36 | #define R10 56 | 40 | #define R10 56 |
37 | #define R9 64 | 41 | #define R9 64 |
@@ -41,15 +45,17 @@ | |||
41 | #define RDX 96 | 45 | #define RDX 96 |
42 | #define RSI 104 | 46 | #define RSI 104 |
43 | #define RDI 112 | 47 | #define RDI 112 |
44 | #define ORIG_RAX 120 /* = ERROR */ | 48 | /* |
45 | /* end of arguments */ | 49 | * On syscall entry, this is syscall#. On CPU exception, this is error code. |
46 | /* cpu exception frame or undefined in case of fast syscall. */ | 50 | * On hw interrupt, it's IRQ number: |
51 | */ | ||
52 | #define ORIG_RAX 120 | ||
53 | /* Return frame for iretq */ | ||
47 | #define RIP 128 | 54 | #define RIP 128 |
48 | #define CS 136 | 55 | #define CS 136 |
49 | #define EFLAGS 144 | 56 | #define EFLAGS 144 |
50 | #define RSP 152 | 57 | #define RSP 152 |
51 | #define SS 160 | 58 | #define SS 160 |
52 | #define ARGOFFSET R11 | ||
53 | #endif /* __ASSEMBLY__ */ | 59 | #endif /* __ASSEMBLY__ */ |
54 | 60 | ||
55 | /* top of stack page */ | 61 | /* top of stack page */ |
diff --git a/arch/x86/include/uapi/asm/ptrace.h b/arch/x86/include/uapi/asm/ptrace.h index ac4b9aa4d999..bc16115af39b 100644 --- a/arch/x86/include/uapi/asm/ptrace.h +++ b/arch/x86/include/uapi/asm/ptrace.h | |||
@@ -41,13 +41,17 @@ struct pt_regs { | |||
41 | #ifndef __KERNEL__ | 41 | #ifndef __KERNEL__ |
42 | 42 | ||
43 | struct pt_regs { | 43 | struct pt_regs { |
44 | /* | ||
45 | * C ABI says these regs are callee-preserved. They aren't saved on kernel entry | ||
46 | * unless syscall needs a complete, fully filled "struct pt_regs". | ||
47 | */ | ||
44 | unsigned long r15; | 48 | unsigned long r15; |
45 | unsigned long r14; | 49 | unsigned long r14; |
46 | unsigned long r13; | 50 | unsigned long r13; |
47 | unsigned long r12; | 51 | unsigned long r12; |
48 | unsigned long rbp; | 52 | unsigned long rbp; |
49 | unsigned long rbx; | 53 | unsigned long rbx; |
50 | /* arguments: non interrupts/non tracing syscalls only save up to here*/ | 54 | /* These regs are callee-clobbered. Always saved on kernel entry. */ |
51 | unsigned long r11; | 55 | unsigned long r11; |
52 | unsigned long r10; | 56 | unsigned long r10; |
53 | unsigned long r9; | 57 | unsigned long r9; |
@@ -57,9 +61,12 @@ struct pt_regs { | |||
57 | unsigned long rdx; | 61 | unsigned long rdx; |
58 | unsigned long rsi; | 62 | unsigned long rsi; |
59 | unsigned long rdi; | 63 | unsigned long rdi; |
64 | /* | ||
65 | * On syscall entry, this is syscall#. On CPU exception, this is error code. | ||
66 | * On hw interrupt, it's IRQ number: | ||
67 | */ | ||
60 | unsigned long orig_rax; | 68 | unsigned long orig_rax; |
61 | /* end of arguments */ | 69 | /* Return frame for iretq */ |
62 | /* cpu exception frame or undefined */ | ||
63 | unsigned long rip; | 70 | unsigned long rip; |
64 | unsigned long cs; | 71 | unsigned long cs; |
65 | unsigned long eflags; | 72 | unsigned long eflags; |
diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h index d8b9f9081e86..16dc4e8a2cd3 100644 --- a/arch/x86/include/uapi/asm/sigcontext.h +++ b/arch/x86/include/uapi/asm/sigcontext.h | |||
@@ -177,9 +177,24 @@ struct sigcontext { | |||
177 | __u64 rip; | 177 | __u64 rip; |
178 | __u64 eflags; /* RFLAGS */ | 178 | __u64 eflags; /* RFLAGS */ |
179 | __u16 cs; | 179 | __u16 cs; |
180 | __u16 gs; | 180 | |
181 | __u16 fs; | 181 | /* |
182 | __u16 __pad0; | 182 | * Prior to 2.5.64 ("[PATCH] x86-64 updates for 2.5.64-bk3"), |
183 | * Linux saved and restored fs and gs in these slots. This | ||
184 | * was counterproductive, as fsbase and gsbase were never | ||
185 | * saved, so arch_prctl was presumably unreliable. | ||
186 | * | ||
187 | * If these slots are ever needed for any other purpose, there | ||
188 | * is some risk that very old 64-bit binaries could get | ||
189 | * confused. I doubt that many such binaries still work, | ||
190 | * though, since the same patch in 2.5.64 also removed the | ||
191 | * 64-bit set_thread_area syscall, so it appears that there is | ||
192 | * no TLS API that works in both pre- and post-2.5.64 kernels. | ||
193 | */ | ||
194 | __u16 __pad2; /* Was gs. */ | ||
195 | __u16 __pad1; /* Was fs. */ | ||
196 | |||
197 | __u16 ss; | ||
183 | __u64 err; | 198 | __u64 err; |
184 | __u64 trapno; | 199 | __u64 trapno; |
185 | __u64 oldmask; | 200 | __u64 oldmask; |
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index c5f1a1deb91a..1fe92181ee9e 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h | |||
@@ -67,6 +67,7 @@ | |||
67 | #define EXIT_REASON_EPT_VIOLATION 48 | 67 | #define EXIT_REASON_EPT_VIOLATION 48 |
68 | #define EXIT_REASON_EPT_MISCONFIG 49 | 68 | #define EXIT_REASON_EPT_MISCONFIG 49 |
69 | #define EXIT_REASON_INVEPT 50 | 69 | #define EXIT_REASON_INVEPT 50 |
70 | #define EXIT_REASON_RDTSCP 51 | ||
70 | #define EXIT_REASON_PREEMPTION_TIMER 52 | 71 | #define EXIT_REASON_PREEMPTION_TIMER 52 |
71 | #define EXIT_REASON_INVVPID 53 | 72 | #define EXIT_REASON_INVVPID 53 |
72 | #define EXIT_REASON_WBINVD 54 | 73 | #define EXIT_REASON_WBINVD 54 |
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index cdb1b70ddad0..9bcd0b56ca17 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile | |||
@@ -32,6 +32,7 @@ obj-$(CONFIG_X86_32) += i386_ksyms_32.o | |||
32 | obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o | 32 | obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o |
33 | obj-$(CONFIG_X86_64) += mcount_64.o | 33 | obj-$(CONFIG_X86_64) += mcount_64.o |
34 | obj-y += syscall_$(BITS).o vsyscall_gtod.o | 34 | obj-y += syscall_$(BITS).o vsyscall_gtod.o |
35 | obj-$(CONFIG_IA32_EMULATION) += syscall_32.o | ||
35 | obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o | 36 | obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o |
36 | obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o | 37 | obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o |
37 | obj-$(CONFIG_SYSFS) += ksysfs.o | 38 | obj-$(CONFIG_SYSFS) += ksysfs.o |
@@ -94,6 +95,7 @@ obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o | |||
94 | obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o | 95 | obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o |
95 | obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o | 96 | obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o |
96 | obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o | 97 | obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o |
98 | obj-$(CONFIG_X86_PMEM_LEGACY) += pmem.o | ||
97 | 99 | ||
98 | obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o | 100 | obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o |
99 | 101 | ||
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 703130f469ec..aef653193160 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c | |||
@@ -52,10 +52,25 @@ static int __init setup_noreplace_paravirt(char *str) | |||
52 | __setup("noreplace-paravirt", setup_noreplace_paravirt); | 52 | __setup("noreplace-paravirt", setup_noreplace_paravirt); |
53 | #endif | 53 | #endif |
54 | 54 | ||
55 | #define DPRINTK(fmt, ...) \ | 55 | #define DPRINTK(fmt, args...) \ |
56 | do { \ | 56 | do { \ |
57 | if (debug_alternative) \ | 57 | if (debug_alternative) \ |
58 | printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ | 58 | printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args); \ |
59 | } while (0) | ||
60 | |||
61 | #define DUMP_BYTES(buf, len, fmt, args...) \ | ||
62 | do { \ | ||
63 | if (unlikely(debug_alternative)) { \ | ||
64 | int j; \ | ||
65 | \ | ||
66 | if (!(len)) \ | ||
67 | break; \ | ||
68 | \ | ||
69 | printk(KERN_DEBUG fmt, ##args); \ | ||
70 | for (j = 0; j < (len) - 1; j++) \ | ||
71 | printk(KERN_CONT "%02hhx ", buf[j]); \ | ||
72 | printk(KERN_CONT "%02hhx\n", buf[j]); \ | ||
73 | } \ | ||
59 | } while (0) | 74 | } while (0) |
60 | 75 | ||
61 | /* | 76 | /* |
@@ -243,12 +258,89 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; | |||
243 | extern s32 __smp_locks[], __smp_locks_end[]; | 258 | extern s32 __smp_locks[], __smp_locks_end[]; |
244 | void *text_poke_early(void *addr, const void *opcode, size_t len); | 259 | void *text_poke_early(void *addr, const void *opcode, size_t len); |
245 | 260 | ||
246 | /* Replace instructions with better alternatives for this CPU type. | 261 | /* |
247 | This runs before SMP is initialized to avoid SMP problems with | 262 | * Are we looking at a near JMP with a 1 or 4-byte displacement. |
248 | self modifying code. This implies that asymmetric systems where | 263 | */ |
249 | APs have less capabilities than the boot processor are not handled. | 264 | static inline bool is_jmp(const u8 opcode) |
250 | Tough. Make sure you disable such features by hand. */ | 265 | { |
266 | return opcode == 0xeb || opcode == 0xe9; | ||
267 | } | ||
268 | |||
269 | static void __init_or_module | ||
270 | recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf) | ||
271 | { | ||
272 | u8 *next_rip, *tgt_rip; | ||
273 | s32 n_dspl, o_dspl; | ||
274 | int repl_len; | ||
275 | |||
276 | if (a->replacementlen != 5) | ||
277 | return; | ||
278 | |||
279 | o_dspl = *(s32 *)(insnbuf + 1); | ||
280 | |||
281 | /* next_rip of the replacement JMP */ | ||
282 | next_rip = repl_insn + a->replacementlen; | ||
283 | /* target rip of the replacement JMP */ | ||
284 | tgt_rip = next_rip + o_dspl; | ||
285 | n_dspl = tgt_rip - orig_insn; | ||
286 | |||
287 | DPRINTK("target RIP: %p, new_displ: 0x%x", tgt_rip, n_dspl); | ||
288 | |||
289 | if (tgt_rip - orig_insn >= 0) { | ||
290 | if (n_dspl - 2 <= 127) | ||
291 | goto two_byte_jmp; | ||
292 | else | ||
293 | goto five_byte_jmp; | ||
294 | /* negative offset */ | ||
295 | } else { | ||
296 | if (((n_dspl - 2) & 0xff) == (n_dspl - 2)) | ||
297 | goto two_byte_jmp; | ||
298 | else | ||
299 | goto five_byte_jmp; | ||
300 | } | ||
301 | |||
302 | two_byte_jmp: | ||
303 | n_dspl -= 2; | ||
304 | |||
305 | insnbuf[0] = 0xeb; | ||
306 | insnbuf[1] = (s8)n_dspl; | ||
307 | add_nops(insnbuf + 2, 3); | ||
308 | |||
309 | repl_len = 2; | ||
310 | goto done; | ||
311 | |||
312 | five_byte_jmp: | ||
313 | n_dspl -= 5; | ||
314 | |||
315 | insnbuf[0] = 0xe9; | ||
316 | *(s32 *)&insnbuf[1] = n_dspl; | ||
251 | 317 | ||
318 | repl_len = 5; | ||
319 | |||
320 | done: | ||
321 | |||
322 | DPRINTK("final displ: 0x%08x, JMP 0x%lx", | ||
323 | n_dspl, (unsigned long)orig_insn + n_dspl + repl_len); | ||
324 | } | ||
325 | |||
326 | static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr) | ||
327 | { | ||
328 | if (instr[0] != 0x90) | ||
329 | return; | ||
330 | |||
331 | add_nops(instr + (a->instrlen - a->padlen), a->padlen); | ||
332 | |||
333 | DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ", | ||
334 | instr, a->instrlen - a->padlen, a->padlen); | ||
335 | } | ||
336 | |||
337 | /* | ||
338 | * Replace instructions with better alternatives for this CPU type. This runs | ||
339 | * before SMP is initialized to avoid SMP problems with self modifying code. | ||
340 | * This implies that asymmetric systems where APs have less capabilities than | ||
341 | * the boot processor are not handled. Tough. Make sure you disable such | ||
342 | * features by hand. | ||
343 | */ | ||
252 | void __init_or_module apply_alternatives(struct alt_instr *start, | 344 | void __init_or_module apply_alternatives(struct alt_instr *start, |
253 | struct alt_instr *end) | 345 | struct alt_instr *end) |
254 | { | 346 | { |
@@ -256,10 +348,10 @@ void __init_or_module apply_alternatives(struct alt_instr *start, | |||
256 | u8 *instr, *replacement; | 348 | u8 *instr, *replacement; |
257 | u8 insnbuf[MAX_PATCH_LEN]; | 349 | u8 insnbuf[MAX_PATCH_LEN]; |
258 | 350 | ||
259 | DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); | 351 | DPRINTK("alt table %p -> %p", start, end); |
260 | /* | 352 | /* |
261 | * The scan order should be from start to end. A later scanned | 353 | * The scan order should be from start to end. A later scanned |
262 | * alternative code can overwrite a previous scanned alternative code. | 354 | * alternative code can overwrite previously scanned alternative code. |
263 | * Some kernel functions (e.g. memcpy, memset, etc) use this order to | 355 | * Some kernel functions (e.g. memcpy, memset, etc) use this order to |
264 | * patch code. | 356 | * patch code. |
265 | * | 357 | * |
@@ -267,29 +359,54 @@ void __init_or_module apply_alternatives(struct alt_instr *start, | |||
267 | * order. | 359 | * order. |
268 | */ | 360 | */ |
269 | for (a = start; a < end; a++) { | 361 | for (a = start; a < end; a++) { |
362 | int insnbuf_sz = 0; | ||
363 | |||
270 | instr = (u8 *)&a->instr_offset + a->instr_offset; | 364 | instr = (u8 *)&a->instr_offset + a->instr_offset; |
271 | replacement = (u8 *)&a->repl_offset + a->repl_offset; | 365 | replacement = (u8 *)&a->repl_offset + a->repl_offset; |
272 | BUG_ON(a->replacementlen > a->instrlen); | ||
273 | BUG_ON(a->instrlen > sizeof(insnbuf)); | 366 | BUG_ON(a->instrlen > sizeof(insnbuf)); |
274 | BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); | 367 | BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); |
275 | if (!boot_cpu_has(a->cpuid)) | 368 | if (!boot_cpu_has(a->cpuid)) { |
369 | if (a->padlen > 1) | ||
370 | optimize_nops(a, instr); | ||
371 | |||
276 | continue; | 372 | continue; |
373 | } | ||
374 | |||
375 | DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d), pad: %d", | ||
376 | a->cpuid >> 5, | ||
377 | a->cpuid & 0x1f, | ||
378 | instr, a->instrlen, | ||
379 | replacement, a->replacementlen, a->padlen); | ||
380 | |||
381 | DUMP_BYTES(instr, a->instrlen, "%p: old_insn: ", instr); | ||
382 | DUMP_BYTES(replacement, a->replacementlen, "%p: rpl_insn: ", replacement); | ||
277 | 383 | ||
278 | memcpy(insnbuf, replacement, a->replacementlen); | 384 | memcpy(insnbuf, replacement, a->replacementlen); |
385 | insnbuf_sz = a->replacementlen; | ||
279 | 386 | ||
280 | /* 0xe8 is a relative jump; fix the offset. */ | 387 | /* 0xe8 is a relative jump; fix the offset. */ |
281 | if (*insnbuf == 0xe8 && a->replacementlen == 5) | 388 | if (*insnbuf == 0xe8 && a->replacementlen == 5) { |
282 | *(s32 *)(insnbuf + 1) += replacement - instr; | 389 | *(s32 *)(insnbuf + 1) += replacement - instr; |
390 | DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx", | ||
391 | *(s32 *)(insnbuf + 1), | ||
392 | (unsigned long)instr + *(s32 *)(insnbuf + 1) + 5); | ||
393 | } | ||
394 | |||
395 | if (a->replacementlen && is_jmp(replacement[0])) | ||
396 | recompute_jump(a, instr, replacement, insnbuf); | ||
283 | 397 | ||
284 | add_nops(insnbuf + a->replacementlen, | 398 | if (a->instrlen > a->replacementlen) { |
285 | a->instrlen - a->replacementlen); | 399 | add_nops(insnbuf + a->replacementlen, |
400 | a->instrlen - a->replacementlen); | ||
401 | insnbuf_sz += a->instrlen - a->replacementlen; | ||
402 | } | ||
403 | DUMP_BYTES(insnbuf, insnbuf_sz, "%p: final_insn: ", instr); | ||
286 | 404 | ||
287 | text_poke_early(instr, insnbuf, a->instrlen); | 405 | text_poke_early(instr, insnbuf, insnbuf_sz); |
288 | } | 406 | } |
289 | } | 407 | } |
290 | 408 | ||
291 | #ifdef CONFIG_SMP | 409 | #ifdef CONFIG_SMP |
292 | |||
293 | static void alternatives_smp_lock(const s32 *start, const s32 *end, | 410 | static void alternatives_smp_lock(const s32 *start, const s32 *end, |
294 | u8 *text, u8 *text_end) | 411 | u8 *text, u8 *text_end) |
295 | { | 412 | { |
@@ -371,8 +488,8 @@ void __init_or_module alternatives_smp_module_add(struct module *mod, | |||
371 | smp->locks_end = locks_end; | 488 | smp->locks_end = locks_end; |
372 | smp->text = text; | 489 | smp->text = text; |
373 | smp->text_end = text_end; | 490 | smp->text_end = text_end; |
374 | DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n", | 491 | DPRINTK("locks %p -> %p, text %p -> %p, name %s\n", |
375 | __func__, smp->locks, smp->locks_end, | 492 | smp->locks, smp->locks_end, |
376 | smp->text, smp->text_end, smp->name); | 493 | smp->text, smp->text_end, smp->name); |
377 | 494 | ||
378 | list_add_tail(&smp->next, &smp_alt_modules); | 495 | list_add_tail(&smp->next, &smp_alt_modules); |
@@ -440,7 +557,7 @@ int alternatives_text_reserved(void *start, void *end) | |||
440 | 557 | ||
441 | return 0; | 558 | return 0; |
442 | } | 559 | } |
443 | #endif | 560 | #endif /* CONFIG_SMP */ |
444 | 561 | ||
445 | #ifdef CONFIG_PARAVIRT | 562 | #ifdef CONFIG_PARAVIRT |
446 | void __init_or_module apply_paravirt(struct paravirt_patch_site *start, | 563 | void __init_or_module apply_paravirt(struct paravirt_patch_site *start, |
@@ -601,7 +718,7 @@ int poke_int3_handler(struct pt_regs *regs) | |||
601 | if (likely(!bp_patching_in_progress)) | 718 | if (likely(!bp_patching_in_progress)) |
602 | return 0; | 719 | return 0; |
603 | 720 | ||
604 | if (user_mode_vm(regs) || regs->ip != (unsigned long)bp_int3_addr) | 721 | if (user_mode(regs) || regs->ip != (unsigned long)bp_int3_addr) |
605 | return 0; | 722 | return 0; |
606 | 723 | ||
607 | /* set up the specified breakpoint handler */ | 724 | /* set up the specified breakpoint handler */ |
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index ad3639ae1b9b..dcb52850a28f 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c | |||
@@ -1084,67 +1084,6 @@ void lapic_shutdown(void) | |||
1084 | local_irq_restore(flags); | 1084 | local_irq_restore(flags); |
1085 | } | 1085 | } |
1086 | 1086 | ||
1087 | /* | ||
1088 | * This is to verify that we're looking at a real local APIC. | ||
1089 | * Check these against your board if the CPUs aren't getting | ||
1090 | * started for no apparent reason. | ||
1091 | */ | ||
1092 | int __init verify_local_APIC(void) | ||
1093 | { | ||
1094 | unsigned int reg0, reg1; | ||
1095 | |||
1096 | /* | ||
1097 | * The version register is read-only in a real APIC. | ||
1098 | */ | ||
1099 | reg0 = apic_read(APIC_LVR); | ||
1100 | apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0); | ||
1101 | apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK); | ||
1102 | reg1 = apic_read(APIC_LVR); | ||
1103 | apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1); | ||
1104 | |||
1105 | /* | ||
1106 | * The two version reads above should print the same | ||
1107 | * numbers. If the second one is different, then we | ||
1108 | * poke at a non-APIC. | ||
1109 | */ | ||
1110 | if (reg1 != reg0) | ||
1111 | return 0; | ||
1112 | |||
1113 | /* | ||
1114 | * Check if the version looks reasonably. | ||
1115 | */ | ||
1116 | reg1 = GET_APIC_VERSION(reg0); | ||
1117 | if (reg1 == 0x00 || reg1 == 0xff) | ||
1118 | return 0; | ||
1119 | reg1 = lapic_get_maxlvt(); | ||
1120 | if (reg1 < 0x02 || reg1 == 0xff) | ||
1121 | return 0; | ||
1122 | |||
1123 | /* | ||
1124 | * The ID register is read/write in a real APIC. | ||
1125 | */ | ||
1126 | reg0 = apic_read(APIC_ID); | ||
1127 | apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); | ||
1128 | apic_write(APIC_ID, reg0 ^ apic->apic_id_mask); | ||
1129 | reg1 = apic_read(APIC_ID); | ||
1130 | apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1); | ||
1131 | apic_write(APIC_ID, reg0); | ||
1132 | if (reg1 != (reg0 ^ apic->apic_id_mask)) | ||
1133 | return 0; | ||
1134 | |||
1135 | /* | ||
1136 | * The next two are just to see if we have sane values. | ||
1137 | * They're only really relevant if we're in Virtual Wire | ||
1138 | * compatibility mode, but most boxes are anymore. | ||
1139 | */ | ||
1140 | reg0 = apic_read(APIC_LVT0); | ||
1141 | apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0); | ||
1142 | reg1 = apic_read(APIC_LVT1); | ||
1143 | apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1); | ||
1144 | |||
1145 | return 1; | ||
1146 | } | ||
1147 | |||
1148 | /** | 1087 | /** |
1149 | * sync_Arb_IDs - synchronize APIC bus arbitration IDs | 1088 | * sync_Arb_IDs - synchronize APIC bus arbitration IDs |
1150 | */ | 1089 | */ |
@@ -2283,7 +2222,6 @@ int __init APIC_init_uniprocessor(void) | |||
2283 | disable_ioapic_support(); | 2222 | disable_ioapic_support(); |
2284 | 2223 | ||
2285 | default_setup_apic_routing(); | 2224 | default_setup_apic_routing(); |
2286 | verify_local_APIC(); | ||
2287 | apic_bsp_setup(true); | 2225 | apic_bsp_setup(true); |
2288 | return 0; | 2226 | return 0; |
2289 | } | 2227 | } |
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index e658f21681c8..d9d0bd2faaf4 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c | |||
@@ -135,12 +135,12 @@ static void init_x2apic_ldr(void) | |||
135 | 135 | ||
136 | per_cpu(x86_cpu_to_logical_apicid, this_cpu) = apic_read(APIC_LDR); | 136 | per_cpu(x86_cpu_to_logical_apicid, this_cpu) = apic_read(APIC_LDR); |
137 | 137 | ||
138 | __cpu_set(this_cpu, per_cpu(cpus_in_cluster, this_cpu)); | 138 | cpumask_set_cpu(this_cpu, per_cpu(cpus_in_cluster, this_cpu)); |
139 | for_each_online_cpu(cpu) { | 139 | for_each_online_cpu(cpu) { |
140 | if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu)) | 140 | if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu)) |
141 | continue; | 141 | continue; |
142 | __cpu_set(this_cpu, per_cpu(cpus_in_cluster, cpu)); | 142 | cpumask_set_cpu(this_cpu, per_cpu(cpus_in_cluster, cpu)); |
143 | __cpu_set(cpu, per_cpu(cpus_in_cluster, this_cpu)); | 143 | cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, this_cpu)); |
144 | } | 144 | } |
145 | } | 145 | } |
146 | 146 | ||
@@ -195,7 +195,7 @@ static int x2apic_init_cpu_notifier(void) | |||
195 | 195 | ||
196 | BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu)); | 196 | BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu)); |
197 | 197 | ||
198 | __cpu_set(cpu, per_cpu(cpus_in_cluster, cpu)); | 198 | cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, cpu)); |
199 | register_hotcpu_notifier(&x2apic_cpu_notifier); | 199 | register_hotcpu_notifier(&x2apic_cpu_notifier); |
200 | return 1; | 200 | return 1; |
201 | } | 201 | } |
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 8e9dcfd630e4..c8d92950bc04 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c | |||
@@ -144,33 +144,60 @@ static void __init uv_set_apicid_hibit(void) | |||
144 | 144 | ||
145 | static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) | 145 | static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) |
146 | { | 146 | { |
147 | int pnodeid, is_uv1, is_uv2, is_uv3; | 147 | int pnodeid; |
148 | 148 | int uv_apic; | |
149 | is_uv1 = !strcmp(oem_id, "SGI"); | 149 | |
150 | is_uv2 = !strcmp(oem_id, "SGI2"); | 150 | if (strncmp(oem_id, "SGI", 3) != 0) |
151 | is_uv3 = !strncmp(oem_id, "SGI3", 4); /* there are varieties of UV3 */ | 151 | return 0; |
152 | if (is_uv1 || is_uv2 || is_uv3) { | 152 | |
153 | uv_hub_info->hub_revision = | 153 | /* |
154 | (is_uv1 ? UV1_HUB_REVISION_BASE : | 154 | * Determine UV arch type. |
155 | (is_uv2 ? UV2_HUB_REVISION_BASE : | 155 | * SGI: UV100/1000 |
156 | UV3_HUB_REVISION_BASE)); | 156 | * SGI2: UV2000/3000 |
157 | pnodeid = early_get_pnodeid(); | 157 | * SGI3: UV300 (truncated to 4 chars because of different varieties) |
158 | early_get_apic_pnode_shift(); | 158 | */ |
159 | x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; | 159 | uv_hub_info->hub_revision = |
160 | x86_platform.nmi_init = uv_nmi_init; | 160 | !strncmp(oem_id, "SGI3", 4) ? UV3_HUB_REVISION_BASE : |
161 | if (!strcmp(oem_table_id, "UVL")) | 161 | !strcmp(oem_id, "SGI2") ? UV2_HUB_REVISION_BASE : |
162 | uv_system_type = UV_LEGACY_APIC; | 162 | !strcmp(oem_id, "SGI") ? UV1_HUB_REVISION_BASE : 0; |
163 | else if (!strcmp(oem_table_id, "UVX")) | 163 | |
164 | uv_system_type = UV_X2APIC; | 164 | if (uv_hub_info->hub_revision == 0) |
165 | else if (!strcmp(oem_table_id, "UVH")) { | 165 | goto badbios; |
166 | __this_cpu_write(x2apic_extra_bits, | 166 | |
167 | pnodeid << uvh_apicid.s.pnode_shift); | 167 | pnodeid = early_get_pnodeid(); |
168 | uv_system_type = UV_NON_UNIQUE_APIC; | 168 | early_get_apic_pnode_shift(); |
169 | uv_set_apicid_hibit(); | 169 | x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; |
170 | return 1; | 170 | x86_platform.nmi_init = uv_nmi_init; |
171 | } | 171 | |
172 | if (!strcmp(oem_table_id, "UVX")) { /* most common */ | ||
173 | uv_system_type = UV_X2APIC; | ||
174 | uv_apic = 0; | ||
175 | |||
176 | } else if (!strcmp(oem_table_id, "UVH")) { /* only UV1 systems */ | ||
177 | uv_system_type = UV_NON_UNIQUE_APIC; | ||
178 | __this_cpu_write(x2apic_extra_bits, | ||
179 | pnodeid << uvh_apicid.s.pnode_shift); | ||
180 | uv_set_apicid_hibit(); | ||
181 | uv_apic = 1; | ||
182 | |||
183 | } else if (!strcmp(oem_table_id, "UVL")) { /* only used for */ | ||
184 | uv_system_type = UV_LEGACY_APIC; /* very small systems */ | ||
185 | uv_apic = 0; | ||
186 | |||
187 | } else { | ||
188 | goto badbios; | ||
172 | } | 189 | } |
173 | return 0; | 190 | |
191 | pr_info("UV: OEM IDs %s/%s, System/HUB Types %d/%d, uv_apic %d\n", | ||
192 | oem_id, oem_table_id, uv_system_type, | ||
193 | uv_min_hub_revision_id, uv_apic); | ||
194 | |||
195 | return uv_apic; | ||
196 | |||
197 | badbios: | ||
198 | pr_err("UV: OEM_ID:%s OEM_TABLE_ID:%s\n", oem_id, oem_table_id); | ||
199 | pr_err("Current BIOS not supported, update kernel and/or BIOS\n"); | ||
200 | BUG(); | ||
174 | } | 201 | } |
175 | 202 | ||
176 | enum uv_system_type get_uv_system_type(void) | 203 | enum uv_system_type get_uv_system_type(void) |
@@ -854,10 +881,14 @@ void __init uv_system_init(void) | |||
854 | unsigned long mmr_base, present, paddr; | 881 | unsigned long mmr_base, present, paddr; |
855 | unsigned short pnode_mask; | 882 | unsigned short pnode_mask; |
856 | unsigned char n_lshift; | 883 | unsigned char n_lshift; |
857 | char *hub = (is_uv1_hub() ? "UV1" : | 884 | char *hub = (is_uv1_hub() ? "UV100/1000" : |
858 | (is_uv2_hub() ? "UV2" : | 885 | (is_uv2_hub() ? "UV2000/3000" : |
859 | "UV3")); | 886 | (is_uv3_hub() ? "UV300" : NULL))); |
860 | 887 | ||
888 | if (!hub) { | ||
889 | pr_err("UV: Unknown/unsupported UV hub\n"); | ||
890 | return; | ||
891 | } | ||
861 | pr_info("UV: Found %s hub\n", hub); | 892 | pr_info("UV: Found %s hub\n", hub); |
862 | map_low_mmrs(); | 893 | map_low_mmrs(); |
863 | 894 | ||
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 3b3b9d33ac1d..47703aed74cf 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c | |||
@@ -68,7 +68,7 @@ void foo(void) | |||
68 | 68 | ||
69 | /* Offset from the sysenter stack to tss.sp0 */ | 69 | /* Offset from the sysenter stack to tss.sp0 */ |
70 | DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - | 70 | DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - |
71 | sizeof(struct tss_struct)); | 71 | offsetofend(struct tss_struct, SYSENTER_stack)); |
72 | 72 | ||
73 | #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) | 73 | #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) |
74 | BLANK(); | 74 | BLANK(); |
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index fdcbb4d27c9f..5ce6f2da8763 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c | |||
@@ -81,6 +81,7 @@ int main(void) | |||
81 | #undef ENTRY | 81 | #undef ENTRY |
82 | 82 | ||
83 | OFFSET(TSS_ist, tss_struct, x86_tss.ist); | 83 | OFFSET(TSS_ist, tss_struct, x86_tss.ist); |
84 | OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); | ||
84 | BLANK(); | 85 | BLANK(); |
85 | 86 | ||
86 | DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); | 87 | DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); |
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 80091ae54c2b..9bff68798836 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile | |||
@@ -39,7 +39,8 @@ obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd_iommu.o | |||
39 | endif | 39 | endif |
40 | obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o | 40 | obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o |
41 | obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o | 41 | obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o |
42 | obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_rapl.o | 42 | obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_rapl.o perf_event_intel_cqm.o |
43 | obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_pt.o perf_event_intel_bts.o | ||
43 | 44 | ||
44 | obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \ | 45 | obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \ |
45 | perf_event_intel_uncore_snb.o \ | 46 | perf_event_intel_uncore_snb.o \ |
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index a220239cea65..fd470ebf924e 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c | |||
@@ -5,6 +5,7 @@ | |||
5 | 5 | ||
6 | #include <linux/io.h> | 6 | #include <linux/io.h> |
7 | #include <linux/sched.h> | 7 | #include <linux/sched.h> |
8 | #include <linux/random.h> | ||
8 | #include <asm/processor.h> | 9 | #include <asm/processor.h> |
9 | #include <asm/apic.h> | 10 | #include <asm/apic.h> |
10 | #include <asm/cpu.h> | 11 | #include <asm/cpu.h> |
@@ -488,6 +489,9 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) | |||
488 | 489 | ||
489 | va_align.mask = (upperbit - 1) & PAGE_MASK; | 490 | va_align.mask = (upperbit - 1) & PAGE_MASK; |
490 | va_align.flags = ALIGN_VA_32 | ALIGN_VA_64; | 491 | va_align.flags = ALIGN_VA_32 | ALIGN_VA_64; |
492 | |||
493 | /* A random value per boot for bit slice [12:upper_bit) */ | ||
494 | va_align.bits = get_random_int() & va_align.mask; | ||
491 | } | 495 | } |
492 | } | 496 | } |
493 | 497 | ||
@@ -711,6 +715,11 @@ static void init_amd(struct cpuinfo_x86 *c) | |||
711 | set_cpu_bug(c, X86_BUG_AMD_APIC_C1E); | 715 | set_cpu_bug(c, X86_BUG_AMD_APIC_C1E); |
712 | 716 | ||
713 | rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); | 717 | rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); |
718 | |||
719 | /* 3DNow or LM implies PREFETCHW */ | ||
720 | if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH)) | ||
721 | if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM)) | ||
722 | set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH); | ||
714 | } | 723 | } |
715 | 724 | ||
716 | #ifdef CONFIG_X86_32 | 725 | #ifdef CONFIG_X86_32 |
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 2346c95c6ab1..a62cf04dac8a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c | |||
@@ -646,6 +646,30 @@ void get_cpu_cap(struct cpuinfo_x86 *c) | |||
646 | c->x86_capability[10] = eax; | 646 | c->x86_capability[10] = eax; |
647 | } | 647 | } |
648 | 648 | ||
649 | /* Additional Intel-defined flags: level 0x0000000F */ | ||
650 | if (c->cpuid_level >= 0x0000000F) { | ||
651 | u32 eax, ebx, ecx, edx; | ||
652 | |||
653 | /* QoS sub-leaf, EAX=0Fh, ECX=0 */ | ||
654 | cpuid_count(0x0000000F, 0, &eax, &ebx, &ecx, &edx); | ||
655 | c->x86_capability[11] = edx; | ||
656 | if (cpu_has(c, X86_FEATURE_CQM_LLC)) { | ||
657 | /* will be overridden if occupancy monitoring exists */ | ||
658 | c->x86_cache_max_rmid = ebx; | ||
659 | |||
660 | /* QoS sub-leaf, EAX=0Fh, ECX=1 */ | ||
661 | cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx); | ||
662 | c->x86_capability[12] = edx; | ||
663 | if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) { | ||
664 | c->x86_cache_max_rmid = ecx; | ||
665 | c->x86_cache_occ_scale = ebx; | ||
666 | } | ||
667 | } else { | ||
668 | c->x86_cache_max_rmid = -1; | ||
669 | c->x86_cache_occ_scale = -1; | ||
670 | } | ||
671 | } | ||
672 | |||
649 | /* AMD-defined flags: level 0x80000001 */ | 673 | /* AMD-defined flags: level 0x80000001 */ |
650 | xlvl = cpuid_eax(0x80000000); | 674 | xlvl = cpuid_eax(0x80000000); |
651 | c->extended_cpuid_level = xlvl; | 675 | c->extended_cpuid_level = xlvl; |
@@ -834,6 +858,20 @@ static void generic_identify(struct cpuinfo_x86 *c) | |||
834 | detect_nopl(c); | 858 | detect_nopl(c); |
835 | } | 859 | } |
836 | 860 | ||
861 | static void x86_init_cache_qos(struct cpuinfo_x86 *c) | ||
862 | { | ||
863 | /* | ||
864 | * The heavy lifting of max_rmid and cache_occ_scale are handled | ||
865 | * in get_cpu_cap(). Here we just set the max_rmid for the boot_cpu | ||
866 | * in case CQM bits really aren't there in this CPU. | ||
867 | */ | ||
868 | if (c != &boot_cpu_data) { | ||
869 | boot_cpu_data.x86_cache_max_rmid = | ||
870 | min(boot_cpu_data.x86_cache_max_rmid, | ||
871 | c->x86_cache_max_rmid); | ||
872 | } | ||
873 | } | ||
874 | |||
837 | /* | 875 | /* |
838 | * This does the hard work of actually picking apart the CPU stuff... | 876 | * This does the hard work of actually picking apart the CPU stuff... |
839 | */ | 877 | */ |
@@ -923,6 +961,7 @@ static void identify_cpu(struct cpuinfo_x86 *c) | |||
923 | 961 | ||
924 | init_hypervisor(c); | 962 | init_hypervisor(c); |
925 | x86_init_rdrand(c); | 963 | x86_init_rdrand(c); |
964 | x86_init_cache_qos(c); | ||
926 | 965 | ||
927 | /* | 966 | /* |
928 | * Clear/Set all flags overriden by options, need do it | 967 | * Clear/Set all flags overriden by options, need do it |
@@ -959,38 +998,37 @@ static void identify_cpu(struct cpuinfo_x86 *c) | |||
959 | #endif | 998 | #endif |
960 | } | 999 | } |
961 | 1000 | ||
962 | #ifdef CONFIG_X86_64 | 1001 | /* |
963 | #ifdef CONFIG_IA32_EMULATION | 1002 | * Set up the CPU state needed to execute SYSENTER/SYSEXIT instructions |
964 | /* May not be __init: called during resume */ | 1003 | * on 32-bit kernels: |
965 | static void syscall32_cpu_init(void) | 1004 | */ |
966 | { | ||
967 | /* Load these always in case some future AMD CPU supports | ||
968 | SYSENTER from compat mode too. */ | ||
969 | wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); | ||
970 | wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); | ||
971 | wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); | ||
972 | |||
973 | wrmsrl(MSR_CSTAR, ia32_cstar_target); | ||
974 | } | ||
975 | #endif /* CONFIG_IA32_EMULATION */ | ||
976 | #endif /* CONFIG_X86_64 */ | ||
977 | |||
978 | #ifdef CONFIG_X86_32 | 1005 | #ifdef CONFIG_X86_32 |
979 | void enable_sep_cpu(void) | 1006 | void enable_sep_cpu(void) |
980 | { | 1007 | { |
981 | int cpu = get_cpu(); | 1008 | struct tss_struct *tss; |
982 | struct tss_struct *tss = &per_cpu(init_tss, cpu); | 1009 | int cpu; |
983 | 1010 | ||
984 | if (!boot_cpu_has(X86_FEATURE_SEP)) { | 1011 | cpu = get_cpu(); |
985 | put_cpu(); | 1012 | tss = &per_cpu(cpu_tss, cpu); |
986 | return; | 1013 | |
987 | } | 1014 | if (!boot_cpu_has(X86_FEATURE_SEP)) |
1015 | goto out; | ||
1016 | |||
1017 | /* | ||
1018 | * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field -- | ||
1019 | * see the big comment in struct x86_hw_tss's definition. | ||
1020 | */ | ||
988 | 1021 | ||
989 | tss->x86_tss.ss1 = __KERNEL_CS; | 1022 | tss->x86_tss.ss1 = __KERNEL_CS; |
990 | tss->x86_tss.sp1 = sizeof(struct tss_struct) + (unsigned long) tss; | 1023 | wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); |
991 | wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); | 1024 | |
992 | wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0); | 1025 | wrmsr(MSR_IA32_SYSENTER_ESP, |
993 | wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0); | 1026 | (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack), |
1027 | 0); | ||
1028 | |||
1029 | wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)ia32_sysenter_target, 0); | ||
1030 | |||
1031 | out: | ||
994 | put_cpu(); | 1032 | put_cpu(); |
995 | } | 1033 | } |
996 | #endif | 1034 | #endif |
@@ -1118,7 +1156,7 @@ static __init int setup_disablecpuid(char *arg) | |||
1118 | __setup("clearcpuid=", setup_disablecpuid); | 1156 | __setup("clearcpuid=", setup_disablecpuid); |
1119 | 1157 | ||
1120 | DEFINE_PER_CPU(unsigned long, kernel_stack) = | 1158 | DEFINE_PER_CPU(unsigned long, kernel_stack) = |
1121 | (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE; | 1159 | (unsigned long)&init_thread_union + THREAD_SIZE; |
1122 | EXPORT_PER_CPU_SYMBOL(kernel_stack); | 1160 | EXPORT_PER_CPU_SYMBOL(kernel_stack); |
1123 | 1161 | ||
1124 | #ifdef CONFIG_X86_64 | 1162 | #ifdef CONFIG_X86_64 |
@@ -1130,8 +1168,8 @@ DEFINE_PER_CPU_FIRST(union irq_stack_union, | |||
1130 | irq_stack_union) __aligned(PAGE_SIZE) __visible; | 1168 | irq_stack_union) __aligned(PAGE_SIZE) __visible; |
1131 | 1169 | ||
1132 | /* | 1170 | /* |
1133 | * The following four percpu variables are hot. Align current_task to | 1171 | * The following percpu variables are hot. Align current_task to |
1134 | * cacheline size such that all four fall in the same cacheline. | 1172 | * cacheline size such that they fall in the same cacheline. |
1135 | */ | 1173 | */ |
1136 | DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = | 1174 | DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = |
1137 | &init_task; | 1175 | &init_task; |
@@ -1171,10 +1209,23 @@ void syscall_init(void) | |||
1171 | */ | 1209 | */ |
1172 | wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); | 1210 | wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); |
1173 | wrmsrl(MSR_LSTAR, system_call); | 1211 | wrmsrl(MSR_LSTAR, system_call); |
1174 | wrmsrl(MSR_CSTAR, ignore_sysret); | ||
1175 | 1212 | ||
1176 | #ifdef CONFIG_IA32_EMULATION | 1213 | #ifdef CONFIG_IA32_EMULATION |
1177 | syscall32_cpu_init(); | 1214 | wrmsrl(MSR_CSTAR, ia32_cstar_target); |
1215 | /* | ||
1216 | * This only works on Intel CPUs. | ||
1217 | * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP. | ||
1218 | * This does not cause SYSENTER to jump to the wrong location, because | ||
1219 | * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). | ||
1220 | */ | ||
1221 | wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); | ||
1222 | wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); | ||
1223 | wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); | ||
1224 | #else | ||
1225 | wrmsrl(MSR_CSTAR, ignore_sysret); | ||
1226 | wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); | ||
1227 | wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); | ||
1228 | wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL); | ||
1178 | #endif | 1229 | #endif |
1179 | 1230 | ||
1180 | /* Flags to clear on syscall */ | 1231 | /* Flags to clear on syscall */ |
@@ -1226,6 +1277,15 @@ DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; | |||
1226 | EXPORT_PER_CPU_SYMBOL(__preempt_count); | 1277 | EXPORT_PER_CPU_SYMBOL(__preempt_count); |
1227 | DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); | 1278 | DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); |
1228 | 1279 | ||
1280 | /* | ||
1281 | * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find | ||
1282 | * the top of the kernel stack. Use an extra percpu variable to track the | ||
1283 | * top of the kernel stack directly. | ||
1284 | */ | ||
1285 | DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = | ||
1286 | (unsigned long)&init_thread_union + THREAD_SIZE; | ||
1287 | EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack); | ||
1288 | |||
1229 | #ifdef CONFIG_CC_STACKPROTECTOR | 1289 | #ifdef CONFIG_CC_STACKPROTECTOR |
1230 | DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); | 1290 | DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); |
1231 | #endif | 1291 | #endif |
@@ -1307,7 +1367,7 @@ void cpu_init(void) | |||
1307 | */ | 1367 | */ |
1308 | load_ucode_ap(); | 1368 | load_ucode_ap(); |
1309 | 1369 | ||
1310 | t = &per_cpu(init_tss, cpu); | 1370 | t = &per_cpu(cpu_tss, cpu); |
1311 | oist = &per_cpu(orig_ist, cpu); | 1371 | oist = &per_cpu(orig_ist, cpu); |
1312 | 1372 | ||
1313 | #ifdef CONFIG_NUMA | 1373 | #ifdef CONFIG_NUMA |
@@ -1391,7 +1451,7 @@ void cpu_init(void) | |||
1391 | { | 1451 | { |
1392 | int cpu = smp_processor_id(); | 1452 | int cpu = smp_processor_id(); |
1393 | struct task_struct *curr = current; | 1453 | struct task_struct *curr = current; |
1394 | struct tss_struct *t = &per_cpu(init_tss, cpu); | 1454 | struct tss_struct *t = &per_cpu(cpu_tss, cpu); |
1395 | struct thread_struct *thread = &curr->thread; | 1455 | struct thread_struct *thread = &curr->thread; |
1396 | 1456 | ||
1397 | wait_for_master_cpu(cpu); | 1457 | wait_for_master_cpu(cpu); |
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 659643376dbf..edcb0e28c336 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c | |||
@@ -7,16 +7,14 @@ | |||
7 | * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. | 7 | * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. |
8 | */ | 8 | */ |
9 | 9 | ||
10 | #include <linux/init.h> | ||
11 | #include <linux/slab.h> | 10 | #include <linux/slab.h> |
12 | #include <linux/device.h> | 11 | #include <linux/cacheinfo.h> |
13 | #include <linux/compiler.h> | ||
14 | #include <linux/cpu.h> | 12 | #include <linux/cpu.h> |
15 | #include <linux/sched.h> | 13 | #include <linux/sched.h> |
14 | #include <linux/sysfs.h> | ||
16 | #include <linux/pci.h> | 15 | #include <linux/pci.h> |
17 | 16 | ||
18 | #include <asm/processor.h> | 17 | #include <asm/processor.h> |
19 | #include <linux/smp.h> | ||
20 | #include <asm/amd_nb.h> | 18 | #include <asm/amd_nb.h> |
21 | #include <asm/smp.h> | 19 | #include <asm/smp.h> |
22 | 20 | ||
@@ -116,10 +114,10 @@ static const struct _cache_table cache_table[] = | |||
116 | 114 | ||
117 | 115 | ||
118 | enum _cache_type { | 116 | enum _cache_type { |
119 | CACHE_TYPE_NULL = 0, | 117 | CTYPE_NULL = 0, |
120 | CACHE_TYPE_DATA = 1, | 118 | CTYPE_DATA = 1, |
121 | CACHE_TYPE_INST = 2, | 119 | CTYPE_INST = 2, |
122 | CACHE_TYPE_UNIFIED = 3 | 120 | CTYPE_UNIFIED = 3 |
123 | }; | 121 | }; |
124 | 122 | ||
125 | union _cpuid4_leaf_eax { | 123 | union _cpuid4_leaf_eax { |
@@ -159,11 +157,6 @@ struct _cpuid4_info_regs { | |||
159 | struct amd_northbridge *nb; | 157 | struct amd_northbridge *nb; |
160 | }; | 158 | }; |
161 | 159 | ||
162 | struct _cpuid4_info { | ||
163 | struct _cpuid4_info_regs base; | ||
164 | DECLARE_BITMAP(shared_cpu_map, NR_CPUS); | ||
165 | }; | ||
166 | |||
167 | unsigned short num_cache_leaves; | 160 | unsigned short num_cache_leaves; |
168 | 161 | ||
169 | /* AMD doesn't have CPUID4. Emulate it here to report the same | 162 | /* AMD doesn't have CPUID4. Emulate it here to report the same |
@@ -220,6 +213,13 @@ static const unsigned short assocs[] = { | |||
220 | static const unsigned char levels[] = { 1, 1, 2, 3 }; | 213 | static const unsigned char levels[] = { 1, 1, 2, 3 }; |
221 | static const unsigned char types[] = { 1, 2, 3, 3 }; | 214 | static const unsigned char types[] = { 1, 2, 3, 3 }; |
222 | 215 | ||
216 | static const enum cache_type cache_type_map[] = { | ||
217 | [CTYPE_NULL] = CACHE_TYPE_NOCACHE, | ||
218 | [CTYPE_DATA] = CACHE_TYPE_DATA, | ||
219 | [CTYPE_INST] = CACHE_TYPE_INST, | ||
220 | [CTYPE_UNIFIED] = CACHE_TYPE_UNIFIED, | ||
221 | }; | ||
222 | |||
223 | static void | 223 | static void |
224 | amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, | 224 | amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, |
225 | union _cpuid4_leaf_ebx *ebx, | 225 | union _cpuid4_leaf_ebx *ebx, |
@@ -291,14 +291,8 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, | |||
291 | (ebx->split.ways_of_associativity + 1) - 1; | 291 | (ebx->split.ways_of_associativity + 1) - 1; |
292 | } | 292 | } |
293 | 293 | ||
294 | struct _cache_attr { | ||
295 | struct attribute attr; | ||
296 | ssize_t (*show)(struct _cpuid4_info *, char *, unsigned int); | ||
297 | ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count, | ||
298 | unsigned int); | ||
299 | }; | ||
300 | |||
301 | #if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS) | 294 | #if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS) |
295 | |||
302 | /* | 296 | /* |
303 | * L3 cache descriptors | 297 | * L3 cache descriptors |
304 | */ | 298 | */ |
@@ -325,20 +319,6 @@ static void amd_calc_l3_indices(struct amd_northbridge *nb) | |||
325 | l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; | 319 | l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; |
326 | } | 320 | } |
327 | 321 | ||
328 | static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index) | ||
329 | { | ||
330 | int node; | ||
331 | |||
332 | /* only for L3, and not in virtualized environments */ | ||
333 | if (index < 3) | ||
334 | return; | ||
335 | |||
336 | node = amd_get_nb_id(smp_processor_id()); | ||
337 | this_leaf->nb = node_to_amd_nb(node); | ||
338 | if (this_leaf->nb && !this_leaf->nb->l3_cache.indices) | ||
339 | amd_calc_l3_indices(this_leaf->nb); | ||
340 | } | ||
341 | |||
342 | /* | 322 | /* |
343 | * check whether a slot used for disabling an L3 index is occupied. | 323 | * check whether a slot used for disabling an L3 index is occupied. |
344 | * @l3: L3 cache descriptor | 324 | * @l3: L3 cache descriptor |
@@ -359,15 +339,13 @@ int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot) | |||
359 | return -1; | 339 | return -1; |
360 | } | 340 | } |
361 | 341 | ||
362 | static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, | 342 | static ssize_t show_cache_disable(struct cacheinfo *this_leaf, char *buf, |
363 | unsigned int slot) | 343 | unsigned int slot) |
364 | { | 344 | { |
365 | int index; | 345 | int index; |
346 | struct amd_northbridge *nb = this_leaf->priv; | ||
366 | 347 | ||
367 | if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) | 348 | index = amd_get_l3_disable_slot(nb, slot); |
368 | return -EINVAL; | ||
369 | |||
370 | index = amd_get_l3_disable_slot(this_leaf->base.nb, slot); | ||
371 | if (index >= 0) | 349 | if (index >= 0) |
372 | return sprintf(buf, "%d\n", index); | 350 | return sprintf(buf, "%d\n", index); |
373 | 351 | ||
@@ -376,9 +354,10 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, | |||
376 | 354 | ||
377 | #define SHOW_CACHE_DISABLE(slot) \ | 355 | #define SHOW_CACHE_DISABLE(slot) \ |
378 | static ssize_t \ | 356 | static ssize_t \ |
379 | show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf, \ | 357 | cache_disable_##slot##_show(struct device *dev, \ |
380 | unsigned int cpu) \ | 358 | struct device_attribute *attr, char *buf) \ |
381 | { \ | 359 | { \ |
360 | struct cacheinfo *this_leaf = dev_get_drvdata(dev); \ | ||
382 | return show_cache_disable(this_leaf, buf, slot); \ | 361 | return show_cache_disable(this_leaf, buf, slot); \ |
383 | } | 362 | } |
384 | SHOW_CACHE_DISABLE(0) | 363 | SHOW_CACHE_DISABLE(0) |
@@ -446,25 +425,23 @@ int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, unsigned slot, | |||
446 | return 0; | 425 | return 0; |
447 | } | 426 | } |
448 | 427 | ||
449 | static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, | 428 | static ssize_t store_cache_disable(struct cacheinfo *this_leaf, |
450 | const char *buf, size_t count, | 429 | const char *buf, size_t count, |
451 | unsigned int slot) | 430 | unsigned int slot) |
452 | { | 431 | { |
453 | unsigned long val = 0; | 432 | unsigned long val = 0; |
454 | int cpu, err = 0; | 433 | int cpu, err = 0; |
434 | struct amd_northbridge *nb = this_leaf->priv; | ||
455 | 435 | ||
456 | if (!capable(CAP_SYS_ADMIN)) | 436 | if (!capable(CAP_SYS_ADMIN)) |
457 | return -EPERM; | 437 | return -EPERM; |
458 | 438 | ||
459 | if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) | 439 | cpu = cpumask_first(&this_leaf->shared_cpu_map); |
460 | return -EINVAL; | ||
461 | |||
462 | cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); | ||
463 | 440 | ||
464 | if (kstrtoul(buf, 10, &val) < 0) | 441 | if (kstrtoul(buf, 10, &val) < 0) |
465 | return -EINVAL; | 442 | return -EINVAL; |
466 | 443 | ||
467 | err = amd_set_l3_disable_slot(this_leaf->base.nb, cpu, slot, val); | 444 | err = amd_set_l3_disable_slot(nb, cpu, slot, val); |
468 | if (err) { | 445 | if (err) { |
469 | if (err == -EEXIST) | 446 | if (err == -EEXIST) |
470 | pr_warning("L3 slot %d in use/index already disabled!\n", | 447 | pr_warning("L3 slot %d in use/index already disabled!\n", |
@@ -476,41 +453,36 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, | |||
476 | 453 | ||
477 | #define STORE_CACHE_DISABLE(slot) \ | 454 | #define STORE_CACHE_DISABLE(slot) \ |
478 | static ssize_t \ | 455 | static ssize_t \ |
479 | store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \ | 456 | cache_disable_##slot##_store(struct device *dev, \ |
480 | const char *buf, size_t count, \ | 457 | struct device_attribute *attr, \ |
481 | unsigned int cpu) \ | 458 | const char *buf, size_t count) \ |
482 | { \ | 459 | { \ |
460 | struct cacheinfo *this_leaf = dev_get_drvdata(dev); \ | ||
483 | return store_cache_disable(this_leaf, buf, count, slot); \ | 461 | return store_cache_disable(this_leaf, buf, count, slot); \ |
484 | } | 462 | } |
485 | STORE_CACHE_DISABLE(0) | 463 | STORE_CACHE_DISABLE(0) |
486 | STORE_CACHE_DISABLE(1) | 464 | STORE_CACHE_DISABLE(1) |
487 | 465 | ||
488 | static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, | 466 | static ssize_t subcaches_show(struct device *dev, |
489 | show_cache_disable_0, store_cache_disable_0); | 467 | struct device_attribute *attr, char *buf) |
490 | static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, | ||
491 | show_cache_disable_1, store_cache_disable_1); | ||
492 | |||
493 | static ssize_t | ||
494 | show_subcaches(struct _cpuid4_info *this_leaf, char *buf, unsigned int cpu) | ||
495 | { | 468 | { |
496 | if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) | 469 | struct cacheinfo *this_leaf = dev_get_drvdata(dev); |
497 | return -EINVAL; | 470 | int cpu = cpumask_first(&this_leaf->shared_cpu_map); |
498 | 471 | ||
499 | return sprintf(buf, "%x\n", amd_get_subcaches(cpu)); | 472 | return sprintf(buf, "%x\n", amd_get_subcaches(cpu)); |
500 | } | 473 | } |
501 | 474 | ||
502 | static ssize_t | 475 | static ssize_t subcaches_store(struct device *dev, |
503 | store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count, | 476 | struct device_attribute *attr, |
504 | unsigned int cpu) | 477 | const char *buf, size_t count) |
505 | { | 478 | { |
479 | struct cacheinfo *this_leaf = dev_get_drvdata(dev); | ||
480 | int cpu = cpumask_first(&this_leaf->shared_cpu_map); | ||
506 | unsigned long val; | 481 | unsigned long val; |
507 | 482 | ||
508 | if (!capable(CAP_SYS_ADMIN)) | 483 | if (!capable(CAP_SYS_ADMIN)) |
509 | return -EPERM; | 484 | return -EPERM; |
510 | 485 | ||
511 | if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) | ||
512 | return -EINVAL; | ||
513 | |||
514 | if (kstrtoul(buf, 16, &val) < 0) | 486 | if (kstrtoul(buf, 16, &val) < 0) |
515 | return -EINVAL; | 487 | return -EINVAL; |
516 | 488 | ||
@@ -520,9 +492,92 @@ store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count, | |||
520 | return count; | 492 | return count; |
521 | } | 493 | } |
522 | 494 | ||
523 | static struct _cache_attr subcaches = | 495 | static DEVICE_ATTR_RW(cache_disable_0); |
524 | __ATTR(subcaches, 0644, show_subcaches, store_subcaches); | 496 | static DEVICE_ATTR_RW(cache_disable_1); |
497 | static DEVICE_ATTR_RW(subcaches); | ||
498 | |||
499 | static umode_t | ||
500 | cache_private_attrs_is_visible(struct kobject *kobj, | ||
501 | struct attribute *attr, int unused) | ||
502 | { | ||
503 | struct device *dev = kobj_to_dev(kobj); | ||
504 | struct cacheinfo *this_leaf = dev_get_drvdata(dev); | ||
505 | umode_t mode = attr->mode; | ||
506 | |||
507 | if (!this_leaf->priv) | ||
508 | return 0; | ||
509 | |||
510 | if ((attr == &dev_attr_subcaches.attr) && | ||
511 | amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) | ||
512 | return mode; | ||
513 | |||
514 | if ((attr == &dev_attr_cache_disable_0.attr || | ||
515 | attr == &dev_attr_cache_disable_1.attr) && | ||
516 | amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) | ||
517 | return mode; | ||
518 | |||
519 | return 0; | ||
520 | } | ||
521 | |||
522 | static struct attribute_group cache_private_group = { | ||
523 | .is_visible = cache_private_attrs_is_visible, | ||
524 | }; | ||
525 | |||
526 | static void init_amd_l3_attrs(void) | ||
527 | { | ||
528 | int n = 1; | ||
529 | static struct attribute **amd_l3_attrs; | ||
530 | |||
531 | if (amd_l3_attrs) /* already initialized */ | ||
532 | return; | ||
533 | |||
534 | if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) | ||
535 | n += 2; | ||
536 | if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) | ||
537 | n += 1; | ||
538 | |||
539 | amd_l3_attrs = kcalloc(n, sizeof(*amd_l3_attrs), GFP_KERNEL); | ||
540 | if (!amd_l3_attrs) | ||
541 | return; | ||
542 | |||
543 | n = 0; | ||
544 | if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) { | ||
545 | amd_l3_attrs[n++] = &dev_attr_cache_disable_0.attr; | ||
546 | amd_l3_attrs[n++] = &dev_attr_cache_disable_1.attr; | ||
547 | } | ||
548 | if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) | ||
549 | amd_l3_attrs[n++] = &dev_attr_subcaches.attr; | ||
525 | 550 | ||
551 | cache_private_group.attrs = amd_l3_attrs; | ||
552 | } | ||
553 | |||
554 | const struct attribute_group * | ||
555 | cache_get_priv_group(struct cacheinfo *this_leaf) | ||
556 | { | ||
557 | struct amd_northbridge *nb = this_leaf->priv; | ||
558 | |||
559 | if (this_leaf->level < 3 || !nb) | ||
560 | return NULL; | ||
561 | |||
562 | if (nb && nb->l3_cache.indices) | ||
563 | init_amd_l3_attrs(); | ||
564 | |||
565 | return &cache_private_group; | ||
566 | } | ||
567 | |||
568 | static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index) | ||
569 | { | ||
570 | int node; | ||
571 | |||
572 | /* only for L3, and not in virtualized environments */ | ||
573 | if (index < 3) | ||
574 | return; | ||
575 | |||
576 | node = amd_get_nb_id(smp_processor_id()); | ||
577 | this_leaf->nb = node_to_amd_nb(node); | ||
578 | if (this_leaf->nb && !this_leaf->nb->l3_cache.indices) | ||
579 | amd_calc_l3_indices(this_leaf->nb); | ||
580 | } | ||
526 | #else | 581 | #else |
527 | #define amd_init_l3_cache(x, y) | 582 | #define amd_init_l3_cache(x, y) |
528 | #endif /* CONFIG_AMD_NB && CONFIG_SYSFS */ | 583 | #endif /* CONFIG_AMD_NB && CONFIG_SYSFS */ |
@@ -546,7 +601,7 @@ cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf) | |||
546 | cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); | 601 | cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); |
547 | } | 602 | } |
548 | 603 | ||
549 | if (eax.split.type == CACHE_TYPE_NULL) | 604 | if (eax.split.type == CTYPE_NULL) |
550 | return -EIO; /* better error ? */ | 605 | return -EIO; /* better error ? */ |
551 | 606 | ||
552 | this_leaf->eax = eax; | 607 | this_leaf->eax = eax; |
@@ -575,7 +630,7 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c) | |||
575 | /* Do cpuid(op) loop to find out num_cache_leaves */ | 630 | /* Do cpuid(op) loop to find out num_cache_leaves */ |
576 | cpuid_count(op, i, &eax, &ebx, &ecx, &edx); | 631 | cpuid_count(op, i, &eax, &ebx, &ecx, &edx); |
577 | cache_eax.full = eax; | 632 | cache_eax.full = eax; |
578 | } while (cache_eax.split.type != CACHE_TYPE_NULL); | 633 | } while (cache_eax.split.type != CTYPE_NULL); |
579 | return i; | 634 | return i; |
580 | } | 635 | } |
581 | 636 | ||
@@ -626,9 +681,9 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c) | |||
626 | 681 | ||
627 | switch (this_leaf.eax.split.level) { | 682 | switch (this_leaf.eax.split.level) { |
628 | case 1: | 683 | case 1: |
629 | if (this_leaf.eax.split.type == CACHE_TYPE_DATA) | 684 | if (this_leaf.eax.split.type == CTYPE_DATA) |
630 | new_l1d = this_leaf.size/1024; | 685 | new_l1d = this_leaf.size/1024; |
631 | else if (this_leaf.eax.split.type == CACHE_TYPE_INST) | 686 | else if (this_leaf.eax.split.type == CTYPE_INST) |
632 | new_l1i = this_leaf.size/1024; | 687 | new_l1i = this_leaf.size/1024; |
633 | break; | 688 | break; |
634 | case 2: | 689 | case 2: |
@@ -747,55 +802,52 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c) | |||
747 | return l2; | 802 | return l2; |
748 | } | 803 | } |
749 | 804 | ||
750 | #ifdef CONFIG_SYSFS | 805 | static int __cache_amd_cpumap_setup(unsigned int cpu, int index, |
751 | 806 | struct _cpuid4_info_regs *base) | |
752 | /* pointer to _cpuid4_info array (for each cache leaf) */ | ||
753 | static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info); | ||
754 | #define CPUID4_INFO_IDX(x, y) (&((per_cpu(ici_cpuid4_info, x))[y])) | ||
755 | |||
756 | #ifdef CONFIG_SMP | ||
757 | |||
758 | static int cache_shared_amd_cpu_map_setup(unsigned int cpu, int index) | ||
759 | { | 807 | { |
760 | struct _cpuid4_info *this_leaf; | 808 | struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); |
809 | struct cacheinfo *this_leaf; | ||
761 | int i, sibling; | 810 | int i, sibling; |
762 | 811 | ||
763 | if (cpu_has_topoext) { | 812 | if (cpu_has_topoext) { |
764 | unsigned int apicid, nshared, first, last; | 813 | unsigned int apicid, nshared, first, last; |
765 | 814 | ||
766 | if (!per_cpu(ici_cpuid4_info, cpu)) | 815 | this_leaf = this_cpu_ci->info_list + index; |
767 | return 0; | 816 | nshared = base->eax.split.num_threads_sharing + 1; |
768 | |||
769 | this_leaf = CPUID4_INFO_IDX(cpu, index); | ||
770 | nshared = this_leaf->base.eax.split.num_threads_sharing + 1; | ||
771 | apicid = cpu_data(cpu).apicid; | 817 | apicid = cpu_data(cpu).apicid; |
772 | first = apicid - (apicid % nshared); | 818 | first = apicid - (apicid % nshared); |
773 | last = first + nshared - 1; | 819 | last = first + nshared - 1; |
774 | 820 | ||
775 | for_each_online_cpu(i) { | 821 | for_each_online_cpu(i) { |
822 | this_cpu_ci = get_cpu_cacheinfo(i); | ||
823 | if (!this_cpu_ci->info_list) | ||
824 | continue; | ||
825 | |||
776 | apicid = cpu_data(i).apicid; | 826 | apicid = cpu_data(i).apicid; |
777 | if ((apicid < first) || (apicid > last)) | 827 | if ((apicid < first) || (apicid > last)) |
778 | continue; | 828 | continue; |
779 | if (!per_cpu(ici_cpuid4_info, i)) | 829 | |
780 | continue; | 830 | this_leaf = this_cpu_ci->info_list + index; |
781 | this_leaf = CPUID4_INFO_IDX(i, index); | ||
782 | 831 | ||
783 | for_each_online_cpu(sibling) { | 832 | for_each_online_cpu(sibling) { |
784 | apicid = cpu_data(sibling).apicid; | 833 | apicid = cpu_data(sibling).apicid; |
785 | if ((apicid < first) || (apicid > last)) | 834 | if ((apicid < first) || (apicid > last)) |
786 | continue; | 835 | continue; |
787 | set_bit(sibling, this_leaf->shared_cpu_map); | 836 | cpumask_set_cpu(sibling, |
837 | &this_leaf->shared_cpu_map); | ||
788 | } | 838 | } |
789 | } | 839 | } |
790 | } else if (index == 3) { | 840 | } else if (index == 3) { |
791 | for_each_cpu(i, cpu_llc_shared_mask(cpu)) { | 841 | for_each_cpu(i, cpu_llc_shared_mask(cpu)) { |
792 | if (!per_cpu(ici_cpuid4_info, i)) | 842 | this_cpu_ci = get_cpu_cacheinfo(i); |
843 | if (!this_cpu_ci->info_list) | ||
793 | continue; | 844 | continue; |
794 | this_leaf = CPUID4_INFO_IDX(i, index); | 845 | this_leaf = this_cpu_ci->info_list + index; |
795 | for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) { | 846 | for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) { |
796 | if (!cpu_online(sibling)) | 847 | if (!cpu_online(sibling)) |
797 | continue; | 848 | continue; |
798 | set_bit(sibling, this_leaf->shared_cpu_map); | 849 | cpumask_set_cpu(sibling, |
850 | &this_leaf->shared_cpu_map); | ||
799 | } | 851 | } |
800 | } | 852 | } |
801 | } else | 853 | } else |
@@ -804,457 +856,86 @@ static int cache_shared_amd_cpu_map_setup(unsigned int cpu, int index) | |||
804 | return 1; | 856 | return 1; |
805 | } | 857 | } |
806 | 858 | ||
807 | static void cache_shared_cpu_map_setup(unsigned int cpu, int index) | 859 | static void __cache_cpumap_setup(unsigned int cpu, int index, |
860 | struct _cpuid4_info_regs *base) | ||
808 | { | 861 | { |
809 | struct _cpuid4_info *this_leaf, *sibling_leaf; | 862 | struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); |
863 | struct cacheinfo *this_leaf, *sibling_leaf; | ||
810 | unsigned long num_threads_sharing; | 864 | unsigned long num_threads_sharing; |
811 | int index_msb, i; | 865 | int index_msb, i; |
812 | struct cpuinfo_x86 *c = &cpu_data(cpu); | 866 | struct cpuinfo_x86 *c = &cpu_data(cpu); |
813 | 867 | ||
814 | if (c->x86_vendor == X86_VENDOR_AMD) { | 868 | if (c->x86_vendor == X86_VENDOR_AMD) { |
815 | if (cache_shared_amd_cpu_map_setup(cpu, index)) | 869 | if (__cache_amd_cpumap_setup(cpu, index, base)) |
816 | return; | 870 | return; |
817 | } | 871 | } |
818 | 872 | ||
819 | this_leaf = CPUID4_INFO_IDX(cpu, index); | 873 | this_leaf = this_cpu_ci->info_list + index; |
820 | num_threads_sharing = 1 + this_leaf->base.eax.split.num_threads_sharing; | 874 | num_threads_sharing = 1 + base->eax.split.num_threads_sharing; |
821 | 875 | ||
876 | cpumask_set_cpu(cpu, &this_leaf->shared_cpu_map); | ||
822 | if (num_threads_sharing == 1) | 877 | if (num_threads_sharing == 1) |
823 | cpumask_set_cpu(cpu, to_cpumask(this_leaf->shared_cpu_map)); | 878 | return; |
824 | else { | ||
825 | index_msb = get_count_order(num_threads_sharing); | ||
826 | |||
827 | for_each_online_cpu(i) { | ||
828 | if (cpu_data(i).apicid >> index_msb == | ||
829 | c->apicid >> index_msb) { | ||
830 | cpumask_set_cpu(i, | ||
831 | to_cpumask(this_leaf->shared_cpu_map)); | ||
832 | if (i != cpu && per_cpu(ici_cpuid4_info, i)) { | ||
833 | sibling_leaf = | ||
834 | CPUID4_INFO_IDX(i, index); | ||
835 | cpumask_set_cpu(cpu, to_cpumask( | ||
836 | sibling_leaf->shared_cpu_map)); | ||
837 | } | ||
838 | } | ||
839 | } | ||
840 | } | ||
841 | } | ||
842 | static void cache_remove_shared_cpu_map(unsigned int cpu, int index) | ||
843 | { | ||
844 | struct _cpuid4_info *this_leaf, *sibling_leaf; | ||
845 | int sibling; | ||
846 | |||
847 | this_leaf = CPUID4_INFO_IDX(cpu, index); | ||
848 | for_each_cpu(sibling, to_cpumask(this_leaf->shared_cpu_map)) { | ||
849 | sibling_leaf = CPUID4_INFO_IDX(sibling, index); | ||
850 | cpumask_clear_cpu(cpu, | ||
851 | to_cpumask(sibling_leaf->shared_cpu_map)); | ||
852 | } | ||
853 | } | ||
854 | #else | ||
855 | static void cache_shared_cpu_map_setup(unsigned int cpu, int index) | ||
856 | { | ||
857 | } | ||
858 | |||
859 | static void cache_remove_shared_cpu_map(unsigned int cpu, int index) | ||
860 | { | ||
861 | } | ||
862 | #endif | ||
863 | |||
864 | static void free_cache_attributes(unsigned int cpu) | ||
865 | { | ||
866 | int i; | ||
867 | |||
868 | for (i = 0; i < num_cache_leaves; i++) | ||
869 | cache_remove_shared_cpu_map(cpu, i); | ||
870 | |||
871 | kfree(per_cpu(ici_cpuid4_info, cpu)); | ||
872 | per_cpu(ici_cpuid4_info, cpu) = NULL; | ||
873 | } | ||
874 | |||
875 | static void get_cpu_leaves(void *_retval) | ||
876 | { | ||
877 | int j, *retval = _retval, cpu = smp_processor_id(); | ||
878 | 879 | ||
879 | /* Do cpuid and store the results */ | 880 | index_msb = get_count_order(num_threads_sharing); |
880 | for (j = 0; j < num_cache_leaves; j++) { | ||
881 | struct _cpuid4_info *this_leaf = CPUID4_INFO_IDX(cpu, j); | ||
882 | 881 | ||
883 | *retval = cpuid4_cache_lookup_regs(j, &this_leaf->base); | 882 | for_each_online_cpu(i) |
884 | if (unlikely(*retval < 0)) { | 883 | if (cpu_data(i).apicid >> index_msb == c->apicid >> index_msb) { |
885 | int i; | 884 | struct cpu_cacheinfo *sib_cpu_ci = get_cpu_cacheinfo(i); |
886 | 885 | ||
887 | for (i = 0; i < j; i++) | 886 | if (i == cpu || !sib_cpu_ci->info_list) |
888 | cache_remove_shared_cpu_map(cpu, i); | 887 | continue;/* skip if itself or no cacheinfo */ |
889 | break; | 888 | sibling_leaf = sib_cpu_ci->info_list + index; |
889 | cpumask_set_cpu(i, &this_leaf->shared_cpu_map); | ||
890 | cpumask_set_cpu(cpu, &sibling_leaf->shared_cpu_map); | ||
890 | } | 891 | } |
891 | cache_shared_cpu_map_setup(cpu, j); | ||
892 | } | ||
893 | } | 892 | } |
894 | 893 | ||
895 | static int detect_cache_attributes(unsigned int cpu) | 894 | static void ci_leaf_init(struct cacheinfo *this_leaf, |
895 | struct _cpuid4_info_regs *base) | ||
896 | { | 896 | { |
897 | int retval; | 897 | this_leaf->level = base->eax.split.level; |
898 | 898 | this_leaf->type = cache_type_map[base->eax.split.type]; | |
899 | if (num_cache_leaves == 0) | 899 | this_leaf->coherency_line_size = |
900 | return -ENOENT; | 900 | base->ebx.split.coherency_line_size + 1; |
901 | 901 | this_leaf->ways_of_associativity = | |
902 | per_cpu(ici_cpuid4_info, cpu) = kzalloc( | 902 | base->ebx.split.ways_of_associativity + 1; |
903 | sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL); | 903 | this_leaf->size = base->size; |
904 | if (per_cpu(ici_cpuid4_info, cpu) == NULL) | 904 | this_leaf->number_of_sets = base->ecx.split.number_of_sets + 1; |
905 | return -ENOMEM; | 905 | this_leaf->physical_line_partition = |
906 | 906 | base->ebx.split.physical_line_partition + 1; | |
907 | smp_call_function_single(cpu, get_cpu_leaves, &retval, true); | 907 | this_leaf->priv = base->nb; |
908 | if (retval) { | ||
909 | kfree(per_cpu(ici_cpuid4_info, cpu)); | ||
910 | per_cpu(ici_cpuid4_info, cpu) = NULL; | ||
911 | } | ||
912 | |||
913 | return retval; | ||
914 | } | 908 | } |
915 | 909 | ||
916 | #include <linux/kobject.h> | 910 | static int __init_cache_level(unsigned int cpu) |
917 | #include <linux/sysfs.h> | ||
918 | #include <linux/cpu.h> | ||
919 | |||
920 | /* pointer to kobject for cpuX/cache */ | ||
921 | static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject); | ||
922 | |||
923 | struct _index_kobject { | ||
924 | struct kobject kobj; | ||
925 | unsigned int cpu; | ||
926 | unsigned short index; | ||
927 | }; | ||
928 | |||
929 | /* pointer to array of kobjects for cpuX/cache/indexY */ | ||
930 | static DEFINE_PER_CPU(struct _index_kobject *, ici_index_kobject); | ||
931 | #define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(ici_index_kobject, x))[y])) | ||
932 | |||
933 | #define show_one_plus(file_name, object, val) \ | ||
934 | static ssize_t show_##file_name(struct _cpuid4_info *this_leaf, char *buf, \ | ||
935 | unsigned int cpu) \ | ||
936 | { \ | ||
937 | return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \ | ||
938 | } | ||
939 | |||
940 | show_one_plus(level, base.eax.split.level, 0); | ||
941 | show_one_plus(coherency_line_size, base.ebx.split.coherency_line_size, 1); | ||
942 | show_one_plus(physical_line_partition, base.ebx.split.physical_line_partition, 1); | ||
943 | show_one_plus(ways_of_associativity, base.ebx.split.ways_of_associativity, 1); | ||
944 | show_one_plus(number_of_sets, base.ecx.split.number_of_sets, 1); | ||
945 | |||
946 | static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf, | ||
947 | unsigned int cpu) | ||
948 | { | ||
949 | return sprintf(buf, "%luK\n", this_leaf->base.size / 1024); | ||
950 | } | ||
951 | |||
952 | static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf, | ||
953 | int type, char *buf) | ||
954 | { | ||
955 | const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); | ||
956 | int ret; | ||
957 | |||
958 | if (type) | ||
959 | ret = scnprintf(buf, PAGE_SIZE - 1, "%*pbl", | ||
960 | cpumask_pr_args(mask)); | ||
961 | else | ||
962 | ret = scnprintf(buf, PAGE_SIZE - 1, "%*pb", | ||
963 | cpumask_pr_args(mask)); | ||
964 | buf[ret++] = '\n'; | ||
965 | buf[ret] = '\0'; | ||
966 | return ret; | ||
967 | } | ||
968 | |||
969 | static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf, | ||
970 | unsigned int cpu) | ||
971 | { | 911 | { |
972 | return show_shared_cpu_map_func(leaf, 0, buf); | 912 | struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); |
973 | } | ||
974 | |||
975 | static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf, | ||
976 | unsigned int cpu) | ||
977 | { | ||
978 | return show_shared_cpu_map_func(leaf, 1, buf); | ||
979 | } | ||
980 | 913 | ||
981 | static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf, | 914 | if (!num_cache_leaves) |
982 | unsigned int cpu) | ||
983 | { | ||
984 | switch (this_leaf->base.eax.split.type) { | ||
985 | case CACHE_TYPE_DATA: | ||
986 | return sprintf(buf, "Data\n"); | ||
987 | case CACHE_TYPE_INST: | ||
988 | return sprintf(buf, "Instruction\n"); | ||
989 | case CACHE_TYPE_UNIFIED: | ||
990 | return sprintf(buf, "Unified\n"); | ||
991 | default: | ||
992 | return sprintf(buf, "Unknown\n"); | ||
993 | } | ||
994 | } | ||
995 | |||
996 | #define to_object(k) container_of(k, struct _index_kobject, kobj) | ||
997 | #define to_attr(a) container_of(a, struct _cache_attr, attr) | ||
998 | |||
999 | #define define_one_ro(_name) \ | ||
1000 | static struct _cache_attr _name = \ | ||
1001 | __ATTR(_name, 0444, show_##_name, NULL) | ||
1002 | |||
1003 | define_one_ro(level); | ||
1004 | define_one_ro(type); | ||
1005 | define_one_ro(coherency_line_size); | ||
1006 | define_one_ro(physical_line_partition); | ||
1007 | define_one_ro(ways_of_associativity); | ||
1008 | define_one_ro(number_of_sets); | ||
1009 | define_one_ro(size); | ||
1010 | define_one_ro(shared_cpu_map); | ||
1011 | define_one_ro(shared_cpu_list); | ||
1012 | |||
1013 | static struct attribute *default_attrs[] = { | ||
1014 | &type.attr, | ||
1015 | &level.attr, | ||
1016 | &coherency_line_size.attr, | ||
1017 | &physical_line_partition.attr, | ||
1018 | &ways_of_associativity.attr, | ||
1019 | &number_of_sets.attr, | ||
1020 | &size.attr, | ||
1021 | &shared_cpu_map.attr, | ||
1022 | &shared_cpu_list.attr, | ||
1023 | NULL | ||
1024 | }; | ||
1025 | |||
1026 | #ifdef CONFIG_AMD_NB | ||
1027 | static struct attribute **amd_l3_attrs(void) | ||
1028 | { | ||
1029 | static struct attribute **attrs; | ||
1030 | int n; | ||
1031 | |||
1032 | if (attrs) | ||
1033 | return attrs; | ||
1034 | |||
1035 | n = ARRAY_SIZE(default_attrs); | ||
1036 | |||
1037 | if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) | ||
1038 | n += 2; | ||
1039 | |||
1040 | if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) | ||
1041 | n += 1; | ||
1042 | |||
1043 | attrs = kzalloc(n * sizeof (struct attribute *), GFP_KERNEL); | ||
1044 | if (attrs == NULL) | ||
1045 | return attrs = default_attrs; | ||
1046 | |||
1047 | for (n = 0; default_attrs[n]; n++) | ||
1048 | attrs[n] = default_attrs[n]; | ||
1049 | |||
1050 | if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) { | ||
1051 | attrs[n++] = &cache_disable_0.attr; | ||
1052 | attrs[n++] = &cache_disable_1.attr; | ||
1053 | } | ||
1054 | |||
1055 | if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) | ||
1056 | attrs[n++] = &subcaches.attr; | ||
1057 | |||
1058 | return attrs; | ||
1059 | } | ||
1060 | #endif | ||
1061 | |||
1062 | static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) | ||
1063 | { | ||
1064 | struct _cache_attr *fattr = to_attr(attr); | ||
1065 | struct _index_kobject *this_leaf = to_object(kobj); | ||
1066 | ssize_t ret; | ||
1067 | |||
1068 | ret = fattr->show ? | ||
1069 | fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), | ||
1070 | buf, this_leaf->cpu) : | ||
1071 | 0; | ||
1072 | return ret; | ||
1073 | } | ||
1074 | |||
1075 | static ssize_t store(struct kobject *kobj, struct attribute *attr, | ||
1076 | const char *buf, size_t count) | ||
1077 | { | ||
1078 | struct _cache_attr *fattr = to_attr(attr); | ||
1079 | struct _index_kobject *this_leaf = to_object(kobj); | ||
1080 | ssize_t ret; | ||
1081 | |||
1082 | ret = fattr->store ? | ||
1083 | fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), | ||
1084 | buf, count, this_leaf->cpu) : | ||
1085 | 0; | ||
1086 | return ret; | ||
1087 | } | ||
1088 | |||
1089 | static const struct sysfs_ops sysfs_ops = { | ||
1090 | .show = show, | ||
1091 | .store = store, | ||
1092 | }; | ||
1093 | |||
1094 | static struct kobj_type ktype_cache = { | ||
1095 | .sysfs_ops = &sysfs_ops, | ||
1096 | .default_attrs = default_attrs, | ||
1097 | }; | ||
1098 | |||
1099 | static struct kobj_type ktype_percpu_entry = { | ||
1100 | .sysfs_ops = &sysfs_ops, | ||
1101 | }; | ||
1102 | |||
1103 | static void cpuid4_cache_sysfs_exit(unsigned int cpu) | ||
1104 | { | ||
1105 | kfree(per_cpu(ici_cache_kobject, cpu)); | ||
1106 | kfree(per_cpu(ici_index_kobject, cpu)); | ||
1107 | per_cpu(ici_cache_kobject, cpu) = NULL; | ||
1108 | per_cpu(ici_index_kobject, cpu) = NULL; | ||
1109 | free_cache_attributes(cpu); | ||
1110 | } | ||
1111 | |||
1112 | static int cpuid4_cache_sysfs_init(unsigned int cpu) | ||
1113 | { | ||
1114 | int err; | ||
1115 | |||
1116 | if (num_cache_leaves == 0) | ||
1117 | return -ENOENT; | 915 | return -ENOENT; |
1118 | 916 | if (!this_cpu_ci) | |
1119 | err = detect_cache_attributes(cpu); | 917 | return -EINVAL; |
1120 | if (err) | 918 | this_cpu_ci->num_levels = 3; |
1121 | return err; | 919 | this_cpu_ci->num_leaves = num_cache_leaves; |
1122 | |||
1123 | /* Allocate all required memory */ | ||
1124 | per_cpu(ici_cache_kobject, cpu) = | ||
1125 | kzalloc(sizeof(struct kobject), GFP_KERNEL); | ||
1126 | if (unlikely(per_cpu(ici_cache_kobject, cpu) == NULL)) | ||
1127 | goto err_out; | ||
1128 | |||
1129 | per_cpu(ici_index_kobject, cpu) = kzalloc( | ||
1130 | sizeof(struct _index_kobject) * num_cache_leaves, GFP_KERNEL); | ||
1131 | if (unlikely(per_cpu(ici_index_kobject, cpu) == NULL)) | ||
1132 | goto err_out; | ||
1133 | |||
1134 | return 0; | 920 | return 0; |
1135 | |||
1136 | err_out: | ||
1137 | cpuid4_cache_sysfs_exit(cpu); | ||
1138 | return -ENOMEM; | ||
1139 | } | 921 | } |
1140 | 922 | ||
1141 | static DECLARE_BITMAP(cache_dev_map, NR_CPUS); | 923 | static int __populate_cache_leaves(unsigned int cpu) |
1142 | |||
1143 | /* Add/Remove cache interface for CPU device */ | ||
1144 | static int cache_add_dev(struct device *dev) | ||
1145 | { | 924 | { |
1146 | unsigned int cpu = dev->id; | 925 | unsigned int idx, ret; |
1147 | unsigned long i, j; | 926 | struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); |
1148 | struct _index_kobject *this_object; | 927 | struct cacheinfo *this_leaf = this_cpu_ci->info_list; |
1149 | struct _cpuid4_info *this_leaf; | 928 | struct _cpuid4_info_regs id4_regs = {}; |
1150 | int retval; | ||
1151 | |||
1152 | retval = cpuid4_cache_sysfs_init(cpu); | ||
1153 | if (unlikely(retval < 0)) | ||
1154 | return retval; | ||
1155 | |||
1156 | retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu), | ||
1157 | &ktype_percpu_entry, | ||
1158 | &dev->kobj, "%s", "cache"); | ||
1159 | if (retval < 0) { | ||
1160 | cpuid4_cache_sysfs_exit(cpu); | ||
1161 | return retval; | ||
1162 | } | ||
1163 | 929 | ||
1164 | for (i = 0; i < num_cache_leaves; i++) { | 930 | for (idx = 0; idx < this_cpu_ci->num_leaves; idx++) { |
1165 | this_object = INDEX_KOBJECT_PTR(cpu, i); | 931 | ret = cpuid4_cache_lookup_regs(idx, &id4_regs); |
1166 | this_object->cpu = cpu; | 932 | if (ret) |
1167 | this_object->index = i; | 933 | return ret; |
1168 | 934 | ci_leaf_init(this_leaf++, &id4_regs); | |
1169 | this_leaf = CPUID4_INFO_IDX(cpu, i); | 935 | __cache_cpumap_setup(cpu, idx, &id4_regs); |
1170 | |||
1171 | ktype_cache.default_attrs = default_attrs; | ||
1172 | #ifdef CONFIG_AMD_NB | ||
1173 | if (this_leaf->base.nb) | ||
1174 | ktype_cache.default_attrs = amd_l3_attrs(); | ||
1175 | #endif | ||
1176 | retval = kobject_init_and_add(&(this_object->kobj), | ||
1177 | &ktype_cache, | ||
1178 | per_cpu(ici_cache_kobject, cpu), | ||
1179 | "index%1lu", i); | ||
1180 | if (unlikely(retval)) { | ||
1181 | for (j = 0; j < i; j++) | ||
1182 | kobject_put(&(INDEX_KOBJECT_PTR(cpu, j)->kobj)); | ||
1183 | kobject_put(per_cpu(ici_cache_kobject, cpu)); | ||
1184 | cpuid4_cache_sysfs_exit(cpu); | ||
1185 | return retval; | ||
1186 | } | ||
1187 | kobject_uevent(&(this_object->kobj), KOBJ_ADD); | ||
1188 | } | 936 | } |
1189 | cpumask_set_cpu(cpu, to_cpumask(cache_dev_map)); | ||
1190 | |||
1191 | kobject_uevent(per_cpu(ici_cache_kobject, cpu), KOBJ_ADD); | ||
1192 | return 0; | 937 | return 0; |
1193 | } | 938 | } |
1194 | 939 | ||
1195 | static void cache_remove_dev(struct device *dev) | 940 | DEFINE_SMP_CALL_CACHE_FUNCTION(init_cache_level) |
1196 | { | 941 | DEFINE_SMP_CALL_CACHE_FUNCTION(populate_cache_leaves) |
1197 | unsigned int cpu = dev->id; | ||
1198 | unsigned long i; | ||
1199 | |||
1200 | if (per_cpu(ici_cpuid4_info, cpu) == NULL) | ||
1201 | return; | ||
1202 | if (!cpumask_test_cpu(cpu, to_cpumask(cache_dev_map))) | ||
1203 | return; | ||
1204 | cpumask_clear_cpu(cpu, to_cpumask(cache_dev_map)); | ||
1205 | |||
1206 | for (i = 0; i < num_cache_leaves; i++) | ||
1207 | kobject_put(&(INDEX_KOBJECT_PTR(cpu, i)->kobj)); | ||
1208 | kobject_put(per_cpu(ici_cache_kobject, cpu)); | ||
1209 | cpuid4_cache_sysfs_exit(cpu); | ||
1210 | } | ||
1211 | |||
1212 | static int cacheinfo_cpu_callback(struct notifier_block *nfb, | ||
1213 | unsigned long action, void *hcpu) | ||
1214 | { | ||
1215 | unsigned int cpu = (unsigned long)hcpu; | ||
1216 | struct device *dev; | ||
1217 | |||
1218 | dev = get_cpu_device(cpu); | ||
1219 | switch (action) { | ||
1220 | case CPU_ONLINE: | ||
1221 | case CPU_ONLINE_FROZEN: | ||
1222 | cache_add_dev(dev); | ||
1223 | break; | ||
1224 | case CPU_DEAD: | ||
1225 | case CPU_DEAD_FROZEN: | ||
1226 | cache_remove_dev(dev); | ||
1227 | break; | ||
1228 | } | ||
1229 | return NOTIFY_OK; | ||
1230 | } | ||
1231 | |||
1232 | static struct notifier_block cacheinfo_cpu_notifier = { | ||
1233 | .notifier_call = cacheinfo_cpu_callback, | ||
1234 | }; | ||
1235 | |||
1236 | static int __init cache_sysfs_init(void) | ||
1237 | { | ||
1238 | int i, err = 0; | ||
1239 | |||
1240 | if (num_cache_leaves == 0) | ||
1241 | return 0; | ||
1242 | |||
1243 | cpu_notifier_register_begin(); | ||
1244 | for_each_online_cpu(i) { | ||
1245 | struct device *dev = get_cpu_device(i); | ||
1246 | |||
1247 | err = cache_add_dev(dev); | ||
1248 | if (err) | ||
1249 | goto out; | ||
1250 | } | ||
1251 | __register_hotcpu_notifier(&cacheinfo_cpu_notifier); | ||
1252 | |||
1253 | out: | ||
1254 | cpu_notifier_register_done(); | ||
1255 | return err; | ||
1256 | } | ||
1257 | |||
1258 | device_initcall(cache_sysfs_init); | ||
1259 | |||
1260 | #endif | ||
diff --git a/arch/x86/kernel/cpu/intel_pt.h b/arch/x86/kernel/cpu/intel_pt.h new file mode 100644 index 000000000000..1c338b0eba05 --- /dev/null +++ b/arch/x86/kernel/cpu/intel_pt.h | |||
@@ -0,0 +1,131 @@ | |||
1 | /* | ||
2 | * Intel(R) Processor Trace PMU driver for perf | ||
3 | * Copyright (c) 2013-2014, Intel Corporation. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify it | ||
6 | * under the terms and conditions of the GNU General Public License, | ||
7 | * version 2, as published by the Free Software Foundation. | ||
8 | * | ||
9 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
12 | * more details. | ||
13 | * | ||
14 | * Intel PT is specified in the Intel Architecture Instruction Set Extensions | ||
15 | * Programming Reference: | ||
16 | * http://software.intel.com/en-us/intel-isa-extensions | ||
17 | */ | ||
18 | |||
19 | #ifndef __INTEL_PT_H__ | ||
20 | #define __INTEL_PT_H__ | ||
21 | |||
22 | /* | ||
23 | * Single-entry ToPA: when this close to region boundary, switch | ||
24 | * buffers to avoid losing data. | ||
25 | */ | ||
26 | #define TOPA_PMI_MARGIN 512 | ||
27 | |||
28 | /* | ||
29 | * Table of Physical Addresses bits | ||
30 | */ | ||
31 | enum topa_sz { | ||
32 | TOPA_4K = 0, | ||
33 | TOPA_8K, | ||
34 | TOPA_16K, | ||
35 | TOPA_32K, | ||
36 | TOPA_64K, | ||
37 | TOPA_128K, | ||
38 | TOPA_256K, | ||
39 | TOPA_512K, | ||
40 | TOPA_1MB, | ||
41 | TOPA_2MB, | ||
42 | TOPA_4MB, | ||
43 | TOPA_8MB, | ||
44 | TOPA_16MB, | ||
45 | TOPA_32MB, | ||
46 | TOPA_64MB, | ||
47 | TOPA_128MB, | ||
48 | TOPA_SZ_END, | ||
49 | }; | ||
50 | |||
51 | static inline unsigned int sizes(enum topa_sz tsz) | ||
52 | { | ||
53 | return 1 << (tsz + 12); | ||
54 | }; | ||
55 | |||
56 | struct topa_entry { | ||
57 | u64 end : 1; | ||
58 | u64 rsvd0 : 1; | ||
59 | u64 intr : 1; | ||
60 | u64 rsvd1 : 1; | ||
61 | u64 stop : 1; | ||
62 | u64 rsvd2 : 1; | ||
63 | u64 size : 4; | ||
64 | u64 rsvd3 : 2; | ||
65 | u64 base : 36; | ||
66 | u64 rsvd4 : 16; | ||
67 | }; | ||
68 | |||
69 | #define TOPA_SHIFT 12 | ||
70 | #define PT_CPUID_LEAVES 2 | ||
71 | |||
72 | enum pt_capabilities { | ||
73 | PT_CAP_max_subleaf = 0, | ||
74 | PT_CAP_cr3_filtering, | ||
75 | PT_CAP_topa_output, | ||
76 | PT_CAP_topa_multiple_entries, | ||
77 | PT_CAP_payloads_lip, | ||
78 | }; | ||
79 | |||
80 | struct pt_pmu { | ||
81 | struct pmu pmu; | ||
82 | u32 caps[4 * PT_CPUID_LEAVES]; | ||
83 | }; | ||
84 | |||
85 | /** | ||
86 | * struct pt_buffer - buffer configuration; one buffer per task_struct or | ||
87 | * cpu, depending on perf event configuration | ||
88 | * @cpu: cpu for per-cpu allocation | ||
89 | * @tables: list of ToPA tables in this buffer | ||
90 | * @first: shorthand for first topa table | ||
91 | * @last: shorthand for last topa table | ||
92 | * @cur: current topa table | ||
93 | * @nr_pages: buffer size in pages | ||
94 | * @cur_idx: current output region's index within @cur table | ||
95 | * @output_off: offset within the current output region | ||
96 | * @data_size: running total of the amount of data in this buffer | ||
97 | * @lost: if data was lost/truncated | ||
98 | * @head: logical write offset inside the buffer | ||
99 | * @snapshot: if this is for a snapshot/overwrite counter | ||
100 | * @stop_pos: STOP topa entry in the buffer | ||
101 | * @intr_pos: INT topa entry in the buffer | ||
102 | * @data_pages: array of pages from perf | ||
103 | * @topa_index: table of topa entries indexed by page offset | ||
104 | */ | ||
105 | struct pt_buffer { | ||
106 | int cpu; | ||
107 | struct list_head tables; | ||
108 | struct topa *first, *last, *cur; | ||
109 | unsigned int cur_idx; | ||
110 | size_t output_off; | ||
111 | unsigned long nr_pages; | ||
112 | local_t data_size; | ||
113 | local_t lost; | ||
114 | local64_t head; | ||
115 | bool snapshot; | ||
116 | unsigned long stop_pos, intr_pos; | ||
117 | void **data_pages; | ||
118 | struct topa_entry *topa_index[0]; | ||
119 | }; | ||
120 | |||
121 | /** | ||
122 | * struct pt - per-cpu pt context | ||
123 | * @handle: perf output handle | ||
124 | * @handle_nmi: do handle PT PMI on this cpu, there's an active event | ||
125 | */ | ||
126 | struct pt { | ||
127 | struct perf_output_handle handle; | ||
128 | int handle_nmi; | ||
129 | }; | ||
130 | |||
131 | #endif /* __INTEL_PT_H__ */ | ||
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 10b46906767f..fe32074b865b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h | |||
@@ -14,6 +14,7 @@ enum severity_level { | |||
14 | }; | 14 | }; |
15 | 15 | ||
16 | #define ATTR_LEN 16 | 16 | #define ATTR_LEN 16 |
17 | #define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */ | ||
17 | 18 | ||
18 | /* One object for each MCE bank, shared by all CPUs */ | 19 | /* One object for each MCE bank, shared by all CPUs */ |
19 | struct mce_bank { | 20 | struct mce_bank { |
@@ -23,20 +24,20 @@ struct mce_bank { | |||
23 | char attrname[ATTR_LEN]; /* attribute name */ | 24 | char attrname[ATTR_LEN]; /* attribute name */ |
24 | }; | 25 | }; |
25 | 26 | ||
26 | int mce_severity(struct mce *a, int tolerant, char **msg, bool is_excp); | 27 | extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp); |
27 | struct dentry *mce_get_debugfs_dir(void); | 28 | struct dentry *mce_get_debugfs_dir(void); |
28 | 29 | ||
29 | extern struct mce_bank *mce_banks; | 30 | extern struct mce_bank *mce_banks; |
30 | extern mce_banks_t mce_banks_ce_disabled; | 31 | extern mce_banks_t mce_banks_ce_disabled; |
31 | 32 | ||
32 | #ifdef CONFIG_X86_MCE_INTEL | 33 | #ifdef CONFIG_X86_MCE_INTEL |
33 | unsigned long mce_intel_adjust_timer(unsigned long interval); | 34 | unsigned long cmci_intel_adjust_timer(unsigned long interval); |
34 | void mce_intel_cmci_poll(void); | 35 | bool mce_intel_cmci_poll(void); |
35 | void mce_intel_hcpu_update(unsigned long cpu); | 36 | void mce_intel_hcpu_update(unsigned long cpu); |
36 | void cmci_disable_bank(int bank); | 37 | void cmci_disable_bank(int bank); |
37 | #else | 38 | #else |
38 | # define mce_intel_adjust_timer mce_adjust_timer_default | 39 | # define cmci_intel_adjust_timer mce_adjust_timer_default |
39 | static inline void mce_intel_cmci_poll(void) { } | 40 | static inline bool mce_intel_cmci_poll(void) { return false; } |
40 | static inline void mce_intel_hcpu_update(unsigned long cpu) { } | 41 | static inline void mce_intel_hcpu_update(unsigned long cpu) { } |
41 | static inline void cmci_disable_bank(int bank) { } | 42 | static inline void cmci_disable_bank(int bank) { } |
42 | #endif | 43 | #endif |
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 8bb433043a7f..9c682c222071 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c | |||
@@ -186,7 +186,61 @@ static int error_context(struct mce *m) | |||
186 | return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL; | 186 | return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL; |
187 | } | 187 | } |
188 | 188 | ||
189 | int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp) | 189 | /* |
190 | * See AMD Error Scope Hierarchy table in a newer BKDG. For example | ||
191 | * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features" | ||
192 | */ | ||
193 | static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_excp) | ||
194 | { | ||
195 | enum context ctx = error_context(m); | ||
196 | |||
197 | /* Processor Context Corrupt, no need to fumble too much, die! */ | ||
198 | if (m->status & MCI_STATUS_PCC) | ||
199 | return MCE_PANIC_SEVERITY; | ||
200 | |||
201 | if (m->status & MCI_STATUS_UC) { | ||
202 | |||
203 | /* | ||
204 | * On older systems where overflow_recov flag is not present, we | ||
205 | * should simply panic if an error overflow occurs. If | ||
206 | * overflow_recov flag is present and set, then software can try | ||
207 | * to at least kill process to prolong system operation. | ||
208 | */ | ||
209 | if (mce_flags.overflow_recov) { | ||
210 | /* software can try to contain */ | ||
211 | if (!(m->mcgstatus & MCG_STATUS_RIPV) && (ctx == IN_KERNEL)) | ||
212 | return MCE_PANIC_SEVERITY; | ||
213 | |||
214 | /* kill current process */ | ||
215 | return MCE_AR_SEVERITY; | ||
216 | } else { | ||
217 | /* at least one error was not logged */ | ||
218 | if (m->status & MCI_STATUS_OVER) | ||
219 | return MCE_PANIC_SEVERITY; | ||
220 | } | ||
221 | |||
222 | /* | ||
223 | * For any other case, return MCE_UC_SEVERITY so that we log the | ||
224 | * error and exit #MC handler. | ||
225 | */ | ||
226 | return MCE_UC_SEVERITY; | ||
227 | } | ||
228 | |||
229 | /* | ||
230 | * deferred error: poll handler catches these and adds to mce_ring so | ||
231 | * memory-failure can take recovery actions. | ||
232 | */ | ||
233 | if (m->status & MCI_STATUS_DEFERRED) | ||
234 | return MCE_DEFERRED_SEVERITY; | ||
235 | |||
236 | /* | ||
237 | * corrected error: poll handler catches these and passes responsibility | ||
238 | * of decoding the error to EDAC | ||
239 | */ | ||
240 | return MCE_KEEP_SEVERITY; | ||
241 | } | ||
242 | |||
243 | static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_excp) | ||
190 | { | 244 | { |
191 | enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP); | 245 | enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP); |
192 | enum context ctx = error_context(m); | 246 | enum context ctx = error_context(m); |
@@ -216,6 +270,16 @@ int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp) | |||
216 | } | 270 | } |
217 | } | 271 | } |
218 | 272 | ||
273 | /* Default to mce_severity_intel */ | ||
274 | int (*mce_severity)(struct mce *m, int tolerant, char **msg, bool is_excp) = | ||
275 | mce_severity_intel; | ||
276 | |||
277 | void __init mcheck_vendor_init_severity(void) | ||
278 | { | ||
279 | if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) | ||
280 | mce_severity = mce_severity_amd; | ||
281 | } | ||
282 | |||
219 | #ifdef CONFIG_DEBUG_FS | 283 | #ifdef CONFIG_DEBUG_FS |
220 | static void *s_start(struct seq_file *f, loff_t *pos) | 284 | static void *s_start(struct seq_file *f, loff_t *pos) |
221 | { | 285 | { |
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 3c036cb4a370..e535533d5ab8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c | |||
@@ -60,11 +60,12 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex); | |||
60 | #define CREATE_TRACE_POINTS | 60 | #define CREATE_TRACE_POINTS |
61 | #include <trace/events/mce.h> | 61 | #include <trace/events/mce.h> |
62 | 62 | ||
63 | #define SPINUNIT 100 /* 100ns */ | 63 | #define SPINUNIT 100 /* 100ns */ |
64 | 64 | ||
65 | DEFINE_PER_CPU(unsigned, mce_exception_count); | 65 | DEFINE_PER_CPU(unsigned, mce_exception_count); |
66 | 66 | ||
67 | struct mce_bank *mce_banks __read_mostly; | 67 | struct mce_bank *mce_banks __read_mostly; |
68 | struct mce_vendor_flags mce_flags __read_mostly; | ||
68 | 69 | ||
69 | struct mca_config mca_cfg __read_mostly = { | 70 | struct mca_config mca_cfg __read_mostly = { |
70 | .bootlog = -1, | 71 | .bootlog = -1, |
@@ -89,9 +90,6 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); | |||
89 | static DEFINE_PER_CPU(struct mce, mces_seen); | 90 | static DEFINE_PER_CPU(struct mce, mces_seen); |
90 | static int cpu_missing; | 91 | static int cpu_missing; |
91 | 92 | ||
92 | /* CMCI storm detection filter */ | ||
93 | static DEFINE_PER_CPU(unsigned long, mce_polled_error); | ||
94 | |||
95 | /* | 93 | /* |
96 | * MCA banks polled by the period polling timer for corrected events. | 94 | * MCA banks polled by the period polling timer for corrected events. |
97 | * With Intel CMCI, this only has MCA banks which do not support CMCI (if any). | 95 | * With Intel CMCI, this only has MCA banks which do not support CMCI (if any). |
@@ -622,8 +620,9 @@ DEFINE_PER_CPU(unsigned, mce_poll_count); | |||
622 | * is already totally * confused. In this case it's likely it will | 620 | * is already totally * confused. In this case it's likely it will |
623 | * not fully execute the machine check handler either. | 621 | * not fully execute the machine check handler either. |
624 | */ | 622 | */ |
625 | void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) | 623 | bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) |
626 | { | 624 | { |
625 | bool error_logged = false; | ||
627 | struct mce m; | 626 | struct mce m; |
628 | int severity; | 627 | int severity; |
629 | int i; | 628 | int i; |
@@ -646,7 +645,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) | |||
646 | if (!(m.status & MCI_STATUS_VAL)) | 645 | if (!(m.status & MCI_STATUS_VAL)) |
647 | continue; | 646 | continue; |
648 | 647 | ||
649 | this_cpu_write(mce_polled_error, 1); | 648 | |
650 | /* | 649 | /* |
651 | * Uncorrected or signalled events are handled by the exception | 650 | * Uncorrected or signalled events are handled by the exception |
652 | * handler when it is enabled, so don't process those here. | 651 | * handler when it is enabled, so don't process those here. |
@@ -679,8 +678,10 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) | |||
679 | * Don't get the IP here because it's unlikely to | 678 | * Don't get the IP here because it's unlikely to |
680 | * have anything to do with the actual error location. | 679 | * have anything to do with the actual error location. |
681 | */ | 680 | */ |
682 | if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) | 681 | if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) { |
682 | error_logged = true; | ||
683 | mce_log(&m); | 683 | mce_log(&m); |
684 | } | ||
684 | 685 | ||
685 | /* | 686 | /* |
686 | * Clear state for this bank. | 687 | * Clear state for this bank. |
@@ -694,6 +695,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) | |||
694 | */ | 695 | */ |
695 | 696 | ||
696 | sync_core(); | 697 | sync_core(); |
698 | |||
699 | return error_logged; | ||
697 | } | 700 | } |
698 | EXPORT_SYMBOL_GPL(machine_check_poll); | 701 | EXPORT_SYMBOL_GPL(machine_check_poll); |
699 | 702 | ||
@@ -813,7 +816,7 @@ static void mce_reign(void) | |||
813 | * other CPUs. | 816 | * other CPUs. |
814 | */ | 817 | */ |
815 | if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) | 818 | if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) |
816 | mce_panic("Fatal Machine check", m, msg); | 819 | mce_panic("Fatal machine check", m, msg); |
817 | 820 | ||
818 | /* | 821 | /* |
819 | * For UC somewhere we let the CPU who detects it handle it. | 822 | * For UC somewhere we let the CPU who detects it handle it. |
@@ -826,7 +829,7 @@ static void mce_reign(void) | |||
826 | * source or one CPU is hung. Panic. | 829 | * source or one CPU is hung. Panic. |
827 | */ | 830 | */ |
828 | if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3) | 831 | if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3) |
829 | mce_panic("Machine check from unknown source", NULL, NULL); | 832 | mce_panic("Fatal machine check from unknown source", NULL, NULL); |
830 | 833 | ||
831 | /* | 834 | /* |
832 | * Now clear all the mces_seen so that they don't reappear on | 835 | * Now clear all the mces_seen so that they don't reappear on |
@@ -1258,7 +1261,7 @@ void mce_log_therm_throt_event(__u64 status) | |||
1258 | * poller finds an MCE, poll 2x faster. When the poller finds no more | 1261 | * poller finds an MCE, poll 2x faster. When the poller finds no more |
1259 | * errors, poll 2x slower (up to check_interval seconds). | 1262 | * errors, poll 2x slower (up to check_interval seconds). |
1260 | */ | 1263 | */ |
1261 | static unsigned long check_interval = 5 * 60; /* 5 minutes */ | 1264 | static unsigned long check_interval = INITIAL_CHECK_INTERVAL; |
1262 | 1265 | ||
1263 | static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ | 1266 | static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ |
1264 | static DEFINE_PER_CPU(struct timer_list, mce_timer); | 1267 | static DEFINE_PER_CPU(struct timer_list, mce_timer); |
@@ -1268,49 +1271,57 @@ static unsigned long mce_adjust_timer_default(unsigned long interval) | |||
1268 | return interval; | 1271 | return interval; |
1269 | } | 1272 | } |
1270 | 1273 | ||
1271 | static unsigned long (*mce_adjust_timer)(unsigned long interval) = | 1274 | static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default; |
1272 | mce_adjust_timer_default; | ||
1273 | 1275 | ||
1274 | static int cmc_error_seen(void) | 1276 | static void __restart_timer(struct timer_list *t, unsigned long interval) |
1275 | { | 1277 | { |
1276 | unsigned long *v = this_cpu_ptr(&mce_polled_error); | 1278 | unsigned long when = jiffies + interval; |
1279 | unsigned long flags; | ||
1280 | |||
1281 | local_irq_save(flags); | ||
1277 | 1282 | ||
1278 | return test_and_clear_bit(0, v); | 1283 | if (timer_pending(t)) { |
1284 | if (time_before(when, t->expires)) | ||
1285 | mod_timer_pinned(t, when); | ||
1286 | } else { | ||
1287 | t->expires = round_jiffies(when); | ||
1288 | add_timer_on(t, smp_processor_id()); | ||
1289 | } | ||
1290 | |||
1291 | local_irq_restore(flags); | ||
1279 | } | 1292 | } |
1280 | 1293 | ||
1281 | static void mce_timer_fn(unsigned long data) | 1294 | static void mce_timer_fn(unsigned long data) |
1282 | { | 1295 | { |
1283 | struct timer_list *t = this_cpu_ptr(&mce_timer); | 1296 | struct timer_list *t = this_cpu_ptr(&mce_timer); |
1297 | int cpu = smp_processor_id(); | ||
1284 | unsigned long iv; | 1298 | unsigned long iv; |
1285 | int notify; | ||
1286 | 1299 | ||
1287 | WARN_ON(smp_processor_id() != data); | 1300 | WARN_ON(cpu != data); |
1301 | |||
1302 | iv = __this_cpu_read(mce_next_interval); | ||
1288 | 1303 | ||
1289 | if (mce_available(this_cpu_ptr(&cpu_info))) { | 1304 | if (mce_available(this_cpu_ptr(&cpu_info))) { |
1290 | machine_check_poll(MCP_TIMESTAMP, | 1305 | machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_poll_banks)); |
1291 | this_cpu_ptr(&mce_poll_banks)); | 1306 | |
1292 | mce_intel_cmci_poll(); | 1307 | if (mce_intel_cmci_poll()) { |
1308 | iv = mce_adjust_timer(iv); | ||
1309 | goto done; | ||
1310 | } | ||
1293 | } | 1311 | } |
1294 | 1312 | ||
1295 | /* | 1313 | /* |
1296 | * Alert userspace if needed. If we logged an MCE, reduce the | 1314 | * Alert userspace if needed. If we logged an MCE, reduce the polling |
1297 | * polling interval, otherwise increase the polling interval. | 1315 | * interval, otherwise increase the polling interval. |
1298 | */ | 1316 | */ |
1299 | iv = __this_cpu_read(mce_next_interval); | 1317 | if (mce_notify_irq()) |
1300 | notify = mce_notify_irq(); | ||
1301 | notify |= cmc_error_seen(); | ||
1302 | if (notify) { | ||
1303 | iv = max(iv / 2, (unsigned long) HZ/100); | 1318 | iv = max(iv / 2, (unsigned long) HZ/100); |
1304 | } else { | 1319 | else |
1305 | iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); | 1320 | iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); |
1306 | iv = mce_adjust_timer(iv); | 1321 | |
1307 | } | 1322 | done: |
1308 | __this_cpu_write(mce_next_interval, iv); | 1323 | __this_cpu_write(mce_next_interval, iv); |
1309 | /* Might have become 0 after CMCI storm subsided */ | 1324 | __restart_timer(t, iv); |
1310 | if (iv) { | ||
1311 | t->expires = jiffies + iv; | ||
1312 | add_timer_on(t, smp_processor_id()); | ||
1313 | } | ||
1314 | } | 1325 | } |
1315 | 1326 | ||
1316 | /* | 1327 | /* |
@@ -1319,16 +1330,10 @@ static void mce_timer_fn(unsigned long data) | |||
1319 | void mce_timer_kick(unsigned long interval) | 1330 | void mce_timer_kick(unsigned long interval) |
1320 | { | 1331 | { |
1321 | struct timer_list *t = this_cpu_ptr(&mce_timer); | 1332 | struct timer_list *t = this_cpu_ptr(&mce_timer); |
1322 | unsigned long when = jiffies + interval; | ||
1323 | unsigned long iv = __this_cpu_read(mce_next_interval); | 1333 | unsigned long iv = __this_cpu_read(mce_next_interval); |
1324 | 1334 | ||
1325 | if (timer_pending(t)) { | 1335 | __restart_timer(t, interval); |
1326 | if (time_before(when, t->expires)) | 1336 | |
1327 | mod_timer_pinned(t, when); | ||
1328 | } else { | ||
1329 | t->expires = round_jiffies(when); | ||
1330 | add_timer_on(t, smp_processor_id()); | ||
1331 | } | ||
1332 | if (interval < iv) | 1337 | if (interval < iv) |
1333 | __this_cpu_write(mce_next_interval, interval); | 1338 | __this_cpu_write(mce_next_interval, interval); |
1334 | } | 1339 | } |
@@ -1525,45 +1530,46 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) | |||
1525 | * Various K7s with broken bank 0 around. Always disable | 1530 | * Various K7s with broken bank 0 around. Always disable |
1526 | * by default. | 1531 | * by default. |
1527 | */ | 1532 | */ |
1528 | if (c->x86 == 6 && cfg->banks > 0) | 1533 | if (c->x86 == 6 && cfg->banks > 0) |
1529 | mce_banks[0].ctl = 0; | 1534 | mce_banks[0].ctl = 0; |
1530 | 1535 | ||
1531 | /* | 1536 | /* |
1532 | * Turn off MC4_MISC thresholding banks on those models since | 1537 | * overflow_recov is supported for F15h Models 00h-0fh |
1533 | * they're not supported there. | 1538 | * even though we don't have a CPUID bit for it. |
1534 | */ | 1539 | */ |
1535 | if (c->x86 == 0x15 && | 1540 | if (c->x86 == 0x15 && c->x86_model <= 0xf) |
1536 | (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) { | 1541 | mce_flags.overflow_recov = 1; |
1537 | int i; | 1542 | |
1538 | u64 val, hwcr; | 1543 | /* |
1539 | bool need_toggle; | 1544 | * Turn off MC4_MISC thresholding banks on those models since |
1540 | u32 msrs[] = { | 1545 | * they're not supported there. |
1546 | */ | ||
1547 | if (c->x86 == 0x15 && | ||
1548 | (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) { | ||
1549 | int i; | ||
1550 | u64 hwcr; | ||
1551 | bool need_toggle; | ||
1552 | u32 msrs[] = { | ||
1541 | 0x00000413, /* MC4_MISC0 */ | 1553 | 0x00000413, /* MC4_MISC0 */ |
1542 | 0xc0000408, /* MC4_MISC1 */ | 1554 | 0xc0000408, /* MC4_MISC1 */ |
1543 | }; | 1555 | }; |
1544 | 1556 | ||
1545 | rdmsrl(MSR_K7_HWCR, hwcr); | 1557 | rdmsrl(MSR_K7_HWCR, hwcr); |
1546 | 1558 | ||
1547 | /* McStatusWrEn has to be set */ | 1559 | /* McStatusWrEn has to be set */ |
1548 | need_toggle = !(hwcr & BIT(18)); | 1560 | need_toggle = !(hwcr & BIT(18)); |
1549 | 1561 | ||
1550 | if (need_toggle) | 1562 | if (need_toggle) |
1551 | wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); | 1563 | wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); |
1552 | 1564 | ||
1553 | for (i = 0; i < ARRAY_SIZE(msrs); i++) { | 1565 | /* Clear CntP bit safely */ |
1554 | rdmsrl(msrs[i], val); | 1566 | for (i = 0; i < ARRAY_SIZE(msrs); i++) |
1567 | msr_clear_bit(msrs[i], 62); | ||
1555 | 1568 | ||
1556 | /* CntP bit set? */ | 1569 | /* restore old settings */ |
1557 | if (val & BIT_64(62)) { | 1570 | if (need_toggle) |
1558 | val &= ~BIT_64(62); | 1571 | wrmsrl(MSR_K7_HWCR, hwcr); |
1559 | wrmsrl(msrs[i], val); | 1572 | } |
1560 | } | ||
1561 | } | ||
1562 | |||
1563 | /* restore old settings */ | ||
1564 | if (need_toggle) | ||
1565 | wrmsrl(MSR_K7_HWCR, hwcr); | ||
1566 | } | ||
1567 | } | 1573 | } |
1568 | 1574 | ||
1569 | if (c->x86_vendor == X86_VENDOR_INTEL) { | 1575 | if (c->x86_vendor == X86_VENDOR_INTEL) { |
@@ -1629,10 +1635,11 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) | |||
1629 | switch (c->x86_vendor) { | 1635 | switch (c->x86_vendor) { |
1630 | case X86_VENDOR_INTEL: | 1636 | case X86_VENDOR_INTEL: |
1631 | mce_intel_feature_init(c); | 1637 | mce_intel_feature_init(c); |
1632 | mce_adjust_timer = mce_intel_adjust_timer; | 1638 | mce_adjust_timer = cmci_intel_adjust_timer; |
1633 | break; | 1639 | break; |
1634 | case X86_VENDOR_AMD: | 1640 | case X86_VENDOR_AMD: |
1635 | mce_amd_feature_init(c); | 1641 | mce_amd_feature_init(c); |
1642 | mce_flags.overflow_recov = cpuid_ebx(0x80000007) & 0x1; | ||
1636 | break; | 1643 | break; |
1637 | default: | 1644 | default: |
1638 | break; | 1645 | break; |
@@ -2017,6 +2024,7 @@ __setup("mce", mcheck_enable); | |||
2017 | int __init mcheck_init(void) | 2024 | int __init mcheck_init(void) |
2018 | { | 2025 | { |
2019 | mcheck_intel_therm_init(); | 2026 | mcheck_intel_therm_init(); |
2027 | mcheck_vendor_init_severity(); | ||
2020 | 2028 | ||
2021 | return 0; | 2029 | return 0; |
2022 | } | 2030 | } |
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index f1c3769bbd64..55ad9b37cae8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c | |||
@@ -79,7 +79,7 @@ static inline bool is_shared_bank(int bank) | |||
79 | return (bank == 4); | 79 | return (bank == 4); |
80 | } | 80 | } |
81 | 81 | ||
82 | static const char * const bank4_names(struct threshold_block *b) | 82 | static const char *bank4_names(const struct threshold_block *b) |
83 | { | 83 | { |
84 | switch (b->address) { | 84 | switch (b->address) { |
85 | /* MSR4_MISC0 */ | 85 | /* MSR4_MISC0 */ |
@@ -250,6 +250,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) | |||
250 | if (!b.interrupt_capable) | 250 | if (!b.interrupt_capable) |
251 | goto init; | 251 | goto init; |
252 | 252 | ||
253 | b.interrupt_enable = 1; | ||
253 | new = (high & MASK_LVTOFF_HI) >> 20; | 254 | new = (high & MASK_LVTOFF_HI) >> 20; |
254 | offset = setup_APIC_mce(offset, new); | 255 | offset = setup_APIC_mce(offset, new); |
255 | 256 | ||
@@ -322,6 +323,8 @@ static void amd_threshold_interrupt(void) | |||
322 | log: | 323 | log: |
323 | mce_setup(&m); | 324 | mce_setup(&m); |
324 | rdmsrl(MSR_IA32_MCx_STATUS(bank), m.status); | 325 | rdmsrl(MSR_IA32_MCx_STATUS(bank), m.status); |
326 | if (!(m.status & MCI_STATUS_VAL)) | ||
327 | return; | ||
325 | m.misc = ((u64)high << 32) | low; | 328 | m.misc = ((u64)high << 32) | low; |
326 | m.bank = bank; | 329 | m.bank = bank; |
327 | mce_log(&m); | 330 | mce_log(&m); |
@@ -497,10 +500,12 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank, | |||
497 | b->interrupt_capable = lvt_interrupt_supported(bank, high); | 500 | b->interrupt_capable = lvt_interrupt_supported(bank, high); |
498 | b->threshold_limit = THRESHOLD_MAX; | 501 | b->threshold_limit = THRESHOLD_MAX; |
499 | 502 | ||
500 | if (b->interrupt_capable) | 503 | if (b->interrupt_capable) { |
501 | threshold_ktype.default_attrs[2] = &interrupt_enable.attr; | 504 | threshold_ktype.default_attrs[2] = &interrupt_enable.attr; |
502 | else | 505 | b->interrupt_enable = 1; |
506 | } else { | ||
503 | threshold_ktype.default_attrs[2] = NULL; | 507 | threshold_ktype.default_attrs[2] = NULL; |
508 | } | ||
504 | 509 | ||
505 | INIT_LIST_HEAD(&b->miscj); | 510 | INIT_LIST_HEAD(&b->miscj); |
506 | 511 | ||
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index b3c97bafc123..b4a41cf030ed 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c | |||
@@ -39,6 +39,15 @@ | |||
39 | static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); | 39 | static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); |
40 | 40 | ||
41 | /* | 41 | /* |
42 | * CMCI storm detection backoff counter | ||
43 | * | ||
44 | * During storm, we reset this counter to INITIAL_CHECK_INTERVAL in case we've | ||
45 | * encountered an error. If not, we decrement it by one. We signal the end of | ||
46 | * the CMCI storm when it reaches 0. | ||
47 | */ | ||
48 | static DEFINE_PER_CPU(int, cmci_backoff_cnt); | ||
49 | |||
50 | /* | ||
42 | * cmci_discover_lock protects against parallel discovery attempts | 51 | * cmci_discover_lock protects against parallel discovery attempts |
43 | * which could race against each other. | 52 | * which could race against each other. |
44 | */ | 53 | */ |
@@ -46,7 +55,7 @@ static DEFINE_RAW_SPINLOCK(cmci_discover_lock); | |||
46 | 55 | ||
47 | #define CMCI_THRESHOLD 1 | 56 | #define CMCI_THRESHOLD 1 |
48 | #define CMCI_POLL_INTERVAL (30 * HZ) | 57 | #define CMCI_POLL_INTERVAL (30 * HZ) |
49 | #define CMCI_STORM_INTERVAL (1 * HZ) | 58 | #define CMCI_STORM_INTERVAL (HZ) |
50 | #define CMCI_STORM_THRESHOLD 15 | 59 | #define CMCI_STORM_THRESHOLD 15 |
51 | 60 | ||
52 | static DEFINE_PER_CPU(unsigned long, cmci_time_stamp); | 61 | static DEFINE_PER_CPU(unsigned long, cmci_time_stamp); |
@@ -82,11 +91,21 @@ static int cmci_supported(int *banks) | |||
82 | return !!(cap & MCG_CMCI_P); | 91 | return !!(cap & MCG_CMCI_P); |
83 | } | 92 | } |
84 | 93 | ||
85 | void mce_intel_cmci_poll(void) | 94 | bool mce_intel_cmci_poll(void) |
86 | { | 95 | { |
87 | if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE) | 96 | if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE) |
88 | return; | 97 | return false; |
89 | machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); | 98 | |
99 | /* | ||
100 | * Reset the counter if we've logged an error in the last poll | ||
101 | * during the storm. | ||
102 | */ | ||
103 | if (machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned))) | ||
104 | this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL); | ||
105 | else | ||
106 | this_cpu_dec(cmci_backoff_cnt); | ||
107 | |||
108 | return true; | ||
90 | } | 109 | } |
91 | 110 | ||
92 | void mce_intel_hcpu_update(unsigned long cpu) | 111 | void mce_intel_hcpu_update(unsigned long cpu) |
@@ -97,31 +116,32 @@ void mce_intel_hcpu_update(unsigned long cpu) | |||
97 | per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE; | 116 | per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE; |
98 | } | 117 | } |
99 | 118 | ||
100 | unsigned long mce_intel_adjust_timer(unsigned long interval) | 119 | unsigned long cmci_intel_adjust_timer(unsigned long interval) |
101 | { | 120 | { |
102 | int r; | 121 | if ((this_cpu_read(cmci_backoff_cnt) > 0) && |
103 | 122 | (__this_cpu_read(cmci_storm_state) == CMCI_STORM_ACTIVE)) { | |
104 | if (interval < CMCI_POLL_INTERVAL) | 123 | mce_notify_irq(); |
105 | return interval; | 124 | return CMCI_STORM_INTERVAL; |
125 | } | ||
106 | 126 | ||
107 | switch (__this_cpu_read(cmci_storm_state)) { | 127 | switch (__this_cpu_read(cmci_storm_state)) { |
108 | case CMCI_STORM_ACTIVE: | 128 | case CMCI_STORM_ACTIVE: |
129 | |||
109 | /* | 130 | /* |
110 | * We switch back to interrupt mode once the poll timer has | 131 | * We switch back to interrupt mode once the poll timer has |
111 | * silenced itself. That means no events recorded and the | 132 | * silenced itself. That means no events recorded and the timer |
112 | * timer interval is back to our poll interval. | 133 | * interval is back to our poll interval. |
113 | */ | 134 | */ |
114 | __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED); | 135 | __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED); |
115 | r = atomic_sub_return(1, &cmci_storm_on_cpus); | 136 | if (!atomic_sub_return(1, &cmci_storm_on_cpus)) |
116 | if (r == 0) | ||
117 | pr_notice("CMCI storm subsided: switching to interrupt mode\n"); | 137 | pr_notice("CMCI storm subsided: switching to interrupt mode\n"); |
138 | |||
118 | /* FALLTHROUGH */ | 139 | /* FALLTHROUGH */ |
119 | 140 | ||
120 | case CMCI_STORM_SUBSIDED: | 141 | case CMCI_STORM_SUBSIDED: |
121 | /* | 142 | /* |
122 | * We wait for all cpus to go back to SUBSIDED | 143 | * We wait for all CPUs to go back to SUBSIDED state. When that |
123 | * state. When that happens we switch back to | 144 | * happens we switch back to interrupt mode. |
124 | * interrupt mode. | ||
125 | */ | 145 | */ |
126 | if (!atomic_read(&cmci_storm_on_cpus)) { | 146 | if (!atomic_read(&cmci_storm_on_cpus)) { |
127 | __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE); | 147 | __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE); |
@@ -130,10 +150,8 @@ unsigned long mce_intel_adjust_timer(unsigned long interval) | |||
130 | } | 150 | } |
131 | return CMCI_POLL_INTERVAL; | 151 | return CMCI_POLL_INTERVAL; |
132 | default: | 152 | default: |
133 | /* | 153 | |
134 | * We have shiny weather. Let the poll do whatever it | 154 | /* We have shiny weather. Let the poll do whatever it thinks. */ |
135 | * thinks. | ||
136 | */ | ||
137 | return interval; | 155 | return interval; |
138 | } | 156 | } |
139 | } | 157 | } |
@@ -178,7 +196,8 @@ static bool cmci_storm_detect(void) | |||
178 | cmci_storm_disable_banks(); | 196 | cmci_storm_disable_banks(); |
179 | __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE); | 197 | __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE); |
180 | r = atomic_add_return(1, &cmci_storm_on_cpus); | 198 | r = atomic_add_return(1, &cmci_storm_on_cpus); |
181 | mce_timer_kick(CMCI_POLL_INTERVAL); | 199 | mce_timer_kick(CMCI_STORM_INTERVAL); |
200 | this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL); | ||
182 | 201 | ||
183 | if (r == 1) | 202 | if (r == 1) |
184 | pr_notice("CMCI storm detected: switching to poll mode\n"); | 203 | pr_notice("CMCI storm detected: switching to poll mode\n"); |
@@ -195,6 +214,7 @@ static void intel_threshold_interrupt(void) | |||
195 | { | 214 | { |
196 | if (cmci_storm_detect()) | 215 | if (cmci_storm_detect()) |
197 | return; | 216 | return; |
217 | |||
198 | machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); | 218 | machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); |
199 | mce_notify_irq(); | 219 | mce_notify_irq(); |
200 | } | 220 | } |
@@ -286,6 +306,7 @@ void cmci_recheck(void) | |||
286 | 306 | ||
287 | if (!mce_available(raw_cpu_ptr(&cpu_info)) || !cmci_supported(&banks)) | 307 | if (!mce_available(raw_cpu_ptr(&cpu_info)) || !cmci_supported(&banks)) |
288 | return; | 308 | return; |
309 | |||
289 | local_irq_save(flags); | 310 | local_irq_save(flags); |
290 | machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); | 311 | machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); |
291 | local_irq_restore(flags); | 312 | local_irq_restore(flags); |
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index bfbbe6195e2d..12829c3ced3c 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c | |||
@@ -21,7 +21,6 @@ | |||
21 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | 21 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
22 | 22 | ||
23 | #include <linux/firmware.h> | 23 | #include <linux/firmware.h> |
24 | #include <linux/pci_ids.h> | ||
25 | #include <linux/uaccess.h> | 24 | #include <linux/uaccess.h> |
26 | #include <linux/vmalloc.h> | 25 | #include <linux/vmalloc.h> |
27 | #include <linux/kernel.h> | 26 | #include <linux/kernel.h> |
diff --git a/arch/x86/kernel/cpu/microcode/core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c index d45df4bd16ab..a413a69cbd74 100644 --- a/arch/x86/kernel/cpu/microcode/core_early.c +++ b/arch/x86/kernel/cpu/microcode/core_early.c | |||
@@ -23,57 +23,6 @@ | |||
23 | #include <asm/processor.h> | 23 | #include <asm/processor.h> |
24 | #include <asm/cmdline.h> | 24 | #include <asm/cmdline.h> |
25 | 25 | ||
26 | #define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24)) | ||
27 | #define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u') | ||
28 | #define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I') | ||
29 | #define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l') | ||
30 | #define CPUID_AMD1 QCHAR('A', 'u', 't', 'h') | ||
31 | #define CPUID_AMD2 QCHAR('e', 'n', 't', 'i') | ||
32 | #define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D') | ||
33 | |||
34 | #define CPUID_IS(a, b, c, ebx, ecx, edx) \ | ||
35 | (!((ebx ^ (a))|(edx ^ (b))|(ecx ^ (c)))) | ||
36 | |||
37 | /* | ||
38 | * In early loading microcode phase on BSP, boot_cpu_data is not set up yet. | ||
39 | * x86_vendor() gets vendor id for BSP. | ||
40 | * | ||
41 | * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify | ||
42 | * coding, we still use x86_vendor() to get vendor id for AP. | ||
43 | * | ||
44 | * x86_vendor() gets vendor information directly through cpuid. | ||
45 | */ | ||
46 | static int x86_vendor(void) | ||
47 | { | ||
48 | u32 eax = 0x00000000; | ||
49 | u32 ebx, ecx = 0, edx; | ||
50 | |||
51 | native_cpuid(&eax, &ebx, &ecx, &edx); | ||
52 | |||
53 | if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx)) | ||
54 | return X86_VENDOR_INTEL; | ||
55 | |||
56 | if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx)) | ||
57 | return X86_VENDOR_AMD; | ||
58 | |||
59 | return X86_VENDOR_UNKNOWN; | ||
60 | } | ||
61 | |||
62 | static int x86_family(void) | ||
63 | { | ||
64 | u32 eax = 0x00000001; | ||
65 | u32 ebx, ecx = 0, edx; | ||
66 | int x86; | ||
67 | |||
68 | native_cpuid(&eax, &ebx, &ecx, &edx); | ||
69 | |||
70 | x86 = (eax >> 8) & 0xf; | ||
71 | if (x86 == 15) | ||
72 | x86 += (eax >> 20) & 0xff; | ||
73 | |||
74 | return x86; | ||
75 | } | ||
76 | |||
77 | static bool __init check_loader_disabled_bsp(void) | 26 | static bool __init check_loader_disabled_bsp(void) |
78 | { | 27 | { |
79 | #ifdef CONFIG_X86_32 | 28 | #ifdef CONFIG_X86_32 |
@@ -96,7 +45,7 @@ static bool __init check_loader_disabled_bsp(void) | |||
96 | 45 | ||
97 | void __init load_ucode_bsp(void) | 46 | void __init load_ucode_bsp(void) |
98 | { | 47 | { |
99 | int vendor, x86; | 48 | int vendor, family; |
100 | 49 | ||
101 | if (check_loader_disabled_bsp()) | 50 | if (check_loader_disabled_bsp()) |
102 | return; | 51 | return; |
@@ -105,15 +54,15 @@ void __init load_ucode_bsp(void) | |||
105 | return; | 54 | return; |
106 | 55 | ||
107 | vendor = x86_vendor(); | 56 | vendor = x86_vendor(); |
108 | x86 = x86_family(); | 57 | family = x86_family(); |
109 | 58 | ||
110 | switch (vendor) { | 59 | switch (vendor) { |
111 | case X86_VENDOR_INTEL: | 60 | case X86_VENDOR_INTEL: |
112 | if (x86 >= 6) | 61 | if (family >= 6) |
113 | load_ucode_intel_bsp(); | 62 | load_ucode_intel_bsp(); |
114 | break; | 63 | break; |
115 | case X86_VENDOR_AMD: | 64 | case X86_VENDOR_AMD: |
116 | if (x86 >= 0x10) | 65 | if (family >= 0x10) |
117 | load_ucode_amd_bsp(); | 66 | load_ucode_amd_bsp(); |
118 | break; | 67 | break; |
119 | default: | 68 | default: |
@@ -132,7 +81,7 @@ static bool check_loader_disabled_ap(void) | |||
132 | 81 | ||
133 | void load_ucode_ap(void) | 82 | void load_ucode_ap(void) |
134 | { | 83 | { |
135 | int vendor, x86; | 84 | int vendor, family; |
136 | 85 | ||
137 | if (check_loader_disabled_ap()) | 86 | if (check_loader_disabled_ap()) |
138 | return; | 87 | return; |
@@ -141,15 +90,15 @@ void load_ucode_ap(void) | |||
141 | return; | 90 | return; |
142 | 91 | ||
143 | vendor = x86_vendor(); | 92 | vendor = x86_vendor(); |
144 | x86 = x86_family(); | 93 | family = x86_family(); |
145 | 94 | ||
146 | switch (vendor) { | 95 | switch (vendor) { |
147 | case X86_VENDOR_INTEL: | 96 | case X86_VENDOR_INTEL: |
148 | if (x86 >= 6) | 97 | if (family >= 6) |
149 | load_ucode_intel_ap(); | 98 | load_ucode_intel_ap(); |
150 | break; | 99 | break; |
151 | case X86_VENDOR_AMD: | 100 | case X86_VENDOR_AMD: |
152 | if (x86 >= 0x10) | 101 | if (family >= 0x10) |
153 | load_ucode_amd_ap(); | 102 | load_ucode_amd_ap(); |
154 | break; | 103 | break; |
155 | default: | 104 | default: |
@@ -179,18 +128,18 @@ int __init save_microcode_in_initrd(void) | |||
179 | 128 | ||
180 | void reload_early_microcode(void) | 129 | void reload_early_microcode(void) |
181 | { | 130 | { |
182 | int vendor, x86; | 131 | int vendor, family; |
183 | 132 | ||
184 | vendor = x86_vendor(); | 133 | vendor = x86_vendor(); |
185 | x86 = x86_family(); | 134 | family = x86_family(); |
186 | 135 | ||
187 | switch (vendor) { | 136 | switch (vendor) { |
188 | case X86_VENDOR_INTEL: | 137 | case X86_VENDOR_INTEL: |
189 | if (x86 >= 6) | 138 | if (family >= 6) |
190 | reload_ucode_intel(); | 139 | reload_ucode_intel(); |
191 | break; | 140 | break; |
192 | case X86_VENDOR_AMD: | 141 | case X86_VENDOR_AMD: |
193 | if (x86 >= 0x10) | 142 | if (family >= 0x10) |
194 | reload_ucode_amd(); | 143 | reload_ucode_amd(); |
195 | break; | 144 | break; |
196 | default: | 145 | default: |
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 746e7fd08aad..a41beadb3db9 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c | |||
@@ -124,7 +124,7 @@ static int get_matching_mc(struct microcode_intel *mc_intel, int cpu) | |||
124 | cpf = cpu_sig.pf; | 124 | cpf = cpu_sig.pf; |
125 | crev = cpu_sig.rev; | 125 | crev = cpu_sig.rev; |
126 | 126 | ||
127 | return get_matching_microcode(csig, cpf, mc_intel, crev); | 127 | return get_matching_microcode(csig, cpf, crev, mc_intel); |
128 | } | 128 | } |
129 | 129 | ||
130 | static int apply_microcode_intel(int cpu) | 130 | static int apply_microcode_intel(int cpu) |
@@ -226,7 +226,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, | |||
226 | 226 | ||
227 | csig = uci->cpu_sig.sig; | 227 | csig = uci->cpu_sig.sig; |
228 | cpf = uci->cpu_sig.pf; | 228 | cpf = uci->cpu_sig.pf; |
229 | if (get_matching_microcode(csig, cpf, mc, new_rev)) { | 229 | if (get_matching_microcode(csig, cpf, new_rev, mc)) { |
230 | vfree(new_mc); | 230 | vfree(new_mc); |
231 | new_rev = mc_header.rev; | 231 | new_rev = mc_header.rev; |
232 | new_mc = mc; | 232 | new_mc = mc; |
diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c index 420eb933189c..2f49ab4ac0ae 100644 --- a/arch/x86/kernel/cpu/microcode/intel_early.c +++ b/arch/x86/kernel/cpu/microcode/intel_early.c | |||
@@ -16,6 +16,14 @@ | |||
16 | * as published by the Free Software Foundation; either version | 16 | * as published by the Free Software Foundation; either version |
17 | * 2 of the License, or (at your option) any later version. | 17 | * 2 of the License, or (at your option) any later version. |
18 | */ | 18 | */ |
19 | |||
20 | /* | ||
21 | * This needs to be before all headers so that pr_debug in printk.h doesn't turn | ||
22 | * printk calls into no_printk(). | ||
23 | * | ||
24 | *#define DEBUG | ||
25 | */ | ||
26 | |||
19 | #include <linux/module.h> | 27 | #include <linux/module.h> |
20 | #include <linux/mm.h> | 28 | #include <linux/mm.h> |
21 | #include <linux/slab.h> | 29 | #include <linux/slab.h> |
@@ -28,6 +36,9 @@ | |||
28 | #include <asm/tlbflush.h> | 36 | #include <asm/tlbflush.h> |
29 | #include <asm/setup.h> | 37 | #include <asm/setup.h> |
30 | 38 | ||
39 | #undef pr_fmt | ||
40 | #define pr_fmt(fmt) "microcode: " fmt | ||
41 | |||
31 | static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT]; | 42 | static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT]; |
32 | static struct mc_saved_data { | 43 | static struct mc_saved_data { |
33 | unsigned int mc_saved_count; | 44 | unsigned int mc_saved_count; |
@@ -35,50 +46,45 @@ static struct mc_saved_data { | |||
35 | } mc_saved_data; | 46 | } mc_saved_data; |
36 | 47 | ||
37 | static enum ucode_state | 48 | static enum ucode_state |
38 | generic_load_microcode_early(struct microcode_intel **mc_saved_p, | 49 | load_microcode_early(struct microcode_intel **saved, |
39 | unsigned int mc_saved_count, | 50 | unsigned int num_saved, struct ucode_cpu_info *uci) |
40 | struct ucode_cpu_info *uci) | ||
41 | { | 51 | { |
42 | struct microcode_intel *ucode_ptr, *new_mc = NULL; | 52 | struct microcode_intel *ucode_ptr, *new_mc = NULL; |
43 | int new_rev = uci->cpu_sig.rev; | 53 | struct microcode_header_intel *mc_hdr; |
44 | enum ucode_state state = UCODE_OK; | 54 | int new_rev, ret, i; |
45 | unsigned int mc_size; | ||
46 | struct microcode_header_intel *mc_header; | ||
47 | unsigned int csig = uci->cpu_sig.sig; | ||
48 | unsigned int cpf = uci->cpu_sig.pf; | ||
49 | int i; | ||
50 | 55 | ||
51 | for (i = 0; i < mc_saved_count; i++) { | 56 | new_rev = uci->cpu_sig.rev; |
52 | ucode_ptr = mc_saved_p[i]; | ||
53 | 57 | ||
54 | mc_header = (struct microcode_header_intel *)ucode_ptr; | 58 | for (i = 0; i < num_saved; i++) { |
55 | mc_size = get_totalsize(mc_header); | 59 | ucode_ptr = saved[i]; |
56 | if (get_matching_microcode(csig, cpf, ucode_ptr, new_rev)) { | 60 | mc_hdr = (struct microcode_header_intel *)ucode_ptr; |
57 | new_rev = mc_header->rev; | ||
58 | new_mc = ucode_ptr; | ||
59 | } | ||
60 | } | ||
61 | 61 | ||
62 | if (!new_mc) { | 62 | ret = get_matching_microcode(uci->cpu_sig.sig, |
63 | state = UCODE_NFOUND; | 63 | uci->cpu_sig.pf, |
64 | goto out; | 64 | new_rev, |
65 | ucode_ptr); | ||
66 | if (!ret) | ||
67 | continue; | ||
68 | |||
69 | new_rev = mc_hdr->rev; | ||
70 | new_mc = ucode_ptr; | ||
65 | } | 71 | } |
66 | 72 | ||
73 | if (!new_mc) | ||
74 | return UCODE_NFOUND; | ||
75 | |||
67 | uci->mc = (struct microcode_intel *)new_mc; | 76 | uci->mc = (struct microcode_intel *)new_mc; |
68 | out: | 77 | return UCODE_OK; |
69 | return state; | ||
70 | } | 78 | } |
71 | 79 | ||
72 | static void | 80 | static inline void |
73 | microcode_pointer(struct microcode_intel **mc_saved, | 81 | copy_initrd_ptrs(struct microcode_intel **mc_saved, unsigned long *initrd, |
74 | unsigned long *mc_saved_in_initrd, | 82 | unsigned long off, int num_saved) |
75 | unsigned long initrd_start, int mc_saved_count) | ||
76 | { | 83 | { |
77 | int i; | 84 | int i; |
78 | 85 | ||
79 | for (i = 0; i < mc_saved_count; i++) | 86 | for (i = 0; i < num_saved; i++) |
80 | mc_saved[i] = (struct microcode_intel *) | 87 | mc_saved[i] = (struct microcode_intel *)(initrd[i] + off); |
81 | (mc_saved_in_initrd[i] + initrd_start); | ||
82 | } | 88 | } |
83 | 89 | ||
84 | #ifdef CONFIG_X86_32 | 90 | #ifdef CONFIG_X86_32 |
@@ -102,55 +108,27 @@ microcode_phys(struct microcode_intel **mc_saved_tmp, | |||
102 | #endif | 108 | #endif |
103 | 109 | ||
104 | static enum ucode_state | 110 | static enum ucode_state |
105 | load_microcode(struct mc_saved_data *mc_saved_data, | 111 | load_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd, |
106 | unsigned long *mc_saved_in_initrd, | 112 | unsigned long initrd_start, struct ucode_cpu_info *uci) |
107 | unsigned long initrd_start, | ||
108 | struct ucode_cpu_info *uci) | ||
109 | { | 113 | { |
110 | struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; | 114 | struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; |
111 | unsigned int count = mc_saved_data->mc_saved_count; | 115 | unsigned int count = mc_saved_data->mc_saved_count; |
112 | 116 | ||
113 | if (!mc_saved_data->mc_saved) { | 117 | if (!mc_saved_data->mc_saved) { |
114 | microcode_pointer(mc_saved_tmp, mc_saved_in_initrd, | 118 | copy_initrd_ptrs(mc_saved_tmp, initrd, initrd_start, count); |
115 | initrd_start, count); | ||
116 | 119 | ||
117 | return generic_load_microcode_early(mc_saved_tmp, count, uci); | 120 | return load_microcode_early(mc_saved_tmp, count, uci); |
118 | } else { | 121 | } else { |
119 | #ifdef CONFIG_X86_32 | 122 | #ifdef CONFIG_X86_32 |
120 | microcode_phys(mc_saved_tmp, mc_saved_data); | 123 | microcode_phys(mc_saved_tmp, mc_saved_data); |
121 | return generic_load_microcode_early(mc_saved_tmp, count, uci); | 124 | return load_microcode_early(mc_saved_tmp, count, uci); |
122 | #else | 125 | #else |
123 | return generic_load_microcode_early(mc_saved_data->mc_saved, | 126 | return load_microcode_early(mc_saved_data->mc_saved, |
124 | count, uci); | 127 | count, uci); |
125 | #endif | 128 | #endif |
126 | } | 129 | } |
127 | } | 130 | } |
128 | 131 | ||
129 | static u8 get_x86_family(unsigned long sig) | ||
130 | { | ||
131 | u8 x86; | ||
132 | |||
133 | x86 = (sig >> 8) & 0xf; | ||
134 | |||
135 | if (x86 == 0xf) | ||
136 | x86 += (sig >> 20) & 0xff; | ||
137 | |||
138 | return x86; | ||
139 | } | ||
140 | |||
141 | static u8 get_x86_model(unsigned long sig) | ||
142 | { | ||
143 | u8 x86, x86_model; | ||
144 | |||
145 | x86 = get_x86_family(sig); | ||
146 | x86_model = (sig >> 4) & 0xf; | ||
147 | |||
148 | if (x86 == 0x6 || x86 == 0xf) | ||
149 | x86_model += ((sig >> 16) & 0xf) << 4; | ||
150 | |||
151 | return x86_model; | ||
152 | } | ||
153 | |||
154 | /* | 132 | /* |
155 | * Given CPU signature and a microcode patch, this function finds if the | 133 | * Given CPU signature and a microcode patch, this function finds if the |
156 | * microcode patch has matching family and model with the CPU. | 134 | * microcode patch has matching family and model with the CPU. |
@@ -159,42 +137,40 @@ static enum ucode_state | |||
159 | matching_model_microcode(struct microcode_header_intel *mc_header, | 137 | matching_model_microcode(struct microcode_header_intel *mc_header, |
160 | unsigned long sig) | 138 | unsigned long sig) |
161 | { | 139 | { |
162 | u8 x86, x86_model; | 140 | unsigned int fam, model; |
163 | u8 x86_ucode, x86_model_ucode; | 141 | unsigned int fam_ucode, model_ucode; |
164 | struct extended_sigtable *ext_header; | 142 | struct extended_sigtable *ext_header; |
165 | unsigned long total_size = get_totalsize(mc_header); | 143 | unsigned long total_size = get_totalsize(mc_header); |
166 | unsigned long data_size = get_datasize(mc_header); | 144 | unsigned long data_size = get_datasize(mc_header); |
167 | int ext_sigcount, i; | 145 | int ext_sigcount, i; |
168 | struct extended_signature *ext_sig; | 146 | struct extended_signature *ext_sig; |
169 | 147 | ||
170 | x86 = get_x86_family(sig); | 148 | fam = __x86_family(sig); |
171 | x86_model = get_x86_model(sig); | 149 | model = x86_model(sig); |
172 | 150 | ||
173 | x86_ucode = get_x86_family(mc_header->sig); | 151 | fam_ucode = __x86_family(mc_header->sig); |
174 | x86_model_ucode = get_x86_model(mc_header->sig); | 152 | model_ucode = x86_model(mc_header->sig); |
175 | 153 | ||
176 | if (x86 == x86_ucode && x86_model == x86_model_ucode) | 154 | if (fam == fam_ucode && model == model_ucode) |
177 | return UCODE_OK; | 155 | return UCODE_OK; |
178 | 156 | ||
179 | /* Look for ext. headers: */ | 157 | /* Look for ext. headers: */ |
180 | if (total_size <= data_size + MC_HEADER_SIZE) | 158 | if (total_size <= data_size + MC_HEADER_SIZE) |
181 | return UCODE_NFOUND; | 159 | return UCODE_NFOUND; |
182 | 160 | ||
183 | ext_header = (struct extended_sigtable *) | 161 | ext_header = (void *) mc_header + data_size + MC_HEADER_SIZE; |
184 | mc_header + data_size + MC_HEADER_SIZE; | 162 | ext_sig = (void *)ext_header + EXT_HEADER_SIZE; |
185 | ext_sigcount = ext_header->count; | 163 | ext_sigcount = ext_header->count; |
186 | ext_sig = (void *)ext_header + EXT_HEADER_SIZE; | ||
187 | 164 | ||
188 | for (i = 0; i < ext_sigcount; i++) { | 165 | for (i = 0; i < ext_sigcount; i++) { |
189 | x86_ucode = get_x86_family(ext_sig->sig); | 166 | fam_ucode = __x86_family(ext_sig->sig); |
190 | x86_model_ucode = get_x86_model(ext_sig->sig); | 167 | model_ucode = x86_model(ext_sig->sig); |
191 | 168 | ||
192 | if (x86 == x86_ucode && x86_model == x86_model_ucode) | 169 | if (fam == fam_ucode && model == model_ucode) |
193 | return UCODE_OK; | 170 | return UCODE_OK; |
194 | 171 | ||
195 | ext_sig++; | 172 | ext_sig++; |
196 | } | 173 | } |
197 | |||
198 | return UCODE_NFOUND; | 174 | return UCODE_NFOUND; |
199 | } | 175 | } |
200 | 176 | ||
@@ -204,7 +180,7 @@ save_microcode(struct mc_saved_data *mc_saved_data, | |||
204 | unsigned int mc_saved_count) | 180 | unsigned int mc_saved_count) |
205 | { | 181 | { |
206 | int i, j; | 182 | int i, j; |
207 | struct microcode_intel **mc_saved_p; | 183 | struct microcode_intel **saved_ptr; |
208 | int ret; | 184 | int ret; |
209 | 185 | ||
210 | if (!mc_saved_count) | 186 | if (!mc_saved_count) |
@@ -213,39 +189,45 @@ save_microcode(struct mc_saved_data *mc_saved_data, | |||
213 | /* | 189 | /* |
214 | * Copy new microcode data. | 190 | * Copy new microcode data. |
215 | */ | 191 | */ |
216 | mc_saved_p = kmalloc(mc_saved_count*sizeof(struct microcode_intel *), | 192 | saved_ptr = kcalloc(mc_saved_count, sizeof(struct microcode_intel *), GFP_KERNEL); |
217 | GFP_KERNEL); | 193 | if (!saved_ptr) |
218 | if (!mc_saved_p) | ||
219 | return -ENOMEM; | 194 | return -ENOMEM; |
220 | 195 | ||
221 | for (i = 0; i < mc_saved_count; i++) { | 196 | for (i = 0; i < mc_saved_count; i++) { |
222 | struct microcode_intel *mc = mc_saved_src[i]; | 197 | struct microcode_header_intel *mc_hdr; |
223 | struct microcode_header_intel *mc_header = &mc->hdr; | 198 | struct microcode_intel *mc; |
224 | unsigned long mc_size = get_totalsize(mc_header); | 199 | unsigned long size; |
225 | mc_saved_p[i] = kmalloc(mc_size, GFP_KERNEL); | 200 | |
226 | if (!mc_saved_p[i]) { | ||
227 | ret = -ENOMEM; | ||
228 | goto err; | ||
229 | } | ||
230 | if (!mc_saved_src[i]) { | 201 | if (!mc_saved_src[i]) { |
231 | ret = -EINVAL; | 202 | ret = -EINVAL; |
232 | goto err; | 203 | goto err; |
233 | } | 204 | } |
234 | memcpy(mc_saved_p[i], mc, mc_size); | 205 | |
206 | mc = mc_saved_src[i]; | ||
207 | mc_hdr = &mc->hdr; | ||
208 | size = get_totalsize(mc_hdr); | ||
209 | |||
210 | saved_ptr[i] = kmalloc(size, GFP_KERNEL); | ||
211 | if (!saved_ptr[i]) { | ||
212 | ret = -ENOMEM; | ||
213 | goto err; | ||
214 | } | ||
215 | |||
216 | memcpy(saved_ptr[i], mc, size); | ||
235 | } | 217 | } |
236 | 218 | ||
237 | /* | 219 | /* |
238 | * Point to newly saved microcode. | 220 | * Point to newly saved microcode. |
239 | */ | 221 | */ |
240 | mc_saved_data->mc_saved = mc_saved_p; | 222 | mc_saved_data->mc_saved = saved_ptr; |
241 | mc_saved_data->mc_saved_count = mc_saved_count; | 223 | mc_saved_data->mc_saved_count = mc_saved_count; |
242 | 224 | ||
243 | return 0; | 225 | return 0; |
244 | 226 | ||
245 | err: | 227 | err: |
246 | for (j = 0; j <= i; j++) | 228 | for (j = 0; j <= i; j++) |
247 | kfree(mc_saved_p[j]); | 229 | kfree(saved_ptr[j]); |
248 | kfree(mc_saved_p); | 230 | kfree(saved_ptr); |
249 | 231 | ||
250 | return ret; | 232 | return ret; |
251 | } | 233 | } |
@@ -257,48 +239,45 @@ err: | |||
257 | * - or if it is a newly discovered microcode patch. | 239 | * - or if it is a newly discovered microcode patch. |
258 | * | 240 | * |
259 | * The microcode patch should have matching model with CPU. | 241 | * The microcode patch should have matching model with CPU. |
242 | * | ||
243 | * Returns: The updated number @num_saved of saved microcode patches. | ||
260 | */ | 244 | */ |
261 | static void _save_mc(struct microcode_intel **mc_saved, u8 *ucode_ptr, | 245 | static unsigned int _save_mc(struct microcode_intel **mc_saved, |
262 | unsigned int *mc_saved_count_p) | 246 | u8 *ucode_ptr, unsigned int num_saved) |
263 | { | 247 | { |
264 | int i; | 248 | struct microcode_header_intel *mc_hdr, *mc_saved_hdr; |
265 | int found = 0; | 249 | unsigned int sig, pf, new_rev; |
266 | unsigned int mc_saved_count = *mc_saved_count_p; | 250 | int found = 0, i; |
267 | struct microcode_header_intel *mc_header; | 251 | |
252 | mc_hdr = (struct microcode_header_intel *)ucode_ptr; | ||
253 | |||
254 | for (i = 0; i < num_saved; i++) { | ||
255 | mc_saved_hdr = (struct microcode_header_intel *)mc_saved[i]; | ||
256 | sig = mc_saved_hdr->sig; | ||
257 | pf = mc_saved_hdr->pf; | ||
258 | new_rev = mc_hdr->rev; | ||
259 | |||
260 | if (!get_matching_sig(sig, pf, new_rev, ucode_ptr)) | ||
261 | continue; | ||
262 | |||
263 | found = 1; | ||
264 | |||
265 | if (!revision_is_newer(mc_hdr, new_rev)) | ||
266 | continue; | ||
268 | 267 | ||
269 | mc_header = (struct microcode_header_intel *)ucode_ptr; | ||
270 | for (i = 0; i < mc_saved_count; i++) { | ||
271 | unsigned int sig, pf; | ||
272 | unsigned int new_rev; | ||
273 | struct microcode_header_intel *mc_saved_header = | ||
274 | (struct microcode_header_intel *)mc_saved[i]; | ||
275 | sig = mc_saved_header->sig; | ||
276 | pf = mc_saved_header->pf; | ||
277 | new_rev = mc_header->rev; | ||
278 | |||
279 | if (get_matching_sig(sig, pf, ucode_ptr, new_rev)) { | ||
280 | found = 1; | ||
281 | if (update_match_revision(mc_header, new_rev)) { | ||
282 | /* | ||
283 | * Found an older ucode saved before. | ||
284 | * Replace the older one with this newer | ||
285 | * one. | ||
286 | */ | ||
287 | mc_saved[i] = | ||
288 | (struct microcode_intel *)ucode_ptr; | ||
289 | break; | ||
290 | } | ||
291 | } | ||
292 | } | ||
293 | if (i >= mc_saved_count && !found) | ||
294 | /* | 268 | /* |
295 | * This ucode is first time discovered in ucode file. | 269 | * Found an older ucode saved earlier. Replace it with |
296 | * Save it to memory. | 270 | * this newer one. |
297 | */ | 271 | */ |
298 | mc_saved[mc_saved_count++] = | 272 | mc_saved[i] = (struct microcode_intel *)ucode_ptr; |
299 | (struct microcode_intel *)ucode_ptr; | 273 | break; |
274 | } | ||
275 | |||
276 | /* Newly detected microcode, save it to memory. */ | ||
277 | if (i >= num_saved && !found) | ||
278 | mc_saved[num_saved++] = (struct microcode_intel *)ucode_ptr; | ||
300 | 279 | ||
301 | *mc_saved_count_p = mc_saved_count; | 280 | return num_saved; |
302 | } | 281 | } |
303 | 282 | ||
304 | /* | 283 | /* |
@@ -346,7 +325,7 @@ get_matching_model_microcode(int cpu, unsigned long start, | |||
346 | continue; | 325 | continue; |
347 | } | 326 | } |
348 | 327 | ||
349 | _save_mc(mc_saved_tmp, ucode_ptr, &mc_saved_count); | 328 | mc_saved_count = _save_mc(mc_saved_tmp, ucode_ptr, mc_saved_count); |
350 | 329 | ||
351 | ucode_ptr += mc_size; | 330 | ucode_ptr += mc_size; |
352 | } | 331 | } |
@@ -372,7 +351,7 @@ out: | |||
372 | static int collect_cpu_info_early(struct ucode_cpu_info *uci) | 351 | static int collect_cpu_info_early(struct ucode_cpu_info *uci) |
373 | { | 352 | { |
374 | unsigned int val[2]; | 353 | unsigned int val[2]; |
375 | u8 x86, x86_model; | 354 | unsigned int family, model; |
376 | struct cpu_signature csig; | 355 | struct cpu_signature csig; |
377 | unsigned int eax, ebx, ecx, edx; | 356 | unsigned int eax, ebx, ecx, edx; |
378 | 357 | ||
@@ -387,10 +366,10 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci) | |||
387 | native_cpuid(&eax, &ebx, &ecx, &edx); | 366 | native_cpuid(&eax, &ebx, &ecx, &edx); |
388 | csig.sig = eax; | 367 | csig.sig = eax; |
389 | 368 | ||
390 | x86 = get_x86_family(csig.sig); | 369 | family = __x86_family(csig.sig); |
391 | x86_model = get_x86_model(csig.sig); | 370 | model = x86_model(csig.sig); |
392 | 371 | ||
393 | if ((x86_model >= 5) || (x86 > 6)) { | 372 | if ((model >= 5) || (family > 6)) { |
394 | /* get processor flags from MSR 0x17 */ | 373 | /* get processor flags from MSR 0x17 */ |
395 | native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); | 374 | native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); |
396 | csig.pf = 1 << ((val[1] >> 18) & 7); | 375 | csig.pf = 1 << ((val[1] >> 18) & 7); |
@@ -429,8 +408,7 @@ static void __ref show_saved_mc(void) | |||
429 | sig = uci.cpu_sig.sig; | 408 | sig = uci.cpu_sig.sig; |
430 | pf = uci.cpu_sig.pf; | 409 | pf = uci.cpu_sig.pf; |
431 | rev = uci.cpu_sig.rev; | 410 | rev = uci.cpu_sig.rev; |
432 | pr_debug("CPU%d: sig=0x%x, pf=0x%x, rev=0x%x\n", | 411 | pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev); |
433 | smp_processor_id(), sig, pf, rev); | ||
434 | 412 | ||
435 | for (i = 0; i < mc_saved_data.mc_saved_count; i++) { | 413 | for (i = 0; i < mc_saved_data.mc_saved_count; i++) { |
436 | struct microcode_header_intel *mc_saved_header; | 414 | struct microcode_header_intel *mc_saved_header; |
@@ -457,8 +435,7 @@ static void __ref show_saved_mc(void) | |||
457 | if (total_size <= data_size + MC_HEADER_SIZE) | 435 | if (total_size <= data_size + MC_HEADER_SIZE) |
458 | continue; | 436 | continue; |
459 | 437 | ||
460 | ext_header = (struct extended_sigtable *) | 438 | ext_header = (void *) mc_saved_header + data_size + MC_HEADER_SIZE; |
461 | mc_saved_header + data_size + MC_HEADER_SIZE; | ||
462 | ext_sigcount = ext_header->count; | 439 | ext_sigcount = ext_header->count; |
463 | ext_sig = (void *)ext_header + EXT_HEADER_SIZE; | 440 | ext_sig = (void *)ext_header + EXT_HEADER_SIZE; |
464 | 441 | ||
@@ -515,8 +492,7 @@ int save_mc_for_early(u8 *mc) | |||
515 | * Save the microcode patch mc in mc_save_tmp structure if it's a newer | 492 | * Save the microcode patch mc in mc_save_tmp structure if it's a newer |
516 | * version. | 493 | * version. |
517 | */ | 494 | */ |
518 | 495 | mc_saved_count = _save_mc(mc_saved_tmp, mc, mc_saved_count); | |
519 | _save_mc(mc_saved_tmp, mc, &mc_saved_count); | ||
520 | 496 | ||
521 | /* | 497 | /* |
522 | * Save the mc_save_tmp in global mc_saved_data. | 498 | * Save the mc_save_tmp in global mc_saved_data. |
@@ -548,12 +524,10 @@ EXPORT_SYMBOL_GPL(save_mc_for_early); | |||
548 | 524 | ||
549 | static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin"; | 525 | static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin"; |
550 | static __init enum ucode_state | 526 | static __init enum ucode_state |
551 | scan_microcode(unsigned long start, unsigned long end, | 527 | scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd, |
552 | struct mc_saved_data *mc_saved_data, | 528 | unsigned long start, unsigned long size, |
553 | unsigned long *mc_saved_in_initrd, | 529 | struct ucode_cpu_info *uci) |
554 | struct ucode_cpu_info *uci) | ||
555 | { | 530 | { |
556 | unsigned int size = end - start + 1; | ||
557 | struct cpio_data cd; | 531 | struct cpio_data cd; |
558 | long offset = 0; | 532 | long offset = 0; |
559 | #ifdef CONFIG_X86_32 | 533 | #ifdef CONFIG_X86_32 |
@@ -569,10 +543,8 @@ scan_microcode(unsigned long start, unsigned long end, | |||
569 | if (!cd.data) | 543 | if (!cd.data) |
570 | return UCODE_ERROR; | 544 | return UCODE_ERROR; |
571 | 545 | ||
572 | |||
573 | return get_matching_model_microcode(0, start, cd.data, cd.size, | 546 | return get_matching_model_microcode(0, start, cd.data, cd.size, |
574 | mc_saved_data, mc_saved_in_initrd, | 547 | mc_saved_data, initrd, uci); |
575 | uci); | ||
576 | } | 548 | } |
577 | 549 | ||
578 | /* | 550 | /* |
@@ -704,7 +676,7 @@ int __init save_microcode_in_initrd_intel(void) | |||
704 | if (count == 0) | 676 | if (count == 0) |
705 | return ret; | 677 | return ret; |
706 | 678 | ||
707 | microcode_pointer(mc_saved, mc_saved_in_initrd, initrd_start, count); | 679 | copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, initrd_start, count); |
708 | ret = save_microcode(&mc_saved_data, mc_saved, count); | 680 | ret = save_microcode(&mc_saved_data, mc_saved, count); |
709 | if (ret) | 681 | if (ret) |
710 | pr_err("Cannot save microcode patches from initrd.\n"); | 682 | pr_err("Cannot save microcode patches from initrd.\n"); |
@@ -716,52 +688,44 @@ int __init save_microcode_in_initrd_intel(void) | |||
716 | 688 | ||
717 | static void __init | 689 | static void __init |
718 | _load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data, | 690 | _load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data, |
719 | unsigned long *mc_saved_in_initrd, | 691 | unsigned long *initrd, |
720 | unsigned long initrd_start_early, | 692 | unsigned long start, unsigned long size) |
721 | unsigned long initrd_end_early, | ||
722 | struct ucode_cpu_info *uci) | ||
723 | { | 693 | { |
694 | struct ucode_cpu_info uci; | ||
724 | enum ucode_state ret; | 695 | enum ucode_state ret; |
725 | 696 | ||
726 | collect_cpu_info_early(uci); | 697 | collect_cpu_info_early(&uci); |
727 | scan_microcode(initrd_start_early, initrd_end_early, mc_saved_data, | ||
728 | mc_saved_in_initrd, uci); | ||
729 | 698 | ||
730 | ret = load_microcode(mc_saved_data, mc_saved_in_initrd, | 699 | ret = scan_microcode(mc_saved_data, initrd, start, size, &uci); |
731 | initrd_start_early, uci); | 700 | if (ret != UCODE_OK) |
701 | return; | ||
732 | 702 | ||
733 | if (ret == UCODE_OK) | 703 | ret = load_microcode(mc_saved_data, initrd, start, &uci); |
734 | apply_microcode_early(uci, true); | 704 | if (ret != UCODE_OK) |
705 | return; | ||
706 | |||
707 | apply_microcode_early(&uci, true); | ||
735 | } | 708 | } |
736 | 709 | ||
737 | void __init | 710 | void __init load_ucode_intel_bsp(void) |
738 | load_ucode_intel_bsp(void) | ||
739 | { | 711 | { |
740 | u64 ramdisk_image, ramdisk_size; | 712 | u64 start, size; |
741 | unsigned long initrd_start_early, initrd_end_early; | ||
742 | struct ucode_cpu_info uci; | ||
743 | #ifdef CONFIG_X86_32 | 713 | #ifdef CONFIG_X86_32 |
744 | struct boot_params *boot_params_p; | 714 | struct boot_params *p; |
745 | 715 | ||
746 | boot_params_p = (struct boot_params *)__pa_nodebug(&boot_params); | 716 | p = (struct boot_params *)__pa_nodebug(&boot_params); |
747 | ramdisk_image = boot_params_p->hdr.ramdisk_image; | 717 | start = p->hdr.ramdisk_image; |
748 | ramdisk_size = boot_params_p->hdr.ramdisk_size; | 718 | size = p->hdr.ramdisk_size; |
749 | initrd_start_early = ramdisk_image; | ||
750 | initrd_end_early = initrd_start_early + ramdisk_size; | ||
751 | 719 | ||
752 | _load_ucode_intel_bsp( | 720 | _load_ucode_intel_bsp( |
753 | (struct mc_saved_data *)__pa_nodebug(&mc_saved_data), | 721 | (struct mc_saved_data *)__pa_nodebug(&mc_saved_data), |
754 | (unsigned long *)__pa_nodebug(&mc_saved_in_initrd), | 722 | (unsigned long *)__pa_nodebug(&mc_saved_in_initrd), |
755 | initrd_start_early, initrd_end_early, &uci); | 723 | start, size); |
756 | #else | 724 | #else |
757 | ramdisk_image = boot_params.hdr.ramdisk_image; | 725 | start = boot_params.hdr.ramdisk_image + PAGE_OFFSET; |
758 | ramdisk_size = boot_params.hdr.ramdisk_size; | 726 | size = boot_params.hdr.ramdisk_size; |
759 | initrd_start_early = ramdisk_image + PAGE_OFFSET; | 727 | |
760 | initrd_end_early = initrd_start_early + ramdisk_size; | 728 | _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, start, size); |
761 | |||
762 | _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, | ||
763 | initrd_start_early, initrd_end_early, | ||
764 | &uci); | ||
765 | #endif | 729 | #endif |
766 | } | 730 | } |
767 | 731 | ||
@@ -771,6 +735,7 @@ void load_ucode_intel_ap(void) | |||
771 | struct ucode_cpu_info uci; | 735 | struct ucode_cpu_info uci; |
772 | unsigned long *mc_saved_in_initrd_p; | 736 | unsigned long *mc_saved_in_initrd_p; |
773 | unsigned long initrd_start_addr; | 737 | unsigned long initrd_start_addr; |
738 | enum ucode_state ret; | ||
774 | #ifdef CONFIG_X86_32 | 739 | #ifdef CONFIG_X86_32 |
775 | unsigned long *initrd_start_p; | 740 | unsigned long *initrd_start_p; |
776 | 741 | ||
@@ -793,8 +758,12 @@ void load_ucode_intel_ap(void) | |||
793 | return; | 758 | return; |
794 | 759 | ||
795 | collect_cpu_info_early(&uci); | 760 | collect_cpu_info_early(&uci); |
796 | load_microcode(mc_saved_data_p, mc_saved_in_initrd_p, | 761 | ret = load_microcode(mc_saved_data_p, mc_saved_in_initrd_p, |
797 | initrd_start_addr, &uci); | 762 | initrd_start_addr, &uci); |
763 | |||
764 | if (ret != UCODE_OK) | ||
765 | return; | ||
766 | |||
798 | apply_microcode_early(&uci, true); | 767 | apply_microcode_early(&uci, true); |
799 | } | 768 | } |
800 | 769 | ||
@@ -808,8 +777,8 @@ void reload_ucode_intel(void) | |||
808 | 777 | ||
809 | collect_cpu_info_early(&uci); | 778 | collect_cpu_info_early(&uci); |
810 | 779 | ||
811 | ret = generic_load_microcode_early(mc_saved_data.mc_saved, | 780 | ret = load_microcode_early(mc_saved_data.mc_saved, |
812 | mc_saved_data.mc_saved_count, &uci); | 781 | mc_saved_data.mc_saved_count, &uci); |
813 | if (ret != UCODE_OK) | 782 | if (ret != UCODE_OK) |
814 | return; | 783 | return; |
815 | 784 | ||
diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c index ce69320d0179..cd47a510a3f1 100644 --- a/arch/x86/kernel/cpu/microcode/intel_lib.c +++ b/arch/x86/kernel/cpu/microcode/intel_lib.c | |||
@@ -38,12 +38,6 @@ update_match_cpu(unsigned int csig, unsigned int cpf, | |||
38 | return (!sigmatch(sig, csig, pf, cpf)) ? 0 : 1; | 38 | return (!sigmatch(sig, csig, pf, cpf)) ? 0 : 1; |
39 | } | 39 | } |
40 | 40 | ||
41 | int | ||
42 | update_match_revision(struct microcode_header_intel *mc_header, int rev) | ||
43 | { | ||
44 | return (mc_header->rev <= rev) ? 0 : 1; | ||
45 | } | ||
46 | |||
47 | int microcode_sanity_check(void *mc, int print_err) | 41 | int microcode_sanity_check(void *mc, int print_err) |
48 | { | 42 | { |
49 | unsigned long total_size, data_size, ext_table_size; | 43 | unsigned long total_size, data_size, ext_table_size; |
@@ -128,10 +122,9 @@ int microcode_sanity_check(void *mc, int print_err) | |||
128 | EXPORT_SYMBOL_GPL(microcode_sanity_check); | 122 | EXPORT_SYMBOL_GPL(microcode_sanity_check); |
129 | 123 | ||
130 | /* | 124 | /* |
131 | * return 0 - no update found | 125 | * Returns 1 if update has been found, 0 otherwise. |
132 | * return 1 - found update | ||
133 | */ | 126 | */ |
134 | int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev) | 127 | int get_matching_sig(unsigned int csig, int cpf, int rev, void *mc) |
135 | { | 128 | { |
136 | struct microcode_header_intel *mc_header = mc; | 129 | struct microcode_header_intel *mc_header = mc; |
137 | struct extended_sigtable *ext_header; | 130 | struct extended_sigtable *ext_header; |
@@ -159,16 +152,15 @@ int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev) | |||
159 | } | 152 | } |
160 | 153 | ||
161 | /* | 154 | /* |
162 | * return 0 - no update found | 155 | * Returns 1 if update has been found, 0 otherwise. |
163 | * return 1 - found update | ||
164 | */ | 156 | */ |
165 | int get_matching_microcode(unsigned int csig, int cpf, void *mc, int rev) | 157 | int get_matching_microcode(unsigned int csig, int cpf, int rev, void *mc) |
166 | { | 158 | { |
167 | struct microcode_header_intel *mc_header = mc; | 159 | struct microcode_header_intel *mc_hdr = mc; |
168 | 160 | ||
169 | if (!update_match_revision(mc_header, rev)) | 161 | if (!revision_is_newer(mc_hdr, rev)) |
170 | return 0; | 162 | return 0; |
171 | 163 | ||
172 | return get_matching_sig(csig, cpf, mc, rev); | 164 | return get_matching_sig(csig, cpf, rev, mc); |
173 | } | 165 | } |
174 | EXPORT_SYMBOL_GPL(get_matching_microcode); | 166 | EXPORT_SYMBOL_GPL(get_matching_microcode); |
diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh index 36d99a337b49..3f20710a5b23 100644 --- a/arch/x86/kernel/cpu/mkcapflags.sh +++ b/arch/x86/kernel/cpu/mkcapflags.sh | |||
@@ -6,7 +6,7 @@ | |||
6 | IN=$1 | 6 | IN=$1 |
7 | OUT=$2 | 7 | OUT=$2 |
8 | 8 | ||
9 | function dump_array() | 9 | dump_array() |
10 | { | 10 | { |
11 | ARRAY=$1 | 11 | ARRAY=$1 |
12 | SIZE=$2 | 12 | SIZE=$2 |
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index a041e094b8b9..d76f13d6d8d6 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c | |||
@@ -404,11 +404,10 @@ static const struct file_operations mtrr_fops = { | |||
404 | static int mtrr_seq_show(struct seq_file *seq, void *offset) | 404 | static int mtrr_seq_show(struct seq_file *seq, void *offset) |
405 | { | 405 | { |
406 | char factor; | 406 | char factor; |
407 | int i, max, len; | 407 | int i, max; |
408 | mtrr_type type; | 408 | mtrr_type type; |
409 | unsigned long base, size; | 409 | unsigned long base, size; |
410 | 410 | ||
411 | len = 0; | ||
412 | max = num_var_ranges; | 411 | max = num_var_ranges; |
413 | for (i = 0; i < max; i++) { | 412 | for (i = 0; i < max; i++) { |
414 | mtrr_if->get(i, &base, &size, &type); | 413 | mtrr_if->get(i, &base, &size, &type); |
@@ -425,11 +424,10 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset) | |||
425 | size >>= 20 - PAGE_SHIFT; | 424 | size >>= 20 - PAGE_SHIFT; |
426 | } | 425 | } |
427 | /* Base can be > 32bit */ | 426 | /* Base can be > 32bit */ |
428 | len += seq_printf(seq, "reg%02i: base=0x%06lx000 " | 427 | seq_printf(seq, "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n", |
429 | "(%5luMB), size=%5lu%cB, count=%d: %s\n", | 428 | i, base, base >> (20 - PAGE_SHIFT), |
430 | i, base, base >> (20 - PAGE_SHIFT), size, | 429 | size, factor, |
431 | factor, mtrr_usage_table[i], | 430 | mtrr_usage_table[i], mtrr_attrib_to_str(type)); |
432 | mtrr_attrib_to_str(type)); | ||
433 | } | 431 | } |
434 | return 0; | 432 | return 0; |
435 | } | 433 | } |
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index b71a7f86d68a..87848ebe2bb7 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c | |||
@@ -263,6 +263,14 @@ static void hw_perf_event_destroy(struct perf_event *event) | |||
263 | } | 263 | } |
264 | } | 264 | } |
265 | 265 | ||
266 | void hw_perf_lbr_event_destroy(struct perf_event *event) | ||
267 | { | ||
268 | hw_perf_event_destroy(event); | ||
269 | |||
270 | /* undo the lbr/bts event accounting */ | ||
271 | x86_del_exclusive(x86_lbr_exclusive_lbr); | ||
272 | } | ||
273 | |||
266 | static inline int x86_pmu_initialized(void) | 274 | static inline int x86_pmu_initialized(void) |
267 | { | 275 | { |
268 | return x86_pmu.handle_irq != NULL; | 276 | return x86_pmu.handle_irq != NULL; |
@@ -302,6 +310,35 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event) | |||
302 | return x86_pmu_extra_regs(val, event); | 310 | return x86_pmu_extra_regs(val, event); |
303 | } | 311 | } |
304 | 312 | ||
313 | /* | ||
314 | * Check if we can create event of a certain type (that no conflicting events | ||
315 | * are present). | ||
316 | */ | ||
317 | int x86_add_exclusive(unsigned int what) | ||
318 | { | ||
319 | int ret = -EBUSY, i; | ||
320 | |||
321 | if (atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) | ||
322 | return 0; | ||
323 | |||
324 | mutex_lock(&pmc_reserve_mutex); | ||
325 | for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) | ||
326 | if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i])) | ||
327 | goto out; | ||
328 | |||
329 | atomic_inc(&x86_pmu.lbr_exclusive[what]); | ||
330 | ret = 0; | ||
331 | |||
332 | out: | ||
333 | mutex_unlock(&pmc_reserve_mutex); | ||
334 | return ret; | ||
335 | } | ||
336 | |||
337 | void x86_del_exclusive(unsigned int what) | ||
338 | { | ||
339 | atomic_dec(&x86_pmu.lbr_exclusive[what]); | ||
340 | } | ||
341 | |||
305 | int x86_setup_perfctr(struct perf_event *event) | 342 | int x86_setup_perfctr(struct perf_event *event) |
306 | { | 343 | { |
307 | struct perf_event_attr *attr = &event->attr; | 344 | struct perf_event_attr *attr = &event->attr; |
@@ -346,6 +383,12 @@ int x86_setup_perfctr(struct perf_event *event) | |||
346 | /* BTS is currently only allowed for user-mode. */ | 383 | /* BTS is currently only allowed for user-mode. */ |
347 | if (!attr->exclude_kernel) | 384 | if (!attr->exclude_kernel) |
348 | return -EOPNOTSUPP; | 385 | return -EOPNOTSUPP; |
386 | |||
387 | /* disallow bts if conflicting events are present */ | ||
388 | if (x86_add_exclusive(x86_lbr_exclusive_lbr)) | ||
389 | return -EBUSY; | ||
390 | |||
391 | event->destroy = hw_perf_lbr_event_destroy; | ||
349 | } | 392 | } |
350 | 393 | ||
351 | hwc->config |= config; | 394 | hwc->config |= config; |
@@ -399,39 +442,41 @@ int x86_pmu_hw_config(struct perf_event *event) | |||
399 | 442 | ||
400 | if (event->attr.precise_ip > precise) | 443 | if (event->attr.precise_ip > precise) |
401 | return -EOPNOTSUPP; | 444 | return -EOPNOTSUPP; |
402 | /* | 445 | } |
403 | * check that PEBS LBR correction does not conflict with | 446 | /* |
404 | * whatever the user is asking with attr->branch_sample_type | 447 | * check that PEBS LBR correction does not conflict with |
405 | */ | 448 | * whatever the user is asking with attr->branch_sample_type |
406 | if (event->attr.precise_ip > 1 && | 449 | */ |
407 | x86_pmu.intel_cap.pebs_format < 2) { | 450 | if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) { |
408 | u64 *br_type = &event->attr.branch_sample_type; | 451 | u64 *br_type = &event->attr.branch_sample_type; |
409 | 452 | ||
410 | if (has_branch_stack(event)) { | 453 | if (has_branch_stack(event)) { |
411 | if (!precise_br_compat(event)) | 454 | if (!precise_br_compat(event)) |
412 | return -EOPNOTSUPP; | 455 | return -EOPNOTSUPP; |
413 | 456 | ||
414 | /* branch_sample_type is compatible */ | 457 | /* branch_sample_type is compatible */ |
415 | 458 | ||
416 | } else { | 459 | } else { |
417 | /* | 460 | /* |
418 | * user did not specify branch_sample_type | 461 | * user did not specify branch_sample_type |
419 | * | 462 | * |
420 | * For PEBS fixups, we capture all | 463 | * For PEBS fixups, we capture all |
421 | * the branches at the priv level of the | 464 | * the branches at the priv level of the |
422 | * event. | 465 | * event. |
423 | */ | 466 | */ |
424 | *br_type = PERF_SAMPLE_BRANCH_ANY; | 467 | *br_type = PERF_SAMPLE_BRANCH_ANY; |
425 | 468 | ||
426 | if (!event->attr.exclude_user) | 469 | if (!event->attr.exclude_user) |
427 | *br_type |= PERF_SAMPLE_BRANCH_USER; | 470 | *br_type |= PERF_SAMPLE_BRANCH_USER; |
428 | 471 | ||
429 | if (!event->attr.exclude_kernel) | 472 | if (!event->attr.exclude_kernel) |
430 | *br_type |= PERF_SAMPLE_BRANCH_KERNEL; | 473 | *br_type |= PERF_SAMPLE_BRANCH_KERNEL; |
431 | } | ||
432 | } | 474 | } |
433 | } | 475 | } |
434 | 476 | ||
477 | if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK) | ||
478 | event->attach_state |= PERF_ATTACH_TASK_DATA; | ||
479 | |||
435 | /* | 480 | /* |
436 | * Generate PMC IRQs: | 481 | * Generate PMC IRQs: |
437 | * (keep 'enabled' bit clear for now) | 482 | * (keep 'enabled' bit clear for now) |
@@ -449,6 +494,12 @@ int x86_pmu_hw_config(struct perf_event *event) | |||
449 | if (event->attr.type == PERF_TYPE_RAW) | 494 | if (event->attr.type == PERF_TYPE_RAW) |
450 | event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK; | 495 | event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK; |
451 | 496 | ||
497 | if (event->attr.sample_period && x86_pmu.limit_period) { | ||
498 | if (x86_pmu.limit_period(event, event->attr.sample_period) > | ||
499 | event->attr.sample_period) | ||
500 | return -EINVAL; | ||
501 | } | ||
502 | |||
452 | return x86_setup_perfctr(event); | 503 | return x86_setup_perfctr(event); |
453 | } | 504 | } |
454 | 505 | ||
@@ -728,14 +779,17 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) | |||
728 | struct event_constraint *c; | 779 | struct event_constraint *c; |
729 | unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; | 780 | unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; |
730 | struct perf_event *e; | 781 | struct perf_event *e; |
731 | int i, wmin, wmax, num = 0; | 782 | int i, wmin, wmax, unsched = 0; |
732 | struct hw_perf_event *hwc; | 783 | struct hw_perf_event *hwc; |
733 | 784 | ||
734 | bitmap_zero(used_mask, X86_PMC_IDX_MAX); | 785 | bitmap_zero(used_mask, X86_PMC_IDX_MAX); |
735 | 786 | ||
787 | if (x86_pmu.start_scheduling) | ||
788 | x86_pmu.start_scheduling(cpuc); | ||
789 | |||
736 | for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) { | 790 | for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) { |
737 | hwc = &cpuc->event_list[i]->hw; | 791 | hwc = &cpuc->event_list[i]->hw; |
738 | c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]); | 792 | c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]); |
739 | hwc->constraint = c; | 793 | hwc->constraint = c; |
740 | 794 | ||
741 | wmin = min(wmin, c->weight); | 795 | wmin = min(wmin, c->weight); |
@@ -768,24 +822,30 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) | |||
768 | 822 | ||
769 | /* slow path */ | 823 | /* slow path */ |
770 | if (i != n) | 824 | if (i != n) |
771 | num = perf_assign_events(cpuc->event_list, n, wmin, | 825 | unsched = perf_assign_events(cpuc->event_list, n, wmin, |
772 | wmax, assign); | 826 | wmax, assign); |
773 | 827 | ||
774 | /* | 828 | /* |
775 | * Mark the event as committed, so we do not put_constraint() | 829 | * In case of success (unsched = 0), mark events as committed, |
776 | * in case new events are added and fail scheduling. | 830 | * so we do not put_constraint() in case new events are added |
831 | * and fail to be scheduled | ||
832 | * | ||
833 | * We invoke the lower level commit callback to lock the resource | ||
834 | * | ||
835 | * We do not need to do all of this in case we are called to | ||
836 | * validate an event group (assign == NULL) | ||
777 | */ | 837 | */ |
778 | if (!num && assign) { | 838 | if (!unsched && assign) { |
779 | for (i = 0; i < n; i++) { | 839 | for (i = 0; i < n; i++) { |
780 | e = cpuc->event_list[i]; | 840 | e = cpuc->event_list[i]; |
781 | e->hw.flags |= PERF_X86_EVENT_COMMITTED; | 841 | e->hw.flags |= PERF_X86_EVENT_COMMITTED; |
842 | if (x86_pmu.commit_scheduling) | ||
843 | x86_pmu.commit_scheduling(cpuc, e, assign[i]); | ||
782 | } | 844 | } |
783 | } | 845 | } |
784 | /* | 846 | |
785 | * scheduling failed or is just a simulation, | 847 | if (!assign || unsched) { |
786 | * free resources if necessary | 848 | |
787 | */ | ||
788 | if (!assign || num) { | ||
789 | for (i = 0; i < n; i++) { | 849 | for (i = 0; i < n; i++) { |
790 | e = cpuc->event_list[i]; | 850 | e = cpuc->event_list[i]; |
791 | /* | 851 | /* |
@@ -795,11 +855,18 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) | |||
795 | if ((e->hw.flags & PERF_X86_EVENT_COMMITTED)) | 855 | if ((e->hw.flags & PERF_X86_EVENT_COMMITTED)) |
796 | continue; | 856 | continue; |
797 | 857 | ||
858 | /* | ||
859 | * release events that failed scheduling | ||
860 | */ | ||
798 | if (x86_pmu.put_event_constraints) | 861 | if (x86_pmu.put_event_constraints) |
799 | x86_pmu.put_event_constraints(cpuc, e); | 862 | x86_pmu.put_event_constraints(cpuc, e); |
800 | } | 863 | } |
801 | } | 864 | } |
802 | return num ? -EINVAL : 0; | 865 | |
866 | if (x86_pmu.stop_scheduling) | ||
867 | x86_pmu.stop_scheduling(cpuc); | ||
868 | |||
869 | return unsched ? -EINVAL : 0; | ||
803 | } | 870 | } |
804 | 871 | ||
805 | /* | 872 | /* |
@@ -986,6 +1053,9 @@ int x86_perf_event_set_period(struct perf_event *event) | |||
986 | if (left > x86_pmu.max_period) | 1053 | if (left > x86_pmu.max_period) |
987 | left = x86_pmu.max_period; | 1054 | left = x86_pmu.max_period; |
988 | 1055 | ||
1056 | if (x86_pmu.limit_period) | ||
1057 | left = x86_pmu.limit_period(event, left); | ||
1058 | |||
989 | per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; | 1059 | per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; |
990 | 1060 | ||
991 | /* | 1061 | /* |
@@ -1033,7 +1103,6 @@ static int x86_pmu_add(struct perf_event *event, int flags) | |||
1033 | 1103 | ||
1034 | hwc = &event->hw; | 1104 | hwc = &event->hw; |
1035 | 1105 | ||
1036 | perf_pmu_disable(event->pmu); | ||
1037 | n0 = cpuc->n_events; | 1106 | n0 = cpuc->n_events; |
1038 | ret = n = collect_events(cpuc, event, false); | 1107 | ret = n = collect_events(cpuc, event, false); |
1039 | if (ret < 0) | 1108 | if (ret < 0) |
@@ -1071,7 +1140,6 @@ done_collect: | |||
1071 | 1140 | ||
1072 | ret = 0; | 1141 | ret = 0; |
1073 | out: | 1142 | out: |
1074 | perf_pmu_enable(event->pmu); | ||
1075 | return ret; | 1143 | return ret; |
1076 | } | 1144 | } |
1077 | 1145 | ||
@@ -1103,7 +1171,7 @@ static void x86_pmu_start(struct perf_event *event, int flags) | |||
1103 | void perf_event_print_debug(void) | 1171 | void perf_event_print_debug(void) |
1104 | { | 1172 | { |
1105 | u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; | 1173 | u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; |
1106 | u64 pebs; | 1174 | u64 pebs, debugctl; |
1107 | struct cpu_hw_events *cpuc; | 1175 | struct cpu_hw_events *cpuc; |
1108 | unsigned long flags; | 1176 | unsigned long flags; |
1109 | int cpu, idx; | 1177 | int cpu, idx; |
@@ -1121,14 +1189,20 @@ void perf_event_print_debug(void) | |||
1121 | rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); | 1189 | rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); |
1122 | rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); | 1190 | rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); |
1123 | rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); | 1191 | rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); |
1124 | rdmsrl(MSR_IA32_PEBS_ENABLE, pebs); | ||
1125 | 1192 | ||
1126 | pr_info("\n"); | 1193 | pr_info("\n"); |
1127 | pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); | 1194 | pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); |
1128 | pr_info("CPU#%d: status: %016llx\n", cpu, status); | 1195 | pr_info("CPU#%d: status: %016llx\n", cpu, status); |
1129 | pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); | 1196 | pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); |
1130 | pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); | 1197 | pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); |
1131 | pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs); | 1198 | if (x86_pmu.pebs_constraints) { |
1199 | rdmsrl(MSR_IA32_PEBS_ENABLE, pebs); | ||
1200 | pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs); | ||
1201 | } | ||
1202 | if (x86_pmu.lbr_nr) { | ||
1203 | rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); | ||
1204 | pr_info("CPU#%d: debugctl: %016llx\n", cpu, debugctl); | ||
1205 | } | ||
1132 | } | 1206 | } |
1133 | pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); | 1207 | pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); |
1134 | 1208 | ||
@@ -1321,11 +1395,12 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) | |||
1321 | { | 1395 | { |
1322 | unsigned int cpu = (long)hcpu; | 1396 | unsigned int cpu = (long)hcpu; |
1323 | struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); | 1397 | struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); |
1324 | int ret = NOTIFY_OK; | 1398 | int i, ret = NOTIFY_OK; |
1325 | 1399 | ||
1326 | switch (action & ~CPU_TASKS_FROZEN) { | 1400 | switch (action & ~CPU_TASKS_FROZEN) { |
1327 | case CPU_UP_PREPARE: | 1401 | case CPU_UP_PREPARE: |
1328 | cpuc->kfree_on_online = NULL; | 1402 | for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) |
1403 | cpuc->kfree_on_online[i] = NULL; | ||
1329 | if (x86_pmu.cpu_prepare) | 1404 | if (x86_pmu.cpu_prepare) |
1330 | ret = x86_pmu.cpu_prepare(cpu); | 1405 | ret = x86_pmu.cpu_prepare(cpu); |
1331 | break; | 1406 | break; |
@@ -1336,7 +1411,10 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) | |||
1336 | break; | 1411 | break; |
1337 | 1412 | ||
1338 | case CPU_ONLINE: | 1413 | case CPU_ONLINE: |
1339 | kfree(cpuc->kfree_on_online); | 1414 | for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) { |
1415 | kfree(cpuc->kfree_on_online[i]); | ||
1416 | cpuc->kfree_on_online[i] = NULL; | ||
1417 | } | ||
1340 | break; | 1418 | break; |
1341 | 1419 | ||
1342 | case CPU_DYING: | 1420 | case CPU_DYING: |
@@ -1712,7 +1790,7 @@ static int validate_event(struct perf_event *event) | |||
1712 | if (IS_ERR(fake_cpuc)) | 1790 | if (IS_ERR(fake_cpuc)) |
1713 | return PTR_ERR(fake_cpuc); | 1791 | return PTR_ERR(fake_cpuc); |
1714 | 1792 | ||
1715 | c = x86_pmu.get_event_constraints(fake_cpuc, event); | 1793 | c = x86_pmu.get_event_constraints(fake_cpuc, -1, event); |
1716 | 1794 | ||
1717 | if (!c || !c->weight) | 1795 | if (!c || !c->weight) |
1718 | ret = -EINVAL; | 1796 | ret = -EINVAL; |
@@ -1914,10 +1992,10 @@ static const struct attribute_group *x86_pmu_attr_groups[] = { | |||
1914 | NULL, | 1992 | NULL, |
1915 | }; | 1993 | }; |
1916 | 1994 | ||
1917 | static void x86_pmu_flush_branch_stack(void) | 1995 | static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in) |
1918 | { | 1996 | { |
1919 | if (x86_pmu.flush_branch_stack) | 1997 | if (x86_pmu.sched_task) |
1920 | x86_pmu.flush_branch_stack(); | 1998 | x86_pmu.sched_task(ctx, sched_in); |
1921 | } | 1999 | } |
1922 | 2000 | ||
1923 | void perf_check_microcode(void) | 2001 | void perf_check_microcode(void) |
@@ -1949,7 +2027,8 @@ static struct pmu pmu = { | |||
1949 | .commit_txn = x86_pmu_commit_txn, | 2027 | .commit_txn = x86_pmu_commit_txn, |
1950 | 2028 | ||
1951 | .event_idx = x86_pmu_event_idx, | 2029 | .event_idx = x86_pmu_event_idx, |
1952 | .flush_branch_stack = x86_pmu_flush_branch_stack, | 2030 | .sched_task = x86_pmu_sched_task, |
2031 | .task_ctx_size = sizeof(struct x86_perf_task_context), | ||
1953 | }; | 2032 | }; |
1954 | 2033 | ||
1955 | void arch_perf_update_userpage(struct perf_event *event, | 2034 | void arch_perf_update_userpage(struct perf_event *event, |
@@ -1968,13 +2047,23 @@ void arch_perf_update_userpage(struct perf_event *event, | |||
1968 | 2047 | ||
1969 | data = cyc2ns_read_begin(); | 2048 | data = cyc2ns_read_begin(); |
1970 | 2049 | ||
2050 | /* | ||
2051 | * Internal timekeeping for enabled/running/stopped times | ||
2052 | * is always in the local_clock domain. | ||
2053 | */ | ||
1971 | userpg->cap_user_time = 1; | 2054 | userpg->cap_user_time = 1; |
1972 | userpg->time_mult = data->cyc2ns_mul; | 2055 | userpg->time_mult = data->cyc2ns_mul; |
1973 | userpg->time_shift = data->cyc2ns_shift; | 2056 | userpg->time_shift = data->cyc2ns_shift; |
1974 | userpg->time_offset = data->cyc2ns_offset - now; | 2057 | userpg->time_offset = data->cyc2ns_offset - now; |
1975 | 2058 | ||
1976 | userpg->cap_user_time_zero = 1; | 2059 | /* |
1977 | userpg->time_zero = data->cyc2ns_offset; | 2060 | * cap_user_time_zero doesn't make sense when we're using a different |
2061 | * time base for the records. | ||
2062 | */ | ||
2063 | if (event->clock == &local_clock) { | ||
2064 | userpg->cap_user_time_zero = 1; | ||
2065 | userpg->time_zero = data->cyc2ns_offset; | ||
2066 | } | ||
1978 | 2067 | ||
1979 | cyc2ns_read_end(data); | 2068 | cyc2ns_read_end(data); |
1980 | } | 2069 | } |
@@ -2147,24 +2236,24 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) | |||
2147 | static unsigned long code_segment_base(struct pt_regs *regs) | 2236 | static unsigned long code_segment_base(struct pt_regs *regs) |
2148 | { | 2237 | { |
2149 | /* | 2238 | /* |
2239 | * For IA32 we look at the GDT/LDT segment base to convert the | ||
2240 | * effective IP to a linear address. | ||
2241 | */ | ||
2242 | |||
2243 | #ifdef CONFIG_X86_32 | ||
2244 | /* | ||
2150 | * If we are in VM86 mode, add the segment offset to convert to a | 2245 | * If we are in VM86 mode, add the segment offset to convert to a |
2151 | * linear address. | 2246 | * linear address. |
2152 | */ | 2247 | */ |
2153 | if (regs->flags & X86_VM_MASK) | 2248 | if (regs->flags & X86_VM_MASK) |
2154 | return 0x10 * regs->cs; | 2249 | return 0x10 * regs->cs; |
2155 | 2250 | ||
2156 | /* | ||
2157 | * For IA32 we look at the GDT/LDT segment base to convert the | ||
2158 | * effective IP to a linear address. | ||
2159 | */ | ||
2160 | #ifdef CONFIG_X86_32 | ||
2161 | if (user_mode(regs) && regs->cs != __USER_CS) | 2251 | if (user_mode(regs) && regs->cs != __USER_CS) |
2162 | return get_segment_base(regs->cs); | 2252 | return get_segment_base(regs->cs); |
2163 | #else | 2253 | #else |
2164 | if (test_thread_flag(TIF_IA32)) { | 2254 | if (user_mode(regs) && !user_64bit_mode(regs) && |
2165 | if (user_mode(regs) && regs->cs != __USER32_CS) | 2255 | regs->cs != __USER32_CS) |
2166 | return get_segment_base(regs->cs); | 2256 | return get_segment_base(regs->cs); |
2167 | } | ||
2168 | #endif | 2257 | #endif |
2169 | return 0; | 2258 | return 0; |
2170 | } | 2259 | } |
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index df525d2be1e8..6ac5cb7a9e14 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h | |||
@@ -65,13 +65,15 @@ struct event_constraint { | |||
65 | /* | 65 | /* |
66 | * struct hw_perf_event.flags flags | 66 | * struct hw_perf_event.flags flags |
67 | */ | 67 | */ |
68 | #define PERF_X86_EVENT_PEBS_LDLAT 0x1 /* ld+ldlat data address sampling */ | 68 | #define PERF_X86_EVENT_PEBS_LDLAT 0x0001 /* ld+ldlat data address sampling */ |
69 | #define PERF_X86_EVENT_PEBS_ST 0x2 /* st data address sampling */ | 69 | #define PERF_X86_EVENT_PEBS_ST 0x0002 /* st data address sampling */ |
70 | #define PERF_X86_EVENT_PEBS_ST_HSW 0x4 /* haswell style datala, store */ | 70 | #define PERF_X86_EVENT_PEBS_ST_HSW 0x0004 /* haswell style datala, store */ |
71 | #define PERF_X86_EVENT_COMMITTED 0x8 /* event passed commit_txn */ | 71 | #define PERF_X86_EVENT_COMMITTED 0x0008 /* event passed commit_txn */ |
72 | #define PERF_X86_EVENT_PEBS_LD_HSW 0x10 /* haswell style datala, load */ | 72 | #define PERF_X86_EVENT_PEBS_LD_HSW 0x0010 /* haswell style datala, load */ |
73 | #define PERF_X86_EVENT_PEBS_NA_HSW 0x20 /* haswell style datala, unknown */ | 73 | #define PERF_X86_EVENT_PEBS_NA_HSW 0x0020 /* haswell style datala, unknown */ |
74 | #define PERF_X86_EVENT_RDPMC_ALLOWED 0x40 /* grant rdpmc permission */ | 74 | #define PERF_X86_EVENT_EXCL 0x0040 /* HT exclusivity on counter */ |
75 | #define PERF_X86_EVENT_DYNAMIC 0x0080 /* dynamic alloc'd constraint */ | ||
76 | #define PERF_X86_EVENT_RDPMC_ALLOWED 0x0100 /* grant rdpmc permission */ | ||
75 | 77 | ||
76 | 78 | ||
77 | struct amd_nb { | 79 | struct amd_nb { |
@@ -123,8 +125,37 @@ struct intel_shared_regs { | |||
123 | unsigned core_id; /* per-core: core id */ | 125 | unsigned core_id; /* per-core: core id */ |
124 | }; | 126 | }; |
125 | 127 | ||
128 | enum intel_excl_state_type { | ||
129 | INTEL_EXCL_UNUSED = 0, /* counter is unused */ | ||
130 | INTEL_EXCL_SHARED = 1, /* counter can be used by both threads */ | ||
131 | INTEL_EXCL_EXCLUSIVE = 2, /* counter can be used by one thread only */ | ||
132 | }; | ||
133 | |||
134 | struct intel_excl_states { | ||
135 | enum intel_excl_state_type init_state[X86_PMC_IDX_MAX]; | ||
136 | enum intel_excl_state_type state[X86_PMC_IDX_MAX]; | ||
137 | int num_alloc_cntrs;/* #counters allocated */ | ||
138 | int max_alloc_cntrs;/* max #counters allowed */ | ||
139 | bool sched_started; /* true if scheduling has started */ | ||
140 | }; | ||
141 | |||
142 | struct intel_excl_cntrs { | ||
143 | raw_spinlock_t lock; | ||
144 | |||
145 | struct intel_excl_states states[2]; | ||
146 | |||
147 | int refcnt; /* per-core: #HT threads */ | ||
148 | unsigned core_id; /* per-core: core id */ | ||
149 | }; | ||
150 | |||
126 | #define MAX_LBR_ENTRIES 16 | 151 | #define MAX_LBR_ENTRIES 16 |
127 | 152 | ||
153 | enum { | ||
154 | X86_PERF_KFREE_SHARED = 0, | ||
155 | X86_PERF_KFREE_EXCL = 1, | ||
156 | X86_PERF_KFREE_MAX | ||
157 | }; | ||
158 | |||
128 | struct cpu_hw_events { | 159 | struct cpu_hw_events { |
129 | /* | 160 | /* |
130 | * Generic x86 PMC bits | 161 | * Generic x86 PMC bits |
@@ -179,6 +210,12 @@ struct cpu_hw_events { | |||
179 | * used on Intel NHM/WSM/SNB | 210 | * used on Intel NHM/WSM/SNB |
180 | */ | 211 | */ |
181 | struct intel_shared_regs *shared_regs; | 212 | struct intel_shared_regs *shared_regs; |
213 | /* | ||
214 | * manage exclusive counter access between hyperthread | ||
215 | */ | ||
216 | struct event_constraint *constraint_list; /* in enable order */ | ||
217 | struct intel_excl_cntrs *excl_cntrs; | ||
218 | int excl_thread_id; /* 0 or 1 */ | ||
182 | 219 | ||
183 | /* | 220 | /* |
184 | * AMD specific bits | 221 | * AMD specific bits |
@@ -187,7 +224,7 @@ struct cpu_hw_events { | |||
187 | /* Inverted mask of bits to clear in the perf_ctr ctrl registers */ | 224 | /* Inverted mask of bits to clear in the perf_ctr ctrl registers */ |
188 | u64 perf_ctr_virt_mask; | 225 | u64 perf_ctr_virt_mask; |
189 | 226 | ||
190 | void *kfree_on_online; | 227 | void *kfree_on_online[X86_PERF_KFREE_MAX]; |
191 | }; | 228 | }; |
192 | 229 | ||
193 | #define __EVENT_CONSTRAINT(c, n, m, w, o, f) {\ | 230 | #define __EVENT_CONSTRAINT(c, n, m, w, o, f) {\ |
@@ -202,6 +239,10 @@ struct cpu_hw_events { | |||
202 | #define EVENT_CONSTRAINT(c, n, m) \ | 239 | #define EVENT_CONSTRAINT(c, n, m) \ |
203 | __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0) | 240 | __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0) |
204 | 241 | ||
242 | #define INTEL_EXCLEVT_CONSTRAINT(c, n) \ | ||
243 | __EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT, HWEIGHT(n),\ | ||
244 | 0, PERF_X86_EVENT_EXCL) | ||
245 | |||
205 | /* | 246 | /* |
206 | * The overlap flag marks event constraints with overlapping counter | 247 | * The overlap flag marks event constraints with overlapping counter |
207 | * masks. This is the case if the counter mask of such an event is not | 248 | * masks. This is the case if the counter mask of such an event is not |
@@ -259,6 +300,10 @@ struct cpu_hw_events { | |||
259 | #define INTEL_FLAGS_UEVENT_CONSTRAINT(c, n) \ | 300 | #define INTEL_FLAGS_UEVENT_CONSTRAINT(c, n) \ |
260 | EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS) | 301 | EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS) |
261 | 302 | ||
303 | #define INTEL_EXCLUEVT_CONSTRAINT(c, n) \ | ||
304 | __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \ | ||
305 | HWEIGHT(n), 0, PERF_X86_EVENT_EXCL) | ||
306 | |||
262 | #define INTEL_PLD_CONSTRAINT(c, n) \ | 307 | #define INTEL_PLD_CONSTRAINT(c, n) \ |
263 | __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ | 308 | __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ |
264 | HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT) | 309 | HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT) |
@@ -283,22 +328,40 @@ struct cpu_hw_events { | |||
283 | 328 | ||
284 | /* Check flags and event code, and set the HSW load flag */ | 329 | /* Check flags and event code, and set the HSW load flag */ |
285 | #define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(code, n) \ | 330 | #define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(code, n) \ |
286 | __EVENT_CONSTRAINT(code, n, \ | 331 | __EVENT_CONSTRAINT(code, n, \ |
287 | ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \ | 332 | ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \ |
288 | HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW) | 333 | HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW) |
289 | 334 | ||
335 | #define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(code, n) \ | ||
336 | __EVENT_CONSTRAINT(code, n, \ | ||
337 | ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \ | ||
338 | HWEIGHT(n), 0, \ | ||
339 | PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL) | ||
340 | |||
290 | /* Check flags and event code/umask, and set the HSW store flag */ | 341 | /* Check flags and event code/umask, and set the HSW store flag */ |
291 | #define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(code, n) \ | 342 | #define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(code, n) \ |
292 | __EVENT_CONSTRAINT(code, n, \ | 343 | __EVENT_CONSTRAINT(code, n, \ |
293 | INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ | 344 | INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ |
294 | HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW) | 345 | HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW) |
295 | 346 | ||
347 | #define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(code, n) \ | ||
348 | __EVENT_CONSTRAINT(code, n, \ | ||
349 | INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ | ||
350 | HWEIGHT(n), 0, \ | ||
351 | PERF_X86_EVENT_PEBS_ST_HSW|PERF_X86_EVENT_EXCL) | ||
352 | |||
296 | /* Check flags and event code/umask, and set the HSW load flag */ | 353 | /* Check flags and event code/umask, and set the HSW load flag */ |
297 | #define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(code, n) \ | 354 | #define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(code, n) \ |
298 | __EVENT_CONSTRAINT(code, n, \ | 355 | __EVENT_CONSTRAINT(code, n, \ |
299 | INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ | 356 | INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ |
300 | HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW) | 357 | HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW) |
301 | 358 | ||
359 | #define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(code, n) \ | ||
360 | __EVENT_CONSTRAINT(code, n, \ | ||
361 | INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ | ||
362 | HWEIGHT(n), 0, \ | ||
363 | PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL) | ||
364 | |||
302 | /* Check flags and event code/umask, and set the HSW N/A flag */ | 365 | /* Check flags and event code/umask, and set the HSW N/A flag */ |
303 | #define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \ | 366 | #define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \ |
304 | __EVENT_CONSTRAINT(code, n, \ | 367 | __EVENT_CONSTRAINT(code, n, \ |
@@ -408,6 +471,13 @@ union x86_pmu_config { | |||
408 | 471 | ||
409 | #define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value | 472 | #define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value |
410 | 473 | ||
474 | enum { | ||
475 | x86_lbr_exclusive_lbr, | ||
476 | x86_lbr_exclusive_bts, | ||
477 | x86_lbr_exclusive_pt, | ||
478 | x86_lbr_exclusive_max, | ||
479 | }; | ||
480 | |||
411 | /* | 481 | /* |
412 | * struct x86_pmu - generic x86 pmu | 482 | * struct x86_pmu - generic x86 pmu |
413 | */ | 483 | */ |
@@ -443,14 +513,25 @@ struct x86_pmu { | |||
443 | u64 max_period; | 513 | u64 max_period; |
444 | struct event_constraint * | 514 | struct event_constraint * |
445 | (*get_event_constraints)(struct cpu_hw_events *cpuc, | 515 | (*get_event_constraints)(struct cpu_hw_events *cpuc, |
516 | int idx, | ||
446 | struct perf_event *event); | 517 | struct perf_event *event); |
447 | 518 | ||
448 | void (*put_event_constraints)(struct cpu_hw_events *cpuc, | 519 | void (*put_event_constraints)(struct cpu_hw_events *cpuc, |
449 | struct perf_event *event); | 520 | struct perf_event *event); |
521 | |||
522 | void (*commit_scheduling)(struct cpu_hw_events *cpuc, | ||
523 | struct perf_event *event, | ||
524 | int cntr); | ||
525 | |||
526 | void (*start_scheduling)(struct cpu_hw_events *cpuc); | ||
527 | |||
528 | void (*stop_scheduling)(struct cpu_hw_events *cpuc); | ||
529 | |||
450 | struct event_constraint *event_constraints; | 530 | struct event_constraint *event_constraints; |
451 | struct x86_pmu_quirk *quirks; | 531 | struct x86_pmu_quirk *quirks; |
452 | int perfctr_second_write; | 532 | int perfctr_second_write; |
453 | bool late_ack; | 533 | bool late_ack; |
534 | unsigned (*limit_period)(struct perf_event *event, unsigned l); | ||
454 | 535 | ||
455 | /* | 536 | /* |
456 | * sysfs attrs | 537 | * sysfs attrs |
@@ -472,7 +553,8 @@ struct x86_pmu { | |||
472 | void (*cpu_dead)(int cpu); | 553 | void (*cpu_dead)(int cpu); |
473 | 554 | ||
474 | void (*check_microcode)(void); | 555 | void (*check_microcode)(void); |
475 | void (*flush_branch_stack)(void); | 556 | void (*sched_task)(struct perf_event_context *ctx, |
557 | bool sched_in); | ||
476 | 558 | ||
477 | /* | 559 | /* |
478 | * Intel Arch Perfmon v2+ | 560 | * Intel Arch Perfmon v2+ |
@@ -504,10 +586,15 @@ struct x86_pmu { | |||
504 | bool lbr_double_abort; /* duplicated lbr aborts */ | 586 | bool lbr_double_abort; /* duplicated lbr aborts */ |
505 | 587 | ||
506 | /* | 588 | /* |
589 | * Intel PT/LBR/BTS are exclusive | ||
590 | */ | ||
591 | atomic_t lbr_exclusive[x86_lbr_exclusive_max]; | ||
592 | |||
593 | /* | ||
507 | * Extra registers for events | 594 | * Extra registers for events |
508 | */ | 595 | */ |
509 | struct extra_reg *extra_regs; | 596 | struct extra_reg *extra_regs; |
510 | unsigned int er_flags; | 597 | unsigned int flags; |
511 | 598 | ||
512 | /* | 599 | /* |
513 | * Intel host/guest support (KVM) | 600 | * Intel host/guest support (KVM) |
@@ -515,6 +602,13 @@ struct x86_pmu { | |||
515 | struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr); | 602 | struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr); |
516 | }; | 603 | }; |
517 | 604 | ||
605 | struct x86_perf_task_context { | ||
606 | u64 lbr_from[MAX_LBR_ENTRIES]; | ||
607 | u64 lbr_to[MAX_LBR_ENTRIES]; | ||
608 | int lbr_callstack_users; | ||
609 | int lbr_stack_state; | ||
610 | }; | ||
611 | |||
518 | #define x86_add_quirk(func_) \ | 612 | #define x86_add_quirk(func_) \ |
519 | do { \ | 613 | do { \ |
520 | static struct x86_pmu_quirk __quirk __initdata = { \ | 614 | static struct x86_pmu_quirk __quirk __initdata = { \ |
@@ -524,8 +618,13 @@ do { \ | |||
524 | x86_pmu.quirks = &__quirk; \ | 618 | x86_pmu.quirks = &__quirk; \ |
525 | } while (0) | 619 | } while (0) |
526 | 620 | ||
527 | #define ERF_NO_HT_SHARING 1 | 621 | /* |
528 | #define ERF_HAS_RSP_1 2 | 622 | * x86_pmu flags |
623 | */ | ||
624 | #define PMU_FL_NO_HT_SHARING 0x1 /* no hyper-threading resource sharing */ | ||
625 | #define PMU_FL_HAS_RSP_1 0x2 /* has 2 equivalent offcore_rsp regs */ | ||
626 | #define PMU_FL_EXCL_CNTRS 0x4 /* has exclusive counter requirements */ | ||
627 | #define PMU_FL_EXCL_ENABLED 0x8 /* exclusive counter active */ | ||
529 | 628 | ||
530 | #define EVENT_VAR(_id) event_attr_##_id | 629 | #define EVENT_VAR(_id) event_attr_##_id |
531 | #define EVENT_PTR(_id) &event_attr_##_id.attr.attr | 630 | #define EVENT_PTR(_id) &event_attr_##_id.attr.attr |
@@ -546,6 +645,12 @@ static struct perf_pmu_events_attr event_attr_##v = { \ | |||
546 | 645 | ||
547 | extern struct x86_pmu x86_pmu __read_mostly; | 646 | extern struct x86_pmu x86_pmu __read_mostly; |
548 | 647 | ||
648 | static inline bool x86_pmu_has_lbr_callstack(void) | ||
649 | { | ||
650 | return x86_pmu.lbr_sel_map && | ||
651 | x86_pmu.lbr_sel_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] > 0; | ||
652 | } | ||
653 | |||
549 | DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events); | 654 | DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events); |
550 | 655 | ||
551 | int x86_perf_event_set_period(struct perf_event *event); | 656 | int x86_perf_event_set_period(struct perf_event *event); |
@@ -588,6 +693,12 @@ static inline int x86_pmu_rdpmc_index(int index) | |||
588 | return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index; | 693 | return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index; |
589 | } | 694 | } |
590 | 695 | ||
696 | int x86_add_exclusive(unsigned int what); | ||
697 | |||
698 | void x86_del_exclusive(unsigned int what); | ||
699 | |||
700 | void hw_perf_lbr_event_destroy(struct perf_event *event); | ||
701 | |||
591 | int x86_setup_perfctr(struct perf_event *event); | 702 | int x86_setup_perfctr(struct perf_event *event); |
592 | 703 | ||
593 | int x86_pmu_hw_config(struct perf_event *event); | 704 | int x86_pmu_hw_config(struct perf_event *event); |
@@ -674,10 +785,34 @@ static inline int amd_pmu_init(void) | |||
674 | 785 | ||
675 | #ifdef CONFIG_CPU_SUP_INTEL | 786 | #ifdef CONFIG_CPU_SUP_INTEL |
676 | 787 | ||
788 | static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event) | ||
789 | { | ||
790 | /* user explicitly requested branch sampling */ | ||
791 | if (has_branch_stack(event)) | ||
792 | return true; | ||
793 | |||
794 | /* implicit branch sampling to correct PEBS skid */ | ||
795 | if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 && | ||
796 | x86_pmu.intel_cap.pebs_format < 2) | ||
797 | return true; | ||
798 | |||
799 | return false; | ||
800 | } | ||
801 | |||
802 | static inline bool intel_pmu_has_bts(struct perf_event *event) | ||
803 | { | ||
804 | if (event->attr.config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS && | ||
805 | !event->attr.freq && event->hw.sample_period == 1) | ||
806 | return true; | ||
807 | |||
808 | return false; | ||
809 | } | ||
810 | |||
677 | int intel_pmu_save_and_restart(struct perf_event *event); | 811 | int intel_pmu_save_and_restart(struct perf_event *event); |
678 | 812 | ||
679 | struct event_constraint * | 813 | struct event_constraint * |
680 | x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event); | 814 | x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx, |
815 | struct perf_event *event); | ||
681 | 816 | ||
682 | struct intel_shared_regs *allocate_shared_regs(int cpu); | 817 | struct intel_shared_regs *allocate_shared_regs(int cpu); |
683 | 818 | ||
@@ -727,13 +862,15 @@ void intel_pmu_pebs_disable_all(void); | |||
727 | 862 | ||
728 | void intel_ds_init(void); | 863 | void intel_ds_init(void); |
729 | 864 | ||
865 | void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in); | ||
866 | |||
730 | void intel_pmu_lbr_reset(void); | 867 | void intel_pmu_lbr_reset(void); |
731 | 868 | ||
732 | void intel_pmu_lbr_enable(struct perf_event *event); | 869 | void intel_pmu_lbr_enable(struct perf_event *event); |
733 | 870 | ||
734 | void intel_pmu_lbr_disable(struct perf_event *event); | 871 | void intel_pmu_lbr_disable(struct perf_event *event); |
735 | 872 | ||
736 | void intel_pmu_lbr_enable_all(void); | 873 | void intel_pmu_lbr_enable_all(bool pmi); |
737 | 874 | ||
738 | void intel_pmu_lbr_disable_all(void); | 875 | void intel_pmu_lbr_disable_all(void); |
739 | 876 | ||
@@ -747,8 +884,18 @@ void intel_pmu_lbr_init_atom(void); | |||
747 | 884 | ||
748 | void intel_pmu_lbr_init_snb(void); | 885 | void intel_pmu_lbr_init_snb(void); |
749 | 886 | ||
887 | void intel_pmu_lbr_init_hsw(void); | ||
888 | |||
750 | int intel_pmu_setup_lbr_filter(struct perf_event *event); | 889 | int intel_pmu_setup_lbr_filter(struct perf_event *event); |
751 | 890 | ||
891 | void intel_pt_interrupt(void); | ||
892 | |||
893 | int intel_bts_interrupt(void); | ||
894 | |||
895 | void intel_bts_enable_local(void); | ||
896 | |||
897 | void intel_bts_disable_local(void); | ||
898 | |||
752 | int p4_pmu_init(void); | 899 | int p4_pmu_init(void); |
753 | 900 | ||
754 | int p6_pmu_init(void); | 901 | int p6_pmu_init(void); |
@@ -758,6 +905,10 @@ int knc_pmu_init(void); | |||
758 | ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, | 905 | ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, |
759 | char *page); | 906 | char *page); |
760 | 907 | ||
908 | static inline int is_ht_workaround_enabled(void) | ||
909 | { | ||
910 | return !!(x86_pmu.flags & PMU_FL_EXCL_ENABLED); | ||
911 | } | ||
761 | #else /* CONFIG_CPU_SUP_INTEL */ | 912 | #else /* CONFIG_CPU_SUP_INTEL */ |
762 | 913 | ||
763 | static inline void reserve_ds_buffers(void) | 914 | static inline void reserve_ds_buffers(void) |
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 28926311aac1..1cee5d2d7ece 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c | |||
@@ -382,6 +382,7 @@ static int amd_pmu_cpu_prepare(int cpu) | |||
382 | static void amd_pmu_cpu_starting(int cpu) | 382 | static void amd_pmu_cpu_starting(int cpu) |
383 | { | 383 | { |
384 | struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); | 384 | struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); |
385 | void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED]; | ||
385 | struct amd_nb *nb; | 386 | struct amd_nb *nb; |
386 | int i, nb_id; | 387 | int i, nb_id; |
387 | 388 | ||
@@ -399,7 +400,7 @@ static void amd_pmu_cpu_starting(int cpu) | |||
399 | continue; | 400 | continue; |
400 | 401 | ||
401 | if (nb->nb_id == nb_id) { | 402 | if (nb->nb_id == nb_id) { |
402 | cpuc->kfree_on_online = cpuc->amd_nb; | 403 | *onln = cpuc->amd_nb; |
403 | cpuc->amd_nb = nb; | 404 | cpuc->amd_nb = nb; |
404 | break; | 405 | break; |
405 | } | 406 | } |
@@ -429,7 +430,8 @@ static void amd_pmu_cpu_dead(int cpu) | |||
429 | } | 430 | } |
430 | 431 | ||
431 | static struct event_constraint * | 432 | static struct event_constraint * |
432 | amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) | 433 | amd_get_event_constraints(struct cpu_hw_events *cpuc, int idx, |
434 | struct perf_event *event) | ||
433 | { | 435 | { |
434 | /* | 436 | /* |
435 | * if not NB event or no NB, then no constraints | 437 | * if not NB event or no NB, then no constraints |
@@ -537,7 +539,8 @@ static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0); | |||
537 | static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0); | 539 | static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0); |
538 | 540 | ||
539 | static struct event_constraint * | 541 | static struct event_constraint * |
540 | amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *event) | 542 | amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, int idx, |
543 | struct perf_event *event) | ||
541 | { | 544 | { |
542 | struct hw_perf_event *hwc = &event->hw; | 545 | struct hw_perf_event *hwc = &event->hw; |
543 | unsigned int event_code = amd_get_event_code(hwc); | 546 | unsigned int event_code = amd_get_event_code(hwc); |
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index a61f5c6911da..989d3c215d2b 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c | |||
@@ -796,7 +796,7 @@ static int setup_ibs_ctl(int ibs_eilvt_off) | |||
796 | * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that | 796 | * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that |
797 | * is using the new offset. | 797 | * is using the new offset. |
798 | */ | 798 | */ |
799 | static int force_ibs_eilvt_setup(void) | 799 | static void force_ibs_eilvt_setup(void) |
800 | { | 800 | { |
801 | int offset; | 801 | int offset; |
802 | int ret; | 802 | int ret; |
@@ -811,26 +811,24 @@ static int force_ibs_eilvt_setup(void) | |||
811 | 811 | ||
812 | if (offset == APIC_EILVT_NR_MAX) { | 812 | if (offset == APIC_EILVT_NR_MAX) { |
813 | printk(KERN_DEBUG "No EILVT entry available\n"); | 813 | printk(KERN_DEBUG "No EILVT entry available\n"); |
814 | return -EBUSY; | 814 | return; |
815 | } | 815 | } |
816 | 816 | ||
817 | ret = setup_ibs_ctl(offset); | 817 | ret = setup_ibs_ctl(offset); |
818 | if (ret) | 818 | if (ret) |
819 | goto out; | 819 | goto out; |
820 | 820 | ||
821 | if (!ibs_eilvt_valid()) { | 821 | if (!ibs_eilvt_valid()) |
822 | ret = -EFAULT; | ||
823 | goto out; | 822 | goto out; |
824 | } | ||
825 | 823 | ||
826 | pr_info("IBS: LVT offset %d assigned\n", offset); | 824 | pr_info("IBS: LVT offset %d assigned\n", offset); |
827 | 825 | ||
828 | return 0; | 826 | return; |
829 | out: | 827 | out: |
830 | preempt_disable(); | 828 | preempt_disable(); |
831 | put_eilvt(offset); | 829 | put_eilvt(offset); |
832 | preempt_enable(); | 830 | preempt_enable(); |
833 | return ret; | 831 | return; |
834 | } | 832 | } |
835 | 833 | ||
836 | static void ibs_eilvt_setup(void) | 834 | static void ibs_eilvt_setup(void) |
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 258990688a5e..219d3fb423a1 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c | |||
@@ -12,6 +12,7 @@ | |||
12 | #include <linux/init.h> | 12 | #include <linux/init.h> |
13 | #include <linux/slab.h> | 13 | #include <linux/slab.h> |
14 | #include <linux/export.h> | 14 | #include <linux/export.h> |
15 | #include <linux/watchdog.h> | ||
15 | 16 | ||
16 | #include <asm/cpufeature.h> | 17 | #include <asm/cpufeature.h> |
17 | #include <asm/hardirq.h> | 18 | #include <asm/hardirq.h> |
@@ -113,6 +114,12 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly = | |||
113 | INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ | 114 | INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ |
114 | INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */ | 115 | INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */ |
115 | INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */ | 116 | INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */ |
117 | |||
118 | INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */ | ||
119 | INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ | ||
120 | INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ | ||
121 | INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */ | ||
122 | |||
116 | EVENT_CONSTRAINT_END | 123 | EVENT_CONSTRAINT_END |
117 | }; | 124 | }; |
118 | 125 | ||
@@ -131,15 +138,12 @@ static struct event_constraint intel_ivb_event_constraints[] __read_mostly = | |||
131 | INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */ | 138 | INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */ |
132 | INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */ | 139 | INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */ |
133 | INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ | 140 | INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ |
134 | /* | 141 | |
135 | * Errata BV98 -- MEM_*_RETIRED events can leak between counters of SMT | 142 | INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */ |
136 | * siblings; disable these events because they can corrupt unrelated | 143 | INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ |
137 | * counters. | 144 | INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ |
138 | */ | 145 | INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */ |
139 | INTEL_EVENT_CONSTRAINT(0xd0, 0x0), /* MEM_UOPS_RETIRED.* */ | 146 | |
140 | INTEL_EVENT_CONSTRAINT(0xd1, 0x0), /* MEM_LOAD_UOPS_RETIRED.* */ | ||
141 | INTEL_EVENT_CONSTRAINT(0xd2, 0x0), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ | ||
142 | INTEL_EVENT_CONSTRAINT(0xd3, 0x0), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */ | ||
143 | EVENT_CONSTRAINT_END | 147 | EVENT_CONSTRAINT_END |
144 | }; | 148 | }; |
145 | 149 | ||
@@ -217,6 +221,21 @@ static struct event_constraint intel_hsw_event_constraints[] = { | |||
217 | INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), | 221 | INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), |
218 | /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */ | 222 | /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */ |
219 | INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), | 223 | INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), |
224 | |||
225 | INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */ | ||
226 | INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ | ||
227 | INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ | ||
228 | INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */ | ||
229 | |||
230 | EVENT_CONSTRAINT_END | ||
231 | }; | ||
232 | |||
233 | struct event_constraint intel_bdw_event_constraints[] = { | ||
234 | FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ | ||
235 | FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ | ||
236 | FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ | ||
237 | INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */ | ||
238 | INTEL_EVENT_CONSTRAINT(0xa3, 0x4), /* CYCLE_ACTIVITY.* */ | ||
220 | EVENT_CONSTRAINT_END | 239 | EVENT_CONSTRAINT_END |
221 | }; | 240 | }; |
222 | 241 | ||
@@ -415,6 +434,202 @@ static __initconst const u64 snb_hw_cache_event_ids | |||
415 | 434 | ||
416 | }; | 435 | }; |
417 | 436 | ||
437 | /* | ||
438 | * Notes on the events: | ||
439 | * - data reads do not include code reads (comparable to earlier tables) | ||
440 | * - data counts include speculative execution (except L1 write, dtlb, bpu) | ||
441 | * - remote node access includes remote memory, remote cache, remote mmio. | ||
442 | * - prefetches are not included in the counts because they are not | ||
443 | * reliably counted. | ||
444 | */ | ||
445 | |||
446 | #define HSW_DEMAND_DATA_RD BIT_ULL(0) | ||
447 | #define HSW_DEMAND_RFO BIT_ULL(1) | ||
448 | #define HSW_ANY_RESPONSE BIT_ULL(16) | ||
449 | #define HSW_SUPPLIER_NONE BIT_ULL(17) | ||
450 | #define HSW_L3_MISS_LOCAL_DRAM BIT_ULL(22) | ||
451 | #define HSW_L3_MISS_REMOTE_HOP0 BIT_ULL(27) | ||
452 | #define HSW_L3_MISS_REMOTE_HOP1 BIT_ULL(28) | ||
453 | #define HSW_L3_MISS_REMOTE_HOP2P BIT_ULL(29) | ||
454 | #define HSW_L3_MISS (HSW_L3_MISS_LOCAL_DRAM| \ | ||
455 | HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \ | ||
456 | HSW_L3_MISS_REMOTE_HOP2P) | ||
457 | #define HSW_SNOOP_NONE BIT_ULL(31) | ||
458 | #define HSW_SNOOP_NOT_NEEDED BIT_ULL(32) | ||
459 | #define HSW_SNOOP_MISS BIT_ULL(33) | ||
460 | #define HSW_SNOOP_HIT_NO_FWD BIT_ULL(34) | ||
461 | #define HSW_SNOOP_HIT_WITH_FWD BIT_ULL(35) | ||
462 | #define HSW_SNOOP_HITM BIT_ULL(36) | ||
463 | #define HSW_SNOOP_NON_DRAM BIT_ULL(37) | ||
464 | #define HSW_ANY_SNOOP (HSW_SNOOP_NONE| \ | ||
465 | HSW_SNOOP_NOT_NEEDED|HSW_SNOOP_MISS| \ | ||
466 | HSW_SNOOP_HIT_NO_FWD|HSW_SNOOP_HIT_WITH_FWD| \ | ||
467 | HSW_SNOOP_HITM|HSW_SNOOP_NON_DRAM) | ||
468 | #define HSW_SNOOP_DRAM (HSW_ANY_SNOOP & ~HSW_SNOOP_NON_DRAM) | ||
469 | #define HSW_DEMAND_READ HSW_DEMAND_DATA_RD | ||
470 | #define HSW_DEMAND_WRITE HSW_DEMAND_RFO | ||
471 | #define HSW_L3_MISS_REMOTE (HSW_L3_MISS_REMOTE_HOP0|\ | ||
472 | HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P) | ||
473 | #define HSW_LLC_ACCESS HSW_ANY_RESPONSE | ||
474 | |||
475 | #define BDW_L3_MISS_LOCAL BIT(26) | ||
476 | #define BDW_L3_MISS (BDW_L3_MISS_LOCAL| \ | ||
477 | HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \ | ||
478 | HSW_L3_MISS_REMOTE_HOP2P) | ||
479 | |||
480 | |||
481 | static __initconst const u64 hsw_hw_cache_event_ids | ||
482 | [PERF_COUNT_HW_CACHE_MAX] | ||
483 | [PERF_COUNT_HW_CACHE_OP_MAX] | ||
484 | [PERF_COUNT_HW_CACHE_RESULT_MAX] = | ||
485 | { | ||
486 | [ C(L1D ) ] = { | ||
487 | [ C(OP_READ) ] = { | ||
488 | [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */ | ||
489 | [ C(RESULT_MISS) ] = 0x151, /* L1D.REPLACEMENT */ | ||
490 | }, | ||
491 | [ C(OP_WRITE) ] = { | ||
492 | [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */ | ||
493 | [ C(RESULT_MISS) ] = 0x0, | ||
494 | }, | ||
495 | [ C(OP_PREFETCH) ] = { | ||
496 | [ C(RESULT_ACCESS) ] = 0x0, | ||
497 | [ C(RESULT_MISS) ] = 0x0, | ||
498 | }, | ||
499 | }, | ||
500 | [ C(L1I ) ] = { | ||
501 | [ C(OP_READ) ] = { | ||
502 | [ C(RESULT_ACCESS) ] = 0x0, | ||
503 | [ C(RESULT_MISS) ] = 0x280, /* ICACHE.MISSES */ | ||
504 | }, | ||
505 | [ C(OP_WRITE) ] = { | ||
506 | [ C(RESULT_ACCESS) ] = -1, | ||
507 | [ C(RESULT_MISS) ] = -1, | ||
508 | }, | ||
509 | [ C(OP_PREFETCH) ] = { | ||
510 | [ C(RESULT_ACCESS) ] = 0x0, | ||
511 | [ C(RESULT_MISS) ] = 0x0, | ||
512 | }, | ||
513 | }, | ||
514 | [ C(LL ) ] = { | ||
515 | [ C(OP_READ) ] = { | ||
516 | [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */ | ||
517 | [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */ | ||
518 | }, | ||
519 | [ C(OP_WRITE) ] = { | ||
520 | [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */ | ||
521 | [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */ | ||
522 | }, | ||
523 | [ C(OP_PREFETCH) ] = { | ||
524 | [ C(RESULT_ACCESS) ] = 0x0, | ||
525 | [ C(RESULT_MISS) ] = 0x0, | ||
526 | }, | ||
527 | }, | ||
528 | [ C(DTLB) ] = { | ||
529 | [ C(OP_READ) ] = { | ||
530 | [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */ | ||
531 | [ C(RESULT_MISS) ] = 0x108, /* DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK */ | ||
532 | }, | ||
533 | [ C(OP_WRITE) ] = { | ||
534 | [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */ | ||
535 | [ C(RESULT_MISS) ] = 0x149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */ | ||
536 | }, | ||
537 | [ C(OP_PREFETCH) ] = { | ||
538 | [ C(RESULT_ACCESS) ] = 0x0, | ||
539 | [ C(RESULT_MISS) ] = 0x0, | ||
540 | }, | ||
541 | }, | ||
542 | [ C(ITLB) ] = { | ||
543 | [ C(OP_READ) ] = { | ||
544 | [ C(RESULT_ACCESS) ] = 0x6085, /* ITLB_MISSES.STLB_HIT */ | ||
545 | [ C(RESULT_MISS) ] = 0x185, /* ITLB_MISSES.MISS_CAUSES_A_WALK */ | ||
546 | }, | ||
547 | [ C(OP_WRITE) ] = { | ||
548 | [ C(RESULT_ACCESS) ] = -1, | ||
549 | [ C(RESULT_MISS) ] = -1, | ||
550 | }, | ||
551 | [ C(OP_PREFETCH) ] = { | ||
552 | [ C(RESULT_ACCESS) ] = -1, | ||
553 | [ C(RESULT_MISS) ] = -1, | ||
554 | }, | ||
555 | }, | ||
556 | [ C(BPU ) ] = { | ||
557 | [ C(OP_READ) ] = { | ||
558 | [ C(RESULT_ACCESS) ] = 0xc4, /* BR_INST_RETIRED.ALL_BRANCHES */ | ||
559 | [ C(RESULT_MISS) ] = 0xc5, /* BR_MISP_RETIRED.ALL_BRANCHES */ | ||
560 | }, | ||
561 | [ C(OP_WRITE) ] = { | ||
562 | [ C(RESULT_ACCESS) ] = -1, | ||
563 | [ C(RESULT_MISS) ] = -1, | ||
564 | }, | ||
565 | [ C(OP_PREFETCH) ] = { | ||
566 | [ C(RESULT_ACCESS) ] = -1, | ||
567 | [ C(RESULT_MISS) ] = -1, | ||
568 | }, | ||
569 | }, | ||
570 | [ C(NODE) ] = { | ||
571 | [ C(OP_READ) ] = { | ||
572 | [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */ | ||
573 | [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */ | ||
574 | }, | ||
575 | [ C(OP_WRITE) ] = { | ||
576 | [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */ | ||
577 | [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */ | ||
578 | }, | ||
579 | [ C(OP_PREFETCH) ] = { | ||
580 | [ C(RESULT_ACCESS) ] = 0x0, | ||
581 | [ C(RESULT_MISS) ] = 0x0, | ||
582 | }, | ||
583 | }, | ||
584 | }; | ||
585 | |||
586 | static __initconst const u64 hsw_hw_cache_extra_regs | ||
587 | [PERF_COUNT_HW_CACHE_MAX] | ||
588 | [PERF_COUNT_HW_CACHE_OP_MAX] | ||
589 | [PERF_COUNT_HW_CACHE_RESULT_MAX] = | ||
590 | { | ||
591 | [ C(LL ) ] = { | ||
592 | [ C(OP_READ) ] = { | ||
593 | [ C(RESULT_ACCESS) ] = HSW_DEMAND_READ| | ||
594 | HSW_LLC_ACCESS, | ||
595 | [ C(RESULT_MISS) ] = HSW_DEMAND_READ| | ||
596 | HSW_L3_MISS|HSW_ANY_SNOOP, | ||
597 | }, | ||
598 | [ C(OP_WRITE) ] = { | ||
599 | [ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE| | ||
600 | HSW_LLC_ACCESS, | ||
601 | [ C(RESULT_MISS) ] = HSW_DEMAND_WRITE| | ||
602 | HSW_L3_MISS|HSW_ANY_SNOOP, | ||
603 | }, | ||
604 | [ C(OP_PREFETCH) ] = { | ||
605 | [ C(RESULT_ACCESS) ] = 0x0, | ||
606 | [ C(RESULT_MISS) ] = 0x0, | ||
607 | }, | ||
608 | }, | ||
609 | [ C(NODE) ] = { | ||
610 | [ C(OP_READ) ] = { | ||
611 | [ C(RESULT_ACCESS) ] = HSW_DEMAND_READ| | ||
612 | HSW_L3_MISS_LOCAL_DRAM| | ||
613 | HSW_SNOOP_DRAM, | ||
614 | [ C(RESULT_MISS) ] = HSW_DEMAND_READ| | ||
615 | HSW_L3_MISS_REMOTE| | ||
616 | HSW_SNOOP_DRAM, | ||
617 | }, | ||
618 | [ C(OP_WRITE) ] = { | ||
619 | [ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE| | ||
620 | HSW_L3_MISS_LOCAL_DRAM| | ||
621 | HSW_SNOOP_DRAM, | ||
622 | [ C(RESULT_MISS) ] = HSW_DEMAND_WRITE| | ||
623 | HSW_L3_MISS_REMOTE| | ||
624 | HSW_SNOOP_DRAM, | ||
625 | }, | ||
626 | [ C(OP_PREFETCH) ] = { | ||
627 | [ C(RESULT_ACCESS) ] = 0x0, | ||
628 | [ C(RESULT_MISS) ] = 0x0, | ||
629 | }, | ||
630 | }, | ||
631 | }; | ||
632 | |||
418 | static __initconst const u64 westmere_hw_cache_event_ids | 633 | static __initconst const u64 westmere_hw_cache_event_ids |
419 | [PERF_COUNT_HW_CACHE_MAX] | 634 | [PERF_COUNT_HW_CACHE_MAX] |
420 | [PERF_COUNT_HW_CACHE_OP_MAX] | 635 | [PERF_COUNT_HW_CACHE_OP_MAX] |
@@ -1029,21 +1244,10 @@ static __initconst const u64 slm_hw_cache_event_ids | |||
1029 | }, | 1244 | }, |
1030 | }; | 1245 | }; |
1031 | 1246 | ||
1032 | static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event) | 1247 | /* |
1033 | { | 1248 | * Use from PMIs where the LBRs are already disabled. |
1034 | /* user explicitly requested branch sampling */ | 1249 | */ |
1035 | if (has_branch_stack(event)) | 1250 | static void __intel_pmu_disable_all(void) |
1036 | return true; | ||
1037 | |||
1038 | /* implicit branch sampling to correct PEBS skid */ | ||
1039 | if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 && | ||
1040 | x86_pmu.intel_cap.pebs_format < 2) | ||
1041 | return true; | ||
1042 | |||
1043 | return false; | ||
1044 | } | ||
1045 | |||
1046 | static void intel_pmu_disable_all(void) | ||
1047 | { | 1251 | { |
1048 | struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); | 1252 | struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); |
1049 | 1253 | ||
@@ -1051,17 +1255,24 @@ static void intel_pmu_disable_all(void) | |||
1051 | 1255 | ||
1052 | if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) | 1256 | if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) |
1053 | intel_pmu_disable_bts(); | 1257 | intel_pmu_disable_bts(); |
1258 | else | ||
1259 | intel_bts_disable_local(); | ||
1054 | 1260 | ||
1055 | intel_pmu_pebs_disable_all(); | 1261 | intel_pmu_pebs_disable_all(); |
1262 | } | ||
1263 | |||
1264 | static void intel_pmu_disable_all(void) | ||
1265 | { | ||
1266 | __intel_pmu_disable_all(); | ||
1056 | intel_pmu_lbr_disable_all(); | 1267 | intel_pmu_lbr_disable_all(); |
1057 | } | 1268 | } |
1058 | 1269 | ||
1059 | static void intel_pmu_enable_all(int added) | 1270 | static void __intel_pmu_enable_all(int added, bool pmi) |
1060 | { | 1271 | { |
1061 | struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); | 1272 | struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); |
1062 | 1273 | ||
1063 | intel_pmu_pebs_enable_all(); | 1274 | intel_pmu_pebs_enable_all(); |
1064 | intel_pmu_lbr_enable_all(); | 1275 | intel_pmu_lbr_enable_all(pmi); |
1065 | wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, | 1276 | wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, |
1066 | x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask); | 1277 | x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask); |
1067 | 1278 | ||
@@ -1073,7 +1284,13 @@ static void intel_pmu_enable_all(int added) | |||
1073 | return; | 1284 | return; |
1074 | 1285 | ||
1075 | intel_pmu_enable_bts(event->hw.config); | 1286 | intel_pmu_enable_bts(event->hw.config); |
1076 | } | 1287 | } else |
1288 | intel_bts_enable_local(); | ||
1289 | } | ||
1290 | |||
1291 | static void intel_pmu_enable_all(int added) | ||
1292 | { | ||
1293 | __intel_pmu_enable_all(added, false); | ||
1077 | } | 1294 | } |
1078 | 1295 | ||
1079 | /* | 1296 | /* |
@@ -1207,7 +1424,7 @@ static void intel_pmu_disable_event(struct perf_event *event) | |||
1207 | * must disable before any actual event | 1424 | * must disable before any actual event |
1208 | * because any event may be combined with LBR | 1425 | * because any event may be combined with LBR |
1209 | */ | 1426 | */ |
1210 | if (intel_pmu_needs_lbr_smpl(event)) | 1427 | if (needs_branch_stack(event)) |
1211 | intel_pmu_lbr_disable(event); | 1428 | intel_pmu_lbr_disable(event); |
1212 | 1429 | ||
1213 | if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { | 1430 | if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { |
@@ -1268,7 +1485,7 @@ static void intel_pmu_enable_event(struct perf_event *event) | |||
1268 | * must enabled before any actual event | 1485 | * must enabled before any actual event |
1269 | * because any event may be combined with LBR | 1486 | * because any event may be combined with LBR |
1270 | */ | 1487 | */ |
1271 | if (intel_pmu_needs_lbr_smpl(event)) | 1488 | if (needs_branch_stack(event)) |
1272 | intel_pmu_lbr_enable(event); | 1489 | intel_pmu_lbr_enable(event); |
1273 | 1490 | ||
1274 | if (event->attr.exclude_host) | 1491 | if (event->attr.exclude_host) |
@@ -1334,6 +1551,18 @@ static void intel_pmu_reset(void) | |||
1334 | if (ds) | 1551 | if (ds) |
1335 | ds->bts_index = ds->bts_buffer_base; | 1552 | ds->bts_index = ds->bts_buffer_base; |
1336 | 1553 | ||
1554 | /* Ack all overflows and disable fixed counters */ | ||
1555 | if (x86_pmu.version >= 2) { | ||
1556 | intel_pmu_ack_status(intel_pmu_get_status()); | ||
1557 | wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); | ||
1558 | } | ||
1559 | |||
1560 | /* Reset LBRs and LBR freezing */ | ||
1561 | if (x86_pmu.lbr_nr) { | ||
1562 | update_debugctlmsr(get_debugctlmsr() & | ||
1563 | ~(DEBUGCTLMSR_FREEZE_LBRS_ON_PMI|DEBUGCTLMSR_LBR)); | ||
1564 | } | ||
1565 | |||
1337 | local_irq_restore(flags); | 1566 | local_irq_restore(flags); |
1338 | } | 1567 | } |
1339 | 1568 | ||
@@ -1357,8 +1586,9 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) | |||
1357 | */ | 1586 | */ |
1358 | if (!x86_pmu.late_ack) | 1587 | if (!x86_pmu.late_ack) |
1359 | apic_write(APIC_LVTPC, APIC_DM_NMI); | 1588 | apic_write(APIC_LVTPC, APIC_DM_NMI); |
1360 | intel_pmu_disable_all(); | 1589 | __intel_pmu_disable_all(); |
1361 | handled = intel_pmu_drain_bts_buffer(); | 1590 | handled = intel_pmu_drain_bts_buffer(); |
1591 | handled += intel_bts_interrupt(); | ||
1362 | status = intel_pmu_get_status(); | 1592 | status = intel_pmu_get_status(); |
1363 | if (!status) | 1593 | if (!status) |
1364 | goto done; | 1594 | goto done; |
@@ -1399,6 +1629,14 @@ again: | |||
1399 | } | 1629 | } |
1400 | 1630 | ||
1401 | /* | 1631 | /* |
1632 | * Intel PT | ||
1633 | */ | ||
1634 | if (__test_and_clear_bit(55, (unsigned long *)&status)) { | ||
1635 | handled++; | ||
1636 | intel_pt_interrupt(); | ||
1637 | } | ||
1638 | |||
1639 | /* | ||
1402 | * Checkpointed counters can lead to 'spurious' PMIs because the | 1640 | * Checkpointed counters can lead to 'spurious' PMIs because the |
1403 | * rollback caused by the PMI will have cleared the overflow status | 1641 | * rollback caused by the PMI will have cleared the overflow status |
1404 | * bit. Therefore always force probe these counters. | 1642 | * bit. Therefore always force probe these counters. |
@@ -1433,7 +1671,7 @@ again: | |||
1433 | goto again; | 1671 | goto again; |
1434 | 1672 | ||
1435 | done: | 1673 | done: |
1436 | intel_pmu_enable_all(0); | 1674 | __intel_pmu_enable_all(0, true); |
1437 | /* | 1675 | /* |
1438 | * Only unmask the NMI after the overflow counters | 1676 | * Only unmask the NMI after the overflow counters |
1439 | * have been reset. This avoids spurious NMIs on | 1677 | * have been reset. This avoids spurious NMIs on |
@@ -1464,7 +1702,7 @@ intel_bts_constraints(struct perf_event *event) | |||
1464 | 1702 | ||
1465 | static int intel_alt_er(int idx) | 1703 | static int intel_alt_er(int idx) |
1466 | { | 1704 | { |
1467 | if (!(x86_pmu.er_flags & ERF_HAS_RSP_1)) | 1705 | if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1)) |
1468 | return idx; | 1706 | return idx; |
1469 | 1707 | ||
1470 | if (idx == EXTRA_REG_RSP_0) | 1708 | if (idx == EXTRA_REG_RSP_0) |
@@ -1624,7 +1862,8 @@ intel_shared_regs_constraints(struct cpu_hw_events *cpuc, | |||
1624 | } | 1862 | } |
1625 | 1863 | ||
1626 | struct event_constraint * | 1864 | struct event_constraint * |
1627 | x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) | 1865 | x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx, |
1866 | struct perf_event *event) | ||
1628 | { | 1867 | { |
1629 | struct event_constraint *c; | 1868 | struct event_constraint *c; |
1630 | 1869 | ||
@@ -1641,7 +1880,8 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) | |||
1641 | } | 1880 | } |
1642 | 1881 | ||
1643 | static struct event_constraint * | 1882 | static struct event_constraint * |
1644 | intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) | 1883 | __intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx, |
1884 | struct perf_event *event) | ||
1645 | { | 1885 | { |
1646 | struct event_constraint *c; | 1886 | struct event_constraint *c; |
1647 | 1887 | ||
@@ -1657,7 +1897,278 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event | |||
1657 | if (c) | 1897 | if (c) |
1658 | return c; | 1898 | return c; |
1659 | 1899 | ||
1660 | return x86_get_event_constraints(cpuc, event); | 1900 | return x86_get_event_constraints(cpuc, idx, event); |
1901 | } | ||
1902 | |||
1903 | static void | ||
1904 | intel_start_scheduling(struct cpu_hw_events *cpuc) | ||
1905 | { | ||
1906 | struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; | ||
1907 | struct intel_excl_states *xl, *xlo; | ||
1908 | int tid = cpuc->excl_thread_id; | ||
1909 | int o_tid = 1 - tid; /* sibling thread */ | ||
1910 | |||
1911 | /* | ||
1912 | * nothing needed if in group validation mode | ||
1913 | */ | ||
1914 | if (cpuc->is_fake || !is_ht_workaround_enabled()) | ||
1915 | return; | ||
1916 | |||
1917 | /* | ||
1918 | * no exclusion needed | ||
1919 | */ | ||
1920 | if (!excl_cntrs) | ||
1921 | return; | ||
1922 | |||
1923 | xlo = &excl_cntrs->states[o_tid]; | ||
1924 | xl = &excl_cntrs->states[tid]; | ||
1925 | |||
1926 | xl->sched_started = true; | ||
1927 | xl->num_alloc_cntrs = 0; | ||
1928 | /* | ||
1929 | * lock shared state until we are done scheduling | ||
1930 | * in stop_event_scheduling() | ||
1931 | * makes scheduling appear as a transaction | ||
1932 | */ | ||
1933 | WARN_ON_ONCE(!irqs_disabled()); | ||
1934 | raw_spin_lock(&excl_cntrs->lock); | ||
1935 | |||
1936 | /* | ||
1937 | * save initial state of sibling thread | ||
1938 | */ | ||
1939 | memcpy(xlo->init_state, xlo->state, sizeof(xlo->init_state)); | ||
1940 | } | ||
1941 | |||
1942 | static void | ||
1943 | intel_stop_scheduling(struct cpu_hw_events *cpuc) | ||
1944 | { | ||
1945 | struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; | ||
1946 | struct intel_excl_states *xl, *xlo; | ||
1947 | int tid = cpuc->excl_thread_id; | ||
1948 | int o_tid = 1 - tid; /* sibling thread */ | ||
1949 | |||
1950 | /* | ||
1951 | * nothing needed if in group validation mode | ||
1952 | */ | ||
1953 | if (cpuc->is_fake || !is_ht_workaround_enabled()) | ||
1954 | return; | ||
1955 | /* | ||
1956 | * no exclusion needed | ||
1957 | */ | ||
1958 | if (!excl_cntrs) | ||
1959 | return; | ||
1960 | |||
1961 | xlo = &excl_cntrs->states[o_tid]; | ||
1962 | xl = &excl_cntrs->states[tid]; | ||
1963 | |||
1964 | /* | ||
1965 | * make new sibling thread state visible | ||
1966 | */ | ||
1967 | memcpy(xlo->state, xlo->init_state, sizeof(xlo->state)); | ||
1968 | |||
1969 | xl->sched_started = false; | ||
1970 | /* | ||
1971 | * release shared state lock (acquired in intel_start_scheduling()) | ||
1972 | */ | ||
1973 | raw_spin_unlock(&excl_cntrs->lock); | ||
1974 | } | ||
1975 | |||
1976 | static struct event_constraint * | ||
1977 | intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, | ||
1978 | int idx, struct event_constraint *c) | ||
1979 | { | ||
1980 | struct event_constraint *cx; | ||
1981 | struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; | ||
1982 | struct intel_excl_states *xl, *xlo; | ||
1983 | int is_excl, i; | ||
1984 | int tid = cpuc->excl_thread_id; | ||
1985 | int o_tid = 1 - tid; /* alternate */ | ||
1986 | |||
1987 | /* | ||
1988 | * validating a group does not require | ||
1989 | * enforcing cross-thread exclusion | ||
1990 | */ | ||
1991 | if (cpuc->is_fake || !is_ht_workaround_enabled()) | ||
1992 | return c; | ||
1993 | |||
1994 | /* | ||
1995 | * no exclusion needed | ||
1996 | */ | ||
1997 | if (!excl_cntrs) | ||
1998 | return c; | ||
1999 | /* | ||
2000 | * event requires exclusive counter access | ||
2001 | * across HT threads | ||
2002 | */ | ||
2003 | is_excl = c->flags & PERF_X86_EVENT_EXCL; | ||
2004 | |||
2005 | /* | ||
2006 | * xl = state of current HT | ||
2007 | * xlo = state of sibling HT | ||
2008 | */ | ||
2009 | xl = &excl_cntrs->states[tid]; | ||
2010 | xlo = &excl_cntrs->states[o_tid]; | ||
2011 | |||
2012 | /* | ||
2013 | * do not allow scheduling of more than max_alloc_cntrs | ||
2014 | * which is set to half the available generic counters. | ||
2015 | * this helps avoid counter starvation of sibling thread | ||
2016 | * by ensuring at most half the counters cannot be in | ||
2017 | * exclusive mode. There is not designated counters for the | ||
2018 | * limits. Any N/2 counters can be used. This helps with | ||
2019 | * events with specifix counter constraints | ||
2020 | */ | ||
2021 | if (xl->num_alloc_cntrs++ == xl->max_alloc_cntrs) | ||
2022 | return &emptyconstraint; | ||
2023 | |||
2024 | cx = c; | ||
2025 | |||
2026 | /* | ||
2027 | * because we modify the constraint, we need | ||
2028 | * to make a copy. Static constraints come | ||
2029 | * from static const tables. | ||
2030 | * | ||
2031 | * only needed when constraint has not yet | ||
2032 | * been cloned (marked dynamic) | ||
2033 | */ | ||
2034 | if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) { | ||
2035 | |||
2036 | /* sanity check */ | ||
2037 | if (idx < 0) | ||
2038 | return &emptyconstraint; | ||
2039 | |||
2040 | /* | ||
2041 | * grab pre-allocated constraint entry | ||
2042 | */ | ||
2043 | cx = &cpuc->constraint_list[idx]; | ||
2044 | |||
2045 | /* | ||
2046 | * initialize dynamic constraint | ||
2047 | * with static constraint | ||
2048 | */ | ||
2049 | memcpy(cx, c, sizeof(*cx)); | ||
2050 | |||
2051 | /* | ||
2052 | * mark constraint as dynamic, so we | ||
2053 | * can free it later on | ||
2054 | */ | ||
2055 | cx->flags |= PERF_X86_EVENT_DYNAMIC; | ||
2056 | } | ||
2057 | |||
2058 | /* | ||
2059 | * From here on, the constraint is dynamic. | ||
2060 | * Either it was just allocated above, or it | ||
2061 | * was allocated during a earlier invocation | ||
2062 | * of this function | ||
2063 | */ | ||
2064 | |||
2065 | /* | ||
2066 | * Modify static constraint with current dynamic | ||
2067 | * state of thread | ||
2068 | * | ||
2069 | * EXCLUSIVE: sibling counter measuring exclusive event | ||
2070 | * SHARED : sibling counter measuring non-exclusive event | ||
2071 | * UNUSED : sibling counter unused | ||
2072 | */ | ||
2073 | for_each_set_bit(i, cx->idxmsk, X86_PMC_IDX_MAX) { | ||
2074 | /* | ||
2075 | * exclusive event in sibling counter | ||
2076 | * our corresponding counter cannot be used | ||
2077 | * regardless of our event | ||
2078 | */ | ||
2079 | if (xl->state[i] == INTEL_EXCL_EXCLUSIVE) | ||
2080 | __clear_bit(i, cx->idxmsk); | ||
2081 | /* | ||
2082 | * if measuring an exclusive event, sibling | ||
2083 | * measuring non-exclusive, then counter cannot | ||
2084 | * be used | ||
2085 | */ | ||
2086 | if (is_excl && xl->state[i] == INTEL_EXCL_SHARED) | ||
2087 | __clear_bit(i, cx->idxmsk); | ||
2088 | } | ||
2089 | |||
2090 | /* | ||
2091 | * recompute actual bit weight for scheduling algorithm | ||
2092 | */ | ||
2093 | cx->weight = hweight64(cx->idxmsk64); | ||
2094 | |||
2095 | /* | ||
2096 | * if we return an empty mask, then switch | ||
2097 | * back to static empty constraint to avoid | ||
2098 | * the cost of freeing later on | ||
2099 | */ | ||
2100 | if (cx->weight == 0) | ||
2101 | cx = &emptyconstraint; | ||
2102 | |||
2103 | return cx; | ||
2104 | } | ||
2105 | |||
2106 | static struct event_constraint * | ||
2107 | intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx, | ||
2108 | struct perf_event *event) | ||
2109 | { | ||
2110 | struct event_constraint *c1 = event->hw.constraint; | ||
2111 | struct event_constraint *c2; | ||
2112 | |||
2113 | /* | ||
2114 | * first time only | ||
2115 | * - static constraint: no change across incremental scheduling calls | ||
2116 | * - dynamic constraint: handled by intel_get_excl_constraints() | ||
2117 | */ | ||
2118 | c2 = __intel_get_event_constraints(cpuc, idx, event); | ||
2119 | if (c1 && (c1->flags & PERF_X86_EVENT_DYNAMIC)) { | ||
2120 | bitmap_copy(c1->idxmsk, c2->idxmsk, X86_PMC_IDX_MAX); | ||
2121 | c1->weight = c2->weight; | ||
2122 | c2 = c1; | ||
2123 | } | ||
2124 | |||
2125 | if (cpuc->excl_cntrs) | ||
2126 | return intel_get_excl_constraints(cpuc, event, idx, c2); | ||
2127 | |||
2128 | return c2; | ||
2129 | } | ||
2130 | |||
2131 | static void intel_put_excl_constraints(struct cpu_hw_events *cpuc, | ||
2132 | struct perf_event *event) | ||
2133 | { | ||
2134 | struct hw_perf_event *hwc = &event->hw; | ||
2135 | struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; | ||
2136 | struct intel_excl_states *xlo, *xl; | ||
2137 | unsigned long flags = 0; /* keep compiler happy */ | ||
2138 | int tid = cpuc->excl_thread_id; | ||
2139 | int o_tid = 1 - tid; | ||
2140 | |||
2141 | /* | ||
2142 | * nothing needed if in group validation mode | ||
2143 | */ | ||
2144 | if (cpuc->is_fake) | ||
2145 | return; | ||
2146 | |||
2147 | WARN_ON_ONCE(!excl_cntrs); | ||
2148 | |||
2149 | if (!excl_cntrs) | ||
2150 | return; | ||
2151 | |||
2152 | xl = &excl_cntrs->states[tid]; | ||
2153 | xlo = &excl_cntrs->states[o_tid]; | ||
2154 | |||
2155 | /* | ||
2156 | * put_constraint may be called from x86_schedule_events() | ||
2157 | * which already has the lock held so here make locking | ||
2158 | * conditional | ||
2159 | */ | ||
2160 | if (!xl->sched_started) | ||
2161 | raw_spin_lock_irqsave(&excl_cntrs->lock, flags); | ||
2162 | |||
2163 | /* | ||
2164 | * if event was actually assigned, then mark the | ||
2165 | * counter state as unused now | ||
2166 | */ | ||
2167 | if (hwc->idx >= 0) | ||
2168 | xlo->state[hwc->idx] = INTEL_EXCL_UNUSED; | ||
2169 | |||
2170 | if (!xl->sched_started) | ||
2171 | raw_spin_unlock_irqrestore(&excl_cntrs->lock, flags); | ||
1661 | } | 2172 | } |
1662 | 2173 | ||
1663 | static void | 2174 | static void |
@@ -1678,7 +2189,57 @@ intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc, | |||
1678 | static void intel_put_event_constraints(struct cpu_hw_events *cpuc, | 2189 | static void intel_put_event_constraints(struct cpu_hw_events *cpuc, |
1679 | struct perf_event *event) | 2190 | struct perf_event *event) |
1680 | { | 2191 | { |
2192 | struct event_constraint *c = event->hw.constraint; | ||
2193 | |||
1681 | intel_put_shared_regs_event_constraints(cpuc, event); | 2194 | intel_put_shared_regs_event_constraints(cpuc, event); |
2195 | |||
2196 | /* | ||
2197 | * is PMU has exclusive counter restrictions, then | ||
2198 | * all events are subject to and must call the | ||
2199 | * put_excl_constraints() routine | ||
2200 | */ | ||
2201 | if (c && cpuc->excl_cntrs) | ||
2202 | intel_put_excl_constraints(cpuc, event); | ||
2203 | |||
2204 | /* cleanup dynamic constraint */ | ||
2205 | if (c && (c->flags & PERF_X86_EVENT_DYNAMIC)) | ||
2206 | event->hw.constraint = NULL; | ||
2207 | } | ||
2208 | |||
2209 | static void intel_commit_scheduling(struct cpu_hw_events *cpuc, | ||
2210 | struct perf_event *event, int cntr) | ||
2211 | { | ||
2212 | struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; | ||
2213 | struct event_constraint *c = event->hw.constraint; | ||
2214 | struct intel_excl_states *xlo, *xl; | ||
2215 | int tid = cpuc->excl_thread_id; | ||
2216 | int o_tid = 1 - tid; | ||
2217 | int is_excl; | ||
2218 | |||
2219 | if (cpuc->is_fake || !c) | ||
2220 | return; | ||
2221 | |||
2222 | is_excl = c->flags & PERF_X86_EVENT_EXCL; | ||
2223 | |||
2224 | if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) | ||
2225 | return; | ||
2226 | |||
2227 | WARN_ON_ONCE(!excl_cntrs); | ||
2228 | |||
2229 | if (!excl_cntrs) | ||
2230 | return; | ||
2231 | |||
2232 | xl = &excl_cntrs->states[tid]; | ||
2233 | xlo = &excl_cntrs->states[o_tid]; | ||
2234 | |||
2235 | WARN_ON_ONCE(!raw_spin_is_locked(&excl_cntrs->lock)); | ||
2236 | |||
2237 | if (cntr >= 0) { | ||
2238 | if (is_excl) | ||
2239 | xlo->init_state[cntr] = INTEL_EXCL_EXCLUSIVE; | ||
2240 | else | ||
2241 | xlo->init_state[cntr] = INTEL_EXCL_SHARED; | ||
2242 | } | ||
1682 | } | 2243 | } |
1683 | 2244 | ||
1684 | static void intel_pebs_aliases_core2(struct perf_event *event) | 2245 | static void intel_pebs_aliases_core2(struct perf_event *event) |
@@ -1747,10 +2308,21 @@ static int intel_pmu_hw_config(struct perf_event *event) | |||
1747 | if (event->attr.precise_ip && x86_pmu.pebs_aliases) | 2308 | if (event->attr.precise_ip && x86_pmu.pebs_aliases) |
1748 | x86_pmu.pebs_aliases(event); | 2309 | x86_pmu.pebs_aliases(event); |
1749 | 2310 | ||
1750 | if (intel_pmu_needs_lbr_smpl(event)) { | 2311 | if (needs_branch_stack(event)) { |
1751 | ret = intel_pmu_setup_lbr_filter(event); | 2312 | ret = intel_pmu_setup_lbr_filter(event); |
1752 | if (ret) | 2313 | if (ret) |
1753 | return ret; | 2314 | return ret; |
2315 | |||
2316 | /* | ||
2317 | * BTS is set up earlier in this path, so don't account twice | ||
2318 | */ | ||
2319 | if (!intel_pmu_has_bts(event)) { | ||
2320 | /* disallow lbr if conflicting events are present */ | ||
2321 | if (x86_add_exclusive(x86_lbr_exclusive_lbr)) | ||
2322 | return -EBUSY; | ||
2323 | |||
2324 | event->destroy = hw_perf_lbr_event_destroy; | ||
2325 | } | ||
1754 | } | 2326 | } |
1755 | 2327 | ||
1756 | if (event->attr.type != PERF_TYPE_RAW) | 2328 | if (event->attr.type != PERF_TYPE_RAW) |
@@ -1891,9 +2463,12 @@ static struct event_constraint counter2_constraint = | |||
1891 | EVENT_CONSTRAINT(0, 0x4, 0); | 2463 | EVENT_CONSTRAINT(0, 0x4, 0); |
1892 | 2464 | ||
1893 | static struct event_constraint * | 2465 | static struct event_constraint * |
1894 | hsw_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) | 2466 | hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx, |
2467 | struct perf_event *event) | ||
1895 | { | 2468 | { |
1896 | struct event_constraint *c = intel_get_event_constraints(cpuc, event); | 2469 | struct event_constraint *c; |
2470 | |||
2471 | c = intel_get_event_constraints(cpuc, idx, event); | ||
1897 | 2472 | ||
1898 | /* Handle special quirk on in_tx_checkpointed only in counter 2 */ | 2473 | /* Handle special quirk on in_tx_checkpointed only in counter 2 */ |
1899 | if (event->hw.config & HSW_IN_TX_CHECKPOINTED) { | 2474 | if (event->hw.config & HSW_IN_TX_CHECKPOINTED) { |
@@ -1905,6 +2480,32 @@ hsw_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) | |||
1905 | return c; | 2480 | return c; |
1906 | } | 2481 | } |
1907 | 2482 | ||
2483 | /* | ||
2484 | * Broadwell: | ||
2485 | * | ||
2486 | * The INST_RETIRED.ALL period always needs to have lowest 6 bits cleared | ||
2487 | * (BDM55) and it must not use a period smaller than 100 (BDM11). We combine | ||
2488 | * the two to enforce a minimum period of 128 (the smallest value that has bits | ||
2489 | * 0-5 cleared and >= 100). | ||
2490 | * | ||
2491 | * Because of how the code in x86_perf_event_set_period() works, the truncation | ||
2492 | * of the lower 6 bits is 'harmless' as we'll occasionally add a longer period | ||
2493 | * to make up for the 'lost' events due to carrying the 'error' in period_left. | ||
2494 | * | ||
2495 | * Therefore the effective (average) period matches the requested period, | ||
2496 | * despite coarser hardware granularity. | ||
2497 | */ | ||
2498 | static unsigned bdw_limit_period(struct perf_event *event, unsigned left) | ||
2499 | { | ||
2500 | if ((event->hw.config & INTEL_ARCH_EVENT_MASK) == | ||
2501 | X86_CONFIG(.event=0xc0, .umask=0x01)) { | ||
2502 | if (left < 128) | ||
2503 | left = 128; | ||
2504 | left &= ~0x3fu; | ||
2505 | } | ||
2506 | return left; | ||
2507 | } | ||
2508 | |||
1908 | PMU_FORMAT_ATTR(event, "config:0-7" ); | 2509 | PMU_FORMAT_ATTR(event, "config:0-7" ); |
1909 | PMU_FORMAT_ATTR(umask, "config:8-15" ); | 2510 | PMU_FORMAT_ATTR(umask, "config:8-15" ); |
1910 | PMU_FORMAT_ATTR(edge, "config:18" ); | 2511 | PMU_FORMAT_ATTR(edge, "config:18" ); |
@@ -1979,16 +2580,52 @@ struct intel_shared_regs *allocate_shared_regs(int cpu) | |||
1979 | return regs; | 2580 | return regs; |
1980 | } | 2581 | } |
1981 | 2582 | ||
2583 | static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu) | ||
2584 | { | ||
2585 | struct intel_excl_cntrs *c; | ||
2586 | int i; | ||
2587 | |||
2588 | c = kzalloc_node(sizeof(struct intel_excl_cntrs), | ||
2589 | GFP_KERNEL, cpu_to_node(cpu)); | ||
2590 | if (c) { | ||
2591 | raw_spin_lock_init(&c->lock); | ||
2592 | for (i = 0; i < X86_PMC_IDX_MAX; i++) { | ||
2593 | c->states[0].state[i] = INTEL_EXCL_UNUSED; | ||
2594 | c->states[0].init_state[i] = INTEL_EXCL_UNUSED; | ||
2595 | |||
2596 | c->states[1].state[i] = INTEL_EXCL_UNUSED; | ||
2597 | c->states[1].init_state[i] = INTEL_EXCL_UNUSED; | ||
2598 | } | ||
2599 | c->core_id = -1; | ||
2600 | } | ||
2601 | return c; | ||
2602 | } | ||
2603 | |||
1982 | static int intel_pmu_cpu_prepare(int cpu) | 2604 | static int intel_pmu_cpu_prepare(int cpu) |
1983 | { | 2605 | { |
1984 | struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); | 2606 | struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); |
1985 | 2607 | ||
1986 | if (!(x86_pmu.extra_regs || x86_pmu.lbr_sel_map)) | 2608 | if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) { |
1987 | return NOTIFY_OK; | 2609 | cpuc->shared_regs = allocate_shared_regs(cpu); |
2610 | if (!cpuc->shared_regs) | ||
2611 | return NOTIFY_BAD; | ||
2612 | } | ||
1988 | 2613 | ||
1989 | cpuc->shared_regs = allocate_shared_regs(cpu); | 2614 | if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) { |
1990 | if (!cpuc->shared_regs) | 2615 | size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint); |
1991 | return NOTIFY_BAD; | 2616 | |
2617 | cpuc->constraint_list = kzalloc(sz, GFP_KERNEL); | ||
2618 | if (!cpuc->constraint_list) | ||
2619 | return NOTIFY_BAD; | ||
2620 | |||
2621 | cpuc->excl_cntrs = allocate_excl_cntrs(cpu); | ||
2622 | if (!cpuc->excl_cntrs) { | ||
2623 | kfree(cpuc->constraint_list); | ||
2624 | kfree(cpuc->shared_regs); | ||
2625 | return NOTIFY_BAD; | ||
2626 | } | ||
2627 | cpuc->excl_thread_id = 0; | ||
2628 | } | ||
1992 | 2629 | ||
1993 | return NOTIFY_OK; | 2630 | return NOTIFY_OK; |
1994 | } | 2631 | } |
@@ -2010,13 +2647,15 @@ static void intel_pmu_cpu_starting(int cpu) | |||
2010 | if (!cpuc->shared_regs) | 2647 | if (!cpuc->shared_regs) |
2011 | return; | 2648 | return; |
2012 | 2649 | ||
2013 | if (!(x86_pmu.er_flags & ERF_NO_HT_SHARING)) { | 2650 | if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) { |
2651 | void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED]; | ||
2652 | |||
2014 | for_each_cpu(i, topology_thread_cpumask(cpu)) { | 2653 | for_each_cpu(i, topology_thread_cpumask(cpu)) { |
2015 | struct intel_shared_regs *pc; | 2654 | struct intel_shared_regs *pc; |
2016 | 2655 | ||
2017 | pc = per_cpu(cpu_hw_events, i).shared_regs; | 2656 | pc = per_cpu(cpu_hw_events, i).shared_regs; |
2018 | if (pc && pc->core_id == core_id) { | 2657 | if (pc && pc->core_id == core_id) { |
2019 | cpuc->kfree_on_online = cpuc->shared_regs; | 2658 | *onln = cpuc->shared_regs; |
2020 | cpuc->shared_regs = pc; | 2659 | cpuc->shared_regs = pc; |
2021 | break; | 2660 | break; |
2022 | } | 2661 | } |
@@ -2027,6 +2666,44 @@ static void intel_pmu_cpu_starting(int cpu) | |||
2027 | 2666 | ||
2028 | if (x86_pmu.lbr_sel_map) | 2667 | if (x86_pmu.lbr_sel_map) |
2029 | cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR]; | 2668 | cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR]; |
2669 | |||
2670 | if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) { | ||
2671 | int h = x86_pmu.num_counters >> 1; | ||
2672 | |||
2673 | for_each_cpu(i, topology_thread_cpumask(cpu)) { | ||
2674 | struct intel_excl_cntrs *c; | ||
2675 | |||
2676 | c = per_cpu(cpu_hw_events, i).excl_cntrs; | ||
2677 | if (c && c->core_id == core_id) { | ||
2678 | cpuc->kfree_on_online[1] = cpuc->excl_cntrs; | ||
2679 | cpuc->excl_cntrs = c; | ||
2680 | cpuc->excl_thread_id = 1; | ||
2681 | break; | ||
2682 | } | ||
2683 | } | ||
2684 | cpuc->excl_cntrs->core_id = core_id; | ||
2685 | cpuc->excl_cntrs->refcnt++; | ||
2686 | /* | ||
2687 | * set hard limit to half the number of generic counters | ||
2688 | */ | ||
2689 | cpuc->excl_cntrs->states[0].max_alloc_cntrs = h; | ||
2690 | cpuc->excl_cntrs->states[1].max_alloc_cntrs = h; | ||
2691 | } | ||
2692 | } | ||
2693 | |||
2694 | static void free_excl_cntrs(int cpu) | ||
2695 | { | ||
2696 | struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); | ||
2697 | struct intel_excl_cntrs *c; | ||
2698 | |||
2699 | c = cpuc->excl_cntrs; | ||
2700 | if (c) { | ||
2701 | if (c->core_id == -1 || --c->refcnt == 0) | ||
2702 | kfree(c); | ||
2703 | cpuc->excl_cntrs = NULL; | ||
2704 | kfree(cpuc->constraint_list); | ||
2705 | cpuc->constraint_list = NULL; | ||
2706 | } | ||
2030 | } | 2707 | } |
2031 | 2708 | ||
2032 | static void intel_pmu_cpu_dying(int cpu) | 2709 | static void intel_pmu_cpu_dying(int cpu) |
@@ -2041,19 +2718,9 @@ static void intel_pmu_cpu_dying(int cpu) | |||
2041 | cpuc->shared_regs = NULL; | 2718 | cpuc->shared_regs = NULL; |
2042 | } | 2719 | } |
2043 | 2720 | ||
2044 | fini_debug_store_on_cpu(cpu); | 2721 | free_excl_cntrs(cpu); |
2045 | } | ||
2046 | 2722 | ||
2047 | static void intel_pmu_flush_branch_stack(void) | 2723 | fini_debug_store_on_cpu(cpu); |
2048 | { | ||
2049 | /* | ||
2050 | * Intel LBR does not tag entries with the | ||
2051 | * PID of the current task, then we need to | ||
2052 | * flush it on ctxsw | ||
2053 | * For now, we simply reset it | ||
2054 | */ | ||
2055 | if (x86_pmu.lbr_nr) | ||
2056 | intel_pmu_lbr_reset(); | ||
2057 | } | 2724 | } |
2058 | 2725 | ||
2059 | PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63"); | 2726 | PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63"); |
@@ -2107,7 +2774,7 @@ static __initconst const struct x86_pmu intel_pmu = { | |||
2107 | .cpu_starting = intel_pmu_cpu_starting, | 2774 | .cpu_starting = intel_pmu_cpu_starting, |
2108 | .cpu_dying = intel_pmu_cpu_dying, | 2775 | .cpu_dying = intel_pmu_cpu_dying, |
2109 | .guest_get_msrs = intel_guest_get_msrs, | 2776 | .guest_get_msrs = intel_guest_get_msrs, |
2110 | .flush_branch_stack = intel_pmu_flush_branch_stack, | 2777 | .sched_task = intel_pmu_lbr_sched_task, |
2111 | }; | 2778 | }; |
2112 | 2779 | ||
2113 | static __init void intel_clovertown_quirk(void) | 2780 | static __init void intel_clovertown_quirk(void) |
@@ -2264,6 +2931,27 @@ static __init void intel_nehalem_quirk(void) | |||
2264 | } | 2931 | } |
2265 | } | 2932 | } |
2266 | 2933 | ||
2934 | /* | ||
2935 | * enable software workaround for errata: | ||
2936 | * SNB: BJ122 | ||
2937 | * IVB: BV98 | ||
2938 | * HSW: HSD29 | ||
2939 | * | ||
2940 | * Only needed when HT is enabled. However detecting | ||
2941 | * if HT is enabled is difficult (model specific). So instead, | ||
2942 | * we enable the workaround in the early boot, and verify if | ||
2943 | * it is needed in a later initcall phase once we have valid | ||
2944 | * topology information to check if HT is actually enabled | ||
2945 | */ | ||
2946 | static __init void intel_ht_bug(void) | ||
2947 | { | ||
2948 | x86_pmu.flags |= PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED; | ||
2949 | |||
2950 | x86_pmu.commit_scheduling = intel_commit_scheduling; | ||
2951 | x86_pmu.start_scheduling = intel_start_scheduling; | ||
2952 | x86_pmu.stop_scheduling = intel_stop_scheduling; | ||
2953 | } | ||
2954 | |||
2267 | EVENT_ATTR_STR(mem-loads, mem_ld_hsw, "event=0xcd,umask=0x1,ldlat=3"); | 2955 | EVENT_ATTR_STR(mem-loads, mem_ld_hsw, "event=0xcd,umask=0x1,ldlat=3"); |
2268 | EVENT_ATTR_STR(mem-stores, mem_st_hsw, "event=0xd0,umask=0x82") | 2956 | EVENT_ATTR_STR(mem-stores, mem_st_hsw, "event=0xd0,umask=0x82") |
2269 | 2957 | ||
@@ -2443,7 +3131,7 @@ __init int intel_pmu_init(void) | |||
2443 | x86_pmu.event_constraints = intel_slm_event_constraints; | 3131 | x86_pmu.event_constraints = intel_slm_event_constraints; |
2444 | x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints; | 3132 | x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints; |
2445 | x86_pmu.extra_regs = intel_slm_extra_regs; | 3133 | x86_pmu.extra_regs = intel_slm_extra_regs; |
2446 | x86_pmu.er_flags |= ERF_HAS_RSP_1; | 3134 | x86_pmu.flags |= PMU_FL_HAS_RSP_1; |
2447 | pr_cont("Silvermont events, "); | 3135 | pr_cont("Silvermont events, "); |
2448 | break; | 3136 | break; |
2449 | 3137 | ||
@@ -2461,7 +3149,7 @@ __init int intel_pmu_init(void) | |||
2461 | x86_pmu.enable_all = intel_pmu_nhm_enable_all; | 3149 | x86_pmu.enable_all = intel_pmu_nhm_enable_all; |
2462 | x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints; | 3150 | x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints; |
2463 | x86_pmu.extra_regs = intel_westmere_extra_regs; | 3151 | x86_pmu.extra_regs = intel_westmere_extra_regs; |
2464 | x86_pmu.er_flags |= ERF_HAS_RSP_1; | 3152 | x86_pmu.flags |= PMU_FL_HAS_RSP_1; |
2465 | 3153 | ||
2466 | x86_pmu.cpu_events = nhm_events_attrs; | 3154 | x86_pmu.cpu_events = nhm_events_attrs; |
2467 | 3155 | ||
@@ -2478,6 +3166,7 @@ __init int intel_pmu_init(void) | |||
2478 | case 42: /* 32nm SandyBridge */ | 3166 | case 42: /* 32nm SandyBridge */ |
2479 | case 45: /* 32nm SandyBridge-E/EN/EP */ | 3167 | case 45: /* 32nm SandyBridge-E/EN/EP */ |
2480 | x86_add_quirk(intel_sandybridge_quirk); | 3168 | x86_add_quirk(intel_sandybridge_quirk); |
3169 | x86_add_quirk(intel_ht_bug); | ||
2481 | memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, | 3170 | memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, |
2482 | sizeof(hw_cache_event_ids)); | 3171 | sizeof(hw_cache_event_ids)); |
2483 | memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, | 3172 | memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, |
@@ -2492,9 +3181,11 @@ __init int intel_pmu_init(void) | |||
2492 | x86_pmu.extra_regs = intel_snbep_extra_regs; | 3181 | x86_pmu.extra_regs = intel_snbep_extra_regs; |
2493 | else | 3182 | else |
2494 | x86_pmu.extra_regs = intel_snb_extra_regs; | 3183 | x86_pmu.extra_regs = intel_snb_extra_regs; |
3184 | |||
3185 | |||
2495 | /* all extra regs are per-cpu when HT is on */ | 3186 | /* all extra regs are per-cpu when HT is on */ |
2496 | x86_pmu.er_flags |= ERF_HAS_RSP_1; | 3187 | x86_pmu.flags |= PMU_FL_HAS_RSP_1; |
2497 | x86_pmu.er_flags |= ERF_NO_HT_SHARING; | 3188 | x86_pmu.flags |= PMU_FL_NO_HT_SHARING; |
2498 | 3189 | ||
2499 | x86_pmu.cpu_events = snb_events_attrs; | 3190 | x86_pmu.cpu_events = snb_events_attrs; |
2500 | 3191 | ||
@@ -2510,6 +3201,7 @@ __init int intel_pmu_init(void) | |||
2510 | 3201 | ||
2511 | case 58: /* 22nm IvyBridge */ | 3202 | case 58: /* 22nm IvyBridge */ |
2512 | case 62: /* 22nm IvyBridge-EP/EX */ | 3203 | case 62: /* 22nm IvyBridge-EP/EX */ |
3204 | x86_add_quirk(intel_ht_bug); | ||
2513 | memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, | 3205 | memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, |
2514 | sizeof(hw_cache_event_ids)); | 3206 | sizeof(hw_cache_event_ids)); |
2515 | /* dTLB-load-misses on IVB is different than SNB */ | 3207 | /* dTLB-load-misses on IVB is different than SNB */ |
@@ -2528,8 +3220,8 @@ __init int intel_pmu_init(void) | |||
2528 | else | 3220 | else |
2529 | x86_pmu.extra_regs = intel_snb_extra_regs; | 3221 | x86_pmu.extra_regs = intel_snb_extra_regs; |
2530 | /* all extra regs are per-cpu when HT is on */ | 3222 | /* all extra regs are per-cpu when HT is on */ |
2531 | x86_pmu.er_flags |= ERF_HAS_RSP_1; | 3223 | x86_pmu.flags |= PMU_FL_HAS_RSP_1; |
2532 | x86_pmu.er_flags |= ERF_NO_HT_SHARING; | 3224 | x86_pmu.flags |= PMU_FL_NO_HT_SHARING; |
2533 | 3225 | ||
2534 | x86_pmu.cpu_events = snb_events_attrs; | 3226 | x86_pmu.cpu_events = snb_events_attrs; |
2535 | 3227 | ||
@@ -2545,19 +3237,20 @@ __init int intel_pmu_init(void) | |||
2545 | case 63: /* 22nm Haswell Server */ | 3237 | case 63: /* 22nm Haswell Server */ |
2546 | case 69: /* 22nm Haswell ULT */ | 3238 | case 69: /* 22nm Haswell ULT */ |
2547 | case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */ | 3239 | case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */ |
3240 | x86_add_quirk(intel_ht_bug); | ||
2548 | x86_pmu.late_ack = true; | 3241 | x86_pmu.late_ack = true; |
2549 | memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); | 3242 | memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids)); |
2550 | memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); | 3243 | memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); |
2551 | 3244 | ||
2552 | intel_pmu_lbr_init_snb(); | 3245 | intel_pmu_lbr_init_hsw(); |
2553 | 3246 | ||
2554 | x86_pmu.event_constraints = intel_hsw_event_constraints; | 3247 | x86_pmu.event_constraints = intel_hsw_event_constraints; |
2555 | x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints; | 3248 | x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints; |
2556 | x86_pmu.extra_regs = intel_snbep_extra_regs; | 3249 | x86_pmu.extra_regs = intel_snbep_extra_regs; |
2557 | x86_pmu.pebs_aliases = intel_pebs_aliases_snb; | 3250 | x86_pmu.pebs_aliases = intel_pebs_aliases_snb; |
2558 | /* all extra regs are per-cpu when HT is on */ | 3251 | /* all extra regs are per-cpu when HT is on */ |
2559 | x86_pmu.er_flags |= ERF_HAS_RSP_1; | 3252 | x86_pmu.flags |= PMU_FL_HAS_RSP_1; |
2560 | x86_pmu.er_flags |= ERF_NO_HT_SHARING; | 3253 | x86_pmu.flags |= PMU_FL_NO_HT_SHARING; |
2561 | 3254 | ||
2562 | x86_pmu.hw_config = hsw_hw_config; | 3255 | x86_pmu.hw_config = hsw_hw_config; |
2563 | x86_pmu.get_event_constraints = hsw_get_event_constraints; | 3256 | x86_pmu.get_event_constraints = hsw_get_event_constraints; |
@@ -2566,6 +3259,39 @@ __init int intel_pmu_init(void) | |||
2566 | pr_cont("Haswell events, "); | 3259 | pr_cont("Haswell events, "); |
2567 | break; | 3260 | break; |
2568 | 3261 | ||
3262 | case 61: /* 14nm Broadwell Core-M */ | ||
3263 | case 86: /* 14nm Broadwell Xeon D */ | ||
3264 | x86_pmu.late_ack = true; | ||
3265 | memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids)); | ||
3266 | memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); | ||
3267 | |||
3268 | /* L3_MISS_LOCAL_DRAM is BIT(26) in Broadwell */ | ||
3269 | hw_cache_extra_regs[C(LL)][C(OP_READ)][C(RESULT_MISS)] = HSW_DEMAND_READ | | ||
3270 | BDW_L3_MISS|HSW_SNOOP_DRAM; | ||
3271 | hw_cache_extra_regs[C(LL)][C(OP_WRITE)][C(RESULT_MISS)] = HSW_DEMAND_WRITE|BDW_L3_MISS| | ||
3272 | HSW_SNOOP_DRAM; | ||
3273 | hw_cache_extra_regs[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = HSW_DEMAND_READ| | ||
3274 | BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM; | ||
3275 | hw_cache_extra_regs[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = HSW_DEMAND_WRITE| | ||
3276 | BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM; | ||
3277 | |||
3278 | intel_pmu_lbr_init_hsw(); | ||
3279 | |||
3280 | x86_pmu.event_constraints = intel_bdw_event_constraints; | ||
3281 | x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints; | ||
3282 | x86_pmu.extra_regs = intel_snbep_extra_regs; | ||
3283 | x86_pmu.pebs_aliases = intel_pebs_aliases_snb; | ||
3284 | /* all extra regs are per-cpu when HT is on */ | ||
3285 | x86_pmu.flags |= PMU_FL_HAS_RSP_1; | ||
3286 | x86_pmu.flags |= PMU_FL_NO_HT_SHARING; | ||
3287 | |||
3288 | x86_pmu.hw_config = hsw_hw_config; | ||
3289 | x86_pmu.get_event_constraints = hsw_get_event_constraints; | ||
3290 | x86_pmu.cpu_events = hsw_events_attrs; | ||
3291 | x86_pmu.limit_period = bdw_limit_period; | ||
3292 | pr_cont("Broadwell events, "); | ||
3293 | break; | ||
3294 | |||
2569 | default: | 3295 | default: |
2570 | switch (x86_pmu.version) { | 3296 | switch (x86_pmu.version) { |
2571 | case 1: | 3297 | case 1: |
@@ -2651,3 +3377,47 @@ __init int intel_pmu_init(void) | |||
2651 | 3377 | ||
2652 | return 0; | 3378 | return 0; |
2653 | } | 3379 | } |
3380 | |||
3381 | /* | ||
3382 | * HT bug: phase 2 init | ||
3383 | * Called once we have valid topology information to check | ||
3384 | * whether or not HT is enabled | ||
3385 | * If HT is off, then we disable the workaround | ||
3386 | */ | ||
3387 | static __init int fixup_ht_bug(void) | ||
3388 | { | ||
3389 | int cpu = smp_processor_id(); | ||
3390 | int w, c; | ||
3391 | /* | ||
3392 | * problem not present on this CPU model, nothing to do | ||
3393 | */ | ||
3394 | if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED)) | ||
3395 | return 0; | ||
3396 | |||
3397 | w = cpumask_weight(topology_thread_cpumask(cpu)); | ||
3398 | if (w > 1) { | ||
3399 | pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n"); | ||
3400 | return 0; | ||
3401 | } | ||
3402 | |||
3403 | watchdog_nmi_disable_all(); | ||
3404 | |||
3405 | x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED); | ||
3406 | |||
3407 | x86_pmu.commit_scheduling = NULL; | ||
3408 | x86_pmu.start_scheduling = NULL; | ||
3409 | x86_pmu.stop_scheduling = NULL; | ||
3410 | |||
3411 | watchdog_nmi_enable_all(); | ||
3412 | |||
3413 | get_online_cpus(); | ||
3414 | |||
3415 | for_each_online_cpu(c) { | ||
3416 | free_excl_cntrs(c); | ||
3417 | } | ||
3418 | |||
3419 | put_online_cpus(); | ||
3420 | pr_info("PMU erratum BJ122, BV98, HSD29 workaround disabled, HT off\n"); | ||
3421 | return 0; | ||
3422 | } | ||
3423 | subsys_initcall(fixup_ht_bug) | ||
diff --git a/arch/x86/kernel/cpu/perf_event_intel_bts.c b/arch/x86/kernel/cpu/perf_event_intel_bts.c new file mode 100644 index 000000000000..ac1f0c55f379 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_bts.c | |||
@@ -0,0 +1,525 @@ | |||
1 | /* | ||
2 | * BTS PMU driver for perf | ||
3 | * Copyright (c) 2013-2014, Intel Corporation. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify it | ||
6 | * under the terms and conditions of the GNU General Public License, | ||
7 | * version 2, as published by the Free Software Foundation. | ||
8 | * | ||
9 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
12 | * more details. | ||
13 | */ | ||
14 | |||
15 | #undef DEBUG | ||
16 | |||
17 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
18 | |||
19 | #include <linux/bitops.h> | ||
20 | #include <linux/types.h> | ||
21 | #include <linux/slab.h> | ||
22 | #include <linux/debugfs.h> | ||
23 | #include <linux/device.h> | ||
24 | #include <linux/coredump.h> | ||
25 | |||
26 | #include <asm-generic/sizes.h> | ||
27 | #include <asm/perf_event.h> | ||
28 | |||
29 | #include "perf_event.h" | ||
30 | |||
31 | struct bts_ctx { | ||
32 | struct perf_output_handle handle; | ||
33 | struct debug_store ds_back; | ||
34 | int started; | ||
35 | }; | ||
36 | |||
37 | static DEFINE_PER_CPU(struct bts_ctx, bts_ctx); | ||
38 | |||
39 | #define BTS_RECORD_SIZE 24 | ||
40 | #define BTS_SAFETY_MARGIN 4080 | ||
41 | |||
42 | struct bts_phys { | ||
43 | struct page *page; | ||
44 | unsigned long size; | ||
45 | unsigned long offset; | ||
46 | unsigned long displacement; | ||
47 | }; | ||
48 | |||
49 | struct bts_buffer { | ||
50 | size_t real_size; /* multiple of BTS_RECORD_SIZE */ | ||
51 | unsigned int nr_pages; | ||
52 | unsigned int nr_bufs; | ||
53 | unsigned int cur_buf; | ||
54 | bool snapshot; | ||
55 | local_t data_size; | ||
56 | local_t lost; | ||
57 | local_t head; | ||
58 | unsigned long end; | ||
59 | void **data_pages; | ||
60 | struct bts_phys buf[0]; | ||
61 | }; | ||
62 | |||
63 | struct pmu bts_pmu; | ||
64 | |||
65 | void intel_pmu_enable_bts(u64 config); | ||
66 | void intel_pmu_disable_bts(void); | ||
67 | |||
68 | static size_t buf_size(struct page *page) | ||
69 | { | ||
70 | return 1 << (PAGE_SHIFT + page_private(page)); | ||
71 | } | ||
72 | |||
73 | static void * | ||
74 | bts_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool overwrite) | ||
75 | { | ||
76 | struct bts_buffer *buf; | ||
77 | struct page *page; | ||
78 | int node = (cpu == -1) ? cpu : cpu_to_node(cpu); | ||
79 | unsigned long offset; | ||
80 | size_t size = nr_pages << PAGE_SHIFT; | ||
81 | int pg, nbuf, pad; | ||
82 | |||
83 | /* count all the high order buffers */ | ||
84 | for (pg = 0, nbuf = 0; pg < nr_pages;) { | ||
85 | page = virt_to_page(pages[pg]); | ||
86 | if (WARN_ON_ONCE(!PagePrivate(page) && nr_pages > 1)) | ||
87 | return NULL; | ||
88 | pg += 1 << page_private(page); | ||
89 | nbuf++; | ||
90 | } | ||
91 | |||
92 | /* | ||
93 | * to avoid interrupts in overwrite mode, only allow one physical | ||
94 | */ | ||
95 | if (overwrite && nbuf > 1) | ||
96 | return NULL; | ||
97 | |||
98 | buf = kzalloc_node(offsetof(struct bts_buffer, buf[nbuf]), GFP_KERNEL, node); | ||
99 | if (!buf) | ||
100 | return NULL; | ||
101 | |||
102 | buf->nr_pages = nr_pages; | ||
103 | buf->nr_bufs = nbuf; | ||
104 | buf->snapshot = overwrite; | ||
105 | buf->data_pages = pages; | ||
106 | buf->real_size = size - size % BTS_RECORD_SIZE; | ||
107 | |||
108 | for (pg = 0, nbuf = 0, offset = 0, pad = 0; nbuf < buf->nr_bufs; nbuf++) { | ||
109 | unsigned int __nr_pages; | ||
110 | |||
111 | page = virt_to_page(pages[pg]); | ||
112 | __nr_pages = PagePrivate(page) ? 1 << page_private(page) : 1; | ||
113 | buf->buf[nbuf].page = page; | ||
114 | buf->buf[nbuf].offset = offset; | ||
115 | buf->buf[nbuf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0); | ||
116 | buf->buf[nbuf].size = buf_size(page) - buf->buf[nbuf].displacement; | ||
117 | pad = buf->buf[nbuf].size % BTS_RECORD_SIZE; | ||
118 | buf->buf[nbuf].size -= pad; | ||
119 | |||
120 | pg += __nr_pages; | ||
121 | offset += __nr_pages << PAGE_SHIFT; | ||
122 | } | ||
123 | |||
124 | return buf; | ||
125 | } | ||
126 | |||
127 | static void bts_buffer_free_aux(void *data) | ||
128 | { | ||
129 | kfree(data); | ||
130 | } | ||
131 | |||
132 | static unsigned long bts_buffer_offset(struct bts_buffer *buf, unsigned int idx) | ||
133 | { | ||
134 | return buf->buf[idx].offset + buf->buf[idx].displacement; | ||
135 | } | ||
136 | |||
137 | static void | ||
138 | bts_config_buffer(struct bts_buffer *buf) | ||
139 | { | ||
140 | int cpu = raw_smp_processor_id(); | ||
141 | struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; | ||
142 | struct bts_phys *phys = &buf->buf[buf->cur_buf]; | ||
143 | unsigned long index, thresh = 0, end = phys->size; | ||
144 | struct page *page = phys->page; | ||
145 | |||
146 | index = local_read(&buf->head); | ||
147 | |||
148 | if (!buf->snapshot) { | ||
149 | if (buf->end < phys->offset + buf_size(page)) | ||
150 | end = buf->end - phys->offset - phys->displacement; | ||
151 | |||
152 | index -= phys->offset + phys->displacement; | ||
153 | |||
154 | if (end - index > BTS_SAFETY_MARGIN) | ||
155 | thresh = end - BTS_SAFETY_MARGIN; | ||
156 | else if (end - index > BTS_RECORD_SIZE) | ||
157 | thresh = end - BTS_RECORD_SIZE; | ||
158 | else | ||
159 | thresh = end; | ||
160 | } | ||
161 | |||
162 | ds->bts_buffer_base = (u64)(long)page_address(page) + phys->displacement; | ||
163 | ds->bts_index = ds->bts_buffer_base + index; | ||
164 | ds->bts_absolute_maximum = ds->bts_buffer_base + end; | ||
165 | ds->bts_interrupt_threshold = !buf->snapshot | ||
166 | ? ds->bts_buffer_base + thresh | ||
167 | : ds->bts_absolute_maximum + BTS_RECORD_SIZE; | ||
168 | } | ||
169 | |||
170 | static void bts_buffer_pad_out(struct bts_phys *phys, unsigned long head) | ||
171 | { | ||
172 | unsigned long index = head - phys->offset; | ||
173 | |||
174 | memset(page_address(phys->page) + index, 0, phys->size - index); | ||
175 | } | ||
176 | |||
177 | static bool bts_buffer_is_full(struct bts_buffer *buf, struct bts_ctx *bts) | ||
178 | { | ||
179 | if (buf->snapshot) | ||
180 | return false; | ||
181 | |||
182 | if (local_read(&buf->data_size) >= bts->handle.size || | ||
183 | bts->handle.size - local_read(&buf->data_size) < BTS_RECORD_SIZE) | ||
184 | return true; | ||
185 | |||
186 | return false; | ||
187 | } | ||
188 | |||
189 | static void bts_update(struct bts_ctx *bts) | ||
190 | { | ||
191 | int cpu = raw_smp_processor_id(); | ||
192 | struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; | ||
193 | struct bts_buffer *buf = perf_get_aux(&bts->handle); | ||
194 | unsigned long index = ds->bts_index - ds->bts_buffer_base, old, head; | ||
195 | |||
196 | if (!buf) | ||
197 | return; | ||
198 | |||
199 | head = index + bts_buffer_offset(buf, buf->cur_buf); | ||
200 | old = local_xchg(&buf->head, head); | ||
201 | |||
202 | if (!buf->snapshot) { | ||
203 | if (old == head) | ||
204 | return; | ||
205 | |||
206 | if (ds->bts_index >= ds->bts_absolute_maximum) | ||
207 | local_inc(&buf->lost); | ||
208 | |||
209 | /* | ||
210 | * old and head are always in the same physical buffer, so we | ||
211 | * can subtract them to get the data size. | ||
212 | */ | ||
213 | local_add(head - old, &buf->data_size); | ||
214 | } else { | ||
215 | local_set(&buf->data_size, head); | ||
216 | } | ||
217 | } | ||
218 | |||
219 | static void __bts_event_start(struct perf_event *event) | ||
220 | { | ||
221 | struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); | ||
222 | struct bts_buffer *buf = perf_get_aux(&bts->handle); | ||
223 | u64 config = 0; | ||
224 | |||
225 | if (!buf || bts_buffer_is_full(buf, bts)) | ||
226 | return; | ||
227 | |||
228 | event->hw.state = 0; | ||
229 | |||
230 | if (!buf->snapshot) | ||
231 | config |= ARCH_PERFMON_EVENTSEL_INT; | ||
232 | if (!event->attr.exclude_kernel) | ||
233 | config |= ARCH_PERFMON_EVENTSEL_OS; | ||
234 | if (!event->attr.exclude_user) | ||
235 | config |= ARCH_PERFMON_EVENTSEL_USR; | ||
236 | |||
237 | bts_config_buffer(buf); | ||
238 | |||
239 | /* | ||
240 | * local barrier to make sure that ds configuration made it | ||
241 | * before we enable BTS | ||
242 | */ | ||
243 | wmb(); | ||
244 | |||
245 | intel_pmu_enable_bts(config); | ||
246 | } | ||
247 | |||
248 | static void bts_event_start(struct perf_event *event, int flags) | ||
249 | { | ||
250 | struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); | ||
251 | |||
252 | __bts_event_start(event); | ||
253 | |||
254 | /* PMI handler: this counter is running and likely generating PMIs */ | ||
255 | ACCESS_ONCE(bts->started) = 1; | ||
256 | } | ||
257 | |||
258 | static void __bts_event_stop(struct perf_event *event) | ||
259 | { | ||
260 | /* | ||
261 | * No extra synchronization is mandated by the documentation to have | ||
262 | * BTS data stores globally visible. | ||
263 | */ | ||
264 | intel_pmu_disable_bts(); | ||
265 | |||
266 | if (event->hw.state & PERF_HES_STOPPED) | ||
267 | return; | ||
268 | |||
269 | ACCESS_ONCE(event->hw.state) |= PERF_HES_STOPPED; | ||
270 | } | ||
271 | |||
272 | static void bts_event_stop(struct perf_event *event, int flags) | ||
273 | { | ||
274 | struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); | ||
275 | |||
276 | /* PMI handler: don't restart this counter */ | ||
277 | ACCESS_ONCE(bts->started) = 0; | ||
278 | |||
279 | __bts_event_stop(event); | ||
280 | |||
281 | if (flags & PERF_EF_UPDATE) | ||
282 | bts_update(bts); | ||
283 | } | ||
284 | |||
285 | void intel_bts_enable_local(void) | ||
286 | { | ||
287 | struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); | ||
288 | |||
289 | if (bts->handle.event && bts->started) | ||
290 | __bts_event_start(bts->handle.event); | ||
291 | } | ||
292 | |||
293 | void intel_bts_disable_local(void) | ||
294 | { | ||
295 | struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); | ||
296 | |||
297 | if (bts->handle.event) | ||
298 | __bts_event_stop(bts->handle.event); | ||
299 | } | ||
300 | |||
301 | static int | ||
302 | bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle) | ||
303 | { | ||
304 | unsigned long head, space, next_space, pad, gap, skip, wakeup; | ||
305 | unsigned int next_buf; | ||
306 | struct bts_phys *phys, *next_phys; | ||
307 | int ret; | ||
308 | |||
309 | if (buf->snapshot) | ||
310 | return 0; | ||
311 | |||
312 | head = handle->head & ((buf->nr_pages << PAGE_SHIFT) - 1); | ||
313 | if (WARN_ON_ONCE(head != local_read(&buf->head))) | ||
314 | return -EINVAL; | ||
315 | |||
316 | phys = &buf->buf[buf->cur_buf]; | ||
317 | space = phys->offset + phys->displacement + phys->size - head; | ||
318 | pad = space; | ||
319 | if (space > handle->size) { | ||
320 | space = handle->size; | ||
321 | space -= space % BTS_RECORD_SIZE; | ||
322 | } | ||
323 | if (space <= BTS_SAFETY_MARGIN) { | ||
324 | /* See if next phys buffer has more space */ | ||
325 | next_buf = buf->cur_buf + 1; | ||
326 | if (next_buf >= buf->nr_bufs) | ||
327 | next_buf = 0; | ||
328 | next_phys = &buf->buf[next_buf]; | ||
329 | gap = buf_size(phys->page) - phys->displacement - phys->size + | ||
330 | next_phys->displacement; | ||
331 | skip = pad + gap; | ||
332 | if (handle->size >= skip) { | ||
333 | next_space = next_phys->size; | ||
334 | if (next_space + skip > handle->size) { | ||
335 | next_space = handle->size - skip; | ||
336 | next_space -= next_space % BTS_RECORD_SIZE; | ||
337 | } | ||
338 | if (next_space > space || !space) { | ||
339 | if (pad) | ||
340 | bts_buffer_pad_out(phys, head); | ||
341 | ret = perf_aux_output_skip(handle, skip); | ||
342 | if (ret) | ||
343 | return ret; | ||
344 | /* Advance to next phys buffer */ | ||
345 | phys = next_phys; | ||
346 | space = next_space; | ||
347 | head = phys->offset + phys->displacement; | ||
348 | /* | ||
349 | * After this, cur_buf and head won't match ds | ||
350 | * anymore, so we must not be racing with | ||
351 | * bts_update(). | ||
352 | */ | ||
353 | buf->cur_buf = next_buf; | ||
354 | local_set(&buf->head, head); | ||
355 | } | ||
356 | } | ||
357 | } | ||
358 | |||
359 | /* Don't go far beyond wakeup watermark */ | ||
360 | wakeup = BTS_SAFETY_MARGIN + BTS_RECORD_SIZE + handle->wakeup - | ||
361 | handle->head; | ||
362 | if (space > wakeup) { | ||
363 | space = wakeup; | ||
364 | space -= space % BTS_RECORD_SIZE; | ||
365 | } | ||
366 | |||
367 | buf->end = head + space; | ||
368 | |||
369 | /* | ||
370 | * If we have no space, the lost notification would have been sent when | ||
371 | * we hit absolute_maximum - see bts_update() | ||
372 | */ | ||
373 | if (!space) | ||
374 | return -ENOSPC; | ||
375 | |||
376 | return 0; | ||
377 | } | ||
378 | |||
379 | int intel_bts_interrupt(void) | ||
380 | { | ||
381 | struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); | ||
382 | struct perf_event *event = bts->handle.event; | ||
383 | struct bts_buffer *buf; | ||
384 | s64 old_head; | ||
385 | int err; | ||
386 | |||
387 | if (!event || !bts->started) | ||
388 | return 0; | ||
389 | |||
390 | buf = perf_get_aux(&bts->handle); | ||
391 | /* | ||
392 | * Skip snapshot counters: they don't use the interrupt, but | ||
393 | * there's no other way of telling, because the pointer will | ||
394 | * keep moving | ||
395 | */ | ||
396 | if (!buf || buf->snapshot) | ||
397 | return 0; | ||
398 | |||
399 | old_head = local_read(&buf->head); | ||
400 | bts_update(bts); | ||
401 | |||
402 | /* no new data */ | ||
403 | if (old_head == local_read(&buf->head)) | ||
404 | return 0; | ||
405 | |||
406 | perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0), | ||
407 | !!local_xchg(&buf->lost, 0)); | ||
408 | |||
409 | buf = perf_aux_output_begin(&bts->handle, event); | ||
410 | if (!buf) | ||
411 | return 1; | ||
412 | |||
413 | err = bts_buffer_reset(buf, &bts->handle); | ||
414 | if (err) | ||
415 | perf_aux_output_end(&bts->handle, 0, false); | ||
416 | |||
417 | return 1; | ||
418 | } | ||
419 | |||
420 | static void bts_event_del(struct perf_event *event, int mode) | ||
421 | { | ||
422 | struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); | ||
423 | struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); | ||
424 | struct bts_buffer *buf = perf_get_aux(&bts->handle); | ||
425 | |||
426 | bts_event_stop(event, PERF_EF_UPDATE); | ||
427 | |||
428 | if (buf) { | ||
429 | if (buf->snapshot) | ||
430 | bts->handle.head = | ||
431 | local_xchg(&buf->data_size, | ||
432 | buf->nr_pages << PAGE_SHIFT); | ||
433 | perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0), | ||
434 | !!local_xchg(&buf->lost, 0)); | ||
435 | } | ||
436 | |||
437 | cpuc->ds->bts_index = bts->ds_back.bts_buffer_base; | ||
438 | cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base; | ||
439 | cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum; | ||
440 | cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold; | ||
441 | } | ||
442 | |||
443 | static int bts_event_add(struct perf_event *event, int mode) | ||
444 | { | ||
445 | struct bts_buffer *buf; | ||
446 | struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); | ||
447 | struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); | ||
448 | struct hw_perf_event *hwc = &event->hw; | ||
449 | int ret = -EBUSY; | ||
450 | |||
451 | event->hw.state = PERF_HES_STOPPED; | ||
452 | |||
453 | if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) | ||
454 | return -EBUSY; | ||
455 | |||
456 | if (bts->handle.event) | ||
457 | return -EBUSY; | ||
458 | |||
459 | buf = perf_aux_output_begin(&bts->handle, event); | ||
460 | if (!buf) | ||
461 | return -EINVAL; | ||
462 | |||
463 | ret = bts_buffer_reset(buf, &bts->handle); | ||
464 | if (ret) { | ||
465 | perf_aux_output_end(&bts->handle, 0, false); | ||
466 | return ret; | ||
467 | } | ||
468 | |||
469 | bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base; | ||
470 | bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum; | ||
471 | bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold; | ||
472 | |||
473 | if (mode & PERF_EF_START) { | ||
474 | bts_event_start(event, 0); | ||
475 | if (hwc->state & PERF_HES_STOPPED) { | ||
476 | bts_event_del(event, 0); | ||
477 | return -EBUSY; | ||
478 | } | ||
479 | } | ||
480 | |||
481 | return 0; | ||
482 | } | ||
483 | |||
484 | static void bts_event_destroy(struct perf_event *event) | ||
485 | { | ||
486 | x86_del_exclusive(x86_lbr_exclusive_bts); | ||
487 | } | ||
488 | |||
489 | static int bts_event_init(struct perf_event *event) | ||
490 | { | ||
491 | if (event->attr.type != bts_pmu.type) | ||
492 | return -ENOENT; | ||
493 | |||
494 | if (x86_add_exclusive(x86_lbr_exclusive_bts)) | ||
495 | return -EBUSY; | ||
496 | |||
497 | event->destroy = bts_event_destroy; | ||
498 | |||
499 | return 0; | ||
500 | } | ||
501 | |||
502 | static void bts_event_read(struct perf_event *event) | ||
503 | { | ||
504 | } | ||
505 | |||
506 | static __init int bts_init(void) | ||
507 | { | ||
508 | if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts) | ||
509 | return -ENODEV; | ||
510 | |||
511 | bts_pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE; | ||
512 | bts_pmu.task_ctx_nr = perf_sw_context; | ||
513 | bts_pmu.event_init = bts_event_init; | ||
514 | bts_pmu.add = bts_event_add; | ||
515 | bts_pmu.del = bts_event_del; | ||
516 | bts_pmu.start = bts_event_start; | ||
517 | bts_pmu.stop = bts_event_stop; | ||
518 | bts_pmu.read = bts_event_read; | ||
519 | bts_pmu.setup_aux = bts_buffer_setup_aux; | ||
520 | bts_pmu.free_aux = bts_buffer_free_aux; | ||
521 | |||
522 | return perf_pmu_register(&bts_pmu, "intel_bts", -1); | ||
523 | } | ||
524 | |||
525 | module_init(bts_init); | ||
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c new file mode 100644 index 000000000000..e4d1b8b738fa --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c | |||
@@ -0,0 +1,1379 @@ | |||
1 | /* | ||
2 | * Intel Cache Quality-of-Service Monitoring (CQM) support. | ||
3 | * | ||
4 | * Based very, very heavily on work by Peter Zijlstra. | ||
5 | */ | ||
6 | |||
7 | #include <linux/perf_event.h> | ||
8 | #include <linux/slab.h> | ||
9 | #include <asm/cpu_device_id.h> | ||
10 | #include "perf_event.h" | ||
11 | |||
12 | #define MSR_IA32_PQR_ASSOC 0x0c8f | ||
13 | #define MSR_IA32_QM_CTR 0x0c8e | ||
14 | #define MSR_IA32_QM_EVTSEL 0x0c8d | ||
15 | |||
16 | static unsigned int cqm_max_rmid = -1; | ||
17 | static unsigned int cqm_l3_scale; /* supposedly cacheline size */ | ||
18 | |||
19 | struct intel_cqm_state { | ||
20 | raw_spinlock_t lock; | ||
21 | int rmid; | ||
22 | int cnt; | ||
23 | }; | ||
24 | |||
25 | static DEFINE_PER_CPU(struct intel_cqm_state, cqm_state); | ||
26 | |||
27 | /* | ||
28 | * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru. | ||
29 | * Also protects event->hw.cqm_rmid | ||
30 | * | ||
31 | * Hold either for stability, both for modification of ->hw.cqm_rmid. | ||
32 | */ | ||
33 | static DEFINE_MUTEX(cache_mutex); | ||
34 | static DEFINE_RAW_SPINLOCK(cache_lock); | ||
35 | |||
36 | /* | ||
37 | * Groups of events that have the same target(s), one RMID per group. | ||
38 | */ | ||
39 | static LIST_HEAD(cache_groups); | ||
40 | |||
41 | /* | ||
42 | * Mask of CPUs for reading CQM values. We only need one per-socket. | ||
43 | */ | ||
44 | static cpumask_t cqm_cpumask; | ||
45 | |||
46 | #define RMID_VAL_ERROR (1ULL << 63) | ||
47 | #define RMID_VAL_UNAVAIL (1ULL << 62) | ||
48 | |||
49 | #define QOS_L3_OCCUP_EVENT_ID (1 << 0) | ||
50 | |||
51 | #define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID | ||
52 | |||
53 | /* | ||
54 | * This is central to the rotation algorithm in __intel_cqm_rmid_rotate(). | ||
55 | * | ||
56 | * This rmid is always free and is guaranteed to have an associated | ||
57 | * near-zero occupancy value, i.e. no cachelines are tagged with this | ||
58 | * RMID, once __intel_cqm_rmid_rotate() returns. | ||
59 | */ | ||
60 | static unsigned int intel_cqm_rotation_rmid; | ||
61 | |||
62 | #define INVALID_RMID (-1) | ||
63 | |||
64 | /* | ||
65 | * Is @rmid valid for programming the hardware? | ||
66 | * | ||
67 | * rmid 0 is reserved by the hardware for all non-monitored tasks, which | ||
68 | * means that we should never come across an rmid with that value. | ||
69 | * Likewise, an rmid value of -1 is used to indicate "no rmid currently | ||
70 | * assigned" and is used as part of the rotation code. | ||
71 | */ | ||
72 | static inline bool __rmid_valid(unsigned int rmid) | ||
73 | { | ||
74 | if (!rmid || rmid == INVALID_RMID) | ||
75 | return false; | ||
76 | |||
77 | return true; | ||
78 | } | ||
79 | |||
80 | static u64 __rmid_read(unsigned int rmid) | ||
81 | { | ||
82 | u64 val; | ||
83 | |||
84 | /* | ||
85 | * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt, | ||
86 | * it just says that to increase confusion. | ||
87 | */ | ||
88 | wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid); | ||
89 | rdmsrl(MSR_IA32_QM_CTR, val); | ||
90 | |||
91 | /* | ||
92 | * Aside from the ERROR and UNAVAIL bits, assume this thing returns | ||
93 | * the number of cachelines tagged with @rmid. | ||
94 | */ | ||
95 | return val; | ||
96 | } | ||
97 | |||
98 | enum rmid_recycle_state { | ||
99 | RMID_YOUNG = 0, | ||
100 | RMID_AVAILABLE, | ||
101 | RMID_DIRTY, | ||
102 | }; | ||
103 | |||
104 | struct cqm_rmid_entry { | ||
105 | unsigned int rmid; | ||
106 | enum rmid_recycle_state state; | ||
107 | struct list_head list; | ||
108 | unsigned long queue_time; | ||
109 | }; | ||
110 | |||
111 | /* | ||
112 | * cqm_rmid_free_lru - A least recently used list of RMIDs. | ||
113 | * | ||
114 | * Oldest entry at the head, newest (most recently used) entry at the | ||
115 | * tail. This list is never traversed, it's only used to keep track of | ||
116 | * the lru order. That is, we only pick entries of the head or insert | ||
117 | * them on the tail. | ||
118 | * | ||
119 | * All entries on the list are 'free', and their RMIDs are not currently | ||
120 | * in use. To mark an RMID as in use, remove its entry from the lru | ||
121 | * list. | ||
122 | * | ||
123 | * | ||
124 | * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs. | ||
125 | * | ||
126 | * This list is contains RMIDs that no one is currently using but that | ||
127 | * may have a non-zero occupancy value associated with them. The | ||
128 | * rotation worker moves RMIDs from the limbo list to the free list once | ||
129 | * the occupancy value drops below __intel_cqm_threshold. | ||
130 | * | ||
131 | * Both lists are protected by cache_mutex. | ||
132 | */ | ||
133 | static LIST_HEAD(cqm_rmid_free_lru); | ||
134 | static LIST_HEAD(cqm_rmid_limbo_lru); | ||
135 | |||
136 | /* | ||
137 | * We use a simple array of pointers so that we can lookup a struct | ||
138 | * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid() | ||
139 | * and __put_rmid() from having to worry about dealing with struct | ||
140 | * cqm_rmid_entry - they just deal with rmids, i.e. integers. | ||
141 | * | ||
142 | * Once this array is initialized it is read-only. No locks are required | ||
143 | * to access it. | ||
144 | * | ||
145 | * All entries for all RMIDs can be looked up in the this array at all | ||
146 | * times. | ||
147 | */ | ||
148 | static struct cqm_rmid_entry **cqm_rmid_ptrs; | ||
149 | |||
150 | static inline struct cqm_rmid_entry *__rmid_entry(int rmid) | ||
151 | { | ||
152 | struct cqm_rmid_entry *entry; | ||
153 | |||
154 | entry = cqm_rmid_ptrs[rmid]; | ||
155 | WARN_ON(entry->rmid != rmid); | ||
156 | |||
157 | return entry; | ||
158 | } | ||
159 | |||
160 | /* | ||
161 | * Returns < 0 on fail. | ||
162 | * | ||
163 | * We expect to be called with cache_mutex held. | ||
164 | */ | ||
165 | static int __get_rmid(void) | ||
166 | { | ||
167 | struct cqm_rmid_entry *entry; | ||
168 | |||
169 | lockdep_assert_held(&cache_mutex); | ||
170 | |||
171 | if (list_empty(&cqm_rmid_free_lru)) | ||
172 | return INVALID_RMID; | ||
173 | |||
174 | entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list); | ||
175 | list_del(&entry->list); | ||
176 | |||
177 | return entry->rmid; | ||
178 | } | ||
179 | |||
180 | static void __put_rmid(unsigned int rmid) | ||
181 | { | ||
182 | struct cqm_rmid_entry *entry; | ||
183 | |||
184 | lockdep_assert_held(&cache_mutex); | ||
185 | |||
186 | WARN_ON(!__rmid_valid(rmid)); | ||
187 | entry = __rmid_entry(rmid); | ||
188 | |||
189 | entry->queue_time = jiffies; | ||
190 | entry->state = RMID_YOUNG; | ||
191 | |||
192 | list_add_tail(&entry->list, &cqm_rmid_limbo_lru); | ||
193 | } | ||
194 | |||
195 | static int intel_cqm_setup_rmid_cache(void) | ||
196 | { | ||
197 | struct cqm_rmid_entry *entry; | ||
198 | unsigned int nr_rmids; | ||
199 | int r = 0; | ||
200 | |||
201 | nr_rmids = cqm_max_rmid + 1; | ||
202 | cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) * | ||
203 | nr_rmids, GFP_KERNEL); | ||
204 | if (!cqm_rmid_ptrs) | ||
205 | return -ENOMEM; | ||
206 | |||
207 | for (; r <= cqm_max_rmid; r++) { | ||
208 | struct cqm_rmid_entry *entry; | ||
209 | |||
210 | entry = kmalloc(sizeof(*entry), GFP_KERNEL); | ||
211 | if (!entry) | ||
212 | goto fail; | ||
213 | |||
214 | INIT_LIST_HEAD(&entry->list); | ||
215 | entry->rmid = r; | ||
216 | cqm_rmid_ptrs[r] = entry; | ||
217 | |||
218 | list_add_tail(&entry->list, &cqm_rmid_free_lru); | ||
219 | } | ||
220 | |||
221 | /* | ||
222 | * RMID 0 is special and is always allocated. It's used for all | ||
223 | * tasks that are not monitored. | ||
224 | */ | ||
225 | entry = __rmid_entry(0); | ||
226 | list_del(&entry->list); | ||
227 | |||
228 | mutex_lock(&cache_mutex); | ||
229 | intel_cqm_rotation_rmid = __get_rmid(); | ||
230 | mutex_unlock(&cache_mutex); | ||
231 | |||
232 | return 0; | ||
233 | fail: | ||
234 | while (r--) | ||
235 | kfree(cqm_rmid_ptrs[r]); | ||
236 | |||
237 | kfree(cqm_rmid_ptrs); | ||
238 | return -ENOMEM; | ||
239 | } | ||
240 | |||
241 | /* | ||
242 | * Determine if @a and @b measure the same set of tasks. | ||
243 | * | ||
244 | * If @a and @b measure the same set of tasks then we want to share a | ||
245 | * single RMID. | ||
246 | */ | ||
247 | static bool __match_event(struct perf_event *a, struct perf_event *b) | ||
248 | { | ||
249 | /* Per-cpu and task events don't mix */ | ||
250 | if ((a->attach_state & PERF_ATTACH_TASK) != | ||
251 | (b->attach_state & PERF_ATTACH_TASK)) | ||
252 | return false; | ||
253 | |||
254 | #ifdef CONFIG_CGROUP_PERF | ||
255 | if (a->cgrp != b->cgrp) | ||
256 | return false; | ||
257 | #endif | ||
258 | |||
259 | /* If not task event, we're machine wide */ | ||
260 | if (!(b->attach_state & PERF_ATTACH_TASK)) | ||
261 | return true; | ||
262 | |||
263 | /* | ||
264 | * Events that target same task are placed into the same cache group. | ||
265 | */ | ||
266 | if (a->hw.target == b->hw.target) | ||
267 | return true; | ||
268 | |||
269 | /* | ||
270 | * Are we an inherited event? | ||
271 | */ | ||
272 | if (b->parent == a) | ||
273 | return true; | ||
274 | |||
275 | return false; | ||
276 | } | ||
277 | |||
278 | #ifdef CONFIG_CGROUP_PERF | ||
279 | static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event) | ||
280 | { | ||
281 | if (event->attach_state & PERF_ATTACH_TASK) | ||
282 | return perf_cgroup_from_task(event->hw.target); | ||
283 | |||
284 | return event->cgrp; | ||
285 | } | ||
286 | #endif | ||
287 | |||
288 | /* | ||
289 | * Determine if @a's tasks intersect with @b's tasks | ||
290 | * | ||
291 | * There are combinations of events that we explicitly prohibit, | ||
292 | * | ||
293 | * PROHIBITS | ||
294 | * system-wide -> cgroup and task | ||
295 | * cgroup -> system-wide | ||
296 | * -> task in cgroup | ||
297 | * task -> system-wide | ||
298 | * -> task in cgroup | ||
299 | * | ||
300 | * Call this function before allocating an RMID. | ||
301 | */ | ||
302 | static bool __conflict_event(struct perf_event *a, struct perf_event *b) | ||
303 | { | ||
304 | #ifdef CONFIG_CGROUP_PERF | ||
305 | /* | ||
306 | * We can have any number of cgroups but only one system-wide | ||
307 | * event at a time. | ||
308 | */ | ||
309 | if (a->cgrp && b->cgrp) { | ||
310 | struct perf_cgroup *ac = a->cgrp; | ||
311 | struct perf_cgroup *bc = b->cgrp; | ||
312 | |||
313 | /* | ||
314 | * This condition should have been caught in | ||
315 | * __match_event() and we should be sharing an RMID. | ||
316 | */ | ||
317 | WARN_ON_ONCE(ac == bc); | ||
318 | |||
319 | if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) || | ||
320 | cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup)) | ||
321 | return true; | ||
322 | |||
323 | return false; | ||
324 | } | ||
325 | |||
326 | if (a->cgrp || b->cgrp) { | ||
327 | struct perf_cgroup *ac, *bc; | ||
328 | |||
329 | /* | ||
330 | * cgroup and system-wide events are mutually exclusive | ||
331 | */ | ||
332 | if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) || | ||
333 | (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK))) | ||
334 | return true; | ||
335 | |||
336 | /* | ||
337 | * Ensure neither event is part of the other's cgroup | ||
338 | */ | ||
339 | ac = event_to_cgroup(a); | ||
340 | bc = event_to_cgroup(b); | ||
341 | if (ac == bc) | ||
342 | return true; | ||
343 | |||
344 | /* | ||
345 | * Must have cgroup and non-intersecting task events. | ||
346 | */ | ||
347 | if (!ac || !bc) | ||
348 | return false; | ||
349 | |||
350 | /* | ||
351 | * We have cgroup and task events, and the task belongs | ||
352 | * to a cgroup. Check for for overlap. | ||
353 | */ | ||
354 | if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) || | ||
355 | cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup)) | ||
356 | return true; | ||
357 | |||
358 | return false; | ||
359 | } | ||
360 | #endif | ||
361 | /* | ||
362 | * If one of them is not a task, same story as above with cgroups. | ||
363 | */ | ||
364 | if (!(a->attach_state & PERF_ATTACH_TASK) || | ||
365 | !(b->attach_state & PERF_ATTACH_TASK)) | ||
366 | return true; | ||
367 | |||
368 | /* | ||
369 | * Must be non-overlapping. | ||
370 | */ | ||
371 | return false; | ||
372 | } | ||
373 | |||
374 | struct rmid_read { | ||
375 | unsigned int rmid; | ||
376 | atomic64_t value; | ||
377 | }; | ||
378 | |||
379 | static void __intel_cqm_event_count(void *info); | ||
380 | |||
381 | /* | ||
382 | * Exchange the RMID of a group of events. | ||
383 | */ | ||
384 | static unsigned int | ||
385 | intel_cqm_xchg_rmid(struct perf_event *group, unsigned int rmid) | ||
386 | { | ||
387 | struct perf_event *event; | ||
388 | unsigned int old_rmid = group->hw.cqm_rmid; | ||
389 | struct list_head *head = &group->hw.cqm_group_entry; | ||
390 | |||
391 | lockdep_assert_held(&cache_mutex); | ||
392 | |||
393 | /* | ||
394 | * If our RMID is being deallocated, perform a read now. | ||
395 | */ | ||
396 | if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) { | ||
397 | struct rmid_read rr = { | ||
398 | .value = ATOMIC64_INIT(0), | ||
399 | .rmid = old_rmid, | ||
400 | }; | ||
401 | |||
402 | on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, | ||
403 | &rr, 1); | ||
404 | local64_set(&group->count, atomic64_read(&rr.value)); | ||
405 | } | ||
406 | |||
407 | raw_spin_lock_irq(&cache_lock); | ||
408 | |||
409 | group->hw.cqm_rmid = rmid; | ||
410 | list_for_each_entry(event, head, hw.cqm_group_entry) | ||
411 | event->hw.cqm_rmid = rmid; | ||
412 | |||
413 | raw_spin_unlock_irq(&cache_lock); | ||
414 | |||
415 | return old_rmid; | ||
416 | } | ||
417 | |||
418 | /* | ||
419 | * If we fail to assign a new RMID for intel_cqm_rotation_rmid because | ||
420 | * cachelines are still tagged with RMIDs in limbo, we progressively | ||
421 | * increment the threshold until we find an RMID in limbo with <= | ||
422 | * __intel_cqm_threshold lines tagged. This is designed to mitigate the | ||
423 | * problem where cachelines tagged with an RMID are not steadily being | ||
424 | * evicted. | ||
425 | * | ||
426 | * On successful rotations we decrease the threshold back towards zero. | ||
427 | * | ||
428 | * __intel_cqm_max_threshold provides an upper bound on the threshold, | ||
429 | * and is measured in bytes because it's exposed to userland. | ||
430 | */ | ||
431 | static unsigned int __intel_cqm_threshold; | ||
432 | static unsigned int __intel_cqm_max_threshold; | ||
433 | |||
434 | /* | ||
435 | * Test whether an RMID has a zero occupancy value on this cpu. | ||
436 | */ | ||
437 | static void intel_cqm_stable(void *arg) | ||
438 | { | ||
439 | struct cqm_rmid_entry *entry; | ||
440 | |||
441 | list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) { | ||
442 | if (entry->state != RMID_AVAILABLE) | ||
443 | break; | ||
444 | |||
445 | if (__rmid_read(entry->rmid) > __intel_cqm_threshold) | ||
446 | entry->state = RMID_DIRTY; | ||
447 | } | ||
448 | } | ||
449 | |||
450 | /* | ||
451 | * If we have group events waiting for an RMID that don't conflict with | ||
452 | * events already running, assign @rmid. | ||
453 | */ | ||
454 | static bool intel_cqm_sched_in_event(unsigned int rmid) | ||
455 | { | ||
456 | struct perf_event *leader, *event; | ||
457 | |||
458 | lockdep_assert_held(&cache_mutex); | ||
459 | |||
460 | leader = list_first_entry(&cache_groups, struct perf_event, | ||
461 | hw.cqm_groups_entry); | ||
462 | event = leader; | ||
463 | |||
464 | list_for_each_entry_continue(event, &cache_groups, | ||
465 | hw.cqm_groups_entry) { | ||
466 | if (__rmid_valid(event->hw.cqm_rmid)) | ||
467 | continue; | ||
468 | |||
469 | if (__conflict_event(event, leader)) | ||
470 | continue; | ||
471 | |||
472 | intel_cqm_xchg_rmid(event, rmid); | ||
473 | return true; | ||
474 | } | ||
475 | |||
476 | return false; | ||
477 | } | ||
478 | |||
479 | /* | ||
480 | * Initially use this constant for both the limbo queue time and the | ||
481 | * rotation timer interval, pmu::hrtimer_interval_ms. | ||
482 | * | ||
483 | * They don't need to be the same, but the two are related since if you | ||
484 | * rotate faster than you recycle RMIDs, you may run out of available | ||
485 | * RMIDs. | ||
486 | */ | ||
487 | #define RMID_DEFAULT_QUEUE_TIME 250 /* ms */ | ||
488 | |||
489 | static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME; | ||
490 | |||
491 | /* | ||
492 | * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list | ||
493 | * @nr_available: number of freeable RMIDs on the limbo list | ||
494 | * | ||
495 | * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no | ||
496 | * cachelines are tagged with those RMIDs. After this we can reuse them | ||
497 | * and know that the current set of active RMIDs is stable. | ||
498 | * | ||
499 | * Return %true or %false depending on whether stabilization needs to be | ||
500 | * reattempted. | ||
501 | * | ||
502 | * If we return %true then @nr_available is updated to indicate the | ||
503 | * number of RMIDs on the limbo list that have been queued for the | ||
504 | * minimum queue time (RMID_AVAILABLE), but whose data occupancy values | ||
505 | * are above __intel_cqm_threshold. | ||
506 | */ | ||
507 | static bool intel_cqm_rmid_stabilize(unsigned int *available) | ||
508 | { | ||
509 | struct cqm_rmid_entry *entry, *tmp; | ||
510 | |||
511 | lockdep_assert_held(&cache_mutex); | ||
512 | |||
513 | *available = 0; | ||
514 | list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) { | ||
515 | unsigned long min_queue_time; | ||
516 | unsigned long now = jiffies; | ||
517 | |||
518 | /* | ||
519 | * We hold RMIDs placed into limbo for a minimum queue | ||
520 | * time. Before the minimum queue time has elapsed we do | ||
521 | * not recycle RMIDs. | ||
522 | * | ||
523 | * The reasoning is that until a sufficient time has | ||
524 | * passed since we stopped using an RMID, any RMID | ||
525 | * placed onto the limbo list will likely still have | ||
526 | * data tagged in the cache, which means we'll probably | ||
527 | * fail to recycle it anyway. | ||
528 | * | ||
529 | * We can save ourselves an expensive IPI by skipping | ||
530 | * any RMIDs that have not been queued for the minimum | ||
531 | * time. | ||
532 | */ | ||
533 | min_queue_time = entry->queue_time + | ||
534 | msecs_to_jiffies(__rmid_queue_time_ms); | ||
535 | |||
536 | if (time_after(min_queue_time, now)) | ||
537 | break; | ||
538 | |||
539 | entry->state = RMID_AVAILABLE; | ||
540 | (*available)++; | ||
541 | } | ||
542 | |||
543 | /* | ||
544 | * Fast return if none of the RMIDs on the limbo list have been | ||
545 | * sitting on the queue for the minimum queue time. | ||
546 | */ | ||
547 | if (!*available) | ||
548 | return false; | ||
549 | |||
550 | /* | ||
551 | * Test whether an RMID is free for each package. | ||
552 | */ | ||
553 | on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true); | ||
554 | |||
555 | list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) { | ||
556 | /* | ||
557 | * Exhausted all RMIDs that have waited min queue time. | ||
558 | */ | ||
559 | if (entry->state == RMID_YOUNG) | ||
560 | break; | ||
561 | |||
562 | if (entry->state == RMID_DIRTY) | ||
563 | continue; | ||
564 | |||
565 | list_del(&entry->list); /* remove from limbo */ | ||
566 | |||
567 | /* | ||
568 | * The rotation RMID gets priority if it's | ||
569 | * currently invalid. In which case, skip adding | ||
570 | * the RMID to the the free lru. | ||
571 | */ | ||
572 | if (!__rmid_valid(intel_cqm_rotation_rmid)) { | ||
573 | intel_cqm_rotation_rmid = entry->rmid; | ||
574 | continue; | ||
575 | } | ||
576 | |||
577 | /* | ||
578 | * If we have groups waiting for RMIDs, hand | ||
579 | * them one now provided they don't conflict. | ||
580 | */ | ||
581 | if (intel_cqm_sched_in_event(entry->rmid)) | ||
582 | continue; | ||
583 | |||
584 | /* | ||
585 | * Otherwise place it onto the free list. | ||
586 | */ | ||
587 | list_add_tail(&entry->list, &cqm_rmid_free_lru); | ||
588 | } | ||
589 | |||
590 | |||
591 | return __rmid_valid(intel_cqm_rotation_rmid); | ||
592 | } | ||
593 | |||
594 | /* | ||
595 | * Pick a victim group and move it to the tail of the group list. | ||
596 | * @next: The first group without an RMID | ||
597 | */ | ||
598 | static void __intel_cqm_pick_and_rotate(struct perf_event *next) | ||
599 | { | ||
600 | struct perf_event *rotor; | ||
601 | unsigned int rmid; | ||
602 | |||
603 | lockdep_assert_held(&cache_mutex); | ||
604 | |||
605 | rotor = list_first_entry(&cache_groups, struct perf_event, | ||
606 | hw.cqm_groups_entry); | ||
607 | |||
608 | /* | ||
609 | * The group at the front of the list should always have a valid | ||
610 | * RMID. If it doesn't then no groups have RMIDs assigned and we | ||
611 | * don't need to rotate the list. | ||
612 | */ | ||
613 | if (next == rotor) | ||
614 | return; | ||
615 | |||
616 | rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID); | ||
617 | __put_rmid(rmid); | ||
618 | |||
619 | list_rotate_left(&cache_groups); | ||
620 | } | ||
621 | |||
622 | /* | ||
623 | * Deallocate the RMIDs from any events that conflict with @event, and | ||
624 | * place them on the back of the group list. | ||
625 | */ | ||
626 | static void intel_cqm_sched_out_conflicting_events(struct perf_event *event) | ||
627 | { | ||
628 | struct perf_event *group, *g; | ||
629 | unsigned int rmid; | ||
630 | |||
631 | lockdep_assert_held(&cache_mutex); | ||
632 | |||
633 | list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) { | ||
634 | if (group == event) | ||
635 | continue; | ||
636 | |||
637 | rmid = group->hw.cqm_rmid; | ||
638 | |||
639 | /* | ||
640 | * Skip events that don't have a valid RMID. | ||
641 | */ | ||
642 | if (!__rmid_valid(rmid)) | ||
643 | continue; | ||
644 | |||
645 | /* | ||
646 | * No conflict? No problem! Leave the event alone. | ||
647 | */ | ||
648 | if (!__conflict_event(group, event)) | ||
649 | continue; | ||
650 | |||
651 | intel_cqm_xchg_rmid(group, INVALID_RMID); | ||
652 | __put_rmid(rmid); | ||
653 | } | ||
654 | } | ||
655 | |||
656 | /* | ||
657 | * Attempt to rotate the groups and assign new RMIDs. | ||
658 | * | ||
659 | * We rotate for two reasons, | ||
660 | * 1. To handle the scheduling of conflicting events | ||
661 | * 2. To recycle RMIDs | ||
662 | * | ||
663 | * Rotating RMIDs is complicated because the hardware doesn't give us | ||
664 | * any clues. | ||
665 | * | ||
666 | * There's problems with the hardware interface; when you change the | ||
667 | * task:RMID map cachelines retain their 'old' tags, giving a skewed | ||
668 | * picture. In order to work around this, we must always keep one free | ||
669 | * RMID - intel_cqm_rotation_rmid. | ||
670 | * | ||
671 | * Rotation works by taking away an RMID from a group (the old RMID), | ||
672 | * and assigning the free RMID to another group (the new RMID). We must | ||
673 | * then wait for the old RMID to not be used (no cachelines tagged). | ||
674 | * This ensure that all cachelines are tagged with 'active' RMIDs. At | ||
675 | * this point we can start reading values for the new RMID and treat the | ||
676 | * old RMID as the free RMID for the next rotation. | ||
677 | * | ||
678 | * Return %true or %false depending on whether we did any rotating. | ||
679 | */ | ||
680 | static bool __intel_cqm_rmid_rotate(void) | ||
681 | { | ||
682 | struct perf_event *group, *start = NULL; | ||
683 | unsigned int threshold_limit; | ||
684 | unsigned int nr_needed = 0; | ||
685 | unsigned int nr_available; | ||
686 | bool rotated = false; | ||
687 | |||
688 | mutex_lock(&cache_mutex); | ||
689 | |||
690 | again: | ||
691 | /* | ||
692 | * Fast path through this function if there are no groups and no | ||
693 | * RMIDs that need cleaning. | ||
694 | */ | ||
695 | if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru)) | ||
696 | goto out; | ||
697 | |||
698 | list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) { | ||
699 | if (!__rmid_valid(group->hw.cqm_rmid)) { | ||
700 | if (!start) | ||
701 | start = group; | ||
702 | nr_needed++; | ||
703 | } | ||
704 | } | ||
705 | |||
706 | /* | ||
707 | * We have some event groups, but they all have RMIDs assigned | ||
708 | * and no RMIDs need cleaning. | ||
709 | */ | ||
710 | if (!nr_needed && list_empty(&cqm_rmid_limbo_lru)) | ||
711 | goto out; | ||
712 | |||
713 | if (!nr_needed) | ||
714 | goto stabilize; | ||
715 | |||
716 | /* | ||
717 | * We have more event groups without RMIDs than available RMIDs, | ||
718 | * or we have event groups that conflict with the ones currently | ||
719 | * scheduled. | ||
720 | * | ||
721 | * We force deallocate the rmid of the group at the head of | ||
722 | * cache_groups. The first event group without an RMID then gets | ||
723 | * assigned intel_cqm_rotation_rmid. This ensures we always make | ||
724 | * forward progress. | ||
725 | * | ||
726 | * Rotate the cache_groups list so the previous head is now the | ||
727 | * tail. | ||
728 | */ | ||
729 | __intel_cqm_pick_and_rotate(start); | ||
730 | |||
731 | /* | ||
732 | * If the rotation is going to succeed, reduce the threshold so | ||
733 | * that we don't needlessly reuse dirty RMIDs. | ||
734 | */ | ||
735 | if (__rmid_valid(intel_cqm_rotation_rmid)) { | ||
736 | intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid); | ||
737 | intel_cqm_rotation_rmid = __get_rmid(); | ||
738 | |||
739 | intel_cqm_sched_out_conflicting_events(start); | ||
740 | |||
741 | if (__intel_cqm_threshold) | ||
742 | __intel_cqm_threshold--; | ||
743 | } | ||
744 | |||
745 | rotated = true; | ||
746 | |||
747 | stabilize: | ||
748 | /* | ||
749 | * We now need to stablize the RMID we freed above (if any) to | ||
750 | * ensure that the next time we rotate we have an RMID with zero | ||
751 | * occupancy value. | ||
752 | * | ||
753 | * Alternatively, if we didn't need to perform any rotation, | ||
754 | * we'll have a bunch of RMIDs in limbo that need stabilizing. | ||
755 | */ | ||
756 | threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale; | ||
757 | |||
758 | while (intel_cqm_rmid_stabilize(&nr_available) && | ||
759 | __intel_cqm_threshold < threshold_limit) { | ||
760 | unsigned int steal_limit; | ||
761 | |||
762 | /* | ||
763 | * Don't spin if nobody is actively waiting for an RMID, | ||
764 | * the rotation worker will be kicked as soon as an | ||
765 | * event needs an RMID anyway. | ||
766 | */ | ||
767 | if (!nr_needed) | ||
768 | break; | ||
769 | |||
770 | /* Allow max 25% of RMIDs to be in limbo. */ | ||
771 | steal_limit = (cqm_max_rmid + 1) / 4; | ||
772 | |||
773 | /* | ||
774 | * We failed to stabilize any RMIDs so our rotation | ||
775 | * logic is now stuck. In order to make forward progress | ||
776 | * we have a few options: | ||
777 | * | ||
778 | * 1. rotate ("steal") another RMID | ||
779 | * 2. increase the threshold | ||
780 | * 3. do nothing | ||
781 | * | ||
782 | * We do both of 1. and 2. until we hit the steal limit. | ||
783 | * | ||
784 | * The steal limit prevents all RMIDs ending up on the | ||
785 | * limbo list. This can happen if every RMID has a | ||
786 | * non-zero occupancy above threshold_limit, and the | ||
787 | * occupancy values aren't dropping fast enough. | ||
788 | * | ||
789 | * Note that there is prioritisation at work here - we'd | ||
790 | * rather increase the number of RMIDs on the limbo list | ||
791 | * than increase the threshold, because increasing the | ||
792 | * threshold skews the event data (because we reuse | ||
793 | * dirty RMIDs) - threshold bumps are a last resort. | ||
794 | */ | ||
795 | if (nr_available < steal_limit) | ||
796 | goto again; | ||
797 | |||
798 | __intel_cqm_threshold++; | ||
799 | } | ||
800 | |||
801 | out: | ||
802 | mutex_unlock(&cache_mutex); | ||
803 | return rotated; | ||
804 | } | ||
805 | |||
806 | static void intel_cqm_rmid_rotate(struct work_struct *work); | ||
807 | |||
808 | static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate); | ||
809 | |||
810 | static struct pmu intel_cqm_pmu; | ||
811 | |||
812 | static void intel_cqm_rmid_rotate(struct work_struct *work) | ||
813 | { | ||
814 | unsigned long delay; | ||
815 | |||
816 | __intel_cqm_rmid_rotate(); | ||
817 | |||
818 | delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms); | ||
819 | schedule_delayed_work(&intel_cqm_rmid_work, delay); | ||
820 | } | ||
821 | |||
822 | /* | ||
823 | * Find a group and setup RMID. | ||
824 | * | ||
825 | * If we're part of a group, we use the group's RMID. | ||
826 | */ | ||
827 | static void intel_cqm_setup_event(struct perf_event *event, | ||
828 | struct perf_event **group) | ||
829 | { | ||
830 | struct perf_event *iter; | ||
831 | unsigned int rmid; | ||
832 | bool conflict = false; | ||
833 | |||
834 | list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) { | ||
835 | rmid = iter->hw.cqm_rmid; | ||
836 | |||
837 | if (__match_event(iter, event)) { | ||
838 | /* All tasks in a group share an RMID */ | ||
839 | event->hw.cqm_rmid = rmid; | ||
840 | *group = iter; | ||
841 | return; | ||
842 | } | ||
843 | |||
844 | /* | ||
845 | * We only care about conflicts for events that are | ||
846 | * actually scheduled in (and hence have a valid RMID). | ||
847 | */ | ||
848 | if (__conflict_event(iter, event) && __rmid_valid(rmid)) | ||
849 | conflict = true; | ||
850 | } | ||
851 | |||
852 | if (conflict) | ||
853 | rmid = INVALID_RMID; | ||
854 | else | ||
855 | rmid = __get_rmid(); | ||
856 | |||
857 | event->hw.cqm_rmid = rmid; | ||
858 | } | ||
859 | |||
860 | static void intel_cqm_event_read(struct perf_event *event) | ||
861 | { | ||
862 | unsigned long flags; | ||
863 | unsigned int rmid; | ||
864 | u64 val; | ||
865 | |||
866 | /* | ||
867 | * Task events are handled by intel_cqm_event_count(). | ||
868 | */ | ||
869 | if (event->cpu == -1) | ||
870 | return; | ||
871 | |||
872 | raw_spin_lock_irqsave(&cache_lock, flags); | ||
873 | rmid = event->hw.cqm_rmid; | ||
874 | |||
875 | if (!__rmid_valid(rmid)) | ||
876 | goto out; | ||
877 | |||
878 | val = __rmid_read(rmid); | ||
879 | |||
880 | /* | ||
881 | * Ignore this reading on error states and do not update the value. | ||
882 | */ | ||
883 | if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) | ||
884 | goto out; | ||
885 | |||
886 | local64_set(&event->count, val); | ||
887 | out: | ||
888 | raw_spin_unlock_irqrestore(&cache_lock, flags); | ||
889 | } | ||
890 | |||
891 | static void __intel_cqm_event_count(void *info) | ||
892 | { | ||
893 | struct rmid_read *rr = info; | ||
894 | u64 val; | ||
895 | |||
896 | val = __rmid_read(rr->rmid); | ||
897 | |||
898 | if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) | ||
899 | return; | ||
900 | |||
901 | atomic64_add(val, &rr->value); | ||
902 | } | ||
903 | |||
904 | static inline bool cqm_group_leader(struct perf_event *event) | ||
905 | { | ||
906 | return !list_empty(&event->hw.cqm_groups_entry); | ||
907 | } | ||
908 | |||
909 | static u64 intel_cqm_event_count(struct perf_event *event) | ||
910 | { | ||
911 | unsigned long flags; | ||
912 | struct rmid_read rr = { | ||
913 | .value = ATOMIC64_INIT(0), | ||
914 | }; | ||
915 | |||
916 | /* | ||
917 | * We only need to worry about task events. System-wide events | ||
918 | * are handled like usual, i.e. entirely with | ||
919 | * intel_cqm_event_read(). | ||
920 | */ | ||
921 | if (event->cpu != -1) | ||
922 | return __perf_event_count(event); | ||
923 | |||
924 | /* | ||
925 | * Only the group leader gets to report values. This stops us | ||
926 | * reporting duplicate values to userspace, and gives us a clear | ||
927 | * rule for which task gets to report the values. | ||
928 | * | ||
929 | * Note that it is impossible to attribute these values to | ||
930 | * specific packages - we forfeit that ability when we create | ||
931 | * task events. | ||
932 | */ | ||
933 | if (!cqm_group_leader(event)) | ||
934 | return 0; | ||
935 | |||
936 | /* | ||
937 | * Notice that we don't perform the reading of an RMID | ||
938 | * atomically, because we can't hold a spin lock across the | ||
939 | * IPIs. | ||
940 | * | ||
941 | * Speculatively perform the read, since @event might be | ||
942 | * assigned a different (possibly invalid) RMID while we're | ||
943 | * busying performing the IPI calls. It's therefore necessary to | ||
944 | * check @event's RMID afterwards, and if it has changed, | ||
945 | * discard the result of the read. | ||
946 | */ | ||
947 | rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid); | ||
948 | |||
949 | if (!__rmid_valid(rr.rmid)) | ||
950 | goto out; | ||
951 | |||
952 | on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1); | ||
953 | |||
954 | raw_spin_lock_irqsave(&cache_lock, flags); | ||
955 | if (event->hw.cqm_rmid == rr.rmid) | ||
956 | local64_set(&event->count, atomic64_read(&rr.value)); | ||
957 | raw_spin_unlock_irqrestore(&cache_lock, flags); | ||
958 | out: | ||
959 | return __perf_event_count(event); | ||
960 | } | ||
961 | |||
962 | static void intel_cqm_event_start(struct perf_event *event, int mode) | ||
963 | { | ||
964 | struct intel_cqm_state *state = this_cpu_ptr(&cqm_state); | ||
965 | unsigned int rmid = event->hw.cqm_rmid; | ||
966 | unsigned long flags; | ||
967 | |||
968 | if (!(event->hw.cqm_state & PERF_HES_STOPPED)) | ||
969 | return; | ||
970 | |||
971 | event->hw.cqm_state &= ~PERF_HES_STOPPED; | ||
972 | |||
973 | raw_spin_lock_irqsave(&state->lock, flags); | ||
974 | |||
975 | if (state->cnt++) | ||
976 | WARN_ON_ONCE(state->rmid != rmid); | ||
977 | else | ||
978 | WARN_ON_ONCE(state->rmid); | ||
979 | |||
980 | state->rmid = rmid; | ||
981 | wrmsrl(MSR_IA32_PQR_ASSOC, state->rmid); | ||
982 | |||
983 | raw_spin_unlock_irqrestore(&state->lock, flags); | ||
984 | } | ||
985 | |||
986 | static void intel_cqm_event_stop(struct perf_event *event, int mode) | ||
987 | { | ||
988 | struct intel_cqm_state *state = this_cpu_ptr(&cqm_state); | ||
989 | unsigned long flags; | ||
990 | |||
991 | if (event->hw.cqm_state & PERF_HES_STOPPED) | ||
992 | return; | ||
993 | |||
994 | event->hw.cqm_state |= PERF_HES_STOPPED; | ||
995 | |||
996 | raw_spin_lock_irqsave(&state->lock, flags); | ||
997 | intel_cqm_event_read(event); | ||
998 | |||
999 | if (!--state->cnt) { | ||
1000 | state->rmid = 0; | ||
1001 | wrmsrl(MSR_IA32_PQR_ASSOC, 0); | ||
1002 | } else { | ||
1003 | WARN_ON_ONCE(!state->rmid); | ||
1004 | } | ||
1005 | |||
1006 | raw_spin_unlock_irqrestore(&state->lock, flags); | ||
1007 | } | ||
1008 | |||
1009 | static int intel_cqm_event_add(struct perf_event *event, int mode) | ||
1010 | { | ||
1011 | unsigned long flags; | ||
1012 | unsigned int rmid; | ||
1013 | |||
1014 | raw_spin_lock_irqsave(&cache_lock, flags); | ||
1015 | |||
1016 | event->hw.cqm_state = PERF_HES_STOPPED; | ||
1017 | rmid = event->hw.cqm_rmid; | ||
1018 | |||
1019 | if (__rmid_valid(rmid) && (mode & PERF_EF_START)) | ||
1020 | intel_cqm_event_start(event, mode); | ||
1021 | |||
1022 | raw_spin_unlock_irqrestore(&cache_lock, flags); | ||
1023 | |||
1024 | return 0; | ||
1025 | } | ||
1026 | |||
1027 | static void intel_cqm_event_del(struct perf_event *event, int mode) | ||
1028 | { | ||
1029 | intel_cqm_event_stop(event, mode); | ||
1030 | } | ||
1031 | |||
1032 | static void intel_cqm_event_destroy(struct perf_event *event) | ||
1033 | { | ||
1034 | struct perf_event *group_other = NULL; | ||
1035 | |||
1036 | mutex_lock(&cache_mutex); | ||
1037 | |||
1038 | /* | ||
1039 | * If there's another event in this group... | ||
1040 | */ | ||
1041 | if (!list_empty(&event->hw.cqm_group_entry)) { | ||
1042 | group_other = list_first_entry(&event->hw.cqm_group_entry, | ||
1043 | struct perf_event, | ||
1044 | hw.cqm_group_entry); | ||
1045 | list_del(&event->hw.cqm_group_entry); | ||
1046 | } | ||
1047 | |||
1048 | /* | ||
1049 | * And we're the group leader.. | ||
1050 | */ | ||
1051 | if (cqm_group_leader(event)) { | ||
1052 | /* | ||
1053 | * If there was a group_other, make that leader, otherwise | ||
1054 | * destroy the group and return the RMID. | ||
1055 | */ | ||
1056 | if (group_other) { | ||
1057 | list_replace(&event->hw.cqm_groups_entry, | ||
1058 | &group_other->hw.cqm_groups_entry); | ||
1059 | } else { | ||
1060 | unsigned int rmid = event->hw.cqm_rmid; | ||
1061 | |||
1062 | if (__rmid_valid(rmid)) | ||
1063 | __put_rmid(rmid); | ||
1064 | list_del(&event->hw.cqm_groups_entry); | ||
1065 | } | ||
1066 | } | ||
1067 | |||
1068 | mutex_unlock(&cache_mutex); | ||
1069 | } | ||
1070 | |||
1071 | static int intel_cqm_event_init(struct perf_event *event) | ||
1072 | { | ||
1073 | struct perf_event *group = NULL; | ||
1074 | bool rotate = false; | ||
1075 | |||
1076 | if (event->attr.type != intel_cqm_pmu.type) | ||
1077 | return -ENOENT; | ||
1078 | |||
1079 | if (event->attr.config & ~QOS_EVENT_MASK) | ||
1080 | return -EINVAL; | ||
1081 | |||
1082 | /* unsupported modes and filters */ | ||
1083 | if (event->attr.exclude_user || | ||
1084 | event->attr.exclude_kernel || | ||
1085 | event->attr.exclude_hv || | ||
1086 | event->attr.exclude_idle || | ||
1087 | event->attr.exclude_host || | ||
1088 | event->attr.exclude_guest || | ||
1089 | event->attr.sample_period) /* no sampling */ | ||
1090 | return -EINVAL; | ||
1091 | |||
1092 | INIT_LIST_HEAD(&event->hw.cqm_group_entry); | ||
1093 | INIT_LIST_HEAD(&event->hw.cqm_groups_entry); | ||
1094 | |||
1095 | event->destroy = intel_cqm_event_destroy; | ||
1096 | |||
1097 | mutex_lock(&cache_mutex); | ||
1098 | |||
1099 | /* Will also set rmid */ | ||
1100 | intel_cqm_setup_event(event, &group); | ||
1101 | |||
1102 | if (group) { | ||
1103 | list_add_tail(&event->hw.cqm_group_entry, | ||
1104 | &group->hw.cqm_group_entry); | ||
1105 | } else { | ||
1106 | list_add_tail(&event->hw.cqm_groups_entry, | ||
1107 | &cache_groups); | ||
1108 | |||
1109 | /* | ||
1110 | * All RMIDs are either in use or have recently been | ||
1111 | * used. Kick the rotation worker to clean/free some. | ||
1112 | * | ||
1113 | * We only do this for the group leader, rather than for | ||
1114 | * every event in a group to save on needless work. | ||
1115 | */ | ||
1116 | if (!__rmid_valid(event->hw.cqm_rmid)) | ||
1117 | rotate = true; | ||
1118 | } | ||
1119 | |||
1120 | mutex_unlock(&cache_mutex); | ||
1121 | |||
1122 | if (rotate) | ||
1123 | schedule_delayed_work(&intel_cqm_rmid_work, 0); | ||
1124 | |||
1125 | return 0; | ||
1126 | } | ||
1127 | |||
1128 | EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01"); | ||
1129 | EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1"); | ||
1130 | EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes"); | ||
1131 | EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL); | ||
1132 | EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1"); | ||
1133 | |||
1134 | static struct attribute *intel_cqm_events_attr[] = { | ||
1135 | EVENT_PTR(intel_cqm_llc), | ||
1136 | EVENT_PTR(intel_cqm_llc_pkg), | ||
1137 | EVENT_PTR(intel_cqm_llc_unit), | ||
1138 | EVENT_PTR(intel_cqm_llc_scale), | ||
1139 | EVENT_PTR(intel_cqm_llc_snapshot), | ||
1140 | NULL, | ||
1141 | }; | ||
1142 | |||
1143 | static struct attribute_group intel_cqm_events_group = { | ||
1144 | .name = "events", | ||
1145 | .attrs = intel_cqm_events_attr, | ||
1146 | }; | ||
1147 | |||
1148 | PMU_FORMAT_ATTR(event, "config:0-7"); | ||
1149 | static struct attribute *intel_cqm_formats_attr[] = { | ||
1150 | &format_attr_event.attr, | ||
1151 | NULL, | ||
1152 | }; | ||
1153 | |||
1154 | static struct attribute_group intel_cqm_format_group = { | ||
1155 | .name = "format", | ||
1156 | .attrs = intel_cqm_formats_attr, | ||
1157 | }; | ||
1158 | |||
1159 | static ssize_t | ||
1160 | max_recycle_threshold_show(struct device *dev, struct device_attribute *attr, | ||
1161 | char *page) | ||
1162 | { | ||
1163 | ssize_t rv; | ||
1164 | |||
1165 | mutex_lock(&cache_mutex); | ||
1166 | rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold); | ||
1167 | mutex_unlock(&cache_mutex); | ||
1168 | |||
1169 | return rv; | ||
1170 | } | ||
1171 | |||
1172 | static ssize_t | ||
1173 | max_recycle_threshold_store(struct device *dev, | ||
1174 | struct device_attribute *attr, | ||
1175 | const char *buf, size_t count) | ||
1176 | { | ||
1177 | unsigned int bytes, cachelines; | ||
1178 | int ret; | ||
1179 | |||
1180 | ret = kstrtouint(buf, 0, &bytes); | ||
1181 | if (ret) | ||
1182 | return ret; | ||
1183 | |||
1184 | mutex_lock(&cache_mutex); | ||
1185 | |||
1186 | __intel_cqm_max_threshold = bytes; | ||
1187 | cachelines = bytes / cqm_l3_scale; | ||
1188 | |||
1189 | /* | ||
1190 | * The new maximum takes effect immediately. | ||
1191 | */ | ||
1192 | if (__intel_cqm_threshold > cachelines) | ||
1193 | __intel_cqm_threshold = cachelines; | ||
1194 | |||
1195 | mutex_unlock(&cache_mutex); | ||
1196 | |||
1197 | return count; | ||
1198 | } | ||
1199 | |||
1200 | static DEVICE_ATTR_RW(max_recycle_threshold); | ||
1201 | |||
1202 | static struct attribute *intel_cqm_attrs[] = { | ||
1203 | &dev_attr_max_recycle_threshold.attr, | ||
1204 | NULL, | ||
1205 | }; | ||
1206 | |||
1207 | static const struct attribute_group intel_cqm_group = { | ||
1208 | .attrs = intel_cqm_attrs, | ||
1209 | }; | ||
1210 | |||
1211 | static const struct attribute_group *intel_cqm_attr_groups[] = { | ||
1212 | &intel_cqm_events_group, | ||
1213 | &intel_cqm_format_group, | ||
1214 | &intel_cqm_group, | ||
1215 | NULL, | ||
1216 | }; | ||
1217 | |||
1218 | static struct pmu intel_cqm_pmu = { | ||
1219 | .hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME, | ||
1220 | .attr_groups = intel_cqm_attr_groups, | ||
1221 | .task_ctx_nr = perf_sw_context, | ||
1222 | .event_init = intel_cqm_event_init, | ||
1223 | .add = intel_cqm_event_add, | ||
1224 | .del = intel_cqm_event_del, | ||
1225 | .start = intel_cqm_event_start, | ||
1226 | .stop = intel_cqm_event_stop, | ||
1227 | .read = intel_cqm_event_read, | ||
1228 | .count = intel_cqm_event_count, | ||
1229 | }; | ||
1230 | |||
1231 | static inline void cqm_pick_event_reader(int cpu) | ||
1232 | { | ||
1233 | int phys_id = topology_physical_package_id(cpu); | ||
1234 | int i; | ||
1235 | |||
1236 | for_each_cpu(i, &cqm_cpumask) { | ||
1237 | if (phys_id == topology_physical_package_id(i)) | ||
1238 | return; /* already got reader for this socket */ | ||
1239 | } | ||
1240 | |||
1241 | cpumask_set_cpu(cpu, &cqm_cpumask); | ||
1242 | } | ||
1243 | |||
1244 | static void intel_cqm_cpu_prepare(unsigned int cpu) | ||
1245 | { | ||
1246 | struct intel_cqm_state *state = &per_cpu(cqm_state, cpu); | ||
1247 | struct cpuinfo_x86 *c = &cpu_data(cpu); | ||
1248 | |||
1249 | raw_spin_lock_init(&state->lock); | ||
1250 | state->rmid = 0; | ||
1251 | state->cnt = 0; | ||
1252 | |||
1253 | WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid); | ||
1254 | WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale); | ||
1255 | } | ||
1256 | |||
1257 | static void intel_cqm_cpu_exit(unsigned int cpu) | ||
1258 | { | ||
1259 | int phys_id = topology_physical_package_id(cpu); | ||
1260 | int i; | ||
1261 | |||
1262 | /* | ||
1263 | * Is @cpu a designated cqm reader? | ||
1264 | */ | ||
1265 | if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask)) | ||
1266 | return; | ||
1267 | |||
1268 | for_each_online_cpu(i) { | ||
1269 | if (i == cpu) | ||
1270 | continue; | ||
1271 | |||
1272 | if (phys_id == topology_physical_package_id(i)) { | ||
1273 | cpumask_set_cpu(i, &cqm_cpumask); | ||
1274 | break; | ||
1275 | } | ||
1276 | } | ||
1277 | } | ||
1278 | |||
1279 | static int intel_cqm_cpu_notifier(struct notifier_block *nb, | ||
1280 | unsigned long action, void *hcpu) | ||
1281 | { | ||
1282 | unsigned int cpu = (unsigned long)hcpu; | ||
1283 | |||
1284 | switch (action & ~CPU_TASKS_FROZEN) { | ||
1285 | case CPU_UP_PREPARE: | ||
1286 | intel_cqm_cpu_prepare(cpu); | ||
1287 | break; | ||
1288 | case CPU_DOWN_PREPARE: | ||
1289 | intel_cqm_cpu_exit(cpu); | ||
1290 | break; | ||
1291 | case CPU_STARTING: | ||
1292 | cqm_pick_event_reader(cpu); | ||
1293 | break; | ||
1294 | } | ||
1295 | |||
1296 | return NOTIFY_OK; | ||
1297 | } | ||
1298 | |||
1299 | static const struct x86_cpu_id intel_cqm_match[] = { | ||
1300 | { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC }, | ||
1301 | {} | ||
1302 | }; | ||
1303 | |||
1304 | static int __init intel_cqm_init(void) | ||
1305 | { | ||
1306 | char *str, scale[20]; | ||
1307 | int i, cpu, ret; | ||
1308 | |||
1309 | if (!x86_match_cpu(intel_cqm_match)) | ||
1310 | return -ENODEV; | ||
1311 | |||
1312 | cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale; | ||
1313 | |||
1314 | /* | ||
1315 | * It's possible that not all resources support the same number | ||
1316 | * of RMIDs. Instead of making scheduling much more complicated | ||
1317 | * (where we have to match a task's RMID to a cpu that supports | ||
1318 | * that many RMIDs) just find the minimum RMIDs supported across | ||
1319 | * all cpus. | ||
1320 | * | ||
1321 | * Also, check that the scales match on all cpus. | ||
1322 | */ | ||
1323 | cpu_notifier_register_begin(); | ||
1324 | |||
1325 | for_each_online_cpu(cpu) { | ||
1326 | struct cpuinfo_x86 *c = &cpu_data(cpu); | ||
1327 | |||
1328 | if (c->x86_cache_max_rmid < cqm_max_rmid) | ||
1329 | cqm_max_rmid = c->x86_cache_max_rmid; | ||
1330 | |||
1331 | if (c->x86_cache_occ_scale != cqm_l3_scale) { | ||
1332 | pr_err("Multiple LLC scale values, disabling\n"); | ||
1333 | ret = -EINVAL; | ||
1334 | goto out; | ||
1335 | } | ||
1336 | } | ||
1337 | |||
1338 | /* | ||
1339 | * A reasonable upper limit on the max threshold is the number | ||
1340 | * of lines tagged per RMID if all RMIDs have the same number of | ||
1341 | * lines tagged in the LLC. | ||
1342 | * | ||
1343 | * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. | ||
1344 | */ | ||
1345 | __intel_cqm_max_threshold = | ||
1346 | boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1); | ||
1347 | |||
1348 | snprintf(scale, sizeof(scale), "%u", cqm_l3_scale); | ||
1349 | str = kstrdup(scale, GFP_KERNEL); | ||
1350 | if (!str) { | ||
1351 | ret = -ENOMEM; | ||
1352 | goto out; | ||
1353 | } | ||
1354 | |||
1355 | event_attr_intel_cqm_llc_scale.event_str = str; | ||
1356 | |||
1357 | ret = intel_cqm_setup_rmid_cache(); | ||
1358 | if (ret) | ||
1359 | goto out; | ||
1360 | |||
1361 | for_each_online_cpu(i) { | ||
1362 | intel_cqm_cpu_prepare(i); | ||
1363 | cqm_pick_event_reader(i); | ||
1364 | } | ||
1365 | |||
1366 | __perf_cpu_notifier(intel_cqm_cpu_notifier); | ||
1367 | |||
1368 | ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1); | ||
1369 | if (ret) | ||
1370 | pr_err("Intel CQM perf registration failed: %d\n", ret); | ||
1371 | else | ||
1372 | pr_info("Intel CQM monitoring enabled\n"); | ||
1373 | |||
1374 | out: | ||
1375 | cpu_notifier_register_done(); | ||
1376 | |||
1377 | return ret; | ||
1378 | } | ||
1379 | device_initcall(intel_cqm_init); | ||
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 073983398364..813f75d71175 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c | |||
@@ -461,7 +461,8 @@ void intel_pmu_enable_bts(u64 config) | |||
461 | 461 | ||
462 | debugctlmsr |= DEBUGCTLMSR_TR; | 462 | debugctlmsr |= DEBUGCTLMSR_TR; |
463 | debugctlmsr |= DEBUGCTLMSR_BTS; | 463 | debugctlmsr |= DEBUGCTLMSR_BTS; |
464 | debugctlmsr |= DEBUGCTLMSR_BTINT; | 464 | if (config & ARCH_PERFMON_EVENTSEL_INT) |
465 | debugctlmsr |= DEBUGCTLMSR_BTINT; | ||
465 | 466 | ||
466 | if (!(config & ARCH_PERFMON_EVENTSEL_OS)) | 467 | if (!(config & ARCH_PERFMON_EVENTSEL_OS)) |
467 | debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS; | 468 | debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS; |
@@ -557,6 +558,8 @@ struct event_constraint intel_core2_pebs_event_constraints[] = { | |||
557 | INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */ | 558 | INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */ |
558 | INTEL_FLAGS_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */ | 559 | INTEL_FLAGS_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */ |
559 | INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ | 560 | INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ |
561 | /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */ | ||
562 | INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01), | ||
560 | EVENT_CONSTRAINT_END | 563 | EVENT_CONSTRAINT_END |
561 | }; | 564 | }; |
562 | 565 | ||
@@ -564,6 +567,8 @@ struct event_constraint intel_atom_pebs_event_constraints[] = { | |||
564 | INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ | 567 | INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ |
565 | INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */ | 568 | INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */ |
566 | INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ | 569 | INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ |
570 | /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */ | ||
571 | INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01), | ||
567 | EVENT_CONSTRAINT_END | 572 | EVENT_CONSTRAINT_END |
568 | }; | 573 | }; |
569 | 574 | ||
@@ -587,6 +592,8 @@ struct event_constraint intel_nehalem_pebs_event_constraints[] = { | |||
587 | INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ | 592 | INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ |
588 | INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ | 593 | INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ |
589 | INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ | 594 | INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ |
595 | /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */ | ||
596 | INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f), | ||
590 | EVENT_CONSTRAINT_END | 597 | EVENT_CONSTRAINT_END |
591 | }; | 598 | }; |
592 | 599 | ||
@@ -602,6 +609,8 @@ struct event_constraint intel_westmere_pebs_event_constraints[] = { | |||
602 | INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ | 609 | INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ |
603 | INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ | 610 | INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ |
604 | INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ | 611 | INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ |
612 | /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */ | ||
613 | INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f), | ||
605 | EVENT_CONSTRAINT_END | 614 | EVENT_CONSTRAINT_END |
606 | }; | 615 | }; |
607 | 616 | ||
@@ -611,6 +620,10 @@ struct event_constraint intel_snb_pebs_event_constraints[] = { | |||
611 | INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */ | 620 | INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */ |
612 | /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ | 621 | /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ |
613 | INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf), | 622 | INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf), |
623 | INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */ | ||
624 | INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ | ||
625 | INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ | ||
626 | INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */ | ||
614 | /* Allow all events as PEBS with no flags */ | 627 | /* Allow all events as PEBS with no flags */ |
615 | INTEL_ALL_EVENT_CONSTRAINT(0, 0xf), | 628 | INTEL_ALL_EVENT_CONSTRAINT(0, 0xf), |
616 | EVENT_CONSTRAINT_END | 629 | EVENT_CONSTRAINT_END |
@@ -622,6 +635,10 @@ struct event_constraint intel_ivb_pebs_event_constraints[] = { | |||
622 | INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */ | 635 | INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */ |
623 | /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ | 636 | /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ |
624 | INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf), | 637 | INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf), |
638 | INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */ | ||
639 | INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ | ||
640 | INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ | ||
641 | INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */ | ||
625 | /* Allow all events as PEBS with no flags */ | 642 | /* Allow all events as PEBS with no flags */ |
626 | INTEL_ALL_EVENT_CONSTRAINT(0, 0xf), | 643 | INTEL_ALL_EVENT_CONSTRAINT(0, 0xf), |
627 | EVENT_CONSTRAINT_END | 644 | EVENT_CONSTRAINT_END |
@@ -633,16 +650,16 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = { | |||
633 | /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ | 650 | /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ |
634 | INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf), | 651 | INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf), |
635 | INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */ | 652 | INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */ |
636 | INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */ | 653 | INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */ |
637 | INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */ | 654 | INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */ |
638 | INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */ | 655 | INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */ |
639 | INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */ | 656 | INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */ |
640 | INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */ | 657 | INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */ |
641 | INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */ | 658 | INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */ |
642 | INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */ | 659 | INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */ |
643 | INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ | 660 | INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ |
644 | INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf), /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */ | 661 | INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd2, 0xf), /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */ |
645 | INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf), /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */ | 662 | INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd3, 0xf), /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */ |
646 | /* Allow all events as PEBS with no flags */ | 663 | /* Allow all events as PEBS with no flags */ |
647 | INTEL_ALL_EVENT_CONSTRAINT(0, 0xf), | 664 | INTEL_ALL_EVENT_CONSTRAINT(0, 0xf), |
648 | EVENT_CONSTRAINT_END | 665 | EVENT_CONSTRAINT_END |
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 58f1a94beaf0..94e5b506caa6 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c | |||
@@ -39,6 +39,7 @@ static enum { | |||
39 | #define LBR_IND_JMP_BIT 6 /* do not capture indirect jumps */ | 39 | #define LBR_IND_JMP_BIT 6 /* do not capture indirect jumps */ |
40 | #define LBR_REL_JMP_BIT 7 /* do not capture relative jumps */ | 40 | #define LBR_REL_JMP_BIT 7 /* do not capture relative jumps */ |
41 | #define LBR_FAR_BIT 8 /* do not capture far branches */ | 41 | #define LBR_FAR_BIT 8 /* do not capture far branches */ |
42 | #define LBR_CALL_STACK_BIT 9 /* enable call stack */ | ||
42 | 43 | ||
43 | #define LBR_KERNEL (1 << LBR_KERNEL_BIT) | 44 | #define LBR_KERNEL (1 << LBR_KERNEL_BIT) |
44 | #define LBR_USER (1 << LBR_USER_BIT) | 45 | #define LBR_USER (1 << LBR_USER_BIT) |
@@ -49,6 +50,7 @@ static enum { | |||
49 | #define LBR_REL_JMP (1 << LBR_REL_JMP_BIT) | 50 | #define LBR_REL_JMP (1 << LBR_REL_JMP_BIT) |
50 | #define LBR_IND_JMP (1 << LBR_IND_JMP_BIT) | 51 | #define LBR_IND_JMP (1 << LBR_IND_JMP_BIT) |
51 | #define LBR_FAR (1 << LBR_FAR_BIT) | 52 | #define LBR_FAR (1 << LBR_FAR_BIT) |
53 | #define LBR_CALL_STACK (1 << LBR_CALL_STACK_BIT) | ||
52 | 54 | ||
53 | #define LBR_PLM (LBR_KERNEL | LBR_USER) | 55 | #define LBR_PLM (LBR_KERNEL | LBR_USER) |
54 | 56 | ||
@@ -69,33 +71,31 @@ static enum { | |||
69 | #define LBR_FROM_FLAG_IN_TX (1ULL << 62) | 71 | #define LBR_FROM_FLAG_IN_TX (1ULL << 62) |
70 | #define LBR_FROM_FLAG_ABORT (1ULL << 61) | 72 | #define LBR_FROM_FLAG_ABORT (1ULL << 61) |
71 | 73 | ||
72 | #define for_each_branch_sample_type(x) \ | ||
73 | for ((x) = PERF_SAMPLE_BRANCH_USER; \ | ||
74 | (x) < PERF_SAMPLE_BRANCH_MAX; (x) <<= 1) | ||
75 | |||
76 | /* | 74 | /* |
77 | * x86control flow change classification | 75 | * x86control flow change classification |
78 | * x86control flow changes include branches, interrupts, traps, faults | 76 | * x86control flow changes include branches, interrupts, traps, faults |
79 | */ | 77 | */ |
80 | enum { | 78 | enum { |
81 | X86_BR_NONE = 0, /* unknown */ | 79 | X86_BR_NONE = 0, /* unknown */ |
82 | 80 | ||
83 | X86_BR_USER = 1 << 0, /* branch target is user */ | 81 | X86_BR_USER = 1 << 0, /* branch target is user */ |
84 | X86_BR_KERNEL = 1 << 1, /* branch target is kernel */ | 82 | X86_BR_KERNEL = 1 << 1, /* branch target is kernel */ |
85 | 83 | ||
86 | X86_BR_CALL = 1 << 2, /* call */ | 84 | X86_BR_CALL = 1 << 2, /* call */ |
87 | X86_BR_RET = 1 << 3, /* return */ | 85 | X86_BR_RET = 1 << 3, /* return */ |
88 | X86_BR_SYSCALL = 1 << 4, /* syscall */ | 86 | X86_BR_SYSCALL = 1 << 4, /* syscall */ |
89 | X86_BR_SYSRET = 1 << 5, /* syscall return */ | 87 | X86_BR_SYSRET = 1 << 5, /* syscall return */ |
90 | X86_BR_INT = 1 << 6, /* sw interrupt */ | 88 | X86_BR_INT = 1 << 6, /* sw interrupt */ |
91 | X86_BR_IRET = 1 << 7, /* return from interrupt */ | 89 | X86_BR_IRET = 1 << 7, /* return from interrupt */ |
92 | X86_BR_JCC = 1 << 8, /* conditional */ | 90 | X86_BR_JCC = 1 << 8, /* conditional */ |
93 | X86_BR_JMP = 1 << 9, /* jump */ | 91 | X86_BR_JMP = 1 << 9, /* jump */ |
94 | X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */ | 92 | X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */ |
95 | X86_BR_IND_CALL = 1 << 11,/* indirect calls */ | 93 | X86_BR_IND_CALL = 1 << 11,/* indirect calls */ |
96 | X86_BR_ABORT = 1 << 12,/* transaction abort */ | 94 | X86_BR_ABORT = 1 << 12,/* transaction abort */ |
97 | X86_BR_IN_TX = 1 << 13,/* in transaction */ | 95 | X86_BR_IN_TX = 1 << 13,/* in transaction */ |
98 | X86_BR_NO_TX = 1 << 14,/* not in transaction */ | 96 | X86_BR_NO_TX = 1 << 14,/* not in transaction */ |
97 | X86_BR_ZERO_CALL = 1 << 15,/* zero length call */ | ||
98 | X86_BR_CALL_STACK = 1 << 16,/* call stack */ | ||
99 | }; | 99 | }; |
100 | 100 | ||
101 | #define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL) | 101 | #define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL) |
@@ -112,13 +112,15 @@ enum { | |||
112 | X86_BR_JMP |\ | 112 | X86_BR_JMP |\ |
113 | X86_BR_IRQ |\ | 113 | X86_BR_IRQ |\ |
114 | X86_BR_ABORT |\ | 114 | X86_BR_ABORT |\ |
115 | X86_BR_IND_CALL) | 115 | X86_BR_IND_CALL |\ |
116 | X86_BR_ZERO_CALL) | ||
116 | 117 | ||
117 | #define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY) | 118 | #define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY) |
118 | 119 | ||
119 | #define X86_BR_ANY_CALL \ | 120 | #define X86_BR_ANY_CALL \ |
120 | (X86_BR_CALL |\ | 121 | (X86_BR_CALL |\ |
121 | X86_BR_IND_CALL |\ | 122 | X86_BR_IND_CALL |\ |
123 | X86_BR_ZERO_CALL |\ | ||
122 | X86_BR_SYSCALL |\ | 124 | X86_BR_SYSCALL |\ |
123 | X86_BR_IRQ |\ | 125 | X86_BR_IRQ |\ |
124 | X86_BR_INT) | 126 | X86_BR_INT) |
@@ -130,17 +132,32 @@ static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc); | |||
130 | * otherwise it becomes near impossible to get a reliable stack. | 132 | * otherwise it becomes near impossible to get a reliable stack. |
131 | */ | 133 | */ |
132 | 134 | ||
133 | static void __intel_pmu_lbr_enable(void) | 135 | static void __intel_pmu_lbr_enable(bool pmi) |
134 | { | 136 | { |
135 | u64 debugctl; | ||
136 | struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); | 137 | struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); |
138 | u64 debugctl, lbr_select = 0, orig_debugctl; | ||
137 | 139 | ||
138 | if (cpuc->lbr_sel) | 140 | /* |
139 | wrmsrl(MSR_LBR_SELECT, cpuc->lbr_sel->config); | 141 | * No need to reprogram LBR_SELECT in a PMI, as it |
142 | * did not change. | ||
143 | */ | ||
144 | if (cpuc->lbr_sel && !pmi) { | ||
145 | lbr_select = cpuc->lbr_sel->config; | ||
146 | wrmsrl(MSR_LBR_SELECT, lbr_select); | ||
147 | } | ||
140 | 148 | ||
141 | rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); | 149 | rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); |
142 | debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); | 150 | orig_debugctl = debugctl; |
143 | wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); | 151 | debugctl |= DEBUGCTLMSR_LBR; |
152 | /* | ||
153 | * LBR callstack does not work well with FREEZE_LBRS_ON_PMI. | ||
154 | * If FREEZE_LBRS_ON_PMI is set, PMI near call/return instructions | ||
155 | * may cause superfluous increase/decrease of LBR_TOS. | ||
156 | */ | ||
157 | if (!(lbr_select & LBR_CALL_STACK)) | ||
158 | debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; | ||
159 | if (orig_debugctl != debugctl) | ||
160 | wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); | ||
144 | } | 161 | } |
145 | 162 | ||
146 | static void __intel_pmu_lbr_disable(void) | 163 | static void __intel_pmu_lbr_disable(void) |
@@ -181,9 +198,116 @@ void intel_pmu_lbr_reset(void) | |||
181 | intel_pmu_lbr_reset_64(); | 198 | intel_pmu_lbr_reset_64(); |
182 | } | 199 | } |
183 | 200 | ||
201 | /* | ||
202 | * TOS = most recently recorded branch | ||
203 | */ | ||
204 | static inline u64 intel_pmu_lbr_tos(void) | ||
205 | { | ||
206 | u64 tos; | ||
207 | |||
208 | rdmsrl(x86_pmu.lbr_tos, tos); | ||
209 | return tos; | ||
210 | } | ||
211 | |||
212 | enum { | ||
213 | LBR_NONE, | ||
214 | LBR_VALID, | ||
215 | }; | ||
216 | |||
217 | static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) | ||
218 | { | ||
219 | int i; | ||
220 | unsigned lbr_idx, mask; | ||
221 | u64 tos; | ||
222 | |||
223 | if (task_ctx->lbr_callstack_users == 0 || | ||
224 | task_ctx->lbr_stack_state == LBR_NONE) { | ||
225 | intel_pmu_lbr_reset(); | ||
226 | return; | ||
227 | } | ||
228 | |||
229 | mask = x86_pmu.lbr_nr - 1; | ||
230 | tos = intel_pmu_lbr_tos(); | ||
231 | for (i = 0; i < x86_pmu.lbr_nr; i++) { | ||
232 | lbr_idx = (tos - i) & mask; | ||
233 | wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]); | ||
234 | wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]); | ||
235 | } | ||
236 | task_ctx->lbr_stack_state = LBR_NONE; | ||
237 | } | ||
238 | |||
239 | static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) | ||
240 | { | ||
241 | int i; | ||
242 | unsigned lbr_idx, mask; | ||
243 | u64 tos; | ||
244 | |||
245 | if (task_ctx->lbr_callstack_users == 0) { | ||
246 | task_ctx->lbr_stack_state = LBR_NONE; | ||
247 | return; | ||
248 | } | ||
249 | |||
250 | mask = x86_pmu.lbr_nr - 1; | ||
251 | tos = intel_pmu_lbr_tos(); | ||
252 | for (i = 0; i < x86_pmu.lbr_nr; i++) { | ||
253 | lbr_idx = (tos - i) & mask; | ||
254 | rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]); | ||
255 | rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]); | ||
256 | } | ||
257 | task_ctx->lbr_stack_state = LBR_VALID; | ||
258 | } | ||
259 | |||
260 | void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) | ||
261 | { | ||
262 | struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); | ||
263 | struct x86_perf_task_context *task_ctx; | ||
264 | |||
265 | if (!x86_pmu.lbr_nr) | ||
266 | return; | ||
267 | |||
268 | /* | ||
269 | * If LBR callstack feature is enabled and the stack was saved when | ||
270 | * the task was scheduled out, restore the stack. Otherwise flush | ||
271 | * the LBR stack. | ||
272 | */ | ||
273 | task_ctx = ctx ? ctx->task_ctx_data : NULL; | ||
274 | if (task_ctx) { | ||
275 | if (sched_in) { | ||
276 | __intel_pmu_lbr_restore(task_ctx); | ||
277 | cpuc->lbr_context = ctx; | ||
278 | } else { | ||
279 | __intel_pmu_lbr_save(task_ctx); | ||
280 | } | ||
281 | return; | ||
282 | } | ||
283 | |||
284 | /* | ||
285 | * When sampling the branck stack in system-wide, it may be | ||
286 | * necessary to flush the stack on context switch. This happens | ||
287 | * when the branch stack does not tag its entries with the pid | ||
288 | * of the current task. Otherwise it becomes impossible to | ||
289 | * associate a branch entry with a task. This ambiguity is more | ||
290 | * likely to appear when the branch stack supports priv level | ||
291 | * filtering and the user sets it to monitor only at the user | ||
292 | * level (which could be a useful measurement in system-wide | ||
293 | * mode). In that case, the risk is high of having a branch | ||
294 | * stack with branch from multiple tasks. | ||
295 | */ | ||
296 | if (sched_in) { | ||
297 | intel_pmu_lbr_reset(); | ||
298 | cpuc->lbr_context = ctx; | ||
299 | } | ||
300 | } | ||
301 | |||
302 | static inline bool branch_user_callstack(unsigned br_sel) | ||
303 | { | ||
304 | return (br_sel & X86_BR_USER) && (br_sel & X86_BR_CALL_STACK); | ||
305 | } | ||
306 | |||
184 | void intel_pmu_lbr_enable(struct perf_event *event) | 307 | void intel_pmu_lbr_enable(struct perf_event *event) |
185 | { | 308 | { |
186 | struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); | 309 | struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); |
310 | struct x86_perf_task_context *task_ctx; | ||
187 | 311 | ||
188 | if (!x86_pmu.lbr_nr) | 312 | if (!x86_pmu.lbr_nr) |
189 | return; | 313 | return; |
@@ -198,18 +322,33 @@ void intel_pmu_lbr_enable(struct perf_event *event) | |||
198 | } | 322 | } |
199 | cpuc->br_sel = event->hw.branch_reg.reg; | 323 | cpuc->br_sel = event->hw.branch_reg.reg; |
200 | 324 | ||
325 | if (branch_user_callstack(cpuc->br_sel) && event->ctx && | ||
326 | event->ctx->task_ctx_data) { | ||
327 | task_ctx = event->ctx->task_ctx_data; | ||
328 | task_ctx->lbr_callstack_users++; | ||
329 | } | ||
330 | |||
201 | cpuc->lbr_users++; | 331 | cpuc->lbr_users++; |
332 | perf_sched_cb_inc(event->ctx->pmu); | ||
202 | } | 333 | } |
203 | 334 | ||
204 | void intel_pmu_lbr_disable(struct perf_event *event) | 335 | void intel_pmu_lbr_disable(struct perf_event *event) |
205 | { | 336 | { |
206 | struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); | 337 | struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); |
338 | struct x86_perf_task_context *task_ctx; | ||
207 | 339 | ||
208 | if (!x86_pmu.lbr_nr) | 340 | if (!x86_pmu.lbr_nr) |
209 | return; | 341 | return; |
210 | 342 | ||
343 | if (branch_user_callstack(cpuc->br_sel) && event->ctx && | ||
344 | event->ctx->task_ctx_data) { | ||
345 | task_ctx = event->ctx->task_ctx_data; | ||
346 | task_ctx->lbr_callstack_users--; | ||
347 | } | ||
348 | |||
211 | cpuc->lbr_users--; | 349 | cpuc->lbr_users--; |
212 | WARN_ON_ONCE(cpuc->lbr_users < 0); | 350 | WARN_ON_ONCE(cpuc->lbr_users < 0); |
351 | perf_sched_cb_dec(event->ctx->pmu); | ||
213 | 352 | ||
214 | if (cpuc->enabled && !cpuc->lbr_users) { | 353 | if (cpuc->enabled && !cpuc->lbr_users) { |
215 | __intel_pmu_lbr_disable(); | 354 | __intel_pmu_lbr_disable(); |
@@ -218,12 +357,12 @@ void intel_pmu_lbr_disable(struct perf_event *event) | |||
218 | } | 357 | } |
219 | } | 358 | } |
220 | 359 | ||
221 | void intel_pmu_lbr_enable_all(void) | 360 | void intel_pmu_lbr_enable_all(bool pmi) |
222 | { | 361 | { |
223 | struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); | 362 | struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); |
224 | 363 | ||
225 | if (cpuc->lbr_users) | 364 | if (cpuc->lbr_users) |
226 | __intel_pmu_lbr_enable(); | 365 | __intel_pmu_lbr_enable(pmi); |
227 | } | 366 | } |
228 | 367 | ||
229 | void intel_pmu_lbr_disable_all(void) | 368 | void intel_pmu_lbr_disable_all(void) |
@@ -234,18 +373,6 @@ void intel_pmu_lbr_disable_all(void) | |||
234 | __intel_pmu_lbr_disable(); | 373 | __intel_pmu_lbr_disable(); |
235 | } | 374 | } |
236 | 375 | ||
237 | /* | ||
238 | * TOS = most recently recorded branch | ||
239 | */ | ||
240 | static inline u64 intel_pmu_lbr_tos(void) | ||
241 | { | ||
242 | u64 tos; | ||
243 | |||
244 | rdmsrl(x86_pmu.lbr_tos, tos); | ||
245 | |||
246 | return tos; | ||
247 | } | ||
248 | |||
249 | static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) | 376 | static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) |
250 | { | 377 | { |
251 | unsigned long mask = x86_pmu.lbr_nr - 1; | 378 | unsigned long mask = x86_pmu.lbr_nr - 1; |
@@ -350,7 +477,7 @@ void intel_pmu_lbr_read(void) | |||
350 | * - in case there is no HW filter | 477 | * - in case there is no HW filter |
351 | * - in case the HW filter has errata or limitations | 478 | * - in case the HW filter has errata or limitations |
352 | */ | 479 | */ |
353 | static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event) | 480 | static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event) |
354 | { | 481 | { |
355 | u64 br_type = event->attr.branch_sample_type; | 482 | u64 br_type = event->attr.branch_sample_type; |
356 | int mask = 0; | 483 | int mask = 0; |
@@ -387,11 +514,21 @@ static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event) | |||
387 | if (br_type & PERF_SAMPLE_BRANCH_COND) | 514 | if (br_type & PERF_SAMPLE_BRANCH_COND) |
388 | mask |= X86_BR_JCC; | 515 | mask |= X86_BR_JCC; |
389 | 516 | ||
517 | if (br_type & PERF_SAMPLE_BRANCH_CALL_STACK) { | ||
518 | if (!x86_pmu_has_lbr_callstack()) | ||
519 | return -EOPNOTSUPP; | ||
520 | if (mask & ~(X86_BR_USER | X86_BR_KERNEL)) | ||
521 | return -EINVAL; | ||
522 | mask |= X86_BR_CALL | X86_BR_IND_CALL | X86_BR_RET | | ||
523 | X86_BR_CALL_STACK; | ||
524 | } | ||
525 | |||
390 | /* | 526 | /* |
391 | * stash actual user request into reg, it may | 527 | * stash actual user request into reg, it may |
392 | * be used by fixup code for some CPU | 528 | * be used by fixup code for some CPU |
393 | */ | 529 | */ |
394 | event->hw.branch_reg.reg = mask; | 530 | event->hw.branch_reg.reg = mask; |
531 | return 0; | ||
395 | } | 532 | } |
396 | 533 | ||
397 | /* | 534 | /* |
@@ -403,14 +540,14 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event) | |||
403 | { | 540 | { |
404 | struct hw_perf_event_extra *reg; | 541 | struct hw_perf_event_extra *reg; |
405 | u64 br_type = event->attr.branch_sample_type; | 542 | u64 br_type = event->attr.branch_sample_type; |
406 | u64 mask = 0, m; | 543 | u64 mask = 0, v; |
407 | u64 v; | 544 | int i; |
408 | 545 | ||
409 | for_each_branch_sample_type(m) { | 546 | for (i = 0; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) { |
410 | if (!(br_type & m)) | 547 | if (!(br_type & (1ULL << i))) |
411 | continue; | 548 | continue; |
412 | 549 | ||
413 | v = x86_pmu.lbr_sel_map[m]; | 550 | v = x86_pmu.lbr_sel_map[i]; |
414 | if (v == LBR_NOT_SUPP) | 551 | if (v == LBR_NOT_SUPP) |
415 | return -EOPNOTSUPP; | 552 | return -EOPNOTSUPP; |
416 | 553 | ||
@@ -420,8 +557,12 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event) | |||
420 | reg = &event->hw.branch_reg; | 557 | reg = &event->hw.branch_reg; |
421 | reg->idx = EXTRA_REG_LBR; | 558 | reg->idx = EXTRA_REG_LBR; |
422 | 559 | ||
423 | /* LBR_SELECT operates in suppress mode so invert mask */ | 560 | /* |
424 | reg->config = ~mask & x86_pmu.lbr_sel_mask; | 561 | * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate |
562 | * in suppress mode. So LBR_SELECT should be set to | ||
563 | * (~mask & LBR_SEL_MASK) | (mask & ~LBR_SEL_MASK) | ||
564 | */ | ||
565 | reg->config = mask ^ x86_pmu.lbr_sel_mask; | ||
425 | 566 | ||
426 | return 0; | 567 | return 0; |
427 | } | 568 | } |
@@ -439,7 +580,9 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event) | |||
439 | /* | 580 | /* |
440 | * setup SW LBR filter | 581 | * setup SW LBR filter |
441 | */ | 582 | */ |
442 | intel_pmu_setup_sw_lbr_filter(event); | 583 | ret = intel_pmu_setup_sw_lbr_filter(event); |
584 | if (ret) | ||
585 | return ret; | ||
443 | 586 | ||
444 | /* | 587 | /* |
445 | * setup HW LBR filter, if any | 588 | * setup HW LBR filter, if any |
@@ -568,6 +711,12 @@ static int branch_type(unsigned long from, unsigned long to, int abort) | |||
568 | ret = X86_BR_INT; | 711 | ret = X86_BR_INT; |
569 | break; | 712 | break; |
570 | case 0xe8: /* call near rel */ | 713 | case 0xe8: /* call near rel */ |
714 | insn_get_immediate(&insn); | ||
715 | if (insn.immediate1.value == 0) { | ||
716 | /* zero length call */ | ||
717 | ret = X86_BR_ZERO_CALL; | ||
718 | break; | ||
719 | } | ||
571 | case 0x9a: /* call far absolute */ | 720 | case 0x9a: /* call far absolute */ |
572 | ret = X86_BR_CALL; | 721 | ret = X86_BR_CALL; |
573 | break; | 722 | break; |
@@ -678,35 +827,49 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) | |||
678 | /* | 827 | /* |
679 | * Map interface branch filters onto LBR filters | 828 | * Map interface branch filters onto LBR filters |
680 | */ | 829 | */ |
681 | static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = { | 830 | static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { |
682 | [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY, | 831 | [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY, |
683 | [PERF_SAMPLE_BRANCH_USER] = LBR_USER, | 832 | [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER, |
684 | [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL, | 833 | [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL, |
685 | [PERF_SAMPLE_BRANCH_HV] = LBR_IGN, | 834 | [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN, |
686 | [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_REL_JMP | 835 | [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_REL_JMP |
687 | | LBR_IND_JMP | LBR_FAR, | 836 | | LBR_IND_JMP | LBR_FAR, |
688 | /* | 837 | /* |
689 | * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches | 838 | * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches |
690 | */ | 839 | */ |
691 | [PERF_SAMPLE_BRANCH_ANY_CALL] = | 840 | [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = |
692 | LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR, | 841 | LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR, |
693 | /* | 842 | /* |
694 | * NHM/WSM erratum: must include IND_JMP to capture IND_CALL | 843 | * NHM/WSM erratum: must include IND_JMP to capture IND_CALL |
695 | */ | 844 | */ |
696 | [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL | LBR_IND_JMP, | 845 | [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL | LBR_IND_JMP, |
697 | [PERF_SAMPLE_BRANCH_COND] = LBR_JCC, | 846 | [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC, |
698 | }; | 847 | }; |
699 | 848 | ||
700 | static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = { | 849 | static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { |
701 | [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY, | 850 | [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY, |
702 | [PERF_SAMPLE_BRANCH_USER] = LBR_USER, | 851 | [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER, |
703 | [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL, | 852 | [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL, |
704 | [PERF_SAMPLE_BRANCH_HV] = LBR_IGN, | 853 | [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN, |
705 | [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_FAR, | 854 | [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR, |
706 | [PERF_SAMPLE_BRANCH_ANY_CALL] = LBR_REL_CALL | LBR_IND_CALL | 855 | [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL |
707 | | LBR_FAR, | 856 | | LBR_FAR, |
708 | [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL, | 857 | [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL, |
709 | [PERF_SAMPLE_BRANCH_COND] = LBR_JCC, | 858 | [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC, |
859 | }; | ||
860 | |||
861 | static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { | ||
862 | [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY, | ||
863 | [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER, | ||
864 | [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL, | ||
865 | [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN, | ||
866 | [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR, | ||
867 | [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL | ||
868 | | LBR_FAR, | ||
869 | [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL, | ||
870 | [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC, | ||
871 | [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_REL_CALL | LBR_IND_CALL | ||
872 | | LBR_RETURN | LBR_CALL_STACK, | ||
710 | }; | 873 | }; |
711 | 874 | ||
712 | /* core */ | 875 | /* core */ |
@@ -765,6 +928,20 @@ void __init intel_pmu_lbr_init_snb(void) | |||
765 | pr_cont("16-deep LBR, "); | 928 | pr_cont("16-deep LBR, "); |
766 | } | 929 | } |
767 | 930 | ||
931 | /* haswell */ | ||
932 | void intel_pmu_lbr_init_hsw(void) | ||
933 | { | ||
934 | x86_pmu.lbr_nr = 16; | ||
935 | x86_pmu.lbr_tos = MSR_LBR_TOS; | ||
936 | x86_pmu.lbr_from = MSR_LBR_NHM_FROM; | ||
937 | x86_pmu.lbr_to = MSR_LBR_NHM_TO; | ||
938 | |||
939 | x86_pmu.lbr_sel_mask = LBR_SEL_MASK; | ||
940 | x86_pmu.lbr_sel_map = hsw_lbr_sel_map; | ||
941 | |||
942 | pr_cont("16-deep LBR, "); | ||
943 | } | ||
944 | |||
768 | /* atom */ | 945 | /* atom */ |
769 | void __init intel_pmu_lbr_init_atom(void) | 946 | void __init intel_pmu_lbr_init_atom(void) |
770 | { | 947 | { |
diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c new file mode 100644 index 000000000000..ffe666c2c6b5 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c | |||
@@ -0,0 +1,1100 @@ | |||
1 | /* | ||
2 | * Intel(R) Processor Trace PMU driver for perf | ||
3 | * Copyright (c) 2013-2014, Intel Corporation. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify it | ||
6 | * under the terms and conditions of the GNU General Public License, | ||
7 | * version 2, as published by the Free Software Foundation. | ||
8 | * | ||
9 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
12 | * more details. | ||
13 | * | ||
14 | * Intel PT is specified in the Intel Architecture Instruction Set Extensions | ||
15 | * Programming Reference: | ||
16 | * http://software.intel.com/en-us/intel-isa-extensions | ||
17 | */ | ||
18 | |||
19 | #undef DEBUG | ||
20 | |||
21 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
22 | |||
23 | #include <linux/types.h> | ||
24 | #include <linux/slab.h> | ||
25 | #include <linux/device.h> | ||
26 | |||
27 | #include <asm/perf_event.h> | ||
28 | #include <asm/insn.h> | ||
29 | #include <asm/io.h> | ||
30 | |||
31 | #include "perf_event.h" | ||
32 | #include "intel_pt.h" | ||
33 | |||
34 | static DEFINE_PER_CPU(struct pt, pt_ctx); | ||
35 | |||
36 | static struct pt_pmu pt_pmu; | ||
37 | |||
38 | enum cpuid_regs { | ||
39 | CR_EAX = 0, | ||
40 | CR_ECX, | ||
41 | CR_EDX, | ||
42 | CR_EBX | ||
43 | }; | ||
44 | |||
45 | /* | ||
46 | * Capabilities of Intel PT hardware, such as number of address bits or | ||
47 | * supported output schemes, are cached and exported to userspace as "caps" | ||
48 | * attribute group of pt pmu device | ||
49 | * (/sys/bus/event_source/devices/intel_pt/caps/) so that userspace can store | ||
50 | * relevant bits together with intel_pt traces. | ||
51 | * | ||
52 | * These are necessary for both trace decoding (payloads_lip, contains address | ||
53 | * width encoded in IP-related packets), and event configuration (bitmasks with | ||
54 | * permitted values for certain bit fields). | ||
55 | */ | ||
56 | #define PT_CAP(_n, _l, _r, _m) \ | ||
57 | [PT_CAP_ ## _n] = { .name = __stringify(_n), .leaf = _l, \ | ||
58 | .reg = _r, .mask = _m } | ||
59 | |||
60 | static struct pt_cap_desc { | ||
61 | const char *name; | ||
62 | u32 leaf; | ||
63 | u8 reg; | ||
64 | u32 mask; | ||
65 | } pt_caps[] = { | ||
66 | PT_CAP(max_subleaf, 0, CR_EAX, 0xffffffff), | ||
67 | PT_CAP(cr3_filtering, 0, CR_EBX, BIT(0)), | ||
68 | PT_CAP(topa_output, 0, CR_ECX, BIT(0)), | ||
69 | PT_CAP(topa_multiple_entries, 0, CR_ECX, BIT(1)), | ||
70 | PT_CAP(payloads_lip, 0, CR_ECX, BIT(31)), | ||
71 | }; | ||
72 | |||
73 | static u32 pt_cap_get(enum pt_capabilities cap) | ||
74 | { | ||
75 | struct pt_cap_desc *cd = &pt_caps[cap]; | ||
76 | u32 c = pt_pmu.caps[cd->leaf * 4 + cd->reg]; | ||
77 | unsigned int shift = __ffs(cd->mask); | ||
78 | |||
79 | return (c & cd->mask) >> shift; | ||
80 | } | ||
81 | |||
82 | static ssize_t pt_cap_show(struct device *cdev, | ||
83 | struct device_attribute *attr, | ||
84 | char *buf) | ||
85 | { | ||
86 | struct dev_ext_attribute *ea = | ||
87 | container_of(attr, struct dev_ext_attribute, attr); | ||
88 | enum pt_capabilities cap = (long)ea->var; | ||
89 | |||
90 | return snprintf(buf, PAGE_SIZE, "%x\n", pt_cap_get(cap)); | ||
91 | } | ||
92 | |||
93 | static struct attribute_group pt_cap_group = { | ||
94 | .name = "caps", | ||
95 | }; | ||
96 | |||
97 | PMU_FORMAT_ATTR(tsc, "config:10" ); | ||
98 | PMU_FORMAT_ATTR(noretcomp, "config:11" ); | ||
99 | |||
100 | static struct attribute *pt_formats_attr[] = { | ||
101 | &format_attr_tsc.attr, | ||
102 | &format_attr_noretcomp.attr, | ||
103 | NULL, | ||
104 | }; | ||
105 | |||
106 | static struct attribute_group pt_format_group = { | ||
107 | .name = "format", | ||
108 | .attrs = pt_formats_attr, | ||
109 | }; | ||
110 | |||
111 | static const struct attribute_group *pt_attr_groups[] = { | ||
112 | &pt_cap_group, | ||
113 | &pt_format_group, | ||
114 | NULL, | ||
115 | }; | ||
116 | |||
117 | static int __init pt_pmu_hw_init(void) | ||
118 | { | ||
119 | struct dev_ext_attribute *de_attrs; | ||
120 | struct attribute **attrs; | ||
121 | size_t size; | ||
122 | int ret; | ||
123 | long i; | ||
124 | |||
125 | attrs = NULL; | ||
126 | ret = -ENODEV; | ||
127 | if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT)) | ||
128 | goto fail; | ||
129 | |||
130 | for (i = 0; i < PT_CPUID_LEAVES; i++) { | ||
131 | cpuid_count(20, i, | ||
132 | &pt_pmu.caps[CR_EAX + i*4], | ||
133 | &pt_pmu.caps[CR_EBX + i*4], | ||
134 | &pt_pmu.caps[CR_ECX + i*4], | ||
135 | &pt_pmu.caps[CR_EDX + i*4]); | ||
136 | } | ||
137 | |||
138 | ret = -ENOMEM; | ||
139 | size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps)+1); | ||
140 | attrs = kzalloc(size, GFP_KERNEL); | ||
141 | if (!attrs) | ||
142 | goto fail; | ||
143 | |||
144 | size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps)+1); | ||
145 | de_attrs = kzalloc(size, GFP_KERNEL); | ||
146 | if (!de_attrs) | ||
147 | goto fail; | ||
148 | |||
149 | for (i = 0; i < ARRAY_SIZE(pt_caps); i++) { | ||
150 | struct dev_ext_attribute *de_attr = de_attrs + i; | ||
151 | |||
152 | de_attr->attr.attr.name = pt_caps[i].name; | ||
153 | |||
154 | sysfs_attr_init(&de_attrs->attr.attr); | ||
155 | |||
156 | de_attr->attr.attr.mode = S_IRUGO; | ||
157 | de_attr->attr.show = pt_cap_show; | ||
158 | de_attr->var = (void *)i; | ||
159 | |||
160 | attrs[i] = &de_attr->attr.attr; | ||
161 | } | ||
162 | |||
163 | pt_cap_group.attrs = attrs; | ||
164 | |||
165 | return 0; | ||
166 | |||
167 | fail: | ||
168 | kfree(attrs); | ||
169 | |||
170 | return ret; | ||
171 | } | ||
172 | |||
173 | #define PT_CONFIG_MASK (RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC) | ||
174 | |||
175 | static bool pt_event_valid(struct perf_event *event) | ||
176 | { | ||
177 | u64 config = event->attr.config; | ||
178 | |||
179 | if ((config & PT_CONFIG_MASK) != config) | ||
180 | return false; | ||
181 | |||
182 | return true; | ||
183 | } | ||
184 | |||
185 | /* | ||
186 | * PT configuration helpers | ||
187 | * These all are cpu affine and operate on a local PT | ||
188 | */ | ||
189 | |||
190 | static bool pt_is_running(void) | ||
191 | { | ||
192 | u64 ctl; | ||
193 | |||
194 | rdmsrl(MSR_IA32_RTIT_CTL, ctl); | ||
195 | |||
196 | return !!(ctl & RTIT_CTL_TRACEEN); | ||
197 | } | ||
198 | |||
199 | static void pt_config(struct perf_event *event) | ||
200 | { | ||
201 | u64 reg; | ||
202 | |||
203 | reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN; | ||
204 | |||
205 | if (!event->attr.exclude_kernel) | ||
206 | reg |= RTIT_CTL_OS; | ||
207 | if (!event->attr.exclude_user) | ||
208 | reg |= RTIT_CTL_USR; | ||
209 | |||
210 | reg |= (event->attr.config & PT_CONFIG_MASK); | ||
211 | |||
212 | wrmsrl(MSR_IA32_RTIT_CTL, reg); | ||
213 | } | ||
214 | |||
215 | static void pt_config_start(bool start) | ||
216 | { | ||
217 | u64 ctl; | ||
218 | |||
219 | rdmsrl(MSR_IA32_RTIT_CTL, ctl); | ||
220 | if (start) | ||
221 | ctl |= RTIT_CTL_TRACEEN; | ||
222 | else | ||
223 | ctl &= ~RTIT_CTL_TRACEEN; | ||
224 | wrmsrl(MSR_IA32_RTIT_CTL, ctl); | ||
225 | |||
226 | /* | ||
227 | * A wrmsr that disables trace generation serializes other PT | ||
228 | * registers and causes all data packets to be written to memory, | ||
229 | * but a fence is required for the data to become globally visible. | ||
230 | * | ||
231 | * The below WMB, separating data store and aux_head store matches | ||
232 | * the consumer's RMB that separates aux_head load and data load. | ||
233 | */ | ||
234 | if (!start) | ||
235 | wmb(); | ||
236 | } | ||
237 | |||
238 | static void pt_config_buffer(void *buf, unsigned int topa_idx, | ||
239 | unsigned int output_off) | ||
240 | { | ||
241 | u64 reg; | ||
242 | |||
243 | wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(buf)); | ||
244 | |||
245 | reg = 0x7f | ((u64)topa_idx << 7) | ((u64)output_off << 32); | ||
246 | |||
247 | wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg); | ||
248 | } | ||
249 | |||
250 | /* | ||
251 | * Keep ToPA table-related metadata on the same page as the actual table, | ||
252 | * taking up a few words from the top | ||
253 | */ | ||
254 | |||
255 | #define TENTS_PER_PAGE (((PAGE_SIZE - 40) / sizeof(struct topa_entry)) - 1) | ||
256 | |||
257 | /** | ||
258 | * struct topa - page-sized ToPA table with metadata at the top | ||
259 | * @table: actual ToPA table entries, as understood by PT hardware | ||
260 | * @list: linkage to struct pt_buffer's list of tables | ||
261 | * @phys: physical address of this page | ||
262 | * @offset: offset of the first entry in this table in the buffer | ||
263 | * @size: total size of all entries in this table | ||
264 | * @last: index of the last initialized entry in this table | ||
265 | */ | ||
266 | struct topa { | ||
267 | struct topa_entry table[TENTS_PER_PAGE]; | ||
268 | struct list_head list; | ||
269 | u64 phys; | ||
270 | u64 offset; | ||
271 | size_t size; | ||
272 | int last; | ||
273 | }; | ||
274 | |||
275 | /* make -1 stand for the last table entry */ | ||
276 | #define TOPA_ENTRY(t, i) ((i) == -1 ? &(t)->table[(t)->last] : &(t)->table[(i)]) | ||
277 | |||
278 | /** | ||
279 | * topa_alloc() - allocate page-sized ToPA table | ||
280 | * @cpu: CPU on which to allocate. | ||
281 | * @gfp: Allocation flags. | ||
282 | * | ||
283 | * Return: On success, return the pointer to ToPA table page. | ||
284 | */ | ||
285 | static struct topa *topa_alloc(int cpu, gfp_t gfp) | ||
286 | { | ||
287 | int node = cpu_to_node(cpu); | ||
288 | struct topa *topa; | ||
289 | struct page *p; | ||
290 | |||
291 | p = alloc_pages_node(node, gfp | __GFP_ZERO, 0); | ||
292 | if (!p) | ||
293 | return NULL; | ||
294 | |||
295 | topa = page_address(p); | ||
296 | topa->last = 0; | ||
297 | topa->phys = page_to_phys(p); | ||
298 | |||
299 | /* | ||
300 | * In case of singe-entry ToPA, always put the self-referencing END | ||
301 | * link as the 2nd entry in the table | ||
302 | */ | ||
303 | if (!pt_cap_get(PT_CAP_topa_multiple_entries)) { | ||
304 | TOPA_ENTRY(topa, 1)->base = topa->phys >> TOPA_SHIFT; | ||
305 | TOPA_ENTRY(topa, 1)->end = 1; | ||
306 | } | ||
307 | |||
308 | return topa; | ||
309 | } | ||
310 | |||
311 | /** | ||
312 | * topa_free() - free a page-sized ToPA table | ||
313 | * @topa: Table to deallocate. | ||
314 | */ | ||
315 | static void topa_free(struct topa *topa) | ||
316 | { | ||
317 | free_page((unsigned long)topa); | ||
318 | } | ||
319 | |||
320 | /** | ||
321 | * topa_insert_table() - insert a ToPA table into a buffer | ||
322 | * @buf: PT buffer that's being extended. | ||
323 | * @topa: New topa table to be inserted. | ||
324 | * | ||
325 | * If it's the first table in this buffer, set up buffer's pointers | ||
326 | * accordingly; otherwise, add a END=1 link entry to @topa to the current | ||
327 | * "last" table and adjust the last table pointer to @topa. | ||
328 | */ | ||
329 | static void topa_insert_table(struct pt_buffer *buf, struct topa *topa) | ||
330 | { | ||
331 | struct topa *last = buf->last; | ||
332 | |||
333 | list_add_tail(&topa->list, &buf->tables); | ||
334 | |||
335 | if (!buf->first) { | ||
336 | buf->first = buf->last = buf->cur = topa; | ||
337 | return; | ||
338 | } | ||
339 | |||
340 | topa->offset = last->offset + last->size; | ||
341 | buf->last = topa; | ||
342 | |||
343 | if (!pt_cap_get(PT_CAP_topa_multiple_entries)) | ||
344 | return; | ||
345 | |||
346 | BUG_ON(last->last != TENTS_PER_PAGE - 1); | ||
347 | |||
348 | TOPA_ENTRY(last, -1)->base = topa->phys >> TOPA_SHIFT; | ||
349 | TOPA_ENTRY(last, -1)->end = 1; | ||
350 | } | ||
351 | |||
352 | /** | ||
353 | * topa_table_full() - check if a ToPA table is filled up | ||
354 | * @topa: ToPA table. | ||
355 | */ | ||
356 | static bool topa_table_full(struct topa *topa) | ||
357 | { | ||
358 | /* single-entry ToPA is a special case */ | ||
359 | if (!pt_cap_get(PT_CAP_topa_multiple_entries)) | ||
360 | return !!topa->last; | ||
361 | |||
362 | return topa->last == TENTS_PER_PAGE - 1; | ||
363 | } | ||
364 | |||
365 | /** | ||
366 | * topa_insert_pages() - create a list of ToPA tables | ||
367 | * @buf: PT buffer being initialized. | ||
368 | * @gfp: Allocation flags. | ||
369 | * | ||
370 | * This initializes a list of ToPA tables with entries from | ||
371 | * the data_pages provided by rb_alloc_aux(). | ||
372 | * | ||
373 | * Return: 0 on success or error code. | ||
374 | */ | ||
375 | static int topa_insert_pages(struct pt_buffer *buf, gfp_t gfp) | ||
376 | { | ||
377 | struct topa *topa = buf->last; | ||
378 | int order = 0; | ||
379 | struct page *p; | ||
380 | |||
381 | p = virt_to_page(buf->data_pages[buf->nr_pages]); | ||
382 | if (PagePrivate(p)) | ||
383 | order = page_private(p); | ||
384 | |||
385 | if (topa_table_full(topa)) { | ||
386 | topa = topa_alloc(buf->cpu, gfp); | ||
387 | if (!topa) | ||
388 | return -ENOMEM; | ||
389 | |||
390 | topa_insert_table(buf, topa); | ||
391 | } | ||
392 | |||
393 | TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT; | ||
394 | TOPA_ENTRY(topa, -1)->size = order; | ||
395 | if (!buf->snapshot && !pt_cap_get(PT_CAP_topa_multiple_entries)) { | ||
396 | TOPA_ENTRY(topa, -1)->intr = 1; | ||
397 | TOPA_ENTRY(topa, -1)->stop = 1; | ||
398 | } | ||
399 | |||
400 | topa->last++; | ||
401 | topa->size += sizes(order); | ||
402 | |||
403 | buf->nr_pages += 1ul << order; | ||
404 | |||
405 | return 0; | ||
406 | } | ||
407 | |||
408 | /** | ||
409 | * pt_topa_dump() - print ToPA tables and their entries | ||
410 | * @buf: PT buffer. | ||
411 | */ | ||
412 | static void pt_topa_dump(struct pt_buffer *buf) | ||
413 | { | ||
414 | struct topa *topa; | ||
415 | |||
416 | list_for_each_entry(topa, &buf->tables, list) { | ||
417 | int i; | ||
418 | |||
419 | pr_debug("# table @%p (%016Lx), off %llx size %zx\n", topa->table, | ||
420 | topa->phys, topa->offset, topa->size); | ||
421 | for (i = 0; i < TENTS_PER_PAGE; i++) { | ||
422 | pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n", | ||
423 | &topa->table[i], | ||
424 | (unsigned long)topa->table[i].base << TOPA_SHIFT, | ||
425 | sizes(topa->table[i].size), | ||
426 | topa->table[i].end ? 'E' : ' ', | ||
427 | topa->table[i].intr ? 'I' : ' ', | ||
428 | topa->table[i].stop ? 'S' : ' ', | ||
429 | *(u64 *)&topa->table[i]); | ||
430 | if ((pt_cap_get(PT_CAP_topa_multiple_entries) && | ||
431 | topa->table[i].stop) || | ||
432 | topa->table[i].end) | ||
433 | break; | ||
434 | } | ||
435 | } | ||
436 | } | ||
437 | |||
438 | /** | ||
439 | * pt_buffer_advance() - advance to the next output region | ||
440 | * @buf: PT buffer. | ||
441 | * | ||
442 | * Advance the current pointers in the buffer to the next ToPA entry. | ||
443 | */ | ||
444 | static void pt_buffer_advance(struct pt_buffer *buf) | ||
445 | { | ||
446 | buf->output_off = 0; | ||
447 | buf->cur_idx++; | ||
448 | |||
449 | if (buf->cur_idx == buf->cur->last) { | ||
450 | if (buf->cur == buf->last) | ||
451 | buf->cur = buf->first; | ||
452 | else | ||
453 | buf->cur = list_entry(buf->cur->list.next, struct topa, | ||
454 | list); | ||
455 | buf->cur_idx = 0; | ||
456 | } | ||
457 | } | ||
458 | |||
459 | /** | ||
460 | * pt_update_head() - calculate current offsets and sizes | ||
461 | * @pt: Per-cpu pt context. | ||
462 | * | ||
463 | * Update buffer's current write pointer position and data size. | ||
464 | */ | ||
465 | static void pt_update_head(struct pt *pt) | ||
466 | { | ||
467 | struct pt_buffer *buf = perf_get_aux(&pt->handle); | ||
468 | u64 topa_idx, base, old; | ||
469 | |||
470 | /* offset of the first region in this table from the beginning of buf */ | ||
471 | base = buf->cur->offset + buf->output_off; | ||
472 | |||
473 | /* offset of the current output region within this table */ | ||
474 | for (topa_idx = 0; topa_idx < buf->cur_idx; topa_idx++) | ||
475 | base += sizes(buf->cur->table[topa_idx].size); | ||
476 | |||
477 | if (buf->snapshot) { | ||
478 | local_set(&buf->data_size, base); | ||
479 | } else { | ||
480 | old = (local64_xchg(&buf->head, base) & | ||
481 | ((buf->nr_pages << PAGE_SHIFT) - 1)); | ||
482 | if (base < old) | ||
483 | base += buf->nr_pages << PAGE_SHIFT; | ||
484 | |||
485 | local_add(base - old, &buf->data_size); | ||
486 | } | ||
487 | } | ||
488 | |||
489 | /** | ||
490 | * pt_buffer_region() - obtain current output region's address | ||
491 | * @buf: PT buffer. | ||
492 | */ | ||
493 | static void *pt_buffer_region(struct pt_buffer *buf) | ||
494 | { | ||
495 | return phys_to_virt(buf->cur->table[buf->cur_idx].base << TOPA_SHIFT); | ||
496 | } | ||
497 | |||
498 | /** | ||
499 | * pt_buffer_region_size() - obtain current output region's size | ||
500 | * @buf: PT buffer. | ||
501 | */ | ||
502 | static size_t pt_buffer_region_size(struct pt_buffer *buf) | ||
503 | { | ||
504 | return sizes(buf->cur->table[buf->cur_idx].size); | ||
505 | } | ||
506 | |||
507 | /** | ||
508 | * pt_handle_status() - take care of possible status conditions | ||
509 | * @pt: Per-cpu pt context. | ||
510 | */ | ||
511 | static void pt_handle_status(struct pt *pt) | ||
512 | { | ||
513 | struct pt_buffer *buf = perf_get_aux(&pt->handle); | ||
514 | int advance = 0; | ||
515 | u64 status; | ||
516 | |||
517 | rdmsrl(MSR_IA32_RTIT_STATUS, status); | ||
518 | |||
519 | if (status & RTIT_STATUS_ERROR) { | ||
520 | pr_err_ratelimited("ToPA ERROR encountered, trying to recover\n"); | ||
521 | pt_topa_dump(buf); | ||
522 | status &= ~RTIT_STATUS_ERROR; | ||
523 | } | ||
524 | |||
525 | if (status & RTIT_STATUS_STOPPED) { | ||
526 | status &= ~RTIT_STATUS_STOPPED; | ||
527 | |||
528 | /* | ||
529 | * On systems that only do single-entry ToPA, hitting STOP | ||
530 | * means we are already losing data; need to let the decoder | ||
531 | * know. | ||
532 | */ | ||
533 | if (!pt_cap_get(PT_CAP_topa_multiple_entries) || | ||
534 | buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) { | ||
535 | local_inc(&buf->lost); | ||
536 | advance++; | ||
537 | } | ||
538 | } | ||
539 | |||
540 | /* | ||
541 | * Also on single-entry ToPA implementations, interrupt will come | ||
542 | * before the output reaches its output region's boundary. | ||
543 | */ | ||
544 | if (!pt_cap_get(PT_CAP_topa_multiple_entries) && !buf->snapshot && | ||
545 | pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) { | ||
546 | void *head = pt_buffer_region(buf); | ||
547 | |||
548 | /* everything within this margin needs to be zeroed out */ | ||
549 | memset(head + buf->output_off, 0, | ||
550 | pt_buffer_region_size(buf) - | ||
551 | buf->output_off); | ||
552 | advance++; | ||
553 | } | ||
554 | |||
555 | if (advance) | ||
556 | pt_buffer_advance(buf); | ||
557 | |||
558 | wrmsrl(MSR_IA32_RTIT_STATUS, status); | ||
559 | } | ||
560 | |||
561 | /** | ||
562 | * pt_read_offset() - translate registers into buffer pointers | ||
563 | * @buf: PT buffer. | ||
564 | * | ||
565 | * Set buffer's output pointers from MSR values. | ||
566 | */ | ||
567 | static void pt_read_offset(struct pt_buffer *buf) | ||
568 | { | ||
569 | u64 offset, base_topa; | ||
570 | |||
571 | rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base_topa); | ||
572 | buf->cur = phys_to_virt(base_topa); | ||
573 | |||
574 | rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, offset); | ||
575 | /* offset within current output region */ | ||
576 | buf->output_off = offset >> 32; | ||
577 | /* index of current output region within this table */ | ||
578 | buf->cur_idx = (offset & 0xffffff80) >> 7; | ||
579 | } | ||
580 | |||
581 | /** | ||
582 | * pt_topa_next_entry() - obtain index of the first page in the next ToPA entry | ||
583 | * @buf: PT buffer. | ||
584 | * @pg: Page offset in the buffer. | ||
585 | * | ||
586 | * When advancing to the next output region (ToPA entry), given a page offset | ||
587 | * into the buffer, we need to find the offset of the first page in the next | ||
588 | * region. | ||
589 | */ | ||
590 | static unsigned int pt_topa_next_entry(struct pt_buffer *buf, unsigned int pg) | ||
591 | { | ||
592 | struct topa_entry *te = buf->topa_index[pg]; | ||
593 | |||
594 | /* one region */ | ||
595 | if (buf->first == buf->last && buf->first->last == 1) | ||
596 | return pg; | ||
597 | |||
598 | do { | ||
599 | pg++; | ||
600 | pg &= buf->nr_pages - 1; | ||
601 | } while (buf->topa_index[pg] == te); | ||
602 | |||
603 | return pg; | ||
604 | } | ||
605 | |||
606 | /** | ||
607 | * pt_buffer_reset_markers() - place interrupt and stop bits in the buffer | ||
608 | * @buf: PT buffer. | ||
609 | * @handle: Current output handle. | ||
610 | * | ||
611 | * Place INT and STOP marks to prevent overwriting old data that the consumer | ||
612 | * hasn't yet collected. | ||
613 | */ | ||
614 | static int pt_buffer_reset_markers(struct pt_buffer *buf, | ||
615 | struct perf_output_handle *handle) | ||
616 | |||
617 | { | ||
618 | unsigned long idx, npages, end; | ||
619 | |||
620 | if (buf->snapshot) | ||
621 | return 0; | ||
622 | |||
623 | /* can't stop in the middle of an output region */ | ||
624 | if (buf->output_off + handle->size + 1 < | ||
625 | sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) | ||
626 | return -EINVAL; | ||
627 | |||
628 | |||
629 | /* single entry ToPA is handled by marking all regions STOP=1 INT=1 */ | ||
630 | if (!pt_cap_get(PT_CAP_topa_multiple_entries)) | ||
631 | return 0; | ||
632 | |||
633 | /* clear STOP and INT from current entry */ | ||
634 | buf->topa_index[buf->stop_pos]->stop = 0; | ||
635 | buf->topa_index[buf->intr_pos]->intr = 0; | ||
636 | |||
637 | if (pt_cap_get(PT_CAP_topa_multiple_entries)) { | ||
638 | npages = (handle->size + 1) >> PAGE_SHIFT; | ||
639 | end = (local64_read(&buf->head) >> PAGE_SHIFT) + npages; | ||
640 | /*if (end > handle->wakeup >> PAGE_SHIFT) | ||
641 | end = handle->wakeup >> PAGE_SHIFT;*/ | ||
642 | idx = end & (buf->nr_pages - 1); | ||
643 | buf->stop_pos = idx; | ||
644 | idx = (local64_read(&buf->head) >> PAGE_SHIFT) + npages - 1; | ||
645 | idx &= buf->nr_pages - 1; | ||
646 | buf->intr_pos = idx; | ||
647 | } | ||
648 | |||
649 | buf->topa_index[buf->stop_pos]->stop = 1; | ||
650 | buf->topa_index[buf->intr_pos]->intr = 1; | ||
651 | |||
652 | return 0; | ||
653 | } | ||
654 | |||
655 | /** | ||
656 | * pt_buffer_setup_topa_index() - build topa_index[] table of regions | ||
657 | * @buf: PT buffer. | ||
658 | * | ||
659 | * topa_index[] references output regions indexed by offset into the | ||
660 | * buffer for purposes of quick reverse lookup. | ||
661 | */ | ||
662 | static void pt_buffer_setup_topa_index(struct pt_buffer *buf) | ||
663 | { | ||
664 | struct topa *cur = buf->first, *prev = buf->last; | ||
665 | struct topa_entry *te_cur = TOPA_ENTRY(cur, 0), | ||
666 | *te_prev = TOPA_ENTRY(prev, prev->last - 1); | ||
667 | int pg = 0, idx = 0, ntopa = 0; | ||
668 | |||
669 | while (pg < buf->nr_pages) { | ||
670 | int tidx; | ||
671 | |||
672 | /* pages within one topa entry */ | ||
673 | for (tidx = 0; tidx < 1 << te_cur->size; tidx++, pg++) | ||
674 | buf->topa_index[pg] = te_prev; | ||
675 | |||
676 | te_prev = te_cur; | ||
677 | |||
678 | if (idx == cur->last - 1) { | ||
679 | /* advance to next topa table */ | ||
680 | idx = 0; | ||
681 | cur = list_entry(cur->list.next, struct topa, list); | ||
682 | ntopa++; | ||
683 | } else | ||
684 | idx++; | ||
685 | te_cur = TOPA_ENTRY(cur, idx); | ||
686 | } | ||
687 | |||
688 | } | ||
689 | |||
690 | /** | ||
691 | * pt_buffer_reset_offsets() - adjust buffer's write pointers from aux_head | ||
692 | * @buf: PT buffer. | ||
693 | * @head: Write pointer (aux_head) from AUX buffer. | ||
694 | * | ||
695 | * Find the ToPA table and entry corresponding to given @head and set buffer's | ||
696 | * "current" pointers accordingly. | ||
697 | */ | ||
698 | static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head) | ||
699 | { | ||
700 | int pg; | ||
701 | |||
702 | if (buf->snapshot) | ||
703 | head &= (buf->nr_pages << PAGE_SHIFT) - 1; | ||
704 | |||
705 | pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1); | ||
706 | pg = pt_topa_next_entry(buf, pg); | ||
707 | |||
708 | buf->cur = (struct topa *)((unsigned long)buf->topa_index[pg] & PAGE_MASK); | ||
709 | buf->cur_idx = ((unsigned long)buf->topa_index[pg] - | ||
710 | (unsigned long)buf->cur) / sizeof(struct topa_entry); | ||
711 | buf->output_off = head & (sizes(buf->cur->table[buf->cur_idx].size) - 1); | ||
712 | |||
713 | local64_set(&buf->head, head); | ||
714 | local_set(&buf->data_size, 0); | ||
715 | } | ||
716 | |||
717 | /** | ||
718 | * pt_buffer_fini_topa() - deallocate ToPA structure of a buffer | ||
719 | * @buf: PT buffer. | ||
720 | */ | ||
721 | static void pt_buffer_fini_topa(struct pt_buffer *buf) | ||
722 | { | ||
723 | struct topa *topa, *iter; | ||
724 | |||
725 | list_for_each_entry_safe(topa, iter, &buf->tables, list) { | ||
726 | /* | ||
727 | * right now, this is in free_aux() path only, so | ||
728 | * no need to unlink this table from the list | ||
729 | */ | ||
730 | topa_free(topa); | ||
731 | } | ||
732 | } | ||
733 | |||
734 | /** | ||
735 | * pt_buffer_init_topa() - initialize ToPA table for pt buffer | ||
736 | * @buf: PT buffer. | ||
737 | * @size: Total size of all regions within this ToPA. | ||
738 | * @gfp: Allocation flags. | ||
739 | */ | ||
740 | static int pt_buffer_init_topa(struct pt_buffer *buf, unsigned long nr_pages, | ||
741 | gfp_t gfp) | ||
742 | { | ||
743 | struct topa *topa; | ||
744 | int err; | ||
745 | |||
746 | topa = topa_alloc(buf->cpu, gfp); | ||
747 | if (!topa) | ||
748 | return -ENOMEM; | ||
749 | |||
750 | topa_insert_table(buf, topa); | ||
751 | |||
752 | while (buf->nr_pages < nr_pages) { | ||
753 | err = topa_insert_pages(buf, gfp); | ||
754 | if (err) { | ||
755 | pt_buffer_fini_topa(buf); | ||
756 | return -ENOMEM; | ||
757 | } | ||
758 | } | ||
759 | |||
760 | pt_buffer_setup_topa_index(buf); | ||
761 | |||
762 | /* link last table to the first one, unless we're double buffering */ | ||
763 | if (pt_cap_get(PT_CAP_topa_multiple_entries)) { | ||
764 | TOPA_ENTRY(buf->last, -1)->base = buf->first->phys >> TOPA_SHIFT; | ||
765 | TOPA_ENTRY(buf->last, -1)->end = 1; | ||
766 | } | ||
767 | |||
768 | pt_topa_dump(buf); | ||
769 | return 0; | ||
770 | } | ||
771 | |||
772 | /** | ||
773 | * pt_buffer_setup_aux() - set up topa tables for a PT buffer | ||
774 | * @cpu: Cpu on which to allocate, -1 means current. | ||
775 | * @pages: Array of pointers to buffer pages passed from perf core. | ||
776 | * @nr_pages: Number of pages in the buffer. | ||
777 | * @snapshot: If this is a snapshot/overwrite counter. | ||
778 | * | ||
779 | * This is a pmu::setup_aux callback that sets up ToPA tables and all the | ||
780 | * bookkeeping for an AUX buffer. | ||
781 | * | ||
782 | * Return: Our private PT buffer structure. | ||
783 | */ | ||
784 | static void * | ||
785 | pt_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool snapshot) | ||
786 | { | ||
787 | struct pt_buffer *buf; | ||
788 | int node, ret; | ||
789 | |||
790 | if (!nr_pages) | ||
791 | return NULL; | ||
792 | |||
793 | if (cpu == -1) | ||
794 | cpu = raw_smp_processor_id(); | ||
795 | node = cpu_to_node(cpu); | ||
796 | |||
797 | buf = kzalloc_node(offsetof(struct pt_buffer, topa_index[nr_pages]), | ||
798 | GFP_KERNEL, node); | ||
799 | if (!buf) | ||
800 | return NULL; | ||
801 | |||
802 | buf->cpu = cpu; | ||
803 | buf->snapshot = snapshot; | ||
804 | buf->data_pages = pages; | ||
805 | |||
806 | INIT_LIST_HEAD(&buf->tables); | ||
807 | |||
808 | ret = pt_buffer_init_topa(buf, nr_pages, GFP_KERNEL); | ||
809 | if (ret) { | ||
810 | kfree(buf); | ||
811 | return NULL; | ||
812 | } | ||
813 | |||
814 | return buf; | ||
815 | } | ||
816 | |||
817 | /** | ||
818 | * pt_buffer_free_aux() - perf AUX deallocation path callback | ||
819 | * @data: PT buffer. | ||
820 | */ | ||
821 | static void pt_buffer_free_aux(void *data) | ||
822 | { | ||
823 | struct pt_buffer *buf = data; | ||
824 | |||
825 | pt_buffer_fini_topa(buf); | ||
826 | kfree(buf); | ||
827 | } | ||
828 | |||
829 | /** | ||
830 | * pt_buffer_is_full() - check if the buffer is full | ||
831 | * @buf: PT buffer. | ||
832 | * @pt: Per-cpu pt handle. | ||
833 | * | ||
834 | * If the user hasn't read data from the output region that aux_head | ||
835 | * points to, the buffer is considered full: the user needs to read at | ||
836 | * least this region and update aux_tail to point past it. | ||
837 | */ | ||
838 | static bool pt_buffer_is_full(struct pt_buffer *buf, struct pt *pt) | ||
839 | { | ||
840 | if (buf->snapshot) | ||
841 | return false; | ||
842 | |||
843 | if (local_read(&buf->data_size) >= pt->handle.size) | ||
844 | return true; | ||
845 | |||
846 | return false; | ||
847 | } | ||
848 | |||
849 | /** | ||
850 | * intel_pt_interrupt() - PT PMI handler | ||
851 | */ | ||
852 | void intel_pt_interrupt(void) | ||
853 | { | ||
854 | struct pt *pt = this_cpu_ptr(&pt_ctx); | ||
855 | struct pt_buffer *buf; | ||
856 | struct perf_event *event = pt->handle.event; | ||
857 | |||
858 | /* | ||
859 | * There may be a dangling PT bit in the interrupt status register | ||
860 | * after PT has been disabled by pt_event_stop(). Make sure we don't | ||
861 | * do anything (particularly, re-enable) for this event here. | ||
862 | */ | ||
863 | if (!ACCESS_ONCE(pt->handle_nmi)) | ||
864 | return; | ||
865 | |||
866 | pt_config_start(false); | ||
867 | |||
868 | if (!event) | ||
869 | return; | ||
870 | |||
871 | buf = perf_get_aux(&pt->handle); | ||
872 | if (!buf) | ||
873 | return; | ||
874 | |||
875 | pt_read_offset(buf); | ||
876 | |||
877 | pt_handle_status(pt); | ||
878 | |||
879 | pt_update_head(pt); | ||
880 | |||
881 | perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0), | ||
882 | local_xchg(&buf->lost, 0)); | ||
883 | |||
884 | if (!event->hw.state) { | ||
885 | int ret; | ||
886 | |||
887 | buf = perf_aux_output_begin(&pt->handle, event); | ||
888 | if (!buf) { | ||
889 | event->hw.state = PERF_HES_STOPPED; | ||
890 | return; | ||
891 | } | ||
892 | |||
893 | pt_buffer_reset_offsets(buf, pt->handle.head); | ||
894 | ret = pt_buffer_reset_markers(buf, &pt->handle); | ||
895 | if (ret) { | ||
896 | perf_aux_output_end(&pt->handle, 0, true); | ||
897 | return; | ||
898 | } | ||
899 | |||
900 | pt_config_buffer(buf->cur->table, buf->cur_idx, | ||
901 | buf->output_off); | ||
902 | wrmsrl(MSR_IA32_RTIT_STATUS, 0); | ||
903 | pt_config(event); | ||
904 | } | ||
905 | } | ||
906 | |||
907 | /* | ||
908 | * PMU callbacks | ||
909 | */ | ||
910 | |||
911 | static void pt_event_start(struct perf_event *event, int mode) | ||
912 | { | ||
913 | struct pt *pt = this_cpu_ptr(&pt_ctx); | ||
914 | struct pt_buffer *buf = perf_get_aux(&pt->handle); | ||
915 | |||
916 | if (pt_is_running() || !buf || pt_buffer_is_full(buf, pt)) { | ||
917 | event->hw.state = PERF_HES_STOPPED; | ||
918 | return; | ||
919 | } | ||
920 | |||
921 | ACCESS_ONCE(pt->handle_nmi) = 1; | ||
922 | event->hw.state = 0; | ||
923 | |||
924 | pt_config_buffer(buf->cur->table, buf->cur_idx, | ||
925 | buf->output_off); | ||
926 | wrmsrl(MSR_IA32_RTIT_STATUS, 0); | ||
927 | pt_config(event); | ||
928 | } | ||
929 | |||
930 | static void pt_event_stop(struct perf_event *event, int mode) | ||
931 | { | ||
932 | struct pt *pt = this_cpu_ptr(&pt_ctx); | ||
933 | |||
934 | /* | ||
935 | * Protect against the PMI racing with disabling wrmsr, | ||
936 | * see comment in intel_pt_interrupt(). | ||
937 | */ | ||
938 | ACCESS_ONCE(pt->handle_nmi) = 0; | ||
939 | pt_config_start(false); | ||
940 | |||
941 | if (event->hw.state == PERF_HES_STOPPED) | ||
942 | return; | ||
943 | |||
944 | event->hw.state = PERF_HES_STOPPED; | ||
945 | |||
946 | if (mode & PERF_EF_UPDATE) { | ||
947 | struct pt *pt = this_cpu_ptr(&pt_ctx); | ||
948 | struct pt_buffer *buf = perf_get_aux(&pt->handle); | ||
949 | |||
950 | if (!buf) | ||
951 | return; | ||
952 | |||
953 | if (WARN_ON_ONCE(pt->handle.event != event)) | ||
954 | return; | ||
955 | |||
956 | pt_read_offset(buf); | ||
957 | |||
958 | pt_handle_status(pt); | ||
959 | |||
960 | pt_update_head(pt); | ||
961 | } | ||
962 | } | ||
963 | |||
964 | static void pt_event_del(struct perf_event *event, int mode) | ||
965 | { | ||
966 | struct pt *pt = this_cpu_ptr(&pt_ctx); | ||
967 | struct pt_buffer *buf; | ||
968 | |||
969 | pt_event_stop(event, PERF_EF_UPDATE); | ||
970 | |||
971 | buf = perf_get_aux(&pt->handle); | ||
972 | |||
973 | if (buf) { | ||
974 | if (buf->snapshot) | ||
975 | pt->handle.head = | ||
976 | local_xchg(&buf->data_size, | ||
977 | buf->nr_pages << PAGE_SHIFT); | ||
978 | perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0), | ||
979 | local_xchg(&buf->lost, 0)); | ||
980 | } | ||
981 | } | ||
982 | |||
983 | static int pt_event_add(struct perf_event *event, int mode) | ||
984 | { | ||
985 | struct pt_buffer *buf; | ||
986 | struct pt *pt = this_cpu_ptr(&pt_ctx); | ||
987 | struct hw_perf_event *hwc = &event->hw; | ||
988 | int ret = -EBUSY; | ||
989 | |||
990 | if (pt->handle.event) | ||
991 | goto fail; | ||
992 | |||
993 | buf = perf_aux_output_begin(&pt->handle, event); | ||
994 | ret = -EINVAL; | ||
995 | if (!buf) | ||
996 | goto fail_stop; | ||
997 | |||
998 | pt_buffer_reset_offsets(buf, pt->handle.head); | ||
999 | if (!buf->snapshot) { | ||
1000 | ret = pt_buffer_reset_markers(buf, &pt->handle); | ||
1001 | if (ret) | ||
1002 | goto fail_end_stop; | ||
1003 | } | ||
1004 | |||
1005 | if (mode & PERF_EF_START) { | ||
1006 | pt_event_start(event, 0); | ||
1007 | ret = -EBUSY; | ||
1008 | if (hwc->state == PERF_HES_STOPPED) | ||
1009 | goto fail_end_stop; | ||
1010 | } else { | ||
1011 | hwc->state = PERF_HES_STOPPED; | ||
1012 | } | ||
1013 | |||
1014 | return 0; | ||
1015 | |||
1016 | fail_end_stop: | ||
1017 | perf_aux_output_end(&pt->handle, 0, true); | ||
1018 | fail_stop: | ||
1019 | hwc->state = PERF_HES_STOPPED; | ||
1020 | fail: | ||
1021 | return ret; | ||
1022 | } | ||
1023 | |||
1024 | static void pt_event_read(struct perf_event *event) | ||
1025 | { | ||
1026 | } | ||
1027 | |||
1028 | static void pt_event_destroy(struct perf_event *event) | ||
1029 | { | ||
1030 | x86_del_exclusive(x86_lbr_exclusive_pt); | ||
1031 | } | ||
1032 | |||
1033 | static int pt_event_init(struct perf_event *event) | ||
1034 | { | ||
1035 | if (event->attr.type != pt_pmu.pmu.type) | ||
1036 | return -ENOENT; | ||
1037 | |||
1038 | if (!pt_event_valid(event)) | ||
1039 | return -EINVAL; | ||
1040 | |||
1041 | if (x86_add_exclusive(x86_lbr_exclusive_pt)) | ||
1042 | return -EBUSY; | ||
1043 | |||
1044 | event->destroy = pt_event_destroy; | ||
1045 | |||
1046 | return 0; | ||
1047 | } | ||
1048 | |||
1049 | static __init int pt_init(void) | ||
1050 | { | ||
1051 | int ret, cpu, prior_warn = 0; | ||
1052 | |||
1053 | BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE); | ||
1054 | get_online_cpus(); | ||
1055 | for_each_online_cpu(cpu) { | ||
1056 | u64 ctl; | ||
1057 | |||
1058 | ret = rdmsrl_safe_on_cpu(cpu, MSR_IA32_RTIT_CTL, &ctl); | ||
1059 | if (!ret && (ctl & RTIT_CTL_TRACEEN)) | ||
1060 | prior_warn++; | ||
1061 | } | ||
1062 | put_online_cpus(); | ||
1063 | |||
1064 | if (prior_warn) { | ||
1065 | x86_add_exclusive(x86_lbr_exclusive_pt); | ||
1066 | pr_warn("PT is enabled at boot time, doing nothing\n"); | ||
1067 | |||
1068 | return -EBUSY; | ||
1069 | } | ||
1070 | |||
1071 | ret = pt_pmu_hw_init(); | ||
1072 | if (ret) | ||
1073 | return ret; | ||
1074 | |||
1075 | if (!pt_cap_get(PT_CAP_topa_output)) { | ||
1076 | pr_warn("ToPA output is not supported on this CPU\n"); | ||
1077 | return -ENODEV; | ||
1078 | } | ||
1079 | |||
1080 | if (!pt_cap_get(PT_CAP_topa_multiple_entries)) | ||
1081 | pt_pmu.pmu.capabilities = | ||
1082 | PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF; | ||
1083 | |||
1084 | pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE; | ||
1085 | pt_pmu.pmu.attr_groups = pt_attr_groups; | ||
1086 | pt_pmu.pmu.task_ctx_nr = perf_sw_context; | ||
1087 | pt_pmu.pmu.event_init = pt_event_init; | ||
1088 | pt_pmu.pmu.add = pt_event_add; | ||
1089 | pt_pmu.pmu.del = pt_event_del; | ||
1090 | pt_pmu.pmu.start = pt_event_start; | ||
1091 | pt_pmu.pmu.stop = pt_event_stop; | ||
1092 | pt_pmu.pmu.read = pt_event_read; | ||
1093 | pt_pmu.pmu.setup_aux = pt_buffer_setup_aux; | ||
1094 | pt_pmu.pmu.free_aux = pt_buffer_free_aux; | ||
1095 | ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1); | ||
1096 | |||
1097 | return ret; | ||
1098 | } | ||
1099 | |||
1100 | module_init(pt_init); | ||
diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c index c4bb8b8e5017..999289b94025 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c | |||
@@ -62,6 +62,14 @@ | |||
62 | #define RAPL_IDX_PP1_NRG_STAT 3 /* gpu */ | 62 | #define RAPL_IDX_PP1_NRG_STAT 3 /* gpu */ |
63 | #define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */ | 63 | #define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */ |
64 | 64 | ||
65 | #define NR_RAPL_DOMAINS 0x4 | ||
66 | static const char *rapl_domain_names[NR_RAPL_DOMAINS] __initconst = { | ||
67 | "pp0-core", | ||
68 | "package", | ||
69 | "dram", | ||
70 | "pp1-gpu", | ||
71 | }; | ||
72 | |||
65 | /* Clients have PP0, PKG */ | 73 | /* Clients have PP0, PKG */ |
66 | #define RAPL_IDX_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\ | 74 | #define RAPL_IDX_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\ |
67 | 1<<RAPL_IDX_PKG_NRG_STAT|\ | 75 | 1<<RAPL_IDX_PKG_NRG_STAT|\ |
@@ -112,7 +120,6 @@ static struct perf_pmu_events_attr event_attr_##v = { \ | |||
112 | 120 | ||
113 | struct rapl_pmu { | 121 | struct rapl_pmu { |
114 | spinlock_t lock; | 122 | spinlock_t lock; |
115 | int hw_unit; /* 1/2^hw_unit Joule */ | ||
116 | int n_active; /* number of active events */ | 123 | int n_active; /* number of active events */ |
117 | struct list_head active_list; | 124 | struct list_head active_list; |
118 | struct pmu *pmu; /* pointer to rapl_pmu_class */ | 125 | struct pmu *pmu; /* pointer to rapl_pmu_class */ |
@@ -120,6 +127,7 @@ struct rapl_pmu { | |||
120 | struct hrtimer hrtimer; | 127 | struct hrtimer hrtimer; |
121 | }; | 128 | }; |
122 | 129 | ||
130 | static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly; /* 1/2^hw_unit Joule */ | ||
123 | static struct pmu rapl_pmu_class; | 131 | static struct pmu rapl_pmu_class; |
124 | static cpumask_t rapl_cpu_mask; | 132 | static cpumask_t rapl_cpu_mask; |
125 | static int rapl_cntr_mask; | 133 | static int rapl_cntr_mask; |
@@ -127,6 +135,7 @@ static int rapl_cntr_mask; | |||
127 | static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu); | 135 | static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu); |
128 | static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free); | 136 | static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free); |
129 | 137 | ||
138 | static struct x86_pmu_quirk *rapl_quirks; | ||
130 | static inline u64 rapl_read_counter(struct perf_event *event) | 139 | static inline u64 rapl_read_counter(struct perf_event *event) |
131 | { | 140 | { |
132 | u64 raw; | 141 | u64 raw; |
@@ -134,15 +143,28 @@ static inline u64 rapl_read_counter(struct perf_event *event) | |||
134 | return raw; | 143 | return raw; |
135 | } | 144 | } |
136 | 145 | ||
137 | static inline u64 rapl_scale(u64 v) | 146 | #define rapl_add_quirk(func_) \ |
147 | do { \ | ||
148 | static struct x86_pmu_quirk __quirk __initdata = { \ | ||
149 | .func = func_, \ | ||
150 | }; \ | ||
151 | __quirk.next = rapl_quirks; \ | ||
152 | rapl_quirks = &__quirk; \ | ||
153 | } while (0) | ||
154 | |||
155 | static inline u64 rapl_scale(u64 v, int cfg) | ||
138 | { | 156 | { |
157 | if (cfg > NR_RAPL_DOMAINS) { | ||
158 | pr_warn("invalid domain %d, failed to scale data\n", cfg); | ||
159 | return v; | ||
160 | } | ||
139 | /* | 161 | /* |
140 | * scale delta to smallest unit (1/2^32) | 162 | * scale delta to smallest unit (1/2^32) |
141 | * users must then scale back: count * 1/(1e9*2^32) to get Joules | 163 | * users must then scale back: count * 1/(1e9*2^32) to get Joules |
142 | * or use ldexp(count, -32). | 164 | * or use ldexp(count, -32). |
143 | * Watts = Joules/Time delta | 165 | * Watts = Joules/Time delta |
144 | */ | 166 | */ |
145 | return v << (32 - __this_cpu_read(rapl_pmu)->hw_unit); | 167 | return v << (32 - rapl_hw_unit[cfg - 1]); |
146 | } | 168 | } |
147 | 169 | ||
148 | static u64 rapl_event_update(struct perf_event *event) | 170 | static u64 rapl_event_update(struct perf_event *event) |
@@ -173,7 +195,7 @@ again: | |||
173 | delta = (new_raw_count << shift) - (prev_raw_count << shift); | 195 | delta = (new_raw_count << shift) - (prev_raw_count << shift); |
174 | delta >>= shift; | 196 | delta >>= shift; |
175 | 197 | ||
176 | sdelta = rapl_scale(delta); | 198 | sdelta = rapl_scale(delta, event->hw.config); |
177 | 199 | ||
178 | local64_add(sdelta, &event->count); | 200 | local64_add(sdelta, &event->count); |
179 | 201 | ||
@@ -546,12 +568,22 @@ static void rapl_cpu_init(int cpu) | |||
546 | cpumask_set_cpu(cpu, &rapl_cpu_mask); | 568 | cpumask_set_cpu(cpu, &rapl_cpu_mask); |
547 | } | 569 | } |
548 | 570 | ||
571 | static __init void rapl_hsw_server_quirk(void) | ||
572 | { | ||
573 | /* | ||
574 | * DRAM domain on HSW server has fixed energy unit which can be | ||
575 | * different than the unit from power unit MSR. | ||
576 | * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2 | ||
577 | * of 2. Datasheet, September 2014, Reference Number: 330784-001 " | ||
578 | */ | ||
579 | rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16; | ||
580 | } | ||
581 | |||
549 | static int rapl_cpu_prepare(int cpu) | 582 | static int rapl_cpu_prepare(int cpu) |
550 | { | 583 | { |
551 | struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); | 584 | struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); |
552 | int phys_id = topology_physical_package_id(cpu); | 585 | int phys_id = topology_physical_package_id(cpu); |
553 | u64 ms; | 586 | u64 ms; |
554 | u64 msr_rapl_power_unit_bits; | ||
555 | 587 | ||
556 | if (pmu) | 588 | if (pmu) |
557 | return 0; | 589 | return 0; |
@@ -559,24 +591,13 @@ static int rapl_cpu_prepare(int cpu) | |||
559 | if (phys_id < 0) | 591 | if (phys_id < 0) |
560 | return -1; | 592 | return -1; |
561 | 593 | ||
562 | /* protect rdmsrl() to handle virtualization */ | ||
563 | if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits)) | ||
564 | return -1; | ||
565 | |||
566 | pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); | 594 | pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); |
567 | if (!pmu) | 595 | if (!pmu) |
568 | return -1; | 596 | return -1; |
569 | |||
570 | spin_lock_init(&pmu->lock); | 597 | spin_lock_init(&pmu->lock); |
571 | 598 | ||
572 | INIT_LIST_HEAD(&pmu->active_list); | 599 | INIT_LIST_HEAD(&pmu->active_list); |
573 | 600 | ||
574 | /* | ||
575 | * grab power unit as: 1/2^unit Joules | ||
576 | * | ||
577 | * we cache in local PMU instance | ||
578 | */ | ||
579 | pmu->hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; | ||
580 | pmu->pmu = &rapl_pmu_class; | 601 | pmu->pmu = &rapl_pmu_class; |
581 | 602 | ||
582 | /* | 603 | /* |
@@ -586,8 +607,8 @@ static int rapl_cpu_prepare(int cpu) | |||
586 | * divide interval by 2 to avoid lockstep (2 * 100) | 607 | * divide interval by 2 to avoid lockstep (2 * 100) |
587 | * if hw unit is 32, then we use 2 ms 1/200/2 | 608 | * if hw unit is 32, then we use 2 ms 1/200/2 |
588 | */ | 609 | */ |
589 | if (pmu->hw_unit < 32) | 610 | if (rapl_hw_unit[0] < 32) |
590 | ms = (1000 / (2 * 100)) * (1ULL << (32 - pmu->hw_unit - 1)); | 611 | ms = (1000 / (2 * 100)) * (1ULL << (32 - rapl_hw_unit[0] - 1)); |
591 | else | 612 | else |
592 | ms = 2; | 613 | ms = 2; |
593 | 614 | ||
@@ -655,6 +676,20 @@ static int rapl_cpu_notifier(struct notifier_block *self, | |||
655 | return NOTIFY_OK; | 676 | return NOTIFY_OK; |
656 | } | 677 | } |
657 | 678 | ||
679 | static int rapl_check_hw_unit(void) | ||
680 | { | ||
681 | u64 msr_rapl_power_unit_bits; | ||
682 | int i; | ||
683 | |||
684 | /* protect rdmsrl() to handle virtualization */ | ||
685 | if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits)) | ||
686 | return -1; | ||
687 | for (i = 0; i < NR_RAPL_DOMAINS; i++) | ||
688 | rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; | ||
689 | |||
690 | return 0; | ||
691 | } | ||
692 | |||
658 | static const struct x86_cpu_id rapl_cpu_match[] = { | 693 | static const struct x86_cpu_id rapl_cpu_match[] = { |
659 | [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 }, | 694 | [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 }, |
660 | [1] = {}, | 695 | [1] = {}, |
@@ -664,6 +699,8 @@ static int __init rapl_pmu_init(void) | |||
664 | { | 699 | { |
665 | struct rapl_pmu *pmu; | 700 | struct rapl_pmu *pmu; |
666 | int cpu, ret; | 701 | int cpu, ret; |
702 | struct x86_pmu_quirk *quirk; | ||
703 | int i; | ||
667 | 704 | ||
668 | /* | 705 | /* |
669 | * check for Intel processor family 6 | 706 | * check for Intel processor family 6 |
@@ -678,6 +715,11 @@ static int __init rapl_pmu_init(void) | |||
678 | rapl_cntr_mask = RAPL_IDX_CLN; | 715 | rapl_cntr_mask = RAPL_IDX_CLN; |
679 | rapl_pmu_events_group.attrs = rapl_events_cln_attr; | 716 | rapl_pmu_events_group.attrs = rapl_events_cln_attr; |
680 | break; | 717 | break; |
718 | case 63: /* Haswell-Server */ | ||
719 | rapl_add_quirk(rapl_hsw_server_quirk); | ||
720 | rapl_cntr_mask = RAPL_IDX_SRV; | ||
721 | rapl_pmu_events_group.attrs = rapl_events_srv_attr; | ||
722 | break; | ||
681 | case 60: /* Haswell */ | 723 | case 60: /* Haswell */ |
682 | case 69: /* Haswell-Celeron */ | 724 | case 69: /* Haswell-Celeron */ |
683 | rapl_cntr_mask = RAPL_IDX_HSW; | 725 | rapl_cntr_mask = RAPL_IDX_HSW; |
@@ -693,7 +735,13 @@ static int __init rapl_pmu_init(void) | |||
693 | /* unsupported */ | 735 | /* unsupported */ |
694 | return 0; | 736 | return 0; |
695 | } | 737 | } |
738 | ret = rapl_check_hw_unit(); | ||
739 | if (ret) | ||
740 | return ret; | ||
696 | 741 | ||
742 | /* run cpu model quirks */ | ||
743 | for (quirk = rapl_quirks; quirk; quirk = quirk->next) | ||
744 | quirk->func(); | ||
697 | cpu_notifier_register_begin(); | 745 | cpu_notifier_register_begin(); |
698 | 746 | ||
699 | for_each_online_cpu(cpu) { | 747 | for_each_online_cpu(cpu) { |
@@ -714,14 +762,18 @@ static int __init rapl_pmu_init(void) | |||
714 | 762 | ||
715 | pmu = __this_cpu_read(rapl_pmu); | 763 | pmu = __this_cpu_read(rapl_pmu); |
716 | 764 | ||
717 | pr_info("RAPL PMU detected, hw unit 2^-%d Joules," | 765 | pr_info("RAPL PMU detected," |
718 | " API unit is 2^-32 Joules," | 766 | " API unit is 2^-32 Joules," |
719 | " %d fixed counters" | 767 | " %d fixed counters" |
720 | " %llu ms ovfl timer\n", | 768 | " %llu ms ovfl timer\n", |
721 | pmu->hw_unit, | ||
722 | hweight32(rapl_cntr_mask), | 769 | hweight32(rapl_cntr_mask), |
723 | ktime_to_ms(pmu->timer_interval)); | 770 | ktime_to_ms(pmu->timer_interval)); |
724 | 771 | for (i = 0; i < NR_RAPL_DOMAINS; i++) { | |
772 | if (rapl_cntr_mask & (1 << i)) { | ||
773 | pr_info("hw unit of domain %s 2^-%d Joules\n", | ||
774 | rapl_domain_names[i], rapl_hw_unit[i]); | ||
775 | } | ||
776 | } | ||
725 | out: | 777 | out: |
726 | cpu_notifier_register_done(); | 778 | cpu_notifier_register_done(); |
727 | 779 | ||
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c index 21af6149edf2..12d9548457e7 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c | |||
@@ -1132,8 +1132,7 @@ static int snbep_pci2phy_map_init(int devid) | |||
1132 | } | 1132 | } |
1133 | } | 1133 | } |
1134 | 1134 | ||
1135 | if (ubox_dev) | 1135 | pci_dev_put(ubox_dev); |
1136 | pci_dev_put(ubox_dev); | ||
1137 | 1136 | ||
1138 | return err ? pcibios_err_to_errno(err) : 0; | 1137 | return err ? pcibios_err_to_errno(err) : 0; |
1139 | } | 1138 | } |
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 60639093d536..3d423a101fae 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c | |||
@@ -41,6 +41,7 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c) | |||
41 | { X86_FEATURE_HWP_ACT_WINDOW, CR_EAX, 9, 0x00000006, 0 }, | 41 | { X86_FEATURE_HWP_ACT_WINDOW, CR_EAX, 9, 0x00000006, 0 }, |
42 | { X86_FEATURE_HWP_EPP, CR_EAX,10, 0x00000006, 0 }, | 42 | { X86_FEATURE_HWP_EPP, CR_EAX,10, 0x00000006, 0 }, |
43 | { X86_FEATURE_HWP_PKG_REQ, CR_EAX,11, 0x00000006, 0 }, | 43 | { X86_FEATURE_HWP_PKG_REQ, CR_EAX,11, 0x00000006, 0 }, |
44 | { X86_FEATURE_INTEL_PT, CR_EBX,25, 0x00000007, 0 }, | ||
44 | { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, | 45 | { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, |
45 | { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, | 46 | { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, |
46 | { X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 }, | 47 | { X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 }, |
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index aceb2f90c716..c76d3e37c6e1 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c | |||
@@ -105,7 +105,7 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs) | |||
105 | #ifdef CONFIG_X86_32 | 105 | #ifdef CONFIG_X86_32 |
106 | struct pt_regs fixed_regs; | 106 | struct pt_regs fixed_regs; |
107 | 107 | ||
108 | if (!user_mode_vm(regs)) { | 108 | if (!user_mode(regs)) { |
109 | crash_fixup_ss_esp(&fixed_regs, regs); | 109 | crash_fixup_ss_esp(&fixed_regs, regs); |
110 | regs = &fixed_regs; | 110 | regs = &fixed_regs; |
111 | } | 111 | } |
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 3d3503351242..6367a780cc8c 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c | |||
@@ -286,13 +286,13 @@ static void __init x86_flattree_get_config(void) | |||
286 | initial_boot_params = dt = early_memremap(initial_dtb, map_len); | 286 | initial_boot_params = dt = early_memremap(initial_dtb, map_len); |
287 | size = of_get_flat_dt_size(); | 287 | size = of_get_flat_dt_size(); |
288 | if (map_len < size) { | 288 | if (map_len < size) { |
289 | early_iounmap(dt, map_len); | 289 | early_memunmap(dt, map_len); |
290 | initial_boot_params = dt = early_memremap(initial_dtb, size); | 290 | initial_boot_params = dt = early_memremap(initial_dtb, size); |
291 | map_len = size; | 291 | map_len = size; |
292 | } | 292 | } |
293 | 293 | ||
294 | unflatten_and_copy_device_tree(); | 294 | unflatten_and_copy_device_tree(); |
295 | early_iounmap(dt, map_len); | 295 | early_memunmap(dt, map_len); |
296 | } | 296 | } |
297 | #else | 297 | #else |
298 | static inline void x86_flattree_get_config(void) { } | 298 | static inline void x86_flattree_get_config(void) { } |
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index cf3df1d8d039..9c30acfadae2 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c | |||
@@ -25,10 +25,12 @@ unsigned int code_bytes = 64; | |||
25 | int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; | 25 | int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; |
26 | static int die_counter; | 26 | static int die_counter; |
27 | 27 | ||
28 | static void printk_stack_address(unsigned long address, int reliable) | 28 | static void printk_stack_address(unsigned long address, int reliable, |
29 | void *data) | ||
29 | { | 30 | { |
30 | pr_cont(" [<%p>] %s%pB\n", | 31 | printk("%s [<%p>] %s%pB\n", |
31 | (void *)address, reliable ? "" : "? ", (void *)address); | 32 | (char *)data, (void *)address, reliable ? "" : "? ", |
33 | (void *)address); | ||
32 | } | 34 | } |
33 | 35 | ||
34 | void printk_address(unsigned long address) | 36 | void printk_address(unsigned long address) |
@@ -155,8 +157,7 @@ static int print_trace_stack(void *data, char *name) | |||
155 | static void print_trace_address(void *data, unsigned long addr, int reliable) | 157 | static void print_trace_address(void *data, unsigned long addr, int reliable) |
156 | { | 158 | { |
157 | touch_nmi_watchdog(); | 159 | touch_nmi_watchdog(); |
158 | printk(data); | 160 | printk_stack_address(addr, reliable, data); |
159 | printk_stack_address(addr, reliable); | ||
160 | } | 161 | } |
161 | 162 | ||
162 | static const struct stacktrace_ops print_trace_ops = { | 163 | static const struct stacktrace_ops print_trace_ops = { |
@@ -278,7 +279,7 @@ int __die(const char *str, struct pt_regs *regs, long err) | |||
278 | print_modules(); | 279 | print_modules(); |
279 | show_regs(regs); | 280 | show_regs(regs); |
280 | #ifdef CONFIG_X86_32 | 281 | #ifdef CONFIG_X86_32 |
281 | if (user_mode_vm(regs)) { | 282 | if (user_mode(regs)) { |
282 | sp = regs->sp; | 283 | sp = regs->sp; |
283 | ss = regs->ss & 0xffff; | 284 | ss = regs->ss & 0xffff; |
284 | } else { | 285 | } else { |
@@ -307,7 +308,7 @@ void die(const char *str, struct pt_regs *regs, long err) | |||
307 | unsigned long flags = oops_begin(); | 308 | unsigned long flags = oops_begin(); |
308 | int sig = SIGSEGV; | 309 | int sig = SIGSEGV; |
309 | 310 | ||
310 | if (!user_mode_vm(regs)) | 311 | if (!user_mode(regs)) |
311 | report_bug(regs->ip, regs); | 312 | report_bug(regs->ip, regs); |
312 | 313 | ||
313 | if (__die(str, regs, err)) | 314 | if (__die(str, regs, err)) |
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 5abd4cd4230c..464ffd69b92e 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c | |||
@@ -108,9 +108,12 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, | |||
108 | for (i = 0; i < kstack_depth_to_print; i++) { | 108 | for (i = 0; i < kstack_depth_to_print; i++) { |
109 | if (kstack_end(stack)) | 109 | if (kstack_end(stack)) |
110 | break; | 110 | break; |
111 | if (i && ((i % STACKSLOTS_PER_LINE) == 0)) | 111 | if ((i % STACKSLOTS_PER_LINE) == 0) { |
112 | pr_cont("\n"); | 112 | if (i != 0) |
113 | pr_cont(" %08lx", *stack++); | 113 | pr_cont("\n"); |
114 | printk("%s %08lx", log_lvl, *stack++); | ||
115 | } else | ||
116 | pr_cont(" %08lx", *stack++); | ||
114 | touch_nmi_watchdog(); | 117 | touch_nmi_watchdog(); |
115 | } | 118 | } |
116 | pr_cont("\n"); | 119 | pr_cont("\n"); |
@@ -123,13 +126,13 @@ void show_regs(struct pt_regs *regs) | |||
123 | int i; | 126 | int i; |
124 | 127 | ||
125 | show_regs_print_info(KERN_EMERG); | 128 | show_regs_print_info(KERN_EMERG); |
126 | __show_regs(regs, !user_mode_vm(regs)); | 129 | __show_regs(regs, !user_mode(regs)); |
127 | 130 | ||
128 | /* | 131 | /* |
129 | * When in-kernel, we also print out the stack and code at the | 132 | * When in-kernel, we also print out the stack and code at the |
130 | * time of the fault.. | 133 | * time of the fault.. |
131 | */ | 134 | */ |
132 | if (!user_mode_vm(regs)) { | 135 | if (!user_mode(regs)) { |
133 | unsigned int code_prologue = code_bytes * 43 / 64; | 136 | unsigned int code_prologue = code_bytes * 43 / 64; |
134 | unsigned int code_len = code_bytes; | 137 | unsigned int code_len = code_bytes; |
135 | unsigned char c; | 138 | unsigned char c; |
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index ff86f19b5758..5f1c6266eb30 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c | |||
@@ -280,12 +280,15 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, | |||
280 | pr_cont(" <EOI> "); | 280 | pr_cont(" <EOI> "); |
281 | } | 281 | } |
282 | } else { | 282 | } else { |
283 | if (((long) stack & (THREAD_SIZE-1)) == 0) | 283 | if (kstack_end(stack)) |
284 | break; | 284 | break; |
285 | } | 285 | } |
286 | if (i && ((i % STACKSLOTS_PER_LINE) == 0)) | 286 | if ((i % STACKSLOTS_PER_LINE) == 0) { |
287 | pr_cont("\n"); | 287 | if (i != 0) |
288 | pr_cont(" %016lx", *stack++); | 288 | pr_cont("\n"); |
289 | printk("%s %016lx", log_lvl, *stack++); | ||
290 | } else | ||
291 | pr_cont(" %016lx", *stack++); | ||
289 | touch_nmi_watchdog(); | 292 | touch_nmi_watchdog(); |
290 | } | 293 | } |
291 | preempt_enable(); | 294 | preempt_enable(); |
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 46201deee923..e2ce85db2283 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c | |||
@@ -149,6 +149,9 @@ static void __init e820_print_type(u32 type) | |||
149 | case E820_UNUSABLE: | 149 | case E820_UNUSABLE: |
150 | printk(KERN_CONT "unusable"); | 150 | printk(KERN_CONT "unusable"); |
151 | break; | 151 | break; |
152 | case E820_PRAM: | ||
153 | printk(KERN_CONT "persistent (type %u)", type); | ||
154 | break; | ||
152 | default: | 155 | default: |
153 | printk(KERN_CONT "type %u", type); | 156 | printk(KERN_CONT "type %u", type); |
154 | break; | 157 | break; |
@@ -343,7 +346,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, | |||
343 | * continue building up new bios map based on this | 346 | * continue building up new bios map based on this |
344 | * information | 347 | * information |
345 | */ | 348 | */ |
346 | if (current_type != last_type) { | 349 | if (current_type != last_type || current_type == E820_PRAM) { |
347 | if (last_type != 0) { | 350 | if (last_type != 0) { |
348 | new_bios[new_bios_entry].size = | 351 | new_bios[new_bios_entry].size = |
349 | change_point[chgidx]->addr - last_addr; | 352 | change_point[chgidx]->addr - last_addr; |
@@ -661,7 +664,7 @@ void __init parse_e820_ext(u64 phys_addr, u32 data_len) | |||
661 | extmap = (struct e820entry *)(sdata->data); | 664 | extmap = (struct e820entry *)(sdata->data); |
662 | __append_e820_map(extmap, entries); | 665 | __append_e820_map(extmap, entries); |
663 | sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); | 666 | sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); |
664 | early_iounmap(sdata, data_len); | 667 | early_memunmap(sdata, data_len); |
665 | printk(KERN_INFO "e820: extended physical RAM map:\n"); | 668 | printk(KERN_INFO "e820: extended physical RAM map:\n"); |
666 | e820_print_map("extended"); | 669 | e820_print_map("extended"); |
667 | } | 670 | } |
@@ -688,6 +691,7 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn) | |||
688 | register_nosave_region(pfn, PFN_UP(ei->addr)); | 691 | register_nosave_region(pfn, PFN_UP(ei->addr)); |
689 | 692 | ||
690 | pfn = PFN_DOWN(ei->addr + ei->size); | 693 | pfn = PFN_DOWN(ei->addr + ei->size); |
694 | |||
691 | if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN) | 695 | if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN) |
692 | register_nosave_region(PFN_UP(ei->addr), pfn); | 696 | register_nosave_region(PFN_UP(ei->addr), pfn); |
693 | 697 | ||
@@ -748,7 +752,7 @@ u64 __init early_reserve_e820(u64 size, u64 align) | |||
748 | /* | 752 | /* |
749 | * Find the highest page frame number we have available | 753 | * Find the highest page frame number we have available |
750 | */ | 754 | */ |
751 | static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) | 755 | static unsigned long __init e820_end_pfn(unsigned long limit_pfn) |
752 | { | 756 | { |
753 | int i; | 757 | int i; |
754 | unsigned long last_pfn = 0; | 758 | unsigned long last_pfn = 0; |
@@ -759,7 +763,11 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) | |||
759 | unsigned long start_pfn; | 763 | unsigned long start_pfn; |
760 | unsigned long end_pfn; | 764 | unsigned long end_pfn; |
761 | 765 | ||
762 | if (ei->type != type) | 766 | /* |
767 | * Persistent memory is accounted as ram for purposes of | ||
768 | * establishing max_pfn and mem_map. | ||
769 | */ | ||
770 | if (ei->type != E820_RAM && ei->type != E820_PRAM) | ||
763 | continue; | 771 | continue; |
764 | 772 | ||
765 | start_pfn = ei->addr >> PAGE_SHIFT; | 773 | start_pfn = ei->addr >> PAGE_SHIFT; |
@@ -784,12 +792,12 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) | |||
784 | } | 792 | } |
785 | unsigned long __init e820_end_of_ram_pfn(void) | 793 | unsigned long __init e820_end_of_ram_pfn(void) |
786 | { | 794 | { |
787 | return e820_end_pfn(MAX_ARCH_PFN, E820_RAM); | 795 | return e820_end_pfn(MAX_ARCH_PFN); |
788 | } | 796 | } |
789 | 797 | ||
790 | unsigned long __init e820_end_of_low_ram_pfn(void) | 798 | unsigned long __init e820_end_of_low_ram_pfn(void) |
791 | { | 799 | { |
792 | return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM); | 800 | return e820_end_pfn(1UL << (32-PAGE_SHIFT)); |
793 | } | 801 | } |
794 | 802 | ||
795 | static void early_panic(char *msg) | 803 | static void early_panic(char *msg) |
@@ -866,6 +874,9 @@ static int __init parse_memmap_one(char *p) | |||
866 | } else if (*p == '$') { | 874 | } else if (*p == '$') { |
867 | start_at = memparse(p+1, &p); | 875 | start_at = memparse(p+1, &p); |
868 | e820_add_region(start_at, mem_size, E820_RESERVED); | 876 | e820_add_region(start_at, mem_size, E820_RESERVED); |
877 | } else if (*p == '!') { | ||
878 | start_at = memparse(p+1, &p); | ||
879 | e820_add_region(start_at, mem_size, E820_PRAM); | ||
869 | } else | 880 | } else |
870 | e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); | 881 | e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); |
871 | 882 | ||
@@ -907,6 +918,7 @@ static inline const char *e820_type_to_string(int e820_type) | |||
907 | case E820_ACPI: return "ACPI Tables"; | 918 | case E820_ACPI: return "ACPI Tables"; |
908 | case E820_NVS: return "ACPI Non-volatile Storage"; | 919 | case E820_NVS: return "ACPI Non-volatile Storage"; |
909 | case E820_UNUSABLE: return "Unusable memory"; | 920 | case E820_UNUSABLE: return "Unusable memory"; |
921 | case E820_PRAM: return "Persistent RAM"; | ||
910 | default: return "reserved"; | 922 | default: return "reserved"; |
911 | } | 923 | } |
912 | } | 924 | } |
@@ -940,7 +952,9 @@ void __init e820_reserve_resources(void) | |||
940 | * pci device BAR resource and insert them later in | 952 | * pci device BAR resource and insert them later in |
941 | * pcibios_resource_survey() | 953 | * pcibios_resource_survey() |
942 | */ | 954 | */ |
943 | if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20)) { | 955 | if (((e820.map[i].type != E820_RESERVED) && |
956 | (e820.map[i].type != E820_PRAM)) || | ||
957 | res->start < (1ULL<<20)) { | ||
944 | res->flags |= IORESOURCE_BUSY; | 958 | res->flags |= IORESOURCE_BUSY; |
945 | insert_resource(&iomem_resource, res); | 959 | insert_resource(&iomem_resource, res); |
946 | } | 960 | } |
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index a62536a1be88..49ff55ef9b26 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c | |||
@@ -95,20 +95,6 @@ static unsigned long early_serial_base = 0x3f8; /* ttyS0 */ | |||
95 | #define DLL 0 /* Divisor Latch Low */ | 95 | #define DLL 0 /* Divisor Latch Low */ |
96 | #define DLH 1 /* Divisor latch High */ | 96 | #define DLH 1 /* Divisor latch High */ |
97 | 97 | ||
98 | static void mem32_serial_out(unsigned long addr, int offset, int value) | ||
99 | { | ||
100 | uint32_t *vaddr = (uint32_t *)addr; | ||
101 | /* shift implied by pointer type */ | ||
102 | writel(value, vaddr + offset); | ||
103 | } | ||
104 | |||
105 | static unsigned int mem32_serial_in(unsigned long addr, int offset) | ||
106 | { | ||
107 | uint32_t *vaddr = (uint32_t *)addr; | ||
108 | /* shift implied by pointer type */ | ||
109 | return readl(vaddr + offset); | ||
110 | } | ||
111 | |||
112 | static unsigned int io_serial_in(unsigned long addr, int offset) | 98 | static unsigned int io_serial_in(unsigned long addr, int offset) |
113 | { | 99 | { |
114 | return inb(addr + offset); | 100 | return inb(addr + offset); |
@@ -205,6 +191,20 @@ static __init void early_serial_init(char *s) | |||
205 | } | 191 | } |
206 | 192 | ||
207 | #ifdef CONFIG_PCI | 193 | #ifdef CONFIG_PCI |
194 | static void mem32_serial_out(unsigned long addr, int offset, int value) | ||
195 | { | ||
196 | u32 *vaddr = (u32 *)addr; | ||
197 | /* shift implied by pointer type */ | ||
198 | writel(value, vaddr + offset); | ||
199 | } | ||
200 | |||
201 | static unsigned int mem32_serial_in(unsigned long addr, int offset) | ||
202 | { | ||
203 | u32 *vaddr = (u32 *)addr; | ||
204 | /* shift implied by pointer type */ | ||
205 | return readl(vaddr + offset); | ||
206 | } | ||
207 | |||
208 | /* | 208 | /* |
209 | * early_pci_serial_init() | 209 | * early_pci_serial_init() |
210 | * | 210 | * |
@@ -217,8 +217,8 @@ static __init void early_pci_serial_init(char *s) | |||
217 | unsigned divisor; | 217 | unsigned divisor; |
218 | unsigned long baud = DEFAULT_BAUD; | 218 | unsigned long baud = DEFAULT_BAUD; |
219 | u8 bus, slot, func; | 219 | u8 bus, slot, func; |
220 | uint32_t classcode, bar0; | 220 | u32 classcode, bar0; |
221 | uint16_t cmdreg; | 221 | u16 cmdreg; |
222 | char *e; | 222 | char *e; |
223 | 223 | ||
224 | 224 | ||
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 31e2d5bf3e38..1c309763e321 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S | |||
@@ -395,10 +395,13 @@ sysenter_past_esp: | |||
395 | /*CFI_REL_OFFSET cs, 0*/ | 395 | /*CFI_REL_OFFSET cs, 0*/ |
396 | /* | 396 | /* |
397 | * Push current_thread_info()->sysenter_return to the stack. | 397 | * Push current_thread_info()->sysenter_return to the stack. |
398 | * A tiny bit of offset fixup is necessary - 4*4 means the 4 words | 398 | * A tiny bit of offset fixup is necessary: TI_sysenter_return |
399 | * pushed above; +8 corresponds to copy_thread's esp0 setting. | 399 | * is relative to thread_info, which is at the bottom of the |
400 | * kernel stack page. 4*4 means the 4 words pushed above; | ||
401 | * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack; | ||
402 | * and THREAD_SIZE takes us to the bottom. | ||
400 | */ | 403 | */ |
401 | pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+8+4*4)(%esp) | 404 | pushl_cfi ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp) |
402 | CFI_REL_OFFSET eip, 0 | 405 | CFI_REL_OFFSET eip, 0 |
403 | 406 | ||
404 | pushl_cfi %eax | 407 | pushl_cfi %eax |
@@ -432,7 +435,7 @@ sysenter_after_call: | |||
432 | TRACE_IRQS_OFF | 435 | TRACE_IRQS_OFF |
433 | movl TI_flags(%ebp), %ecx | 436 | movl TI_flags(%ebp), %ecx |
434 | testl $_TIF_ALLWORK_MASK, %ecx | 437 | testl $_TIF_ALLWORK_MASK, %ecx |
435 | jne sysexit_audit | 438 | jnz sysexit_audit |
436 | sysenter_exit: | 439 | sysenter_exit: |
437 | /* if something modifies registers it must also disable sysexit */ | 440 | /* if something modifies registers it must also disable sysexit */ |
438 | movl PT_EIP(%esp), %edx | 441 | movl PT_EIP(%esp), %edx |
@@ -460,7 +463,7 @@ sysenter_audit: | |||
460 | 463 | ||
461 | sysexit_audit: | 464 | sysexit_audit: |
462 | testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx | 465 | testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx |
463 | jne syscall_exit_work | 466 | jnz syscall_exit_work |
464 | TRACE_IRQS_ON | 467 | TRACE_IRQS_ON |
465 | ENABLE_INTERRUPTS(CLBR_ANY) | 468 | ENABLE_INTERRUPTS(CLBR_ANY) |
466 | movl %eax,%edx /* second arg, syscall return value */ | 469 | movl %eax,%edx /* second arg, syscall return value */ |
@@ -472,7 +475,7 @@ sysexit_audit: | |||
472 | TRACE_IRQS_OFF | 475 | TRACE_IRQS_OFF |
473 | movl TI_flags(%ebp), %ecx | 476 | movl TI_flags(%ebp), %ecx |
474 | testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx | 477 | testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx |
475 | jne syscall_exit_work | 478 | jnz syscall_exit_work |
476 | movl PT_EAX(%esp),%eax /* reload syscall return value */ | 479 | movl PT_EAX(%esp),%eax /* reload syscall return value */ |
477 | jmp sysenter_exit | 480 | jmp sysenter_exit |
478 | #endif | 481 | #endif |
@@ -510,7 +513,7 @@ syscall_exit: | |||
510 | TRACE_IRQS_OFF | 513 | TRACE_IRQS_OFF |
511 | movl TI_flags(%ebp), %ecx | 514 | movl TI_flags(%ebp), %ecx |
512 | testl $_TIF_ALLWORK_MASK, %ecx # current->work | 515 | testl $_TIF_ALLWORK_MASK, %ecx # current->work |
513 | jne syscall_exit_work | 516 | jnz syscall_exit_work |
514 | 517 | ||
515 | restore_all: | 518 | restore_all: |
516 | TRACE_IRQS_IRET | 519 | TRACE_IRQS_IRET |
@@ -612,7 +615,7 @@ work_notifysig: # deal with pending signals and | |||
612 | #ifdef CONFIG_VM86 | 615 | #ifdef CONFIG_VM86 |
613 | testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) | 616 | testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) |
614 | movl %esp, %eax | 617 | movl %esp, %eax |
615 | jne work_notifysig_v86 # returning to kernel-space or | 618 | jnz work_notifysig_v86 # returning to kernel-space or |
616 | # vm86-space | 619 | # vm86-space |
617 | 1: | 620 | 1: |
618 | #else | 621 | #else |
@@ -720,43 +723,22 @@ END(sysenter_badsys) | |||
720 | .endm | 723 | .endm |
721 | 724 | ||
722 | /* | 725 | /* |
723 | * Build the entry stubs and pointer table with some assembler magic. | 726 | * Build the entry stubs with some assembler magic. |
724 | * We pack 7 stubs into a single 32-byte chunk, which will fit in a | 727 | * We pack 1 stub into every 8-byte block. |
725 | * single cache line on all modern x86 implementations. | ||
726 | */ | 728 | */ |
727 | .section .init.rodata,"a" | 729 | .align 8 |
728 | ENTRY(interrupt) | ||
729 | .section .entry.text, "ax" | ||
730 | .p2align 5 | ||
731 | .p2align CONFIG_X86_L1_CACHE_SHIFT | ||
732 | ENTRY(irq_entries_start) | 730 | ENTRY(irq_entries_start) |
733 | RING0_INT_FRAME | 731 | RING0_INT_FRAME |
734 | vector=FIRST_EXTERNAL_VECTOR | 732 | vector=FIRST_EXTERNAL_VECTOR |
735 | .rept (FIRST_SYSTEM_VECTOR-FIRST_EXTERNAL_VECTOR+6)/7 | 733 | .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) |
736 | .balign 32 | 734 | pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */ |
737 | .rept 7 | 735 | vector=vector+1 |
738 | .if vector < FIRST_SYSTEM_VECTOR | 736 | jmp common_interrupt |
739 | .if vector <> FIRST_EXTERNAL_VECTOR | ||
740 | CFI_ADJUST_CFA_OFFSET -4 | 737 | CFI_ADJUST_CFA_OFFSET -4 |
741 | .endif | 738 | .align 8 |
742 | 1: pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */ | 739 | .endr |
743 | .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 | ||
744 | jmp 2f | ||
745 | .endif | ||
746 | .previous | ||
747 | .long 1b | ||
748 | .section .entry.text, "ax" | ||
749 | vector=vector+1 | ||
750 | .endif | ||
751 | .endr | ||
752 | 2: jmp common_interrupt | ||
753 | .endr | ||
754 | END(irq_entries_start) | 740 | END(irq_entries_start) |
755 | 741 | ||
756 | .previous | ||
757 | END(interrupt) | ||
758 | .previous | ||
759 | |||
760 | /* | 742 | /* |
761 | * the CPU automatically disables interrupts when executing an IRQ vector, | 743 | * the CPU automatically disables interrupts when executing an IRQ vector, |
762 | * so IRQ-flags tracing has to follow that: | 744 | * so IRQ-flags tracing has to follow that: |
@@ -816,15 +798,9 @@ ENTRY(simd_coprocessor_error) | |||
816 | pushl_cfi $0 | 798 | pushl_cfi $0 |
817 | #ifdef CONFIG_X86_INVD_BUG | 799 | #ifdef CONFIG_X86_INVD_BUG |
818 | /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ | 800 | /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ |
819 | 661: pushl_cfi $do_general_protection | 801 | ALTERNATIVE "pushl_cfi $do_general_protection", \ |
820 | 662: | 802 | "pushl $do_simd_coprocessor_error", \ |
821 | .section .altinstructions,"a" | 803 | X86_FEATURE_XMM |
822 | altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f | ||
823 | .previous | ||
824 | .section .altinstr_replacement,"ax" | ||
825 | 663: pushl $do_simd_coprocessor_error | ||
826 | 664: | ||
827 | .previous | ||
828 | #else | 804 | #else |
829 | pushl_cfi $do_simd_coprocessor_error | 805 | pushl_cfi $do_simd_coprocessor_error |
830 | #endif | 806 | #endif |
@@ -1240,20 +1216,13 @@ error_code: | |||
1240 | /*CFI_REL_OFFSET es, 0*/ | 1216 | /*CFI_REL_OFFSET es, 0*/ |
1241 | pushl_cfi %ds | 1217 | pushl_cfi %ds |
1242 | /*CFI_REL_OFFSET ds, 0*/ | 1218 | /*CFI_REL_OFFSET ds, 0*/ |
1243 | pushl_cfi %eax | 1219 | pushl_cfi_reg eax |
1244 | CFI_REL_OFFSET eax, 0 | 1220 | pushl_cfi_reg ebp |
1245 | pushl_cfi %ebp | 1221 | pushl_cfi_reg edi |
1246 | CFI_REL_OFFSET ebp, 0 | 1222 | pushl_cfi_reg esi |
1247 | pushl_cfi %edi | 1223 | pushl_cfi_reg edx |
1248 | CFI_REL_OFFSET edi, 0 | 1224 | pushl_cfi_reg ecx |
1249 | pushl_cfi %esi | 1225 | pushl_cfi_reg ebx |
1250 | CFI_REL_OFFSET esi, 0 | ||
1251 | pushl_cfi %edx | ||
1252 | CFI_REL_OFFSET edx, 0 | ||
1253 | pushl_cfi %ecx | ||
1254 | CFI_REL_OFFSET ecx, 0 | ||
1255 | pushl_cfi %ebx | ||
1256 | CFI_REL_OFFSET ebx, 0 | ||
1257 | cld | 1226 | cld |
1258 | movl $(__KERNEL_PERCPU), %ecx | 1227 | movl $(__KERNEL_PERCPU), %ecx |
1259 | movl %ecx, %fs | 1228 | movl %ecx, %fs |
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index f0095a76c182..c7b238494b31 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S | |||
@@ -14,27 +14,14 @@ | |||
14 | * NOTE: This code handles signal-recognition, which happens every time | 14 | * NOTE: This code handles signal-recognition, which happens every time |
15 | * after an interrupt and after each system call. | 15 | * after an interrupt and after each system call. |
16 | * | 16 | * |
17 | * Normal syscalls and interrupts don't save a full stack frame, this is | ||
18 | * only done for syscall tracing, signals or fork/exec et.al. | ||
19 | * | ||
20 | * A note on terminology: | 17 | * A note on terminology: |
21 | * - top of stack: Architecture defined interrupt frame from SS to RIP | 18 | * - iret frame: Architecture defined interrupt frame from SS to RIP |
22 | * at the top of the kernel process stack. | 19 | * at the top of the kernel process stack. |
23 | * - partial stack frame: partially saved registers up to R11. | ||
24 | * - full stack frame: Like partial stack frame, but all register saved. | ||
25 | * | 20 | * |
26 | * Some macro usage: | 21 | * Some macro usage: |
27 | * - CFI macros are used to generate dwarf2 unwind information for better | 22 | * - CFI macros are used to generate dwarf2 unwind information for better |
28 | * backtraces. They don't change any code. | 23 | * backtraces. They don't change any code. |
29 | * - SAVE_ALL/RESTORE_ALL - Save/restore all registers | ||
30 | * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify. | ||
31 | * There are unfortunately lots of special cases where some registers | ||
32 | * not touched. The macro is a big mess that should be cleaned up. | ||
33 | * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS. | ||
34 | * Gives a full stack frame. | ||
35 | * - ENTRY/END Define functions in the symbol table. | 24 | * - ENTRY/END Define functions in the symbol table. |
36 | * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack | ||
37 | * frame that is otherwise undefined after a SYSCALL | ||
38 | * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. | 25 | * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. |
39 | * - idtentry - Define exception entry points. | 26 | * - idtentry - Define exception entry points. |
40 | */ | 27 | */ |
@@ -70,10 +57,6 @@ | |||
70 | .section .entry.text, "ax" | 57 | .section .entry.text, "ax" |
71 | 58 | ||
72 | 59 | ||
73 | #ifndef CONFIG_PREEMPT | ||
74 | #define retint_kernel retint_restore_args | ||
75 | #endif | ||
76 | |||
77 | #ifdef CONFIG_PARAVIRT | 60 | #ifdef CONFIG_PARAVIRT |
78 | ENTRY(native_usergs_sysret64) | 61 | ENTRY(native_usergs_sysret64) |
79 | swapgs | 62 | swapgs |
@@ -82,9 +65,9 @@ ENDPROC(native_usergs_sysret64) | |||
82 | #endif /* CONFIG_PARAVIRT */ | 65 | #endif /* CONFIG_PARAVIRT */ |
83 | 66 | ||
84 | 67 | ||
85 | .macro TRACE_IRQS_IRETQ offset=ARGOFFSET | 68 | .macro TRACE_IRQS_IRETQ |
86 | #ifdef CONFIG_TRACE_IRQFLAGS | 69 | #ifdef CONFIG_TRACE_IRQFLAGS |
87 | bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ | 70 | bt $9,EFLAGS(%rsp) /* interrupts off? */ |
88 | jnc 1f | 71 | jnc 1f |
89 | TRACE_IRQS_ON | 72 | TRACE_IRQS_ON |
90 | 1: | 73 | 1: |
@@ -116,8 +99,8 @@ ENDPROC(native_usergs_sysret64) | |||
116 | call debug_stack_reset | 99 | call debug_stack_reset |
117 | .endm | 100 | .endm |
118 | 101 | ||
119 | .macro TRACE_IRQS_IRETQ_DEBUG offset=ARGOFFSET | 102 | .macro TRACE_IRQS_IRETQ_DEBUG |
120 | bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ | 103 | bt $9,EFLAGS(%rsp) /* interrupts off? */ |
121 | jnc 1f | 104 | jnc 1f |
122 | TRACE_IRQS_ON_DEBUG | 105 | TRACE_IRQS_ON_DEBUG |
123 | 1: | 106 | 1: |
@@ -130,34 +113,7 @@ ENDPROC(native_usergs_sysret64) | |||
130 | #endif | 113 | #endif |
131 | 114 | ||
132 | /* | 115 | /* |
133 | * C code is not supposed to know about undefined top of stack. Every time | 116 | * empty frame |
134 | * a C function with an pt_regs argument is called from the SYSCALL based | ||
135 | * fast path FIXUP_TOP_OF_STACK is needed. | ||
136 | * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs | ||
137 | * manipulation. | ||
138 | */ | ||
139 | |||
140 | /* %rsp:at FRAMEEND */ | ||
141 | .macro FIXUP_TOP_OF_STACK tmp offset=0 | ||
142 | movq PER_CPU_VAR(old_rsp),\tmp | ||
143 | movq \tmp,RSP+\offset(%rsp) | ||
144 | movq $__USER_DS,SS+\offset(%rsp) | ||
145 | movq $__USER_CS,CS+\offset(%rsp) | ||
146 | movq RIP+\offset(%rsp),\tmp /* get rip */ | ||
147 | movq \tmp,RCX+\offset(%rsp) /* copy it to rcx as sysret would do */ | ||
148 | movq R11+\offset(%rsp),\tmp /* get eflags */ | ||
149 | movq \tmp,EFLAGS+\offset(%rsp) | ||
150 | .endm | ||
151 | |||
152 | .macro RESTORE_TOP_OF_STACK tmp offset=0 | ||
153 | movq RSP+\offset(%rsp),\tmp | ||
154 | movq \tmp,PER_CPU_VAR(old_rsp) | ||
155 | movq EFLAGS+\offset(%rsp),\tmp | ||
156 | movq \tmp,R11+\offset(%rsp) | ||
157 | .endm | ||
158 | |||
159 | /* | ||
160 | * initial frame state for interrupts (and exceptions without error code) | ||
161 | */ | 117 | */ |
162 | .macro EMPTY_FRAME start=1 offset=0 | 118 | .macro EMPTY_FRAME start=1 offset=0 |
163 | .if \start | 119 | .if \start |
@@ -173,12 +129,12 @@ ENDPROC(native_usergs_sysret64) | |||
173 | * initial frame state for interrupts (and exceptions without error code) | 129 | * initial frame state for interrupts (and exceptions without error code) |
174 | */ | 130 | */ |
175 | .macro INTR_FRAME start=1 offset=0 | 131 | .macro INTR_FRAME start=1 offset=0 |
176 | EMPTY_FRAME \start, SS+8+\offset-RIP | 132 | EMPTY_FRAME \start, 5*8+\offset |
177 | /*CFI_REL_OFFSET ss, SS+\offset-RIP*/ | 133 | /*CFI_REL_OFFSET ss, 4*8+\offset*/ |
178 | CFI_REL_OFFSET rsp, RSP+\offset-RIP | 134 | CFI_REL_OFFSET rsp, 3*8+\offset |
179 | /*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/ | 135 | /*CFI_REL_OFFSET rflags, 2*8+\offset*/ |
180 | /*CFI_REL_OFFSET cs, CS+\offset-RIP*/ | 136 | /*CFI_REL_OFFSET cs, 1*8+\offset*/ |
181 | CFI_REL_OFFSET rip, RIP+\offset-RIP | 137 | CFI_REL_OFFSET rip, 0*8+\offset |
182 | .endm | 138 | .endm |
183 | 139 | ||
184 | /* | 140 | /* |
@@ -186,30 +142,23 @@ ENDPROC(native_usergs_sysret64) | |||
186 | * with vector already pushed) | 142 | * with vector already pushed) |
187 | */ | 143 | */ |
188 | .macro XCPT_FRAME start=1 offset=0 | 144 | .macro XCPT_FRAME start=1 offset=0 |
189 | INTR_FRAME \start, RIP+\offset-ORIG_RAX | 145 | INTR_FRAME \start, 1*8+\offset |
190 | .endm | ||
191 | |||
192 | /* | ||
193 | * frame that enables calling into C. | ||
194 | */ | ||
195 | .macro PARTIAL_FRAME start=1 offset=0 | ||
196 | XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET | ||
197 | CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET | ||
198 | CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET | ||
199 | CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET | ||
200 | CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET | ||
201 | CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET | ||
202 | CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET | ||
203 | CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET | ||
204 | CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET | ||
205 | CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET | ||
206 | .endm | 146 | .endm |
207 | 147 | ||
208 | /* | 148 | /* |
209 | * frame that enables passing a complete pt_regs to a C function. | 149 | * frame that enables passing a complete pt_regs to a C function. |
210 | */ | 150 | */ |
211 | .macro DEFAULT_FRAME start=1 offset=0 | 151 | .macro DEFAULT_FRAME start=1 offset=0 |
212 | PARTIAL_FRAME \start, R11+\offset-R15 | 152 | XCPT_FRAME \start, ORIG_RAX+\offset |
153 | CFI_REL_OFFSET rdi, RDI+\offset | ||
154 | CFI_REL_OFFSET rsi, RSI+\offset | ||
155 | CFI_REL_OFFSET rdx, RDX+\offset | ||
156 | CFI_REL_OFFSET rcx, RCX+\offset | ||
157 | CFI_REL_OFFSET rax, RAX+\offset | ||
158 | CFI_REL_OFFSET r8, R8+\offset | ||
159 | CFI_REL_OFFSET r9, R9+\offset | ||
160 | CFI_REL_OFFSET r10, R10+\offset | ||
161 | CFI_REL_OFFSET r11, R11+\offset | ||
213 | CFI_REL_OFFSET rbx, RBX+\offset | 162 | CFI_REL_OFFSET rbx, RBX+\offset |
214 | CFI_REL_OFFSET rbp, RBP+\offset | 163 | CFI_REL_OFFSET rbp, RBP+\offset |
215 | CFI_REL_OFFSET r12, R12+\offset | 164 | CFI_REL_OFFSET r12, R12+\offset |
@@ -218,105 +167,30 @@ ENDPROC(native_usergs_sysret64) | |||
218 | CFI_REL_OFFSET r15, R15+\offset | 167 | CFI_REL_OFFSET r15, R15+\offset |
219 | .endm | 168 | .endm |
220 | 169 | ||
221 | ENTRY(save_paranoid) | ||
222 | XCPT_FRAME 1 RDI+8 | ||
223 | cld | ||
224 | movq %rdi, RDI+8(%rsp) | ||
225 | movq %rsi, RSI+8(%rsp) | ||
226 | movq_cfi rdx, RDX+8 | ||
227 | movq_cfi rcx, RCX+8 | ||
228 | movq_cfi rax, RAX+8 | ||
229 | movq %r8, R8+8(%rsp) | ||
230 | movq %r9, R9+8(%rsp) | ||
231 | movq %r10, R10+8(%rsp) | ||
232 | movq %r11, R11+8(%rsp) | ||
233 | movq_cfi rbx, RBX+8 | ||
234 | movq %rbp, RBP+8(%rsp) | ||
235 | movq %r12, R12+8(%rsp) | ||
236 | movq %r13, R13+8(%rsp) | ||
237 | movq %r14, R14+8(%rsp) | ||
238 | movq %r15, R15+8(%rsp) | ||
239 | movl $1,%ebx | ||
240 | movl $MSR_GS_BASE,%ecx | ||
241 | rdmsr | ||
242 | testl %edx,%edx | ||
243 | js 1f /* negative -> in kernel */ | ||
244 | SWAPGS | ||
245 | xorl %ebx,%ebx | ||
246 | 1: ret | ||
247 | CFI_ENDPROC | ||
248 | END(save_paranoid) | ||
249 | |||
250 | /* | 170 | /* |
251 | * A newly forked process directly context switches into this address. | 171 | * 64bit SYSCALL instruction entry. Up to 6 arguments in registers. |
252 | * | 172 | * |
253 | * rdi: prev task we switched from | 173 | * 64bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, |
254 | */ | 174 | * then loads new ss, cs, and rip from previously programmed MSRs. |
255 | ENTRY(ret_from_fork) | 175 | * rflags gets masked by a value from another MSR (so CLD and CLAC |
256 | DEFAULT_FRAME | 176 | * are not needed). SYSCALL does not save anything on the stack |
257 | 177 | * and does not change rsp. | |
258 | LOCK ; btr $TIF_FORK,TI_flags(%r8) | ||
259 | |||
260 | pushq_cfi $0x0002 | ||
261 | popfq_cfi # reset kernel eflags | ||
262 | |||
263 | call schedule_tail # rdi: 'prev' task parameter | ||
264 | |||
265 | GET_THREAD_INFO(%rcx) | ||
266 | |||
267 | RESTORE_REST | ||
268 | |||
269 | testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread? | ||
270 | jz 1f | ||
271 | |||
272 | /* | ||
273 | * By the time we get here, we have no idea whether our pt_regs, | ||
274 | * ti flags, and ti status came from the 64-bit SYSCALL fast path, | ||
275 | * the slow path, or one of the ia32entry paths. | ||
276 | * Use int_ret_from_sys_call to return, since it can safely handle | ||
277 | * all of the above. | ||
278 | */ | ||
279 | jmp int_ret_from_sys_call | ||
280 | |||
281 | 1: | ||
282 | subq $REST_SKIP, %rsp # leave space for volatiles | ||
283 | CFI_ADJUST_CFA_OFFSET REST_SKIP | ||
284 | movq %rbp, %rdi | ||
285 | call *%rbx | ||
286 | movl $0, RAX(%rsp) | ||
287 | RESTORE_REST | ||
288 | jmp int_ret_from_sys_call | ||
289 | CFI_ENDPROC | ||
290 | END(ret_from_fork) | ||
291 | |||
292 | /* | ||
293 | * System call entry. Up to 6 arguments in registers are supported. | ||
294 | * | 178 | * |
295 | * SYSCALL does not save anything on the stack and does not change the | 179 | * Registers on entry: |
296 | * stack pointer. However, it does mask the flags register for us, so | ||
297 | * CLD and CLAC are not needed. | ||
298 | */ | ||
299 | |||
300 | /* | ||
301 | * Register setup: | ||
302 | * rax system call number | 180 | * rax system call number |
181 | * rcx return address | ||
182 | * r11 saved rflags (note: r11 is callee-clobbered register in C ABI) | ||
303 | * rdi arg0 | 183 | * rdi arg0 |
304 | * rcx return address for syscall/sysret, C arg3 | ||
305 | * rsi arg1 | 184 | * rsi arg1 |
306 | * rdx arg2 | 185 | * rdx arg2 |
307 | * r10 arg3 (--> moved to rcx for C) | 186 | * r10 arg3 (needs to be moved to rcx to conform to C ABI) |
308 | * r8 arg4 | 187 | * r8 arg4 |
309 | * r9 arg5 | 188 | * r9 arg5 |
310 | * r11 eflags for syscall/sysret, temporary for C | 189 | * (note: r12-r15,rbp,rbx are callee-preserved in C ABI) |
311 | * r12-r15,rbp,rbx saved by C code, not touched. | ||
312 | * | 190 | * |
313 | * Interrupts are off on entry. | ||
314 | * Only called from user space. | 191 | * Only called from user space. |
315 | * | 192 | * |
316 | * XXX if we had a free scratch register we could save the RSP into the stack frame | 193 | * When user can change pt_regs->foo always force IRET. That is because |
317 | * and report it properly in ps. Unfortunately we haven't. | ||
318 | * | ||
319 | * When user can change the frames always force IRET. That is because | ||
320 | * it deals with uncanonical addresses better. SYSRET has trouble | 194 | * it deals with uncanonical addresses better. SYSRET has trouble |
321 | * with them due to bugs in both AMD and Intel CPUs. | 195 | * with them due to bugs in both AMD and Intel CPUs. |
322 | */ | 196 | */ |
@@ -324,9 +198,15 @@ END(ret_from_fork) | |||
324 | ENTRY(system_call) | 198 | ENTRY(system_call) |
325 | CFI_STARTPROC simple | 199 | CFI_STARTPROC simple |
326 | CFI_SIGNAL_FRAME | 200 | CFI_SIGNAL_FRAME |
327 | CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET | 201 | CFI_DEF_CFA rsp,0 |
328 | CFI_REGISTER rip,rcx | 202 | CFI_REGISTER rip,rcx |
329 | /*CFI_REGISTER rflags,r11*/ | 203 | /*CFI_REGISTER rflags,r11*/ |
204 | |||
205 | /* | ||
206 | * Interrupts are off on entry. | ||
207 | * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, | ||
208 | * it is too small to ever cause noticeable irq latency. | ||
209 | */ | ||
330 | SWAPGS_UNSAFE_STACK | 210 | SWAPGS_UNSAFE_STACK |
331 | /* | 211 | /* |
332 | * A hypervisor implementation might want to use a label | 212 | * A hypervisor implementation might want to use a label |
@@ -335,18 +215,38 @@ ENTRY(system_call) | |||
335 | */ | 215 | */ |
336 | GLOBAL(system_call_after_swapgs) | 216 | GLOBAL(system_call_after_swapgs) |
337 | 217 | ||
338 | movq %rsp,PER_CPU_VAR(old_rsp) | 218 | movq %rsp,PER_CPU_VAR(rsp_scratch) |
339 | movq PER_CPU_VAR(kernel_stack),%rsp | 219 | movq PER_CPU_VAR(kernel_stack),%rsp |
220 | |||
221 | /* Construct struct pt_regs on stack */ | ||
222 | pushq_cfi $__USER_DS /* pt_regs->ss */ | ||
223 | pushq_cfi PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ | ||
340 | /* | 224 | /* |
341 | * No need to follow this irqs off/on section - it's straight | 225 | * Re-enable interrupts. |
342 | * and short: | 226 | * We use 'rsp_scratch' as a scratch space, hence irq-off block above |
227 | * must execute atomically in the face of possible interrupt-driven | ||
228 | * task preemption. We must enable interrupts only after we're done | ||
229 | * with using rsp_scratch: | ||
343 | */ | 230 | */ |
344 | ENABLE_INTERRUPTS(CLBR_NONE) | 231 | ENABLE_INTERRUPTS(CLBR_NONE) |
345 | SAVE_ARGS 8, 0, rax_enosys=1 | 232 | pushq_cfi %r11 /* pt_regs->flags */ |
346 | movq_cfi rax,(ORIG_RAX-ARGOFFSET) | 233 | pushq_cfi $__USER_CS /* pt_regs->cs */ |
347 | movq %rcx,RIP-ARGOFFSET(%rsp) | 234 | pushq_cfi %rcx /* pt_regs->ip */ |
348 | CFI_REL_OFFSET rip,RIP-ARGOFFSET | 235 | CFI_REL_OFFSET rip,0 |
349 | testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) | 236 | pushq_cfi_reg rax /* pt_regs->orig_ax */ |
237 | pushq_cfi_reg rdi /* pt_regs->di */ | ||
238 | pushq_cfi_reg rsi /* pt_regs->si */ | ||
239 | pushq_cfi_reg rdx /* pt_regs->dx */ | ||
240 | pushq_cfi_reg rcx /* pt_regs->cx */ | ||
241 | pushq_cfi $-ENOSYS /* pt_regs->ax */ | ||
242 | pushq_cfi_reg r8 /* pt_regs->r8 */ | ||
243 | pushq_cfi_reg r9 /* pt_regs->r9 */ | ||
244 | pushq_cfi_reg r10 /* pt_regs->r10 */ | ||
245 | pushq_cfi_reg r11 /* pt_regs->r11 */ | ||
246 | sub $(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */ | ||
247 | CFI_ADJUST_CFA_OFFSET 6*8 | ||
248 | |||
249 | testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) | ||
350 | jnz tracesys | 250 | jnz tracesys |
351 | system_call_fastpath: | 251 | system_call_fastpath: |
352 | #if __SYSCALL_MASK == ~0 | 252 | #if __SYSCALL_MASK == ~0 |
@@ -355,18 +255,21 @@ system_call_fastpath: | |||
355 | andl $__SYSCALL_MASK,%eax | 255 | andl $__SYSCALL_MASK,%eax |
356 | cmpl $__NR_syscall_max,%eax | 256 | cmpl $__NR_syscall_max,%eax |
357 | #endif | 257 | #endif |
358 | ja ret_from_sys_call /* and return regs->ax */ | 258 | ja 1f /* return -ENOSYS (already in pt_regs->ax) */ |
359 | movq %r10,%rcx | 259 | movq %r10,%rcx |
360 | call *sys_call_table(,%rax,8) # XXX: rip relative | 260 | call *sys_call_table(,%rax,8) |
361 | movq %rax,RAX-ARGOFFSET(%rsp) | 261 | movq %rax,RAX(%rsp) |
262 | 1: | ||
362 | /* | 263 | /* |
363 | * Syscall return path ending with SYSRET (fast path) | 264 | * Syscall return path ending with SYSRET (fast path). |
364 | * Has incomplete stack frame and undefined top of stack. | 265 | * Has incompletely filled pt_regs. |
365 | */ | 266 | */ |
366 | ret_from_sys_call: | ||
367 | LOCKDEP_SYS_EXIT | 267 | LOCKDEP_SYS_EXIT |
268 | /* | ||
269 | * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, | ||
270 | * it is too small to ever cause noticeable irq latency. | ||
271 | */ | ||
368 | DISABLE_INTERRUPTS(CLBR_NONE) | 272 | DISABLE_INTERRUPTS(CLBR_NONE) |
369 | TRACE_IRQS_OFF | ||
370 | 273 | ||
371 | /* | 274 | /* |
372 | * We must check ti flags with interrupts (or at least preemption) | 275 | * We must check ti flags with interrupts (or at least preemption) |
@@ -376,72 +279,73 @@ ret_from_sys_call: | |||
376 | * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is | 279 | * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is |
377 | * very bad. | 280 | * very bad. |
378 | */ | 281 | */ |
379 | testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) | 282 | testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) |
380 | jnz int_ret_from_sys_call_fixup /* Go the the slow path */ | 283 | jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */ |
381 | 284 | ||
382 | CFI_REMEMBER_STATE | 285 | CFI_REMEMBER_STATE |
383 | /* | 286 | |
384 | * sysretq will re-enable interrupts: | 287 | RESTORE_C_REGS_EXCEPT_RCX_R11 |
385 | */ | 288 | movq RIP(%rsp),%rcx |
386 | TRACE_IRQS_ON | ||
387 | movq RIP-ARGOFFSET(%rsp),%rcx | ||
388 | CFI_REGISTER rip,rcx | 289 | CFI_REGISTER rip,rcx |
389 | RESTORE_ARGS 1,-ARG_SKIP,0 | 290 | movq EFLAGS(%rsp),%r11 |
390 | /*CFI_REGISTER rflags,r11*/ | 291 | /*CFI_REGISTER rflags,r11*/ |
391 | movq PER_CPU_VAR(old_rsp), %rsp | 292 | movq RSP(%rsp),%rsp |
293 | /* | ||
294 | * 64bit SYSRET restores rip from rcx, | ||
295 | * rflags from r11 (but RF and VM bits are forced to 0), | ||
296 | * cs and ss are loaded from MSRs. | ||
297 | * Restoration of rflags re-enables interrupts. | ||
298 | */ | ||
392 | USERGS_SYSRET64 | 299 | USERGS_SYSRET64 |
393 | 300 | ||
394 | CFI_RESTORE_STATE | 301 | CFI_RESTORE_STATE |
395 | 302 | ||
396 | int_ret_from_sys_call_fixup: | 303 | /* Do syscall entry tracing */ |
397 | FIXUP_TOP_OF_STACK %r11, -ARGOFFSET | ||
398 | jmp int_ret_from_sys_call_irqs_off | ||
399 | |||
400 | /* Do syscall tracing */ | ||
401 | tracesys: | 304 | tracesys: |
402 | leaq -REST_SKIP(%rsp), %rdi | 305 | movq %rsp, %rdi |
403 | movq $AUDIT_ARCH_X86_64, %rsi | 306 | movl $AUDIT_ARCH_X86_64, %esi |
404 | call syscall_trace_enter_phase1 | 307 | call syscall_trace_enter_phase1 |
405 | test %rax, %rax | 308 | test %rax, %rax |
406 | jnz tracesys_phase2 /* if needed, run the slow path */ | 309 | jnz tracesys_phase2 /* if needed, run the slow path */ |
407 | LOAD_ARGS 0 /* else restore clobbered regs */ | 310 | RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */ |
311 | movq ORIG_RAX(%rsp), %rax | ||
408 | jmp system_call_fastpath /* and return to the fast path */ | 312 | jmp system_call_fastpath /* and return to the fast path */ |
409 | 313 | ||
410 | tracesys_phase2: | 314 | tracesys_phase2: |
411 | SAVE_REST | 315 | SAVE_EXTRA_REGS |
412 | FIXUP_TOP_OF_STACK %rdi | ||
413 | movq %rsp, %rdi | 316 | movq %rsp, %rdi |
414 | movq $AUDIT_ARCH_X86_64, %rsi | 317 | movl $AUDIT_ARCH_X86_64, %esi |
415 | movq %rax,%rdx | 318 | movq %rax,%rdx |
416 | call syscall_trace_enter_phase2 | 319 | call syscall_trace_enter_phase2 |
417 | 320 | ||
418 | /* | 321 | /* |
419 | * Reload arg registers from stack in case ptrace changed them. | 322 | * Reload registers from stack in case ptrace changed them. |
420 | * We don't reload %rax because syscall_trace_entry_phase2() returned | 323 | * We don't reload %rax because syscall_trace_entry_phase2() returned |
421 | * the value it wants us to use in the table lookup. | 324 | * the value it wants us to use in the table lookup. |
422 | */ | 325 | */ |
423 | LOAD_ARGS ARGOFFSET, 1 | 326 | RESTORE_C_REGS_EXCEPT_RAX |
424 | RESTORE_REST | 327 | RESTORE_EXTRA_REGS |
425 | #if __SYSCALL_MASK == ~0 | 328 | #if __SYSCALL_MASK == ~0 |
426 | cmpq $__NR_syscall_max,%rax | 329 | cmpq $__NR_syscall_max,%rax |
427 | #else | 330 | #else |
428 | andl $__SYSCALL_MASK,%eax | 331 | andl $__SYSCALL_MASK,%eax |
429 | cmpl $__NR_syscall_max,%eax | 332 | cmpl $__NR_syscall_max,%eax |
430 | #endif | 333 | #endif |
431 | ja int_ret_from_sys_call /* RAX(%rsp) is already set */ | 334 | ja 1f /* return -ENOSYS (already in pt_regs->ax) */ |
432 | movq %r10,%rcx /* fixup for C */ | 335 | movq %r10,%rcx /* fixup for C */ |
433 | call *sys_call_table(,%rax,8) | 336 | call *sys_call_table(,%rax,8) |
434 | movq %rax,RAX-ARGOFFSET(%rsp) | 337 | movq %rax,RAX(%rsp) |
435 | /* Use IRET because user could have changed frame */ | 338 | 1: |
339 | /* Use IRET because user could have changed pt_regs->foo */ | ||
436 | 340 | ||
437 | /* | 341 | /* |
438 | * Syscall return path ending with IRET. | 342 | * Syscall return path ending with IRET. |
439 | * Has correct top of stack, but partial stack frame. | 343 | * Has correct iret frame. |
440 | */ | 344 | */ |
441 | GLOBAL(int_ret_from_sys_call) | 345 | GLOBAL(int_ret_from_sys_call) |
442 | DISABLE_INTERRUPTS(CLBR_NONE) | 346 | DISABLE_INTERRUPTS(CLBR_NONE) |
347 | int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */ | ||
443 | TRACE_IRQS_OFF | 348 | TRACE_IRQS_OFF |
444 | int_ret_from_sys_call_irqs_off: | ||
445 | movl $_TIF_ALLWORK_MASK,%edi | 349 | movl $_TIF_ALLWORK_MASK,%edi |
446 | /* edi: mask to check */ | 350 | /* edi: mask to check */ |
447 | GLOBAL(int_with_check) | 351 | GLOBAL(int_with_check) |
@@ -450,8 +354,8 @@ GLOBAL(int_with_check) | |||
450 | movl TI_flags(%rcx),%edx | 354 | movl TI_flags(%rcx),%edx |
451 | andl %edi,%edx | 355 | andl %edi,%edx |
452 | jnz int_careful | 356 | jnz int_careful |
453 | andl $~TS_COMPAT,TI_status(%rcx) | 357 | andl $~TS_COMPAT,TI_status(%rcx) |
454 | jmp retint_swapgs | 358 | jmp syscall_return |
455 | 359 | ||
456 | /* Either reschedule or signal or syscall exit tracking needed. */ | 360 | /* Either reschedule or signal or syscall exit tracking needed. */ |
457 | /* First do a reschedule test. */ | 361 | /* First do a reschedule test. */ |
@@ -468,12 +372,11 @@ int_careful: | |||
468 | TRACE_IRQS_OFF | 372 | TRACE_IRQS_OFF |
469 | jmp int_with_check | 373 | jmp int_with_check |
470 | 374 | ||
471 | /* handle signals and tracing -- both require a full stack frame */ | 375 | /* handle signals and tracing -- both require a full pt_regs */ |
472 | int_very_careful: | 376 | int_very_careful: |
473 | TRACE_IRQS_ON | 377 | TRACE_IRQS_ON |
474 | ENABLE_INTERRUPTS(CLBR_NONE) | 378 | ENABLE_INTERRUPTS(CLBR_NONE) |
475 | int_check_syscall_exit_work: | 379 | SAVE_EXTRA_REGS |
476 | SAVE_REST | ||
477 | /* Check for syscall exit trace */ | 380 | /* Check for syscall exit trace */ |
478 | testl $_TIF_WORK_SYSCALL_EXIT,%edx | 381 | testl $_TIF_WORK_SYSCALL_EXIT,%edx |
479 | jz int_signal | 382 | jz int_signal |
@@ -492,86 +395,192 @@ int_signal: | |||
492 | call do_notify_resume | 395 | call do_notify_resume |
493 | 1: movl $_TIF_WORK_MASK,%edi | 396 | 1: movl $_TIF_WORK_MASK,%edi |
494 | int_restore_rest: | 397 | int_restore_rest: |
495 | RESTORE_REST | 398 | RESTORE_EXTRA_REGS |
496 | DISABLE_INTERRUPTS(CLBR_NONE) | 399 | DISABLE_INTERRUPTS(CLBR_NONE) |
497 | TRACE_IRQS_OFF | 400 | TRACE_IRQS_OFF |
498 | jmp int_with_check | 401 | jmp int_with_check |
402 | |||
403 | syscall_return: | ||
404 | /* The IRETQ could re-enable interrupts: */ | ||
405 | DISABLE_INTERRUPTS(CLBR_ANY) | ||
406 | TRACE_IRQS_IRETQ | ||
407 | |||
408 | /* | ||
409 | * Try to use SYSRET instead of IRET if we're returning to | ||
410 | * a completely clean 64-bit userspace context. | ||
411 | */ | ||
412 | movq RCX(%rsp),%rcx | ||
413 | cmpq %rcx,RIP(%rsp) /* RCX == RIP */ | ||
414 | jne opportunistic_sysret_failed | ||
415 | |||
416 | /* | ||
417 | * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP | ||
418 | * in kernel space. This essentially lets the user take over | ||
419 | * the kernel, since userspace controls RSP. It's not worth | ||
420 | * testing for canonicalness exactly -- this check detects any | ||
421 | * of the 17 high bits set, which is true for non-canonical | ||
422 | * or kernel addresses. (This will pessimize vsyscall=native. | ||
423 | * Big deal.) | ||
424 | * | ||
425 | * If virtual addresses ever become wider, this will need | ||
426 | * to be updated to remain correct on both old and new CPUs. | ||
427 | */ | ||
428 | .ifne __VIRTUAL_MASK_SHIFT - 47 | ||
429 | .error "virtual address width changed -- SYSRET checks need update" | ||
430 | .endif | ||
431 | shr $__VIRTUAL_MASK_SHIFT, %rcx | ||
432 | jnz opportunistic_sysret_failed | ||
433 | |||
434 | cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */ | ||
435 | jne opportunistic_sysret_failed | ||
436 | |||
437 | movq R11(%rsp),%r11 | ||
438 | cmpq %r11,EFLAGS(%rsp) /* R11 == RFLAGS */ | ||
439 | jne opportunistic_sysret_failed | ||
440 | |||
441 | /* | ||
442 | * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET, | ||
443 | * restoring TF results in a trap from userspace immediately after | ||
444 | * SYSRET. This would cause an infinite loop whenever #DB happens | ||
445 | * with register state that satisfies the opportunistic SYSRET | ||
446 | * conditions. For example, single-stepping this user code: | ||
447 | * | ||
448 | * movq $stuck_here,%rcx | ||
449 | * pushfq | ||
450 | * popq %r11 | ||
451 | * stuck_here: | ||
452 | * | ||
453 | * would never get past 'stuck_here'. | ||
454 | */ | ||
455 | testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 | ||
456 | jnz opportunistic_sysret_failed | ||
457 | |||
458 | /* nothing to check for RSP */ | ||
459 | |||
460 | cmpq $__USER_DS,SS(%rsp) /* SS must match SYSRET */ | ||
461 | jne opportunistic_sysret_failed | ||
462 | |||
463 | /* | ||
464 | * We win! This label is here just for ease of understanding | ||
465 | * perf profiles. Nothing jumps here. | ||
466 | */ | ||
467 | syscall_return_via_sysret: | ||
468 | CFI_REMEMBER_STATE | ||
469 | /* r11 is already restored (see code above) */ | ||
470 | RESTORE_C_REGS_EXCEPT_R11 | ||
471 | movq RSP(%rsp),%rsp | ||
472 | USERGS_SYSRET64 | ||
473 | CFI_RESTORE_STATE | ||
474 | |||
475 | opportunistic_sysret_failed: | ||
476 | SWAPGS | ||
477 | jmp restore_c_regs_and_iret | ||
499 | CFI_ENDPROC | 478 | CFI_ENDPROC |
500 | END(system_call) | 479 | END(system_call) |
501 | 480 | ||
481 | |||
502 | .macro FORK_LIKE func | 482 | .macro FORK_LIKE func |
503 | ENTRY(stub_\func) | 483 | ENTRY(stub_\func) |
504 | CFI_STARTPROC | 484 | CFI_STARTPROC |
505 | popq %r11 /* save return address */ | 485 | DEFAULT_FRAME 0, 8 /* offset 8: return address */ |
506 | PARTIAL_FRAME 0 | 486 | SAVE_EXTRA_REGS 8 |
507 | SAVE_REST | 487 | jmp sys_\func |
508 | pushq %r11 /* put it back on stack */ | ||
509 | FIXUP_TOP_OF_STACK %r11, 8 | ||
510 | DEFAULT_FRAME 0 8 /* offset 8: return address */ | ||
511 | call sys_\func | ||
512 | RESTORE_TOP_OF_STACK %r11, 8 | ||
513 | ret $REST_SKIP /* pop extended registers */ | ||
514 | CFI_ENDPROC | 488 | CFI_ENDPROC |
515 | END(stub_\func) | 489 | END(stub_\func) |
516 | .endm | 490 | .endm |
517 | 491 | ||
518 | .macro FIXED_FRAME label,func | ||
519 | ENTRY(\label) | ||
520 | CFI_STARTPROC | ||
521 | PARTIAL_FRAME 0 8 /* offset 8: return address */ | ||
522 | FIXUP_TOP_OF_STACK %r11, 8-ARGOFFSET | ||
523 | call \func | ||
524 | RESTORE_TOP_OF_STACK %r11, 8-ARGOFFSET | ||
525 | ret | ||
526 | CFI_ENDPROC | ||
527 | END(\label) | ||
528 | .endm | ||
529 | |||
530 | FORK_LIKE clone | 492 | FORK_LIKE clone |
531 | FORK_LIKE fork | 493 | FORK_LIKE fork |
532 | FORK_LIKE vfork | 494 | FORK_LIKE vfork |
533 | FIXED_FRAME stub_iopl, sys_iopl | ||
534 | 495 | ||
535 | ENTRY(stub_execve) | 496 | ENTRY(stub_execve) |
536 | CFI_STARTPROC | 497 | CFI_STARTPROC |
537 | addq $8, %rsp | 498 | DEFAULT_FRAME 0, 8 |
538 | PARTIAL_FRAME 0 | 499 | call sys_execve |
539 | SAVE_REST | 500 | return_from_execve: |
540 | FIXUP_TOP_OF_STACK %r11 | 501 | testl %eax, %eax |
541 | call sys_execve | 502 | jz 1f |
542 | movq %rax,RAX(%rsp) | 503 | /* exec failed, can use fast SYSRET code path in this case */ |
543 | RESTORE_REST | 504 | ret |
544 | jmp int_ret_from_sys_call | 505 | 1: |
506 | /* must use IRET code path (pt_regs->cs may have changed) */ | ||
507 | addq $8, %rsp | ||
508 | CFI_ADJUST_CFA_OFFSET -8 | ||
509 | ZERO_EXTRA_REGS | ||
510 | movq %rax,RAX(%rsp) | ||
511 | jmp int_ret_from_sys_call | ||
545 | CFI_ENDPROC | 512 | CFI_ENDPROC |
546 | END(stub_execve) | 513 | END(stub_execve) |
547 | 514 | /* | |
548 | ENTRY(stub_execveat) | 515 | * Remaining execve stubs are only 7 bytes long. |
516 | * ENTRY() often aligns to 16 bytes, which in this case has no benefits. | ||
517 | */ | ||
518 | .align 8 | ||
519 | GLOBAL(stub_execveat) | ||
549 | CFI_STARTPROC | 520 | CFI_STARTPROC |
550 | addq $8, %rsp | 521 | DEFAULT_FRAME 0, 8 |
551 | PARTIAL_FRAME 0 | 522 | call sys_execveat |
552 | SAVE_REST | 523 | jmp return_from_execve |
553 | FIXUP_TOP_OF_STACK %r11 | ||
554 | call sys_execveat | ||
555 | RESTORE_TOP_OF_STACK %r11 | ||
556 | movq %rax,RAX(%rsp) | ||
557 | RESTORE_REST | ||
558 | jmp int_ret_from_sys_call | ||
559 | CFI_ENDPROC | 524 | CFI_ENDPROC |
560 | END(stub_execveat) | 525 | END(stub_execveat) |
561 | 526 | ||
527 | #ifdef CONFIG_X86_X32_ABI | ||
528 | .align 8 | ||
529 | GLOBAL(stub_x32_execve) | ||
530 | CFI_STARTPROC | ||
531 | DEFAULT_FRAME 0, 8 | ||
532 | call compat_sys_execve | ||
533 | jmp return_from_execve | ||
534 | CFI_ENDPROC | ||
535 | END(stub_x32_execve) | ||
536 | .align 8 | ||
537 | GLOBAL(stub_x32_execveat) | ||
538 | CFI_STARTPROC | ||
539 | DEFAULT_FRAME 0, 8 | ||
540 | call compat_sys_execveat | ||
541 | jmp return_from_execve | ||
542 | CFI_ENDPROC | ||
543 | END(stub_x32_execveat) | ||
544 | #endif | ||
545 | |||
546 | #ifdef CONFIG_IA32_EMULATION | ||
547 | .align 8 | ||
548 | GLOBAL(stub32_execve) | ||
549 | CFI_STARTPROC | ||
550 | call compat_sys_execve | ||
551 | jmp return_from_execve | ||
552 | CFI_ENDPROC | ||
553 | END(stub32_execve) | ||
554 | .align 8 | ||
555 | GLOBAL(stub32_execveat) | ||
556 | CFI_STARTPROC | ||
557 | call compat_sys_execveat | ||
558 | jmp return_from_execve | ||
559 | CFI_ENDPROC | ||
560 | END(stub32_execveat) | ||
561 | #endif | ||
562 | |||
562 | /* | 563 | /* |
563 | * sigreturn is special because it needs to restore all registers on return. | 564 | * sigreturn is special because it needs to restore all registers on return. |
564 | * This cannot be done with SYSRET, so use the IRET return path instead. | 565 | * This cannot be done with SYSRET, so use the IRET return path instead. |
565 | */ | 566 | */ |
566 | ENTRY(stub_rt_sigreturn) | 567 | ENTRY(stub_rt_sigreturn) |
567 | CFI_STARTPROC | 568 | CFI_STARTPROC |
568 | addq $8, %rsp | 569 | DEFAULT_FRAME 0, 8 |
569 | PARTIAL_FRAME 0 | 570 | /* |
570 | SAVE_REST | 571 | * SAVE_EXTRA_REGS result is not normally needed: |
571 | FIXUP_TOP_OF_STACK %r11 | 572 | * sigreturn overwrites all pt_regs->GPREGS. |
573 | * But sigreturn can fail (!), and there is no easy way to detect that. | ||
574 | * To make sure RESTORE_EXTRA_REGS doesn't restore garbage on error, | ||
575 | * we SAVE_EXTRA_REGS here. | ||
576 | */ | ||
577 | SAVE_EXTRA_REGS 8 | ||
572 | call sys_rt_sigreturn | 578 | call sys_rt_sigreturn |
573 | movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer | 579 | return_from_stub: |
574 | RESTORE_REST | 580 | addq $8, %rsp |
581 | CFI_ADJUST_CFA_OFFSET -8 | ||
582 | RESTORE_EXTRA_REGS | ||
583 | movq %rax,RAX(%rsp) | ||
575 | jmp int_ret_from_sys_call | 584 | jmp int_ret_from_sys_call |
576 | CFI_ENDPROC | 585 | CFI_ENDPROC |
577 | END(stub_rt_sigreturn) | 586 | END(stub_rt_sigreturn) |
@@ -579,86 +588,70 @@ END(stub_rt_sigreturn) | |||
579 | #ifdef CONFIG_X86_X32_ABI | 588 | #ifdef CONFIG_X86_X32_ABI |
580 | ENTRY(stub_x32_rt_sigreturn) | 589 | ENTRY(stub_x32_rt_sigreturn) |
581 | CFI_STARTPROC | 590 | CFI_STARTPROC |
582 | addq $8, %rsp | 591 | DEFAULT_FRAME 0, 8 |
583 | PARTIAL_FRAME 0 | 592 | SAVE_EXTRA_REGS 8 |
584 | SAVE_REST | ||
585 | FIXUP_TOP_OF_STACK %r11 | ||
586 | call sys32_x32_rt_sigreturn | 593 | call sys32_x32_rt_sigreturn |
587 | movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer | 594 | jmp return_from_stub |
588 | RESTORE_REST | ||
589 | jmp int_ret_from_sys_call | ||
590 | CFI_ENDPROC | 595 | CFI_ENDPROC |
591 | END(stub_x32_rt_sigreturn) | 596 | END(stub_x32_rt_sigreturn) |
597 | #endif | ||
592 | 598 | ||
593 | ENTRY(stub_x32_execve) | 599 | /* |
594 | CFI_STARTPROC | 600 | * A newly forked process directly context switches into this address. |
595 | addq $8, %rsp | 601 | * |
596 | PARTIAL_FRAME 0 | 602 | * rdi: prev task we switched from |
597 | SAVE_REST | 603 | */ |
598 | FIXUP_TOP_OF_STACK %r11 | 604 | ENTRY(ret_from_fork) |
599 | call compat_sys_execve | 605 | DEFAULT_FRAME |
600 | RESTORE_TOP_OF_STACK %r11 | ||
601 | movq %rax,RAX(%rsp) | ||
602 | RESTORE_REST | ||
603 | jmp int_ret_from_sys_call | ||
604 | CFI_ENDPROC | ||
605 | END(stub_x32_execve) | ||
606 | 606 | ||
607 | ENTRY(stub_x32_execveat) | 607 | LOCK ; btr $TIF_FORK,TI_flags(%r8) |
608 | CFI_STARTPROC | 608 | |
609 | addq $8, %rsp | 609 | pushq_cfi $0x0002 |
610 | PARTIAL_FRAME 0 | 610 | popfq_cfi # reset kernel eflags |
611 | SAVE_REST | 611 | |
612 | FIXUP_TOP_OF_STACK %r11 | 612 | call schedule_tail # rdi: 'prev' task parameter |
613 | call compat_sys_execveat | 613 | |
614 | RESTORE_TOP_OF_STACK %r11 | 614 | RESTORE_EXTRA_REGS |
615 | movq %rax,RAX(%rsp) | 615 | |
616 | RESTORE_REST | 616 | testl $3,CS(%rsp) # from kernel_thread? |
617 | |||
618 | /* | ||
619 | * By the time we get here, we have no idea whether our pt_regs, | ||
620 | * ti flags, and ti status came from the 64-bit SYSCALL fast path, | ||
621 | * the slow path, or one of the ia32entry paths. | ||
622 | * Use IRET code path to return, since it can safely handle | ||
623 | * all of the above. | ||
624 | */ | ||
625 | jnz int_ret_from_sys_call | ||
626 | |||
627 | /* We came from kernel_thread */ | ||
628 | /* nb: we depend on RESTORE_EXTRA_REGS above */ | ||
629 | movq %rbp, %rdi | ||
630 | call *%rbx | ||
631 | movl $0, RAX(%rsp) | ||
632 | RESTORE_EXTRA_REGS | ||
617 | jmp int_ret_from_sys_call | 633 | jmp int_ret_from_sys_call |
618 | CFI_ENDPROC | 634 | CFI_ENDPROC |
619 | END(stub_x32_execveat) | 635 | END(ret_from_fork) |
620 | |||
621 | #endif | ||
622 | 636 | ||
623 | /* | 637 | /* |
624 | * Build the entry stubs and pointer table with some assembler magic. | 638 | * Build the entry stubs with some assembler magic. |
625 | * We pack 7 stubs into a single 32-byte chunk, which will fit in a | 639 | * We pack 1 stub into every 8-byte block. |
626 | * single cache line on all modern x86 implementations. | ||
627 | */ | 640 | */ |
628 | .section .init.rodata,"a" | 641 | .align 8 |
629 | ENTRY(interrupt) | ||
630 | .section .entry.text | ||
631 | .p2align 5 | ||
632 | .p2align CONFIG_X86_L1_CACHE_SHIFT | ||
633 | ENTRY(irq_entries_start) | 642 | ENTRY(irq_entries_start) |
634 | INTR_FRAME | 643 | INTR_FRAME |
635 | vector=FIRST_EXTERNAL_VECTOR | 644 | vector=FIRST_EXTERNAL_VECTOR |
636 | .rept (FIRST_SYSTEM_VECTOR-FIRST_EXTERNAL_VECTOR+6)/7 | 645 | .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) |
637 | .balign 32 | 646 | pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */ |
638 | .rept 7 | 647 | vector=vector+1 |
639 | .if vector < FIRST_SYSTEM_VECTOR | 648 | jmp common_interrupt |
640 | .if vector <> FIRST_EXTERNAL_VECTOR | ||
641 | CFI_ADJUST_CFA_OFFSET -8 | 649 | CFI_ADJUST_CFA_OFFSET -8 |
642 | .endif | 650 | .align 8 |
643 | 1: pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */ | 651 | .endr |
644 | .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 | ||
645 | jmp 2f | ||
646 | .endif | ||
647 | .previous | ||
648 | .quad 1b | ||
649 | .section .entry.text | ||
650 | vector=vector+1 | ||
651 | .endif | ||
652 | .endr | ||
653 | 2: jmp common_interrupt | ||
654 | .endr | ||
655 | CFI_ENDPROC | 652 | CFI_ENDPROC |
656 | END(irq_entries_start) | 653 | END(irq_entries_start) |
657 | 654 | ||
658 | .previous | ||
659 | END(interrupt) | ||
660 | .previous | ||
661 | |||
662 | /* | 655 | /* |
663 | * Interrupt entry/exit. | 656 | * Interrupt entry/exit. |
664 | * | 657 | * |
@@ -669,47 +662,45 @@ END(interrupt) | |||
669 | 662 | ||
670 | /* 0(%rsp): ~(interrupt number) */ | 663 | /* 0(%rsp): ~(interrupt number) */ |
671 | .macro interrupt func | 664 | .macro interrupt func |
672 | /* reserve pt_regs for scratch regs and rbp */ | ||
673 | subq $ORIG_RAX-RBP, %rsp | ||
674 | CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP | ||
675 | cld | 665 | cld |
676 | /* start from rbp in pt_regs and jump over */ | 666 | /* |
677 | movq_cfi rdi, (RDI-RBP) | 667 | * Since nothing in interrupt handling code touches r12...r15 members |
678 | movq_cfi rsi, (RSI-RBP) | 668 | * of "struct pt_regs", and since interrupts can nest, we can save |
679 | movq_cfi rdx, (RDX-RBP) | 669 | * four stack slots and simultaneously provide |
680 | movq_cfi rcx, (RCX-RBP) | 670 | * an unwind-friendly stack layout by saving "truncated" pt_regs |
681 | movq_cfi rax, (RAX-RBP) | 671 | * exactly up to rbp slot, without these members. |
682 | movq_cfi r8, (R8-RBP) | 672 | */ |
683 | movq_cfi r9, (R9-RBP) | 673 | ALLOC_PT_GPREGS_ON_STACK -RBP |
684 | movq_cfi r10, (R10-RBP) | 674 | SAVE_C_REGS -RBP |
685 | movq_cfi r11, (R11-RBP) | 675 | /* this goes to 0(%rsp) for unwinder, not for saving the value: */ |
686 | 676 | SAVE_EXTRA_REGS_RBP -RBP | |
687 | /* Save rbp so that we can unwind from get_irq_regs() */ | ||
688 | movq_cfi rbp, 0 | ||
689 | |||
690 | /* Save previous stack value */ | ||
691 | movq %rsp, %rsi | ||
692 | 677 | ||
693 | leaq -RBP(%rsp),%rdi /* arg1 for handler */ | 678 | leaq -RBP(%rsp),%rdi /* arg1 for \func (pointer to pt_regs) */ |
694 | testl $3, CS-RBP(%rsi) | 679 | |
680 | testl $3, CS-RBP(%rsp) | ||
695 | je 1f | 681 | je 1f |
696 | SWAPGS | 682 | SWAPGS |
683 | 1: | ||
697 | /* | 684 | /* |
685 | * Save previous stack pointer, optionally switch to interrupt stack. | ||
698 | * irq_count is used to check if a CPU is already on an interrupt stack | 686 | * irq_count is used to check if a CPU is already on an interrupt stack |
699 | * or not. While this is essentially redundant with preempt_count it is | 687 | * or not. While this is essentially redundant with preempt_count it is |
700 | * a little cheaper to use a separate counter in the PDA (short of | 688 | * a little cheaper to use a separate counter in the PDA (short of |
701 | * moving irq_enter into assembly, which would be too much work) | 689 | * moving irq_enter into assembly, which would be too much work) |
702 | */ | 690 | */ |
703 | 1: incl PER_CPU_VAR(irq_count) | 691 | movq %rsp, %rsi |
692 | incl PER_CPU_VAR(irq_count) | ||
704 | cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp | 693 | cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp |
705 | CFI_DEF_CFA_REGISTER rsi | 694 | CFI_DEF_CFA_REGISTER rsi |
706 | |||
707 | /* Store previous stack value */ | ||
708 | pushq %rsi | 695 | pushq %rsi |
696 | /* | ||
697 | * For debugger: | ||
698 | * "CFA (Current Frame Address) is the value on stack + offset" | ||
699 | */ | ||
709 | CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \ | 700 | CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \ |
710 | 0x77 /* DW_OP_breg7 */, 0, \ | 701 | 0x77 /* DW_OP_breg7 (rsp) */, 0, \ |
711 | 0x06 /* DW_OP_deref */, \ | 702 | 0x06 /* DW_OP_deref */, \ |
712 | 0x08 /* DW_OP_const1u */, SS+8-RBP, \ | 703 | 0x08 /* DW_OP_const1u */, SIZEOF_PTREGS-RBP, \ |
713 | 0x22 /* DW_OP_plus */ | 704 | 0x22 /* DW_OP_plus */ |
714 | /* We entered an interrupt context - irqs are off: */ | 705 | /* We entered an interrupt context - irqs are off: */ |
715 | TRACE_IRQS_OFF | 706 | TRACE_IRQS_OFF |
@@ -727,7 +718,7 @@ common_interrupt: | |||
727 | ASM_CLAC | 718 | ASM_CLAC |
728 | addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ | 719 | addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ |
729 | interrupt do_IRQ | 720 | interrupt do_IRQ |
730 | /* 0(%rsp): old_rsp-ARGOFFSET */ | 721 | /* 0(%rsp): old RSP */ |
731 | ret_from_intr: | 722 | ret_from_intr: |
732 | DISABLE_INTERRUPTS(CLBR_NONE) | 723 | DISABLE_INTERRUPTS(CLBR_NONE) |
733 | TRACE_IRQS_OFF | 724 | TRACE_IRQS_OFF |
@@ -735,19 +726,18 @@ ret_from_intr: | |||
735 | 726 | ||
736 | /* Restore saved previous stack */ | 727 | /* Restore saved previous stack */ |
737 | popq %rsi | 728 | popq %rsi |
738 | CFI_DEF_CFA rsi,SS+8-RBP /* reg/off reset after def_cfa_expr */ | 729 | CFI_DEF_CFA rsi,SIZEOF_PTREGS-RBP /* reg/off reset after def_cfa_expr */ |
739 | leaq ARGOFFSET-RBP(%rsi), %rsp | 730 | /* return code expects complete pt_regs - adjust rsp accordingly: */ |
731 | leaq -RBP(%rsi),%rsp | ||
740 | CFI_DEF_CFA_REGISTER rsp | 732 | CFI_DEF_CFA_REGISTER rsp |
741 | CFI_ADJUST_CFA_OFFSET RBP-ARGOFFSET | 733 | CFI_ADJUST_CFA_OFFSET RBP |
742 | 734 | ||
743 | exit_intr: | 735 | testl $3,CS(%rsp) |
744 | GET_THREAD_INFO(%rcx) | ||
745 | testl $3,CS-ARGOFFSET(%rsp) | ||
746 | je retint_kernel | 736 | je retint_kernel |
747 | |||
748 | /* Interrupt came from user space */ | 737 | /* Interrupt came from user space */ |
738 | |||
739 | GET_THREAD_INFO(%rcx) | ||
749 | /* | 740 | /* |
750 | * Has a correct top of stack, but a partial stack frame | ||
751 | * %rcx: thread info. Interrupts off. | 741 | * %rcx: thread info. Interrupts off. |
752 | */ | 742 | */ |
753 | retint_with_reschedule: | 743 | retint_with_reschedule: |
@@ -766,84 +756,34 @@ retint_swapgs: /* return to user-space */ | |||
766 | DISABLE_INTERRUPTS(CLBR_ANY) | 756 | DISABLE_INTERRUPTS(CLBR_ANY) |
767 | TRACE_IRQS_IRETQ | 757 | TRACE_IRQS_IRETQ |
768 | 758 | ||
769 | /* | ||
770 | * Try to use SYSRET instead of IRET if we're returning to | ||
771 | * a completely clean 64-bit userspace context. | ||
772 | */ | ||
773 | movq (RCX-R11)(%rsp), %rcx | ||
774 | cmpq %rcx,(RIP-R11)(%rsp) /* RCX == RIP */ | ||
775 | jne opportunistic_sysret_failed | ||
776 | |||
777 | /* | ||
778 | * On Intel CPUs, sysret with non-canonical RCX/RIP will #GP | ||
779 | * in kernel space. This essentially lets the user take over | ||
780 | * the kernel, since userspace controls RSP. It's not worth | ||
781 | * testing for canonicalness exactly -- this check detects any | ||
782 | * of the 17 high bits set, which is true for non-canonical | ||
783 | * or kernel addresses. (This will pessimize vsyscall=native. | ||
784 | * Big deal.) | ||
785 | * | ||
786 | * If virtual addresses ever become wider, this will need | ||
787 | * to be updated to remain correct on both old and new CPUs. | ||
788 | */ | ||
789 | .ifne __VIRTUAL_MASK_SHIFT - 47 | ||
790 | .error "virtual address width changed -- sysret checks need update" | ||
791 | .endif | ||
792 | shr $__VIRTUAL_MASK_SHIFT, %rcx | ||
793 | jnz opportunistic_sysret_failed | ||
794 | |||
795 | cmpq $__USER_CS,(CS-R11)(%rsp) /* CS must match SYSRET */ | ||
796 | jne opportunistic_sysret_failed | ||
797 | |||
798 | movq (R11-ARGOFFSET)(%rsp), %r11 | ||
799 | cmpq %r11,(EFLAGS-ARGOFFSET)(%rsp) /* R11 == RFLAGS */ | ||
800 | jne opportunistic_sysret_failed | ||
801 | |||
802 | /* | ||
803 | * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET, | ||
804 | * restoring TF results in a trap from userspace immediately after | ||
805 | * SYSRET. This would cause an infinite loop whenever #DB happens | ||
806 | * with register state that satisfies the opportunistic SYSRET | ||
807 | * conditions. For example, single-stepping this user code: | ||
808 | * | ||
809 | * movq $stuck_here,%rcx | ||
810 | * pushfq | ||
811 | * popq %r11 | ||
812 | * stuck_here: | ||
813 | * | ||
814 | * would never get past 'stuck_here'. | ||
815 | */ | ||
816 | testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 | ||
817 | jnz opportunistic_sysret_failed | ||
818 | |||
819 | /* nothing to check for RSP */ | ||
820 | |||
821 | cmpq $__USER_DS,(SS-ARGOFFSET)(%rsp) /* SS must match SYSRET */ | ||
822 | jne opportunistic_sysret_failed | ||
823 | |||
824 | /* | ||
825 | * We win! This label is here just for ease of understanding | ||
826 | * perf profiles. Nothing jumps here. | ||
827 | */ | ||
828 | irq_return_via_sysret: | ||
829 | CFI_REMEMBER_STATE | ||
830 | RESTORE_ARGS 1,8,1 | ||
831 | movq (RSP-RIP)(%rsp),%rsp | ||
832 | USERGS_SYSRET64 | ||
833 | CFI_RESTORE_STATE | ||
834 | |||
835 | opportunistic_sysret_failed: | ||
836 | SWAPGS | 759 | SWAPGS |
837 | jmp restore_args | 760 | jmp restore_c_regs_and_iret |
838 | 761 | ||
839 | retint_restore_args: /* return to kernel space */ | 762 | /* Returning to kernel space */ |
840 | DISABLE_INTERRUPTS(CLBR_ANY) | 763 | retint_kernel: |
764 | #ifdef CONFIG_PREEMPT | ||
765 | /* Interrupts are off */ | ||
766 | /* Check if we need preemption */ | ||
767 | bt $9,EFLAGS(%rsp) /* interrupts were off? */ | ||
768 | jnc 1f | ||
769 | 0: cmpl $0,PER_CPU_VAR(__preempt_count) | ||
770 | jnz 1f | ||
771 | call preempt_schedule_irq | ||
772 | jmp 0b | ||
773 | 1: | ||
774 | #endif | ||
841 | /* | 775 | /* |
842 | * The iretq could re-enable interrupts: | 776 | * The iretq could re-enable interrupts: |
843 | */ | 777 | */ |
844 | TRACE_IRQS_IRETQ | 778 | TRACE_IRQS_IRETQ |
845 | restore_args: | 779 | |
846 | RESTORE_ARGS 1,8,1 | 780 | /* |
781 | * At this label, code paths which return to kernel and to user, | ||
782 | * which come from interrupts/exception and from syscalls, merge. | ||
783 | */ | ||
784 | restore_c_regs_and_iret: | ||
785 | RESTORE_C_REGS | ||
786 | REMOVE_PT_GPREGS_FROM_STACK 8 | ||
847 | 787 | ||
848 | irq_return: | 788 | irq_return: |
849 | INTERRUPT_RETURN | 789 | INTERRUPT_RETURN |
@@ -914,28 +854,17 @@ retint_signal: | |||
914 | jz retint_swapgs | 854 | jz retint_swapgs |
915 | TRACE_IRQS_ON | 855 | TRACE_IRQS_ON |
916 | ENABLE_INTERRUPTS(CLBR_NONE) | 856 | ENABLE_INTERRUPTS(CLBR_NONE) |
917 | SAVE_REST | 857 | SAVE_EXTRA_REGS |
918 | movq $-1,ORIG_RAX(%rsp) | 858 | movq $-1,ORIG_RAX(%rsp) |
919 | xorl %esi,%esi # oldset | 859 | xorl %esi,%esi # oldset |
920 | movq %rsp,%rdi # &pt_regs | 860 | movq %rsp,%rdi # &pt_regs |
921 | call do_notify_resume | 861 | call do_notify_resume |
922 | RESTORE_REST | 862 | RESTORE_EXTRA_REGS |
923 | DISABLE_INTERRUPTS(CLBR_NONE) | 863 | DISABLE_INTERRUPTS(CLBR_NONE) |
924 | TRACE_IRQS_OFF | 864 | TRACE_IRQS_OFF |
925 | GET_THREAD_INFO(%rcx) | 865 | GET_THREAD_INFO(%rcx) |
926 | jmp retint_with_reschedule | 866 | jmp retint_with_reschedule |
927 | 867 | ||
928 | #ifdef CONFIG_PREEMPT | ||
929 | /* Returning to kernel space. Check if we need preemption */ | ||
930 | /* rcx: threadinfo. interrupts off. */ | ||
931 | ENTRY(retint_kernel) | ||
932 | cmpl $0,PER_CPU_VAR(__preempt_count) | ||
933 | jnz retint_restore_args | ||
934 | bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ | ||
935 | jnc retint_restore_args | ||
936 | call preempt_schedule_irq | ||
937 | jmp exit_intr | ||
938 | #endif | ||
939 | CFI_ENDPROC | 868 | CFI_ENDPROC |
940 | END(common_interrupt) | 869 | END(common_interrupt) |
941 | 870 | ||
@@ -1024,7 +953,7 @@ apicinterrupt IRQ_WORK_VECTOR \ | |||
1024 | /* | 953 | /* |
1025 | * Exception entry points. | 954 | * Exception entry points. |
1026 | */ | 955 | */ |
1027 | #define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) | 956 | #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8) |
1028 | 957 | ||
1029 | .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 | 958 | .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 |
1030 | ENTRY(\sym) | 959 | ENTRY(\sym) |
@@ -1046,8 +975,7 @@ ENTRY(\sym) | |||
1046 | pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ | 975 | pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ |
1047 | .endif | 976 | .endif |
1048 | 977 | ||
1049 | subq $ORIG_RAX-R15, %rsp | 978 | ALLOC_PT_GPREGS_ON_STACK |
1050 | CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 | ||
1051 | 979 | ||
1052 | .if \paranoid | 980 | .if \paranoid |
1053 | .if \paranoid == 1 | 981 | .if \paranoid == 1 |
@@ -1055,10 +983,11 @@ ENTRY(\sym) | |||
1055 | testl $3, CS(%rsp) /* If coming from userspace, switch */ | 983 | testl $3, CS(%rsp) /* If coming from userspace, switch */ |
1056 | jnz 1f /* stacks. */ | 984 | jnz 1f /* stacks. */ |
1057 | .endif | 985 | .endif |
1058 | call save_paranoid | 986 | call paranoid_entry |
1059 | .else | 987 | .else |
1060 | call error_entry | 988 | call error_entry |
1061 | .endif | 989 | .endif |
990 | /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ | ||
1062 | 991 | ||
1063 | DEFAULT_FRAME 0 | 992 | DEFAULT_FRAME 0 |
1064 | 993 | ||
@@ -1080,19 +1009,20 @@ ENTRY(\sym) | |||
1080 | .endif | 1009 | .endif |
1081 | 1010 | ||
1082 | .if \shift_ist != -1 | 1011 | .if \shift_ist != -1 |
1083 | subq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist) | 1012 | subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) |
1084 | .endif | 1013 | .endif |
1085 | 1014 | ||
1086 | call \do_sym | 1015 | call \do_sym |
1087 | 1016 | ||
1088 | .if \shift_ist != -1 | 1017 | .if \shift_ist != -1 |
1089 | addq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist) | 1018 | addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) |
1090 | .endif | 1019 | .endif |
1091 | 1020 | ||
1021 | /* these procedures expect "no swapgs" flag in ebx */ | ||
1092 | .if \paranoid | 1022 | .if \paranoid |
1093 | jmp paranoid_exit /* %ebx: no swapgs flag */ | 1023 | jmp paranoid_exit |
1094 | .else | 1024 | .else |
1095 | jmp error_exit /* %ebx: no swapgs flag */ | 1025 | jmp error_exit |
1096 | .endif | 1026 | .endif |
1097 | 1027 | ||
1098 | .if \paranoid == 1 | 1028 | .if \paranoid == 1 |
@@ -1296,7 +1226,9 @@ ENTRY(xen_failsafe_callback) | |||
1296 | addq $0x30,%rsp | 1226 | addq $0x30,%rsp |
1297 | CFI_ADJUST_CFA_OFFSET -0x30 | 1227 | CFI_ADJUST_CFA_OFFSET -0x30 |
1298 | pushq_cfi $-1 /* orig_ax = -1 => not a system call */ | 1228 | pushq_cfi $-1 /* orig_ax = -1 => not a system call */ |
1299 | SAVE_ALL | 1229 | ALLOC_PT_GPREGS_ON_STACK |
1230 | SAVE_C_REGS | ||
1231 | SAVE_EXTRA_REGS | ||
1300 | jmp error_exit | 1232 | jmp error_exit |
1301 | CFI_ENDPROC | 1233 | CFI_ENDPROC |
1302 | END(xen_failsafe_callback) | 1234 | END(xen_failsafe_callback) |
@@ -1328,59 +1260,66 @@ idtentry async_page_fault do_async_page_fault has_error_code=1 | |||
1328 | idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) | 1260 | idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) |
1329 | #endif | 1261 | #endif |
1330 | 1262 | ||
1331 | /* | 1263 | /* |
1332 | * "Paranoid" exit path from exception stack. This is invoked | 1264 | * Save all registers in pt_regs, and switch gs if needed. |
1333 | * only on return from non-NMI IST interrupts that came | 1265 | * Use slow, but surefire "are we in kernel?" check. |
1334 | * from kernel space. | 1266 | * Return: ebx=0: need swapgs on exit, ebx=1: otherwise |
1335 | * | 1267 | */ |
1336 | * We may be returning to very strange contexts (e.g. very early | 1268 | ENTRY(paranoid_entry) |
1337 | * in syscall entry), so checking for preemption here would | 1269 | XCPT_FRAME 1 15*8 |
1338 | * be complicated. Fortunately, we there's no good reason | 1270 | cld |
1339 | * to try to handle preemption here. | 1271 | SAVE_C_REGS 8 |
1340 | */ | 1272 | SAVE_EXTRA_REGS 8 |
1273 | movl $1,%ebx | ||
1274 | movl $MSR_GS_BASE,%ecx | ||
1275 | rdmsr | ||
1276 | testl %edx,%edx | ||
1277 | js 1f /* negative -> in kernel */ | ||
1278 | SWAPGS | ||
1279 | xorl %ebx,%ebx | ||
1280 | 1: ret | ||
1281 | CFI_ENDPROC | ||
1282 | END(paranoid_entry) | ||
1341 | 1283 | ||
1342 | /* ebx: no swapgs flag */ | 1284 | /* |
1285 | * "Paranoid" exit path from exception stack. This is invoked | ||
1286 | * only on return from non-NMI IST interrupts that came | ||
1287 | * from kernel space. | ||
1288 | * | ||
1289 | * We may be returning to very strange contexts (e.g. very early | ||
1290 | * in syscall entry), so checking for preemption here would | ||
1291 | * be complicated. Fortunately, we there's no good reason | ||
1292 | * to try to handle preemption here. | ||
1293 | */ | ||
1294 | /* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ | ||
1343 | ENTRY(paranoid_exit) | 1295 | ENTRY(paranoid_exit) |
1344 | DEFAULT_FRAME | 1296 | DEFAULT_FRAME |
1345 | DISABLE_INTERRUPTS(CLBR_NONE) | 1297 | DISABLE_INTERRUPTS(CLBR_NONE) |
1346 | TRACE_IRQS_OFF_DEBUG | 1298 | TRACE_IRQS_OFF_DEBUG |
1347 | testl %ebx,%ebx /* swapgs needed? */ | 1299 | testl %ebx,%ebx /* swapgs needed? */ |
1348 | jnz paranoid_restore | 1300 | jnz paranoid_exit_no_swapgs |
1349 | TRACE_IRQS_IRETQ 0 | 1301 | TRACE_IRQS_IRETQ |
1350 | SWAPGS_UNSAFE_STACK | 1302 | SWAPGS_UNSAFE_STACK |
1351 | RESTORE_ALL 8 | 1303 | jmp paranoid_exit_restore |
1352 | INTERRUPT_RETURN | 1304 | paranoid_exit_no_swapgs: |
1353 | paranoid_restore: | 1305 | TRACE_IRQS_IRETQ_DEBUG |
1354 | TRACE_IRQS_IRETQ_DEBUG 0 | 1306 | paranoid_exit_restore: |
1355 | RESTORE_ALL 8 | 1307 | RESTORE_EXTRA_REGS |
1308 | RESTORE_C_REGS | ||
1309 | REMOVE_PT_GPREGS_FROM_STACK 8 | ||
1356 | INTERRUPT_RETURN | 1310 | INTERRUPT_RETURN |
1357 | CFI_ENDPROC | 1311 | CFI_ENDPROC |
1358 | END(paranoid_exit) | 1312 | END(paranoid_exit) |
1359 | 1313 | ||
1360 | /* | 1314 | /* |
1361 | * Exception entry point. This expects an error code/orig_rax on the stack. | 1315 | * Save all registers in pt_regs, and switch gs if needed. |
1362 | * returns in "no swapgs flag" in %ebx. | 1316 | * Return: ebx=0: need swapgs on exit, ebx=1: otherwise |
1363 | */ | 1317 | */ |
1364 | ENTRY(error_entry) | 1318 | ENTRY(error_entry) |
1365 | XCPT_FRAME | 1319 | XCPT_FRAME 1 15*8 |
1366 | CFI_ADJUST_CFA_OFFSET 15*8 | ||
1367 | /* oldrax contains error code */ | ||
1368 | cld | 1320 | cld |
1369 | movq %rdi, RDI+8(%rsp) | 1321 | SAVE_C_REGS 8 |
1370 | movq %rsi, RSI+8(%rsp) | 1322 | SAVE_EXTRA_REGS 8 |
1371 | movq %rdx, RDX+8(%rsp) | ||
1372 | movq %rcx, RCX+8(%rsp) | ||
1373 | movq %rax, RAX+8(%rsp) | ||
1374 | movq %r8, R8+8(%rsp) | ||
1375 | movq %r9, R9+8(%rsp) | ||
1376 | movq %r10, R10+8(%rsp) | ||
1377 | movq %r11, R11+8(%rsp) | ||
1378 | movq_cfi rbx, RBX+8 | ||
1379 | movq %rbp, RBP+8(%rsp) | ||
1380 | movq %r12, R12+8(%rsp) | ||
1381 | movq %r13, R13+8(%rsp) | ||
1382 | movq %r14, R14+8(%rsp) | ||
1383 | movq %r15, R15+8(%rsp) | ||
1384 | xorl %ebx,%ebx | 1323 | xorl %ebx,%ebx |
1385 | testl $3,CS+8(%rsp) | 1324 | testl $3,CS+8(%rsp) |
1386 | je error_kernelspace | 1325 | je error_kernelspace |
@@ -1390,12 +1329,12 @@ error_sti: | |||
1390 | TRACE_IRQS_OFF | 1329 | TRACE_IRQS_OFF |
1391 | ret | 1330 | ret |
1392 | 1331 | ||
1393 | /* | 1332 | /* |
1394 | * There are two places in the kernel that can potentially fault with | 1333 | * There are two places in the kernel that can potentially fault with |
1395 | * usergs. Handle them here. B stepping K8s sometimes report a | 1334 | * usergs. Handle them here. B stepping K8s sometimes report a |
1396 | * truncated RIP for IRET exceptions returning to compat mode. Check | 1335 | * truncated RIP for IRET exceptions returning to compat mode. Check |
1397 | * for these here too. | 1336 | * for these here too. |
1398 | */ | 1337 | */ |
1399 | error_kernelspace: | 1338 | error_kernelspace: |
1400 | CFI_REL_OFFSET rcx, RCX+8 | 1339 | CFI_REL_OFFSET rcx, RCX+8 |
1401 | incl %ebx | 1340 | incl %ebx |
@@ -1425,11 +1364,11 @@ error_bad_iret: | |||
1425 | END(error_entry) | 1364 | END(error_entry) |
1426 | 1365 | ||
1427 | 1366 | ||
1428 | /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */ | 1367 | /* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ |
1429 | ENTRY(error_exit) | 1368 | ENTRY(error_exit) |
1430 | DEFAULT_FRAME | 1369 | DEFAULT_FRAME |
1431 | movl %ebx,%eax | 1370 | movl %ebx,%eax |
1432 | RESTORE_REST | 1371 | RESTORE_EXTRA_REGS |
1433 | DISABLE_INTERRUPTS(CLBR_NONE) | 1372 | DISABLE_INTERRUPTS(CLBR_NONE) |
1434 | TRACE_IRQS_OFF | 1373 | TRACE_IRQS_OFF |
1435 | GET_THREAD_INFO(%rcx) | 1374 | GET_THREAD_INFO(%rcx) |
@@ -1444,19 +1383,7 @@ ENTRY(error_exit) | |||
1444 | CFI_ENDPROC | 1383 | CFI_ENDPROC |
1445 | END(error_exit) | 1384 | END(error_exit) |
1446 | 1385 | ||
1447 | /* | 1386 | /* Runs on exception stack */ |
1448 | * Test if a given stack is an NMI stack or not. | ||
1449 | */ | ||
1450 | .macro test_in_nmi reg stack nmi_ret normal_ret | ||
1451 | cmpq %\reg, \stack | ||
1452 | ja \normal_ret | ||
1453 | subq $EXCEPTION_STKSZ, %\reg | ||
1454 | cmpq %\reg, \stack | ||
1455 | jb \normal_ret | ||
1456 | jmp \nmi_ret | ||
1457 | .endm | ||
1458 | |||
1459 | /* runs on exception stack */ | ||
1460 | ENTRY(nmi) | 1387 | ENTRY(nmi) |
1461 | INTR_FRAME | 1388 | INTR_FRAME |
1462 | PARAVIRT_ADJUST_EXCEPTION_FRAME | 1389 | PARAVIRT_ADJUST_EXCEPTION_FRAME |
@@ -1492,7 +1419,7 @@ ENTRY(nmi) | |||
1492 | * NMI. | 1419 | * NMI. |
1493 | */ | 1420 | */ |
1494 | 1421 | ||
1495 | /* Use %rdx as out temp variable throughout */ | 1422 | /* Use %rdx as our temp variable throughout */ |
1496 | pushq_cfi %rdx | 1423 | pushq_cfi %rdx |
1497 | CFI_REL_OFFSET rdx, 0 | 1424 | CFI_REL_OFFSET rdx, 0 |
1498 | 1425 | ||
@@ -1517,8 +1444,17 @@ ENTRY(nmi) | |||
1517 | * We check the variable because the first NMI could be in a | 1444 | * We check the variable because the first NMI could be in a |
1518 | * breakpoint routine using a breakpoint stack. | 1445 | * breakpoint routine using a breakpoint stack. |
1519 | */ | 1446 | */ |
1520 | lea 6*8(%rsp), %rdx | 1447 | lea 6*8(%rsp), %rdx |
1521 | test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi | 1448 | /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */ |
1449 | cmpq %rdx, 4*8(%rsp) | ||
1450 | /* If the stack pointer is above the NMI stack, this is a normal NMI */ | ||
1451 | ja first_nmi | ||
1452 | subq $EXCEPTION_STKSZ, %rdx | ||
1453 | cmpq %rdx, 4*8(%rsp) | ||
1454 | /* If it is below the NMI stack, it is a normal NMI */ | ||
1455 | jb first_nmi | ||
1456 | /* Ah, it is within the NMI stack, treat it as nested */ | ||
1457 | |||
1522 | CFI_REMEMBER_STATE | 1458 | CFI_REMEMBER_STATE |
1523 | 1459 | ||
1524 | nested_nmi: | 1460 | nested_nmi: |
@@ -1611,7 +1547,7 @@ first_nmi: | |||
1611 | .rept 5 | 1547 | .rept 5 |
1612 | pushq_cfi 11*8(%rsp) | 1548 | pushq_cfi 11*8(%rsp) |
1613 | .endr | 1549 | .endr |
1614 | CFI_DEF_CFA_OFFSET SS+8-RIP | 1550 | CFI_DEF_CFA_OFFSET 5*8 |
1615 | 1551 | ||
1616 | /* Everything up to here is safe from nested NMIs */ | 1552 | /* Everything up to here is safe from nested NMIs */ |
1617 | 1553 | ||
@@ -1639,7 +1575,7 @@ repeat_nmi: | |||
1639 | pushq_cfi -6*8(%rsp) | 1575 | pushq_cfi -6*8(%rsp) |
1640 | .endr | 1576 | .endr |
1641 | subq $(5*8), %rsp | 1577 | subq $(5*8), %rsp |
1642 | CFI_DEF_CFA_OFFSET SS+8-RIP | 1578 | CFI_DEF_CFA_OFFSET 5*8 |
1643 | end_repeat_nmi: | 1579 | end_repeat_nmi: |
1644 | 1580 | ||
1645 | /* | 1581 | /* |
@@ -1648,16 +1584,16 @@ end_repeat_nmi: | |||
1648 | * so that we repeat another NMI. | 1584 | * so that we repeat another NMI. |
1649 | */ | 1585 | */ |
1650 | pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ | 1586 | pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ |
1651 | subq $ORIG_RAX-R15, %rsp | 1587 | ALLOC_PT_GPREGS_ON_STACK |
1652 | CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 | 1588 | |
1653 | /* | 1589 | /* |
1654 | * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit | 1590 | * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit |
1655 | * as we should not be calling schedule in NMI context. | 1591 | * as we should not be calling schedule in NMI context. |
1656 | * Even with normal interrupts enabled. An NMI should not be | 1592 | * Even with normal interrupts enabled. An NMI should not be |
1657 | * setting NEED_RESCHED or anything that normal interrupts and | 1593 | * setting NEED_RESCHED or anything that normal interrupts and |
1658 | * exceptions might do. | 1594 | * exceptions might do. |
1659 | */ | 1595 | */ |
1660 | call save_paranoid | 1596 | call paranoid_entry |
1661 | DEFAULT_FRAME 0 | 1597 | DEFAULT_FRAME 0 |
1662 | 1598 | ||
1663 | /* | 1599 | /* |
@@ -1688,8 +1624,10 @@ end_repeat_nmi: | |||
1688 | nmi_swapgs: | 1624 | nmi_swapgs: |
1689 | SWAPGS_UNSAFE_STACK | 1625 | SWAPGS_UNSAFE_STACK |
1690 | nmi_restore: | 1626 | nmi_restore: |
1627 | RESTORE_EXTRA_REGS | ||
1628 | RESTORE_C_REGS | ||
1691 | /* Pop the extra iret frame at once */ | 1629 | /* Pop the extra iret frame at once */ |
1692 | RESTORE_ALL 6*8 | 1630 | REMOVE_PT_GPREGS_FROM_STACK 6*8 |
1693 | 1631 | ||
1694 | /* Clear the NMI executing stack variable */ | 1632 | /* Clear the NMI executing stack variable */ |
1695 | movq $0, 5*8(%rsp) | 1633 | movq $0, 5*8(%rsp) |
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index c4f8d4659070..2b55ee6db053 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c | |||
@@ -177,9 +177,6 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) | |||
177 | */ | 177 | */ |
178 | load_ucode_bsp(); | 178 | load_ucode_bsp(); |
179 | 179 | ||
180 | if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) | ||
181 | early_printk("Kernel alive\n"); | ||
182 | |||
183 | clear_page(init_level4_pgt); | 180 | clear_page(init_level4_pgt); |
184 | /* set init_level4_pgt kernel high mapping*/ | 181 | /* set init_level4_pgt kernel high mapping*/ |
185 | init_level4_pgt[511] = early_level4_pgt[511]; | 182 | init_level4_pgt[511] = early_level4_pgt[511]; |
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index f36bd42d6f0c..d031bad9e07e 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <asm/cpufeature.h> | 22 | #include <asm/cpufeature.h> |
23 | #include <asm/percpu.h> | 23 | #include <asm/percpu.h> |
24 | #include <asm/nops.h> | 24 | #include <asm/nops.h> |
25 | #include <asm/bootparam.h> | ||
25 | 26 | ||
26 | /* Physical address */ | 27 | /* Physical address */ |
27 | #define pa(X) ((X) - __PAGE_OFFSET) | 28 | #define pa(X) ((X) - __PAGE_OFFSET) |
@@ -90,7 +91,7 @@ ENTRY(startup_32) | |||
90 | 91 | ||
91 | /* test KEEP_SEGMENTS flag to see if the bootloader is asking | 92 | /* test KEEP_SEGMENTS flag to see if the bootloader is asking |
92 | us to not reload segments */ | 93 | us to not reload segments */ |
93 | testb $(1<<6), BP_loadflags(%esi) | 94 | testb $KEEP_SEGMENTS, BP_loadflags(%esi) |
94 | jnz 2f | 95 | jnz 2f |
95 | 96 | ||
96 | /* | 97 | /* |
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 6fd514d9f69a..ae6588b301c2 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit | 2 | * linux/arch/x86/kernel/head_64.S -- start in 32bit and switch to 64bit |
3 | * | 3 | * |
4 | * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE | 4 | * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE |
5 | * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> | 5 | * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> |
@@ -56,7 +56,7 @@ startup_64: | |||
56 | * %rsi holds a physical pointer to real_mode_data. | 56 | * %rsi holds a physical pointer to real_mode_data. |
57 | * | 57 | * |
58 | * We come here either directly from a 64bit bootloader, or from | 58 | * We come here either directly from a 64bit bootloader, or from |
59 | * arch/x86_64/boot/compressed/head.S. | 59 | * arch/x86/boot/compressed/head_64.S. |
60 | * | 60 | * |
61 | * We only come here initially at boot nothing else comes here. | 61 | * We only come here initially at boot nothing else comes here. |
62 | * | 62 | * |
@@ -146,7 +146,7 @@ startup_64: | |||
146 | leaq level2_kernel_pgt(%rip), %rdi | 146 | leaq level2_kernel_pgt(%rip), %rdi |
147 | leaq 4096(%rdi), %r8 | 147 | leaq 4096(%rdi), %r8 |
148 | /* See if it is a valid page table entry */ | 148 | /* See if it is a valid page table entry */ |
149 | 1: testq $1, 0(%rdi) | 149 | 1: testb $1, 0(%rdi) |
150 | jz 2f | 150 | jz 2f |
151 | addq %rbp, 0(%rdi) | 151 | addq %rbp, 0(%rdi) |
152 | /* Go to the next page */ | 152 | /* Go to the next page */ |
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index d5651fce0b71..009183276bb7 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c | |||
@@ -42,8 +42,8 @@ void kernel_fpu_enable(void) | |||
42 | * be set (so that the clts/stts pair does nothing that is | 42 | * be set (so that the clts/stts pair does nothing that is |
43 | * visible in the interrupted kernel thread). | 43 | * visible in the interrupted kernel thread). |
44 | * | 44 | * |
45 | * Except for the eagerfpu case when we return 1 unless we've already | 45 | * Except for the eagerfpu case when we return true; in the likely case |
46 | * been eager and saved the state in kernel_fpu_begin(). | 46 | * the thread has FPU but we are not going to set/clear TS. |
47 | */ | 47 | */ |
48 | static inline bool interrupted_kernel_fpu_idle(void) | 48 | static inline bool interrupted_kernel_fpu_idle(void) |
49 | { | 49 | { |
@@ -51,7 +51,7 @@ static inline bool interrupted_kernel_fpu_idle(void) | |||
51 | return false; | 51 | return false; |
52 | 52 | ||
53 | if (use_eager_fpu()) | 53 | if (use_eager_fpu()) |
54 | return __thread_has_fpu(current); | 54 | return true; |
55 | 55 | ||
56 | return !__thread_has_fpu(current) && | 56 | return !__thread_has_fpu(current) && |
57 | (read_cr0() & X86_CR0_TS); | 57 | (read_cr0() & X86_CR0_TS); |
@@ -68,7 +68,7 @@ static inline bool interrupted_kernel_fpu_idle(void) | |||
68 | static inline bool interrupted_user_mode(void) | 68 | static inline bool interrupted_user_mode(void) |
69 | { | 69 | { |
70 | struct pt_regs *regs = get_irq_regs(); | 70 | struct pt_regs *regs = get_irq_regs(); |
71 | return regs && user_mode_vm(regs); | 71 | return regs && user_mode(regs); |
72 | } | 72 | } |
73 | 73 | ||
74 | /* | 74 | /* |
@@ -94,9 +94,10 @@ void __kernel_fpu_begin(void) | |||
94 | 94 | ||
95 | if (__thread_has_fpu(me)) { | 95 | if (__thread_has_fpu(me)) { |
96 | __save_init_fpu(me); | 96 | __save_init_fpu(me); |
97 | } else if (!use_eager_fpu()) { | 97 | } else { |
98 | this_cpu_write(fpu_owner_task, NULL); | 98 | this_cpu_write(fpu_owner_task, NULL); |
99 | clts(); | 99 | if (!use_eager_fpu()) |
100 | clts(); | ||
100 | } | 101 | } |
101 | } | 102 | } |
102 | EXPORT_SYMBOL(__kernel_fpu_begin); | 103 | EXPORT_SYMBOL(__kernel_fpu_begin); |
@@ -107,7 +108,7 @@ void __kernel_fpu_end(void) | |||
107 | 108 | ||
108 | if (__thread_has_fpu(me)) { | 109 | if (__thread_has_fpu(me)) { |
109 | if (WARN_ON(restore_fpu_checking(me))) | 110 | if (WARN_ON(restore_fpu_checking(me))) |
110 | drop_init_fpu(me); | 111 | fpu_reset_state(me); |
111 | } else if (!use_eager_fpu()) { | 112 | } else if (!use_eager_fpu()) { |
112 | stts(); | 113 | stts(); |
113 | } | 114 | } |
@@ -120,10 +121,13 @@ void unlazy_fpu(struct task_struct *tsk) | |||
120 | { | 121 | { |
121 | preempt_disable(); | 122 | preempt_disable(); |
122 | if (__thread_has_fpu(tsk)) { | 123 | if (__thread_has_fpu(tsk)) { |
123 | __save_init_fpu(tsk); | 124 | if (use_eager_fpu()) { |
124 | __thread_fpu_end(tsk); | 125 | __save_fpu(tsk); |
125 | } else | 126 | } else { |
126 | tsk->thread.fpu_counter = 0; | 127 | __save_init_fpu(tsk); |
128 | __thread_fpu_end(tsk); | ||
129 | } | ||
130 | } | ||
127 | preempt_enable(); | 131 | preempt_enable(); |
128 | } | 132 | } |
129 | EXPORT_SYMBOL(unlazy_fpu); | 133 | EXPORT_SYMBOL(unlazy_fpu); |
@@ -221,11 +225,12 @@ void fpu_finit(struct fpu *fpu) | |||
221 | return; | 225 | return; |
222 | } | 226 | } |
223 | 227 | ||
228 | memset(fpu->state, 0, xstate_size); | ||
229 | |||
224 | if (cpu_has_fxsr) { | 230 | if (cpu_has_fxsr) { |
225 | fx_finit(&fpu->state->fxsave); | 231 | fx_finit(&fpu->state->fxsave); |
226 | } else { | 232 | } else { |
227 | struct i387_fsave_struct *fp = &fpu->state->fsave; | 233 | struct i387_fsave_struct *fp = &fpu->state->fsave; |
228 | memset(fp, 0, xstate_size); | ||
229 | fp->cwd = 0xffff037fu; | 234 | fp->cwd = 0xffff037fu; |
230 | fp->swd = 0xffff0000u; | 235 | fp->swd = 0xffff0000u; |
231 | fp->twd = 0xffffffffu; | 236 | fp->twd = 0xffffffffu; |
@@ -247,7 +252,7 @@ int init_fpu(struct task_struct *tsk) | |||
247 | if (tsk_used_math(tsk)) { | 252 | if (tsk_used_math(tsk)) { |
248 | if (cpu_has_fpu && tsk == current) | 253 | if (cpu_has_fpu && tsk == current) |
249 | unlazy_fpu(tsk); | 254 | unlazy_fpu(tsk); |
250 | tsk->thread.fpu.last_cpu = ~0; | 255 | task_disable_lazy_fpu_restore(tsk); |
251 | return 0; | 256 | return 0; |
252 | } | 257 | } |
253 | 258 | ||
@@ -336,6 +341,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, | |||
336 | unsigned int pos, unsigned int count, | 341 | unsigned int pos, unsigned int count, |
337 | void *kbuf, void __user *ubuf) | 342 | void *kbuf, void __user *ubuf) |
338 | { | 343 | { |
344 | struct xsave_struct *xsave; | ||
339 | int ret; | 345 | int ret; |
340 | 346 | ||
341 | if (!cpu_has_xsave) | 347 | if (!cpu_has_xsave) |
@@ -345,19 +351,19 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, | |||
345 | if (ret) | 351 | if (ret) |
346 | return ret; | 352 | return ret; |
347 | 353 | ||
354 | xsave = &target->thread.fpu.state->xsave; | ||
355 | |||
348 | /* | 356 | /* |
349 | * Copy the 48bytes defined by the software first into the xstate | 357 | * Copy the 48bytes defined by the software first into the xstate |
350 | * memory layout in the thread struct, so that we can copy the entire | 358 | * memory layout in the thread struct, so that we can copy the entire |
351 | * xstateregs to the user using one user_regset_copyout(). | 359 | * xstateregs to the user using one user_regset_copyout(). |
352 | */ | 360 | */ |
353 | memcpy(&target->thread.fpu.state->fxsave.sw_reserved, | 361 | memcpy(&xsave->i387.sw_reserved, |
354 | xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes)); | 362 | xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes)); |
355 | |||
356 | /* | 363 | /* |
357 | * Copy the xstate memory layout. | 364 | * Copy the xstate memory layout. |
358 | */ | 365 | */ |
359 | ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, | 366 | ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); |
360 | &target->thread.fpu.state->xsave, 0, -1); | ||
361 | return ret; | 367 | return ret; |
362 | } | 368 | } |
363 | 369 | ||
@@ -365,8 +371,8 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, | |||
365 | unsigned int pos, unsigned int count, | 371 | unsigned int pos, unsigned int count, |
366 | const void *kbuf, const void __user *ubuf) | 372 | const void *kbuf, const void __user *ubuf) |
367 | { | 373 | { |
374 | struct xsave_struct *xsave; | ||
368 | int ret; | 375 | int ret; |
369 | struct xsave_hdr_struct *xsave_hdr; | ||
370 | 376 | ||
371 | if (!cpu_has_xsave) | 377 | if (!cpu_has_xsave) |
372 | return -ENODEV; | 378 | return -ENODEV; |
@@ -375,22 +381,18 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, | |||
375 | if (ret) | 381 | if (ret) |
376 | return ret; | 382 | return ret; |
377 | 383 | ||
378 | ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, | 384 | xsave = &target->thread.fpu.state->xsave; |
379 | &target->thread.fpu.state->xsave, 0, -1); | ||
380 | 385 | ||
386 | ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); | ||
381 | /* | 387 | /* |
382 | * mxcsr reserved bits must be masked to zero for security reasons. | 388 | * mxcsr reserved bits must be masked to zero for security reasons. |
383 | */ | 389 | */ |
384 | target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask; | 390 | xsave->i387.mxcsr &= mxcsr_feature_mask; |
385 | 391 | xsave->xsave_hdr.xstate_bv &= pcntxt_mask; | |
386 | xsave_hdr = &target->thread.fpu.state->xsave.xsave_hdr; | ||
387 | |||
388 | xsave_hdr->xstate_bv &= pcntxt_mask; | ||
389 | /* | 392 | /* |
390 | * These bits must be zero. | 393 | * These bits must be zero. |
391 | */ | 394 | */ |
392 | memset(xsave_hdr->reserved, 0, 48); | 395 | memset(&xsave->xsave_hdr.reserved, 0, 48); |
393 | |||
394 | return ret; | 396 | return ret; |
395 | } | 397 | } |
396 | 398 | ||
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 4ddaf66ea35f..37dae792dbbe 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c | |||
@@ -54,7 +54,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) | |||
54 | * because the ->io_bitmap_max value must match the bitmap | 54 | * because the ->io_bitmap_max value must match the bitmap |
55 | * contents: | 55 | * contents: |
56 | */ | 56 | */ |
57 | tss = &per_cpu(init_tss, get_cpu()); | 57 | tss = &per_cpu(cpu_tss, get_cpu()); |
58 | 58 | ||
59 | if (turn_on) | 59 | if (turn_on) |
60 | bitmap_clear(t->io_bitmap_ptr, from, num); | 60 | bitmap_clear(t->io_bitmap_ptr, from, num); |
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 67b1cbe0093a..e5952c225532 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c | |||
@@ -295,7 +295,7 @@ int check_irq_vectors_for_cpu_disable(void) | |||
295 | 295 | ||
296 | this_cpu = smp_processor_id(); | 296 | this_cpu = smp_processor_id(); |
297 | cpumask_copy(&online_new, cpu_online_mask); | 297 | cpumask_copy(&online_new, cpu_online_mask); |
298 | cpu_clear(this_cpu, online_new); | 298 | cpumask_clear_cpu(this_cpu, &online_new); |
299 | 299 | ||
300 | this_count = 0; | 300 | this_count = 0; |
301 | for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { | 301 | for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { |
@@ -307,7 +307,7 @@ int check_irq_vectors_for_cpu_disable(void) | |||
307 | 307 | ||
308 | data = irq_desc_get_irq_data(desc); | 308 | data = irq_desc_get_irq_data(desc); |
309 | cpumask_copy(&affinity_new, data->affinity); | 309 | cpumask_copy(&affinity_new, data->affinity); |
310 | cpu_clear(this_cpu, affinity_new); | 310 | cpumask_clear_cpu(this_cpu, &affinity_new); |
311 | 311 | ||
312 | /* Do not count inactive or per-cpu irqs. */ | 312 | /* Do not count inactive or per-cpu irqs. */ |
313 | if (!irq_has_action(irq) || irqd_is_per_cpu(data)) | 313 | if (!irq_has_action(irq) || irqd_is_per_cpu(data)) |
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 28d28f5eb8f4..f9fd86a7fcc7 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c | |||
@@ -165,7 +165,7 @@ bool handle_irq(unsigned irq, struct pt_regs *regs) | |||
165 | if (unlikely(!desc)) | 165 | if (unlikely(!desc)) |
166 | return false; | 166 | return false; |
167 | 167 | ||
168 | if (user_mode_vm(regs) || !execute_on_irq_stack(overflow, desc, irq)) { | 168 | if (user_mode(regs) || !execute_on_irq_stack(overflow, desc, irq)) { |
169 | if (unlikely(overflow)) | 169 | if (unlikely(overflow)) |
170 | print_stack_overflow(); | 170 | print_stack_overflow(); |
171 | desc->handle_irq(irq, desc); | 171 | desc->handle_irq(irq, desc); |
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index e4b503d5558c..394e643d7830 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c | |||
@@ -44,7 +44,7 @@ static inline void stack_overflow_check(struct pt_regs *regs) | |||
44 | u64 estack_top, estack_bottom; | 44 | u64 estack_top, estack_bottom; |
45 | u64 curbase = (u64)task_stack_page(current); | 45 | u64 curbase = (u64)task_stack_page(current); |
46 | 46 | ||
47 | if (user_mode_vm(regs)) | 47 | if (user_mode(regs)) |
48 | return; | 48 | return; |
49 | 49 | ||
50 | if (regs->sp >= curbase + sizeof(struct thread_info) + | 50 | if (regs->sp >= curbase + sizeof(struct thread_info) + |
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 70e181ea1eac..cd10a6437264 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c | |||
@@ -178,7 +178,8 @@ void __init native_init_IRQ(void) | |||
178 | #endif | 178 | #endif |
179 | for_each_clear_bit_from(i, used_vectors, first_system_vector) { | 179 | for_each_clear_bit_from(i, used_vectors, first_system_vector) { |
180 | /* IA32_SYSCALL_VECTOR could be used in trap_init already. */ | 180 | /* IA32_SYSCALL_VECTOR could be used in trap_init already. */ |
181 | set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); | 181 | set_intr_gate(i, irq_entries_start + |
182 | 8 * (i - FIRST_EXTERNAL_VECTOR)); | ||
182 | } | 183 | } |
183 | #ifdef CONFIG_X86_LOCAL_APIC | 184 | #ifdef CONFIG_X86_LOCAL_APIC |
184 | for_each_clear_bit_from(i, used_vectors, NR_VECTORS) | 185 | for_each_clear_bit_from(i, used_vectors, NR_VECTORS) |
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 25ecd56cefa8..d6178d9791db 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c | |||
@@ -126,11 +126,11 @@ char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs) | |||
126 | #ifdef CONFIG_X86_32 | 126 | #ifdef CONFIG_X86_32 |
127 | switch (regno) { | 127 | switch (regno) { |
128 | case GDB_SS: | 128 | case GDB_SS: |
129 | if (!user_mode_vm(regs)) | 129 | if (!user_mode(regs)) |
130 | *(unsigned long *)mem = __KERNEL_DS; | 130 | *(unsigned long *)mem = __KERNEL_DS; |
131 | break; | 131 | break; |
132 | case GDB_SP: | 132 | case GDB_SP: |
133 | if (!user_mode_vm(regs)) | 133 | if (!user_mode(regs)) |
134 | *(unsigned long *)mem = kernel_stack_pointer(regs); | 134 | *(unsigned long *)mem = kernel_stack_pointer(regs); |
135 | break; | 135 | break; |
136 | case GDB_GS: | 136 | case GDB_GS: |
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 4e3d5a9621fe..1deffe6cc873 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c | |||
@@ -354,6 +354,7 @@ int __copy_instruction(u8 *dest, u8 *src) | |||
354 | { | 354 | { |
355 | struct insn insn; | 355 | struct insn insn; |
356 | kprobe_opcode_t buf[MAX_INSN_SIZE]; | 356 | kprobe_opcode_t buf[MAX_INSN_SIZE]; |
357 | int length; | ||
357 | unsigned long recovered_insn = | 358 | unsigned long recovered_insn = |
358 | recover_probed_instruction(buf, (unsigned long)src); | 359 | recover_probed_instruction(buf, (unsigned long)src); |
359 | 360 | ||
@@ -361,16 +362,18 @@ int __copy_instruction(u8 *dest, u8 *src) | |||
361 | return 0; | 362 | return 0; |
362 | kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE); | 363 | kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE); |
363 | insn_get_length(&insn); | 364 | insn_get_length(&insn); |
365 | length = insn.length; | ||
366 | |||
364 | /* Another subsystem puts a breakpoint, failed to recover */ | 367 | /* Another subsystem puts a breakpoint, failed to recover */ |
365 | if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) | 368 | if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) |
366 | return 0; | 369 | return 0; |
367 | memcpy(dest, insn.kaddr, insn.length); | 370 | memcpy(dest, insn.kaddr, length); |
368 | 371 | ||
369 | #ifdef CONFIG_X86_64 | 372 | #ifdef CONFIG_X86_64 |
370 | if (insn_rip_relative(&insn)) { | 373 | if (insn_rip_relative(&insn)) { |
371 | s64 newdisp; | 374 | s64 newdisp; |
372 | u8 *disp; | 375 | u8 *disp; |
373 | kernel_insn_init(&insn, dest, insn.length); | 376 | kernel_insn_init(&insn, dest, length); |
374 | insn_get_displacement(&insn); | 377 | insn_get_displacement(&insn); |
375 | /* | 378 | /* |
376 | * The copied instruction uses the %rip-relative addressing | 379 | * The copied instruction uses the %rip-relative addressing |
@@ -394,7 +397,7 @@ int __copy_instruction(u8 *dest, u8 *src) | |||
394 | *(s32 *) disp = (s32) newdisp; | 397 | *(s32 *) disp = (s32) newdisp; |
395 | } | 398 | } |
396 | #endif | 399 | #endif |
397 | return insn.length; | 400 | return length; |
398 | } | 401 | } |
399 | 402 | ||
400 | static int arch_copy_kprobe(struct kprobe *p) | 403 | static int arch_copy_kprobe(struct kprobe *p) |
@@ -602,7 +605,7 @@ int kprobe_int3_handler(struct pt_regs *regs) | |||
602 | struct kprobe *p; | 605 | struct kprobe *p; |
603 | struct kprobe_ctlblk *kcb; | 606 | struct kprobe_ctlblk *kcb; |
604 | 607 | ||
605 | if (user_mode_vm(regs)) | 608 | if (user_mode(regs)) |
606 | return 0; | 609 | return 0; |
607 | 610 | ||
608 | addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t)); | 611 | addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t)); |
@@ -1007,7 +1010,7 @@ int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, | |||
1007 | struct die_args *args = data; | 1010 | struct die_args *args = data; |
1008 | int ret = NOTIFY_DONE; | 1011 | int ret = NOTIFY_DONE; |
1009 | 1012 | ||
1010 | if (args->regs && user_mode_vm(args->regs)) | 1013 | if (args->regs && user_mode(args->regs)) |
1011 | return ret; | 1014 | return ret; |
1012 | 1015 | ||
1013 | if (val == DIE_GPF) { | 1016 | if (val == DIE_GPF) { |
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index e354cc6446ab..9435620062df 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c | |||
@@ -513,7 +513,7 @@ void __init kvm_guest_init(void) | |||
513 | * can get false positives too easily, for example if the host is | 513 | * can get false positives too easily, for example if the host is |
514 | * overcommitted. | 514 | * overcommitted. |
515 | */ | 515 | */ |
516 | watchdog_enable_hardlockup_detector(false); | 516 | hardlockup_detector_disable(); |
517 | } | 517 | } |
518 | 518 | ||
519 | static noinline uint32_t __kvm_cpuid_base(void) | 519 | static noinline uint32_t __kvm_cpuid_base(void) |
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index d1ac80b72c72..005c03e93fc5 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c | |||
@@ -33,6 +33,7 @@ | |||
33 | 33 | ||
34 | #include <asm/page.h> | 34 | #include <asm/page.h> |
35 | #include <asm/pgtable.h> | 35 | #include <asm/pgtable.h> |
36 | #include <asm/setup.h> | ||
36 | 37 | ||
37 | #if 0 | 38 | #if 0 |
38 | #define DEBUGP(fmt, ...) \ | 39 | #define DEBUGP(fmt, ...) \ |
@@ -47,21 +48,13 @@ do { \ | |||
47 | 48 | ||
48 | #ifdef CONFIG_RANDOMIZE_BASE | 49 | #ifdef CONFIG_RANDOMIZE_BASE |
49 | static unsigned long module_load_offset; | 50 | static unsigned long module_load_offset; |
50 | static int randomize_modules = 1; | ||
51 | 51 | ||
52 | /* Mutex protects the module_load_offset. */ | 52 | /* Mutex protects the module_load_offset. */ |
53 | static DEFINE_MUTEX(module_kaslr_mutex); | 53 | static DEFINE_MUTEX(module_kaslr_mutex); |
54 | 54 | ||
55 | static int __init parse_nokaslr(char *p) | ||
56 | { | ||
57 | randomize_modules = 0; | ||
58 | return 0; | ||
59 | } | ||
60 | early_param("nokaslr", parse_nokaslr); | ||
61 | |||
62 | static unsigned long int get_module_load_offset(void) | 55 | static unsigned long int get_module_load_offset(void) |
63 | { | 56 | { |
64 | if (randomize_modules) { | 57 | if (kaslr_enabled()) { |
65 | mutex_lock(&module_kaslr_mutex); | 58 | mutex_lock(&module_kaslr_mutex); |
66 | /* | 59 | /* |
67 | * Calculate the module_load_offset the first time this | 60 | * Calculate the module_load_offset the first time this |
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 548d25f00c90..c614dd492f5f 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c | |||
@@ -443,7 +443,7 @@ struct pv_mmu_ops pv_mmu_ops = { | |||
443 | .ptep_modify_prot_start = __ptep_modify_prot_start, | 443 | .ptep_modify_prot_start = __ptep_modify_prot_start, |
444 | .ptep_modify_prot_commit = __ptep_modify_prot_commit, | 444 | .ptep_modify_prot_commit = __ptep_modify_prot_commit, |
445 | 445 | ||
446 | #if PAGETABLE_LEVELS >= 3 | 446 | #if CONFIG_PGTABLE_LEVELS >= 3 |
447 | #ifdef CONFIG_X86_PAE | 447 | #ifdef CONFIG_X86_PAE |
448 | .set_pte_atomic = native_set_pte_atomic, | 448 | .set_pte_atomic = native_set_pte_atomic, |
449 | .pte_clear = native_pte_clear, | 449 | .pte_clear = native_pte_clear, |
@@ -454,13 +454,13 @@ struct pv_mmu_ops pv_mmu_ops = { | |||
454 | .pmd_val = PTE_IDENT, | 454 | .pmd_val = PTE_IDENT, |
455 | .make_pmd = PTE_IDENT, | 455 | .make_pmd = PTE_IDENT, |
456 | 456 | ||
457 | #if PAGETABLE_LEVELS == 4 | 457 | #if CONFIG_PGTABLE_LEVELS == 4 |
458 | .pud_val = PTE_IDENT, | 458 | .pud_val = PTE_IDENT, |
459 | .make_pud = PTE_IDENT, | 459 | .make_pud = PTE_IDENT, |
460 | 460 | ||
461 | .set_pgd = native_set_pgd, | 461 | .set_pgd = native_set_pgd, |
462 | #endif | 462 | #endif |
463 | #endif /* PAGETABLE_LEVELS >= 3 */ | 463 | #endif /* CONFIG_PGTABLE_LEVELS >= 3 */ |
464 | 464 | ||
465 | .pte_val = PTE_IDENT, | 465 | .pte_val = PTE_IDENT, |
466 | .pgd_val = PTE_IDENT, | 466 | .pgd_val = PTE_IDENT, |
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c index 781861cc5ee8..da8cb987b973 100644 --- a/arch/x86/kernel/perf_regs.c +++ b/arch/x86/kernel/perf_regs.c | |||
@@ -131,10 +131,11 @@ void perf_get_regs_user(struct perf_regs *regs_user, | |||
131 | } | 131 | } |
132 | 132 | ||
133 | /* | 133 | /* |
134 | * RIP, flags, and the argument registers are usually saved. | 134 | * These registers are always saved on 64-bit syscall entry. |
135 | * orig_ax is probably okay, too. | 135 | * On 32-bit entry points, they are saved too except r8..r11. |
136 | */ | 136 | */ |
137 | regs_user_copy->ip = user_regs->ip; | 137 | regs_user_copy->ip = user_regs->ip; |
138 | regs_user_copy->ax = user_regs->ax; | ||
138 | regs_user_copy->cx = user_regs->cx; | 139 | regs_user_copy->cx = user_regs->cx; |
139 | regs_user_copy->dx = user_regs->dx; | 140 | regs_user_copy->dx = user_regs->dx; |
140 | regs_user_copy->si = user_regs->si; | 141 | regs_user_copy->si = user_regs->si; |
@@ -145,9 +146,12 @@ void perf_get_regs_user(struct perf_regs *regs_user, | |||
145 | regs_user_copy->r11 = user_regs->r11; | 146 | regs_user_copy->r11 = user_regs->r11; |
146 | regs_user_copy->orig_ax = user_regs->orig_ax; | 147 | regs_user_copy->orig_ax = user_regs->orig_ax; |
147 | regs_user_copy->flags = user_regs->flags; | 148 | regs_user_copy->flags = user_regs->flags; |
149 | regs_user_copy->sp = user_regs->sp; | ||
150 | regs_user_copy->cs = user_regs->cs; | ||
151 | regs_user_copy->ss = user_regs->ss; | ||
148 | 152 | ||
149 | /* | 153 | /* |
150 | * Don't even try to report the "rest" regs. | 154 | * Most system calls don't save these registers, don't report them. |
151 | */ | 155 | */ |
152 | regs_user_copy->bx = -1; | 156 | regs_user_copy->bx = -1; |
153 | regs_user_copy->bp = -1; | 157 | regs_user_copy->bp = -1; |
@@ -158,37 +162,13 @@ void perf_get_regs_user(struct perf_regs *regs_user, | |||
158 | 162 | ||
159 | /* | 163 | /* |
160 | * For this to be at all useful, we need a reasonable guess for | 164 | * For this to be at all useful, we need a reasonable guess for |
161 | * sp and the ABI. Be careful: we're in NMI context, and we're | 165 | * the ABI. Be careful: we're in NMI context, and we're |
162 | * considering current to be the current task, so we should | 166 | * considering current to be the current task, so we should |
163 | * be careful not to look at any other percpu variables that might | 167 | * be careful not to look at any other percpu variables that might |
164 | * change during context switches. | 168 | * change during context switches. |
165 | */ | 169 | */ |
166 | if (IS_ENABLED(CONFIG_IA32_EMULATION) && | 170 | regs_user->abi = user_64bit_mode(user_regs) ? |
167 | task_thread_info(current)->status & TS_COMPAT) { | 171 | PERF_SAMPLE_REGS_ABI_64 : PERF_SAMPLE_REGS_ABI_32; |
168 | /* Easy case: we're in a compat syscall. */ | ||
169 | regs_user->abi = PERF_SAMPLE_REGS_ABI_32; | ||
170 | regs_user_copy->sp = user_regs->sp; | ||
171 | regs_user_copy->cs = user_regs->cs; | ||
172 | regs_user_copy->ss = user_regs->ss; | ||
173 | } else if (user_regs->orig_ax != -1) { | ||
174 | /* | ||
175 | * We're probably in a 64-bit syscall. | ||
176 | * Warning: this code is severely racy. At least it's better | ||
177 | * than just blindly copying user_regs. | ||
178 | */ | ||
179 | regs_user->abi = PERF_SAMPLE_REGS_ABI_64; | ||
180 | regs_user_copy->sp = this_cpu_read(old_rsp); | ||
181 | regs_user_copy->cs = __USER_CS; | ||
182 | regs_user_copy->ss = __USER_DS; | ||
183 | regs_user_copy->cx = -1; /* usually contains garbage */ | ||
184 | } else { | ||
185 | /* We're probably in an interrupt or exception. */ | ||
186 | regs_user->abi = user_64bit_mode(user_regs) ? | ||
187 | PERF_SAMPLE_REGS_ABI_64 : PERF_SAMPLE_REGS_ABI_32; | ||
188 | regs_user_copy->sp = user_regs->sp; | ||
189 | regs_user_copy->cs = user_regs->cs; | ||
190 | regs_user_copy->ss = user_regs->ss; | ||
191 | } | ||
192 | 172 | ||
193 | regs_user->regs = regs_user_copy; | 173 | regs_user->regs = regs_user_copy; |
194 | } | 174 | } |
diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c new file mode 100644 index 000000000000..3420c874ddc5 --- /dev/null +++ b/arch/x86/kernel/pmem.c | |||
@@ -0,0 +1,53 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2015, Christoph Hellwig. | ||
3 | */ | ||
4 | #include <linux/memblock.h> | ||
5 | #include <linux/platform_device.h> | ||
6 | #include <linux/slab.h> | ||
7 | #include <asm/e820.h> | ||
8 | #include <asm/page_types.h> | ||
9 | #include <asm/setup.h> | ||
10 | |||
11 | static __init void register_pmem_device(struct resource *res) | ||
12 | { | ||
13 | struct platform_device *pdev; | ||
14 | int error; | ||
15 | |||
16 | pdev = platform_device_alloc("pmem", PLATFORM_DEVID_AUTO); | ||
17 | if (!pdev) | ||
18 | return; | ||
19 | |||
20 | error = platform_device_add_resources(pdev, res, 1); | ||
21 | if (error) | ||
22 | goto out_put_pdev; | ||
23 | |||
24 | error = platform_device_add(pdev); | ||
25 | if (error) | ||
26 | goto out_put_pdev; | ||
27 | return; | ||
28 | |||
29 | out_put_pdev: | ||
30 | dev_warn(&pdev->dev, "failed to add 'pmem' (persistent memory) device!\n"); | ||
31 | platform_device_put(pdev); | ||
32 | } | ||
33 | |||
34 | static __init int register_pmem_devices(void) | ||
35 | { | ||
36 | int i; | ||
37 | |||
38 | for (i = 0; i < e820.nr_map; i++) { | ||
39 | struct e820entry *ei = &e820.map[i]; | ||
40 | |||
41 | if (ei->type == E820_PRAM) { | ||
42 | struct resource res = { | ||
43 | .flags = IORESOURCE_MEM, | ||
44 | .start = ei->addr, | ||
45 | .end = ei->addr + ei->size - 1, | ||
46 | }; | ||
47 | register_pmem_device(&res); | ||
48 | } | ||
49 | } | ||
50 | |||
51 | return 0; | ||
52 | } | ||
53 | device_initcall(register_pmem_devices); | ||
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 046e2d620bbe..8213da62b1b7 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c | |||
@@ -9,7 +9,7 @@ | |||
9 | #include <linux/sched.h> | 9 | #include <linux/sched.h> |
10 | #include <linux/module.h> | 10 | #include <linux/module.h> |
11 | #include <linux/pm.h> | 11 | #include <linux/pm.h> |
12 | #include <linux/clockchips.h> | 12 | #include <linux/tick.h> |
13 | #include <linux/random.h> | 13 | #include <linux/random.h> |
14 | #include <linux/user-return-notifier.h> | 14 | #include <linux/user-return-notifier.h> |
15 | #include <linux/dmi.h> | 15 | #include <linux/dmi.h> |
@@ -24,6 +24,7 @@ | |||
24 | #include <asm/syscalls.h> | 24 | #include <asm/syscalls.h> |
25 | #include <asm/idle.h> | 25 | #include <asm/idle.h> |
26 | #include <asm/uaccess.h> | 26 | #include <asm/uaccess.h> |
27 | #include <asm/mwait.h> | ||
27 | #include <asm/i387.h> | 28 | #include <asm/i387.h> |
28 | #include <asm/fpu-internal.h> | 29 | #include <asm/fpu-internal.h> |
29 | #include <asm/debugreg.h> | 30 | #include <asm/debugreg.h> |
@@ -37,7 +38,26 @@ | |||
37 | * section. Since TSS's are completely CPU-local, we want them | 38 | * section. Since TSS's are completely CPU-local, we want them |
38 | * on exact cacheline boundaries, to eliminate cacheline ping-pong. | 39 | * on exact cacheline boundaries, to eliminate cacheline ping-pong. |
39 | */ | 40 | */ |
40 | __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; | 41 | __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { |
42 | .x86_tss = { | ||
43 | .sp0 = TOP_OF_INIT_STACK, | ||
44 | #ifdef CONFIG_X86_32 | ||
45 | .ss0 = __KERNEL_DS, | ||
46 | .ss1 = __KERNEL_CS, | ||
47 | .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, | ||
48 | #endif | ||
49 | }, | ||
50 | #ifdef CONFIG_X86_32 | ||
51 | /* | ||
52 | * Note that the .io_bitmap member must be extra-big. This is because | ||
53 | * the CPU will access an additional byte beyond the end of the IO | ||
54 | * permission bitmap. The extra byte must be all 1 bits, and must | ||
55 | * be within the limit. | ||
56 | */ | ||
57 | .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, | ||
58 | #endif | ||
59 | }; | ||
60 | EXPORT_PER_CPU_SYMBOL_GPL(cpu_tss); | ||
41 | 61 | ||
42 | #ifdef CONFIG_X86_64 | 62 | #ifdef CONFIG_X86_64 |
43 | static DEFINE_PER_CPU(unsigned char, is_idle); | 63 | static DEFINE_PER_CPU(unsigned char, is_idle); |
@@ -69,8 +89,8 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) | |||
69 | 89 | ||
70 | dst->thread.fpu_counter = 0; | 90 | dst->thread.fpu_counter = 0; |
71 | dst->thread.fpu.has_fpu = 0; | 91 | dst->thread.fpu.has_fpu = 0; |
72 | dst->thread.fpu.last_cpu = ~0; | ||
73 | dst->thread.fpu.state = NULL; | 92 | dst->thread.fpu.state = NULL; |
93 | task_disable_lazy_fpu_restore(dst); | ||
74 | if (tsk_used_math(src)) { | 94 | if (tsk_used_math(src)) { |
75 | int err = fpu_alloc(&dst->thread.fpu); | 95 | int err = fpu_alloc(&dst->thread.fpu); |
76 | if (err) | 96 | if (err) |
@@ -109,7 +129,7 @@ void exit_thread(void) | |||
109 | unsigned long *bp = t->io_bitmap_ptr; | 129 | unsigned long *bp = t->io_bitmap_ptr; |
110 | 130 | ||
111 | if (bp) { | 131 | if (bp) { |
112 | struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); | 132 | struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu()); |
113 | 133 | ||
114 | t->io_bitmap_ptr = NULL; | 134 | t->io_bitmap_ptr = NULL; |
115 | clear_thread_flag(TIF_IO_BITMAP); | 135 | clear_thread_flag(TIF_IO_BITMAP); |
@@ -131,13 +151,18 @@ void flush_thread(void) | |||
131 | 151 | ||
132 | flush_ptrace_hw_breakpoint(tsk); | 152 | flush_ptrace_hw_breakpoint(tsk); |
133 | memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); | 153 | memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); |
134 | drop_init_fpu(tsk); | 154 | |
135 | /* | 155 | if (!use_eager_fpu()) { |
136 | * Free the FPU state for non xsave platforms. They get reallocated | 156 | /* FPU state will be reallocated lazily at the first use. */ |
137 | * lazily at the first use. | 157 | drop_fpu(tsk); |
138 | */ | ||
139 | if (!use_eager_fpu()) | ||
140 | free_thread_xstate(tsk); | 158 | free_thread_xstate(tsk); |
159 | } else if (!used_math()) { | ||
160 | /* kthread execs. TODO: cleanup this horror. */ | ||
161 | if (WARN_ON(init_fpu(tsk))) | ||
162 | force_sig(SIGKILL, tsk); | ||
163 | user_fpu_begin(); | ||
164 | restore_init_xstate(); | ||
165 | } | ||
141 | } | 166 | } |
142 | 167 | ||
143 | static void hard_disable_TSC(void) | 168 | static void hard_disable_TSC(void) |
@@ -377,14 +402,11 @@ static void amd_e400_idle(void) | |||
377 | 402 | ||
378 | if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) { | 403 | if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) { |
379 | cpumask_set_cpu(cpu, amd_e400_c1e_mask); | 404 | cpumask_set_cpu(cpu, amd_e400_c1e_mask); |
380 | /* | 405 | /* Force broadcast so ACPI can not interfere. */ |
381 | * Force broadcast so ACPI can not interfere. | 406 | tick_broadcast_force(); |
382 | */ | ||
383 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, | ||
384 | &cpu); | ||
385 | pr_info("Switch to broadcast mode on CPU%d\n", cpu); | 407 | pr_info("Switch to broadcast mode on CPU%d\n", cpu); |
386 | } | 408 | } |
387 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); | 409 | tick_broadcast_enter(); |
388 | 410 | ||
389 | default_idle(); | 411 | default_idle(); |
390 | 412 | ||
@@ -393,12 +415,59 @@ static void amd_e400_idle(void) | |||
393 | * called with interrupts disabled. | 415 | * called with interrupts disabled. |
394 | */ | 416 | */ |
395 | local_irq_disable(); | 417 | local_irq_disable(); |
396 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); | 418 | tick_broadcast_exit(); |
397 | local_irq_enable(); | 419 | local_irq_enable(); |
398 | } else | 420 | } else |
399 | default_idle(); | 421 | default_idle(); |
400 | } | 422 | } |
401 | 423 | ||
424 | /* | ||
425 | * Intel Core2 and older machines prefer MWAIT over HALT for C1. | ||
426 | * We can't rely on cpuidle installing MWAIT, because it will not load | ||
427 | * on systems that support only C1 -- so the boot default must be MWAIT. | ||
428 | * | ||
429 | * Some AMD machines are the opposite, they depend on using HALT. | ||
430 | * | ||
431 | * So for default C1, which is used during boot until cpuidle loads, | ||
432 | * use MWAIT-C1 on Intel HW that has it, else use HALT. | ||
433 | */ | ||
434 | static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) | ||
435 | { | ||
436 | if (c->x86_vendor != X86_VENDOR_INTEL) | ||
437 | return 0; | ||
438 | |||
439 | if (!cpu_has(c, X86_FEATURE_MWAIT)) | ||
440 | return 0; | ||
441 | |||
442 | return 1; | ||
443 | } | ||
444 | |||
445 | /* | ||
446 | * MONITOR/MWAIT with no hints, used for default default C1 state. | ||
447 | * This invokes MWAIT with interrutps enabled and no flags, | ||
448 | * which is backwards compatible with the original MWAIT implementation. | ||
449 | */ | ||
450 | |||
451 | static void mwait_idle(void) | ||
452 | { | ||
453 | if (!current_set_polling_and_test()) { | ||
454 | if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) { | ||
455 | smp_mb(); /* quirk */ | ||
456 | clflush((void *)¤t_thread_info()->flags); | ||
457 | smp_mb(); /* quirk */ | ||
458 | } | ||
459 | |||
460 | __monitor((void *)¤t_thread_info()->flags, 0, 0); | ||
461 | if (!need_resched()) | ||
462 | __sti_mwait(0, 0); | ||
463 | else | ||
464 | local_irq_enable(); | ||
465 | } else { | ||
466 | local_irq_enable(); | ||
467 | } | ||
468 | __current_clr_polling(); | ||
469 | } | ||
470 | |||
402 | void select_idle_routine(const struct cpuinfo_x86 *c) | 471 | void select_idle_routine(const struct cpuinfo_x86 *c) |
403 | { | 472 | { |
404 | #ifdef CONFIG_SMP | 473 | #ifdef CONFIG_SMP |
@@ -412,6 +481,9 @@ void select_idle_routine(const struct cpuinfo_x86 *c) | |||
412 | /* E400: APIC timer interrupt does not wake up CPU from C1e */ | 481 | /* E400: APIC timer interrupt does not wake up CPU from C1e */ |
413 | pr_info("using AMD E400 aware idle routine\n"); | 482 | pr_info("using AMD E400 aware idle routine\n"); |
414 | x86_idle = amd_e400_idle; | 483 | x86_idle = amd_e400_idle; |
484 | } else if (prefer_mwait_c1_over_halt(c)) { | ||
485 | pr_info("using mwait in idle threads\n"); | ||
486 | x86_idle = mwait_idle; | ||
415 | } else | 487 | } else |
416 | x86_idle = default_idle; | 488 | x86_idle = default_idle; |
417 | } | 489 | } |
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 603c4f99cb5a..8ed2106b06da 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c | |||
@@ -73,7 +73,7 @@ void __show_regs(struct pt_regs *regs, int all) | |||
73 | unsigned long sp; | 73 | unsigned long sp; |
74 | unsigned short ss, gs; | 74 | unsigned short ss, gs; |
75 | 75 | ||
76 | if (user_mode_vm(regs)) { | 76 | if (user_mode(regs)) { |
77 | sp = regs->sp; | 77 | sp = regs->sp; |
78 | ss = regs->ss & 0xffff; | 78 | ss = regs->ss & 0xffff; |
79 | gs = get_user_gs(regs); | 79 | gs = get_user_gs(regs); |
@@ -206,11 +206,7 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) | |||
206 | regs->ip = new_ip; | 206 | regs->ip = new_ip; |
207 | regs->sp = new_sp; | 207 | regs->sp = new_sp; |
208 | regs->flags = X86_EFLAGS_IF; | 208 | regs->flags = X86_EFLAGS_IF; |
209 | /* | 209 | force_iret(); |
210 | * force it to the iret return path by making it look as if there was | ||
211 | * some work pending. | ||
212 | */ | ||
213 | set_thread_flag(TIF_NOTIFY_RESUME); | ||
214 | } | 210 | } |
215 | EXPORT_SYMBOL_GPL(start_thread); | 211 | EXPORT_SYMBOL_GPL(start_thread); |
216 | 212 | ||
@@ -248,7 +244,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
248 | struct thread_struct *prev = &prev_p->thread, | 244 | struct thread_struct *prev = &prev_p->thread, |
249 | *next = &next_p->thread; | 245 | *next = &next_p->thread; |
250 | int cpu = smp_processor_id(); | 246 | int cpu = smp_processor_id(); |
251 | struct tss_struct *tss = &per_cpu(init_tss, cpu); | 247 | struct tss_struct *tss = &per_cpu(cpu_tss, cpu); |
252 | fpu_switch_t fpu; | 248 | fpu_switch_t fpu; |
253 | 249 | ||
254 | /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ | 250 | /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ |
@@ -256,11 +252,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
256 | fpu = switch_fpu_prepare(prev_p, next_p, cpu); | 252 | fpu = switch_fpu_prepare(prev_p, next_p, cpu); |
257 | 253 | ||
258 | /* | 254 | /* |
259 | * Reload esp0. | ||
260 | */ | ||
261 | load_sp0(tss, next); | ||
262 | |||
263 | /* | ||
264 | * Save away %gs. No need to save %fs, as it was saved on the | 255 | * Save away %gs. No need to save %fs, as it was saved on the |
265 | * stack on entry. No need to save %es and %ds, as those are | 256 | * stack on entry. No need to save %es and %ds, as those are |
266 | * always kernel segments while inside the kernel. Doing this | 257 | * always kernel segments while inside the kernel. Doing this |
@@ -310,9 +301,17 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
310 | */ | 301 | */ |
311 | arch_end_context_switch(next_p); | 302 | arch_end_context_switch(next_p); |
312 | 303 | ||
304 | /* | ||
305 | * Reload esp0, kernel_stack, and current_top_of_stack. This changes | ||
306 | * current_thread_info(). | ||
307 | */ | ||
308 | load_sp0(tss, next); | ||
313 | this_cpu_write(kernel_stack, | 309 | this_cpu_write(kernel_stack, |
314 | (unsigned long)task_stack_page(next_p) + | 310 | (unsigned long)task_stack_page(next_p) + |
315 | THREAD_SIZE - KERNEL_STACK_OFFSET); | 311 | THREAD_SIZE); |
312 | this_cpu_write(cpu_current_top_of_stack, | ||
313 | (unsigned long)task_stack_page(next_p) + | ||
314 | THREAD_SIZE); | ||
316 | 315 | ||
317 | /* | 316 | /* |
318 | * Restore %gs if needed (which is common) | 317 | * Restore %gs if needed (which is common) |
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 67fcc43577d2..4baaa972f52a 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c | |||
@@ -52,7 +52,7 @@ | |||
52 | 52 | ||
53 | asmlinkage extern void ret_from_fork(void); | 53 | asmlinkage extern void ret_from_fork(void); |
54 | 54 | ||
55 | __visible DEFINE_PER_CPU(unsigned long, old_rsp); | 55 | __visible DEFINE_PER_CPU(unsigned long, rsp_scratch); |
56 | 56 | ||
57 | /* Prints also some state that isn't saved in the pt_regs */ | 57 | /* Prints also some state that isn't saved in the pt_regs */ |
58 | void __show_regs(struct pt_regs *regs, int all) | 58 | void __show_regs(struct pt_regs *regs, int all) |
@@ -161,7 +161,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, | |||
161 | p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE; | 161 | p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE; |
162 | childregs = task_pt_regs(p); | 162 | childregs = task_pt_regs(p); |
163 | p->thread.sp = (unsigned long) childregs; | 163 | p->thread.sp = (unsigned long) childregs; |
164 | p->thread.usersp = me->thread.usersp; | ||
165 | set_tsk_thread_flag(p, TIF_FORK); | 164 | set_tsk_thread_flag(p, TIF_FORK); |
166 | p->thread.io_bitmap_ptr = NULL; | 165 | p->thread.io_bitmap_ptr = NULL; |
167 | 166 | ||
@@ -207,7 +206,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, | |||
207 | */ | 206 | */ |
208 | if (clone_flags & CLONE_SETTLS) { | 207 | if (clone_flags & CLONE_SETTLS) { |
209 | #ifdef CONFIG_IA32_EMULATION | 208 | #ifdef CONFIG_IA32_EMULATION |
210 | if (test_thread_flag(TIF_IA32)) | 209 | if (is_ia32_task()) |
211 | err = do_set_thread_area(p, -1, | 210 | err = do_set_thread_area(p, -1, |
212 | (struct user_desc __user *)childregs->si, 0); | 211 | (struct user_desc __user *)childregs->si, 0); |
213 | else | 212 | else |
@@ -235,13 +234,12 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, | |||
235 | loadsegment(es, _ds); | 234 | loadsegment(es, _ds); |
236 | loadsegment(ds, _ds); | 235 | loadsegment(ds, _ds); |
237 | load_gs_index(0); | 236 | load_gs_index(0); |
238 | current->thread.usersp = new_sp; | ||
239 | regs->ip = new_ip; | 237 | regs->ip = new_ip; |
240 | regs->sp = new_sp; | 238 | regs->sp = new_sp; |
241 | this_cpu_write(old_rsp, new_sp); | ||
242 | regs->cs = _cs; | 239 | regs->cs = _cs; |
243 | regs->ss = _ss; | 240 | regs->ss = _ss; |
244 | regs->flags = X86_EFLAGS_IF; | 241 | regs->flags = X86_EFLAGS_IF; |
242 | force_iret(); | ||
245 | } | 243 | } |
246 | 244 | ||
247 | void | 245 | void |
@@ -277,15 +275,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
277 | struct thread_struct *prev = &prev_p->thread; | 275 | struct thread_struct *prev = &prev_p->thread; |
278 | struct thread_struct *next = &next_p->thread; | 276 | struct thread_struct *next = &next_p->thread; |
279 | int cpu = smp_processor_id(); | 277 | int cpu = smp_processor_id(); |
280 | struct tss_struct *tss = &per_cpu(init_tss, cpu); | 278 | struct tss_struct *tss = &per_cpu(cpu_tss, cpu); |
281 | unsigned fsindex, gsindex; | 279 | unsigned fsindex, gsindex; |
282 | fpu_switch_t fpu; | 280 | fpu_switch_t fpu; |
283 | 281 | ||
284 | fpu = switch_fpu_prepare(prev_p, next_p, cpu); | 282 | fpu = switch_fpu_prepare(prev_p, next_p, cpu); |
285 | 283 | ||
286 | /* Reload esp0 and ss1. */ | ||
287 | load_sp0(tss, next); | ||
288 | |||
289 | /* We must save %fs and %gs before load_TLS() because | 284 | /* We must save %fs and %gs before load_TLS() because |
290 | * %fs and %gs may be cleared by load_TLS(). | 285 | * %fs and %gs may be cleared by load_TLS(). |
291 | * | 286 | * |
@@ -401,8 +396,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
401 | /* | 396 | /* |
402 | * Switch the PDA and FPU contexts. | 397 | * Switch the PDA and FPU contexts. |
403 | */ | 398 | */ |
404 | prev->usersp = this_cpu_read(old_rsp); | ||
405 | this_cpu_write(old_rsp, next->usersp); | ||
406 | this_cpu_write(current_task, next_p); | 399 | this_cpu_write(current_task, next_p); |
407 | 400 | ||
408 | /* | 401 | /* |
@@ -413,9 +406,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
413 | task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count); | 406 | task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count); |
414 | this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count); | 407 | this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count); |
415 | 408 | ||
409 | /* Reload esp0 and ss1. This changes current_thread_info(). */ | ||
410 | load_sp0(tss, next); | ||
411 | |||
416 | this_cpu_write(kernel_stack, | 412 | this_cpu_write(kernel_stack, |
417 | (unsigned long)task_stack_page(next_p) + | 413 | (unsigned long)task_stack_page(next_p) + THREAD_SIZE); |
418 | THREAD_SIZE - KERNEL_STACK_OFFSET); | ||
419 | 414 | ||
420 | /* | 415 | /* |
421 | * Now maybe reload the debug registers and handle I/O bitmaps | 416 | * Now maybe reload the debug registers and handle I/O bitmaps |
@@ -602,6 +597,5 @@ long sys_arch_prctl(int code, unsigned long addr) | |||
602 | 597 | ||
603 | unsigned long KSTK_ESP(struct task_struct *task) | 598 | unsigned long KSTK_ESP(struct task_struct *task) |
604 | { | 599 | { |
605 | return (test_tsk_thread_flag(task, TIF_IA32)) ? | 600 | return task_pt_regs(task)->sp; |
606 | (task_pt_regs(task)->sp) : ((task)->thread.usersp); | ||
607 | } | 601 | } |
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index e510618b2e91..a7bc79480719 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c | |||
@@ -364,18 +364,12 @@ static int set_segment_reg(struct task_struct *task, | |||
364 | case offsetof(struct user_regs_struct,cs): | 364 | case offsetof(struct user_regs_struct,cs): |
365 | if (unlikely(value == 0)) | 365 | if (unlikely(value == 0)) |
366 | return -EIO; | 366 | return -EIO; |
367 | #ifdef CONFIG_IA32_EMULATION | 367 | task_pt_regs(task)->cs = value; |
368 | if (test_tsk_thread_flag(task, TIF_IA32)) | ||
369 | task_pt_regs(task)->cs = value; | ||
370 | #endif | ||
371 | break; | 368 | break; |
372 | case offsetof(struct user_regs_struct,ss): | 369 | case offsetof(struct user_regs_struct,ss): |
373 | if (unlikely(value == 0)) | 370 | if (unlikely(value == 0)) |
374 | return -EIO; | 371 | return -EIO; |
375 | #ifdef CONFIG_IA32_EMULATION | 372 | task_pt_regs(task)->ss = value; |
376 | if (test_tsk_thread_flag(task, TIF_IA32)) | ||
377 | task_pt_regs(task)->ss = value; | ||
378 | #endif | ||
379 | break; | 373 | break; |
380 | } | 374 | } |
381 | 375 | ||
@@ -1421,7 +1415,7 @@ static void fill_sigtrap_info(struct task_struct *tsk, | |||
1421 | memset(info, 0, sizeof(*info)); | 1415 | memset(info, 0, sizeof(*info)); |
1422 | info->si_signo = SIGTRAP; | 1416 | info->si_signo = SIGTRAP; |
1423 | info->si_code = si_code; | 1417 | info->si_code = si_code; |
1424 | info->si_addr = user_mode_vm(regs) ? (void __user *)regs->ip : NULL; | 1418 | info->si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL; |
1425 | } | 1419 | } |
1426 | 1420 | ||
1427 | void user_single_step_siginfo(struct task_struct *tsk, | 1421 | void user_single_step_siginfo(struct task_struct *tsk, |
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 2f355d229a58..e5ecd20e72dd 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c | |||
@@ -141,7 +141,46 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, | |||
141 | set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); | 141 | set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); |
142 | } | 142 | } |
143 | 143 | ||
144 | static struct pvclock_vsyscall_time_info *pvclock_vdso_info; | ||
145 | |||
146 | static struct pvclock_vsyscall_time_info * | ||
147 | pvclock_get_vsyscall_user_time_info(int cpu) | ||
148 | { | ||
149 | if (!pvclock_vdso_info) { | ||
150 | BUG(); | ||
151 | return NULL; | ||
152 | } | ||
153 | |||
154 | return &pvclock_vdso_info[cpu]; | ||
155 | } | ||
156 | |||
157 | struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu) | ||
158 | { | ||
159 | return &pvclock_get_vsyscall_user_time_info(cpu)->pvti; | ||
160 | } | ||
161 | |||
144 | #ifdef CONFIG_X86_64 | 162 | #ifdef CONFIG_X86_64 |
163 | static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l, | ||
164 | void *v) | ||
165 | { | ||
166 | struct task_migration_notifier *mn = v; | ||
167 | struct pvclock_vsyscall_time_info *pvti; | ||
168 | |||
169 | pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu); | ||
170 | |||
171 | /* this is NULL when pvclock vsyscall is not initialized */ | ||
172 | if (unlikely(pvti == NULL)) | ||
173 | return NOTIFY_DONE; | ||
174 | |||
175 | pvti->migrate_count++; | ||
176 | |||
177 | return NOTIFY_DONE; | ||
178 | } | ||
179 | |||
180 | static struct notifier_block pvclock_migrate = { | ||
181 | .notifier_call = pvclock_task_migrate, | ||
182 | }; | ||
183 | |||
145 | /* | 184 | /* |
146 | * Initialize the generic pvclock vsyscall state. This will allocate | 185 | * Initialize the generic pvclock vsyscall state. This will allocate |
147 | * a/some page(s) for the per-vcpu pvclock information, set up a | 186 | * a/some page(s) for the per-vcpu pvclock information, set up a |
@@ -155,12 +194,17 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i, | |||
155 | 194 | ||
156 | WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE); | 195 | WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE); |
157 | 196 | ||
197 | pvclock_vdso_info = i; | ||
198 | |||
158 | for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) { | 199 | for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) { |
159 | __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx, | 200 | __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx, |
160 | __pa(i) + (idx*PAGE_SIZE), | 201 | __pa(i) + (idx*PAGE_SIZE), |
161 | PAGE_KERNEL_VVAR); | 202 | PAGE_KERNEL_VVAR); |
162 | } | 203 | } |
163 | 204 | ||
205 | |||
206 | register_task_migration_notifier(&pvclock_migrate); | ||
207 | |||
164 | return 0; | 208 | return 0; |
165 | } | 209 | } |
166 | #endif | 210 | #endif |
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index e13f8e7c22a6..77630d57e7bf 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S | |||
@@ -226,23 +226,23 @@ swap_pages: | |||
226 | movl (%ebx), %ecx | 226 | movl (%ebx), %ecx |
227 | addl $4, %ebx | 227 | addl $4, %ebx |
228 | 1: | 228 | 1: |
229 | testl $0x1, %ecx /* is it a destination page */ | 229 | testb $0x1, %cl /* is it a destination page */ |
230 | jz 2f | 230 | jz 2f |
231 | movl %ecx, %edi | 231 | movl %ecx, %edi |
232 | andl $0xfffff000, %edi | 232 | andl $0xfffff000, %edi |
233 | jmp 0b | 233 | jmp 0b |
234 | 2: | 234 | 2: |
235 | testl $0x2, %ecx /* is it an indirection page */ | 235 | testb $0x2, %cl /* is it an indirection page */ |
236 | jz 2f | 236 | jz 2f |
237 | movl %ecx, %ebx | 237 | movl %ecx, %ebx |
238 | andl $0xfffff000, %ebx | 238 | andl $0xfffff000, %ebx |
239 | jmp 0b | 239 | jmp 0b |
240 | 2: | 240 | 2: |
241 | testl $0x4, %ecx /* is it the done indicator */ | 241 | testb $0x4, %cl /* is it the done indicator */ |
242 | jz 2f | 242 | jz 2f |
243 | jmp 3f | 243 | jmp 3f |
244 | 2: | 244 | 2: |
245 | testl $0x8, %ecx /* is it the source indicator */ | 245 | testb $0x8, %cl /* is it the source indicator */ |
246 | jz 0b /* Ignore it otherwise */ | 246 | jz 0b /* Ignore it otherwise */ |
247 | movl %ecx, %esi /* For every source page do a copy */ | 247 | movl %ecx, %esi /* For every source page do a copy */ |
248 | andl $0xfffff000, %esi | 248 | andl $0xfffff000, %esi |
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index 3fd2c693e475..98111b38ebfd 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S | |||
@@ -123,7 +123,7 @@ identity_mapped: | |||
123 | * Set cr4 to a known state: | 123 | * Set cr4 to a known state: |
124 | * - physical address extension enabled | 124 | * - physical address extension enabled |
125 | */ | 125 | */ |
126 | movq $X86_CR4_PAE, %rax | 126 | movl $X86_CR4_PAE, %eax |
127 | movq %rax, %cr4 | 127 | movq %rax, %cr4 |
128 | 128 | ||
129 | jmp 1f | 129 | jmp 1f |
@@ -221,23 +221,23 @@ swap_pages: | |||
221 | movq (%rbx), %rcx | 221 | movq (%rbx), %rcx |
222 | addq $8, %rbx | 222 | addq $8, %rbx |
223 | 1: | 223 | 1: |
224 | testq $0x1, %rcx /* is it a destination page? */ | 224 | testb $0x1, %cl /* is it a destination page? */ |
225 | jz 2f | 225 | jz 2f |
226 | movq %rcx, %rdi | 226 | movq %rcx, %rdi |
227 | andq $0xfffffffffffff000, %rdi | 227 | andq $0xfffffffffffff000, %rdi |
228 | jmp 0b | 228 | jmp 0b |
229 | 2: | 229 | 2: |
230 | testq $0x2, %rcx /* is it an indirection page? */ | 230 | testb $0x2, %cl /* is it an indirection page? */ |
231 | jz 2f | 231 | jz 2f |
232 | movq %rcx, %rbx | 232 | movq %rcx, %rbx |
233 | andq $0xfffffffffffff000, %rbx | 233 | andq $0xfffffffffffff000, %rbx |
234 | jmp 0b | 234 | jmp 0b |
235 | 2: | 235 | 2: |
236 | testq $0x4, %rcx /* is it the done indicator? */ | 236 | testb $0x4, %cl /* is it the done indicator? */ |
237 | jz 2f | 237 | jz 2f |
238 | jmp 3f | 238 | jmp 3f |
239 | 2: | 239 | 2: |
240 | testq $0x8, %rcx /* is it the source indicator? */ | 240 | testb $0x8, %cl /* is it the source indicator? */ |
241 | jz 0b /* Ignore it otherwise */ | 241 | jz 0b /* Ignore it otherwise */ |
242 | movq %rcx, %rsi /* For ever source page do a copy */ | 242 | movq %rcx, %rsi /* For ever source page do a copy */ |
243 | andq $0xfffffffffffff000, %rsi | 243 | andq $0xfffffffffffff000, %rsi |
@@ -246,17 +246,17 @@ swap_pages: | |||
246 | movq %rsi, %rax | 246 | movq %rsi, %rax |
247 | 247 | ||
248 | movq %r10, %rdi | 248 | movq %r10, %rdi |
249 | movq $512, %rcx | 249 | movl $512, %ecx |
250 | rep ; movsq | 250 | rep ; movsq |
251 | 251 | ||
252 | movq %rax, %rdi | 252 | movq %rax, %rdi |
253 | movq %rdx, %rsi | 253 | movq %rdx, %rsi |
254 | movq $512, %rcx | 254 | movl $512, %ecx |
255 | rep ; movsq | 255 | rep ; movsq |
256 | 256 | ||
257 | movq %rdx, %rdi | 257 | movq %rdx, %rdi |
258 | movq %r10, %rsi | 258 | movq %r10, %rsi |
259 | movq $512, %rcx | 259 | movl $512, %ecx |
260 | rep ; movsq | 260 | rep ; movsq |
261 | 261 | ||
262 | lea PAGE_SIZE(%rax), %rsi | 262 | lea PAGE_SIZE(%rax), %rsi |
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0a2421cca01f..d74ac33290ae 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c | |||
@@ -354,7 +354,7 @@ static void __init relocate_initrd(void) | |||
354 | mapaddr = ramdisk_image & PAGE_MASK; | 354 | mapaddr = ramdisk_image & PAGE_MASK; |
355 | p = early_memremap(mapaddr, clen+slop); | 355 | p = early_memremap(mapaddr, clen+slop); |
356 | memcpy(q, p+slop, clen); | 356 | memcpy(q, p+slop, clen); |
357 | early_iounmap(p, clen+slop); | 357 | early_memunmap(p, clen+slop); |
358 | q += clen; | 358 | q += clen; |
359 | ramdisk_image += clen; | 359 | ramdisk_image += clen; |
360 | ramdisk_size -= clen; | 360 | ramdisk_size -= clen; |
@@ -438,7 +438,7 @@ static void __init parse_setup_data(void) | |||
438 | data_len = data->len + sizeof(struct setup_data); | 438 | data_len = data->len + sizeof(struct setup_data); |
439 | data_type = data->type; | 439 | data_type = data->type; |
440 | pa_next = data->next; | 440 | pa_next = data->next; |
441 | early_iounmap(data, sizeof(*data)); | 441 | early_memunmap(data, sizeof(*data)); |
442 | 442 | ||
443 | switch (data_type) { | 443 | switch (data_type) { |
444 | case SETUP_E820_EXT: | 444 | case SETUP_E820_EXT: |
@@ -470,7 +470,7 @@ static void __init e820_reserve_setup_data(void) | |||
470 | E820_RAM, E820_RESERVED_KERN); | 470 | E820_RAM, E820_RESERVED_KERN); |
471 | found = 1; | 471 | found = 1; |
472 | pa_data = data->next; | 472 | pa_data = data->next; |
473 | early_iounmap(data, sizeof(*data)); | 473 | early_memunmap(data, sizeof(*data)); |
474 | } | 474 | } |
475 | if (!found) | 475 | if (!found) |
476 | return; | 476 | return; |
@@ -491,7 +491,7 @@ static void __init memblock_x86_reserve_range_setup_data(void) | |||
491 | data = early_memremap(pa_data, sizeof(*data)); | 491 | data = early_memremap(pa_data, sizeof(*data)); |
492 | memblock_reserve(pa_data, sizeof(*data) + data->len); | 492 | memblock_reserve(pa_data, sizeof(*data) + data->len); |
493 | pa_data = data->next; | 493 | pa_data = data->next; |
494 | early_iounmap(data, sizeof(*data)); | 494 | early_memunmap(data, sizeof(*data)); |
495 | } | 495 | } |
496 | } | 496 | } |
497 | 497 | ||
@@ -832,10 +832,15 @@ static void __init trim_low_memory_range(void) | |||
832 | static int | 832 | static int |
833 | dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) | 833 | dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) |
834 | { | 834 | { |
835 | pr_emerg("Kernel Offset: 0x%lx from 0x%lx " | 835 | if (kaslr_enabled()) { |
836 | "(relocation range: 0x%lx-0x%lx)\n", | 836 | pr_emerg("Kernel Offset: 0x%lx from 0x%lx (relocation range: 0x%lx-0x%lx)\n", |
837 | (unsigned long)&_text - __START_KERNEL, __START_KERNEL, | 837 | (unsigned long)&_text - __START_KERNEL, |
838 | __START_KERNEL_map, MODULES_VADDR-1); | 838 | __START_KERNEL, |
839 | __START_KERNEL_map, | ||
840 | MODULES_VADDR-1); | ||
841 | } else { | ||
842 | pr_emerg("Kernel Offset: disabled\n"); | ||
843 | } | ||
839 | 844 | ||
840 | return 0; | 845 | return 0; |
841 | } | 846 | } |
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index e5042463c1bc..1ea14fd53933 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c | |||
@@ -61,8 +61,7 @@ | |||
61 | regs->seg = GET_SEG(seg) | 3; \ | 61 | regs->seg = GET_SEG(seg) | 3; \ |
62 | } while (0) | 62 | } while (0) |
63 | 63 | ||
64 | int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, | 64 | int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc) |
65 | unsigned long *pax) | ||
66 | { | 65 | { |
67 | void __user *buf; | 66 | void __user *buf; |
68 | unsigned int tmpflags; | 67 | unsigned int tmpflags; |
@@ -81,7 +80,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, | |||
81 | #endif /* CONFIG_X86_32 */ | 80 | #endif /* CONFIG_X86_32 */ |
82 | 81 | ||
83 | COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); | 82 | COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); |
84 | COPY(dx); COPY(cx); COPY(ip); | 83 | COPY(dx); COPY(cx); COPY(ip); COPY(ax); |
85 | 84 | ||
86 | #ifdef CONFIG_X86_64 | 85 | #ifdef CONFIG_X86_64 |
87 | COPY(r8); | 86 | COPY(r8); |
@@ -94,27 +93,20 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, | |||
94 | COPY(r15); | 93 | COPY(r15); |
95 | #endif /* CONFIG_X86_64 */ | 94 | #endif /* CONFIG_X86_64 */ |
96 | 95 | ||
97 | #ifdef CONFIG_X86_32 | ||
98 | COPY_SEG_CPL3(cs); | 96 | COPY_SEG_CPL3(cs); |
99 | COPY_SEG_CPL3(ss); | 97 | COPY_SEG_CPL3(ss); |
100 | #else /* !CONFIG_X86_32 */ | ||
101 | /* Kernel saves and restores only the CS segment register on signals, | ||
102 | * which is the bare minimum needed to allow mixed 32/64-bit code. | ||
103 | * App's signal handler can save/restore other segments if needed. */ | ||
104 | COPY_SEG_CPL3(cs); | ||
105 | #endif /* CONFIG_X86_32 */ | ||
106 | 98 | ||
107 | get_user_ex(tmpflags, &sc->flags); | 99 | get_user_ex(tmpflags, &sc->flags); |
108 | regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); | 100 | regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); |
109 | regs->orig_ax = -1; /* disable syscall checks */ | 101 | regs->orig_ax = -1; /* disable syscall checks */ |
110 | 102 | ||
111 | get_user_ex(buf, &sc->fpstate); | 103 | get_user_ex(buf, &sc->fpstate); |
112 | |||
113 | get_user_ex(*pax, &sc->ax); | ||
114 | } get_user_catch(err); | 104 | } get_user_catch(err); |
115 | 105 | ||
116 | err |= restore_xstate_sig(buf, config_enabled(CONFIG_X86_32)); | 106 | err |= restore_xstate_sig(buf, config_enabled(CONFIG_X86_32)); |
117 | 107 | ||
108 | force_iret(); | ||
109 | |||
118 | return err; | 110 | return err; |
119 | } | 111 | } |
120 | 112 | ||
@@ -162,8 +154,9 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, | |||
162 | #else /* !CONFIG_X86_32 */ | 154 | #else /* !CONFIG_X86_32 */ |
163 | put_user_ex(regs->flags, &sc->flags); | 155 | put_user_ex(regs->flags, &sc->flags); |
164 | put_user_ex(regs->cs, &sc->cs); | 156 | put_user_ex(regs->cs, &sc->cs); |
165 | put_user_ex(0, &sc->gs); | 157 | put_user_ex(0, &sc->__pad2); |
166 | put_user_ex(0, &sc->fs); | 158 | put_user_ex(0, &sc->__pad1); |
159 | put_user_ex(regs->ss, &sc->ss); | ||
167 | #endif /* CONFIG_X86_32 */ | 160 | #endif /* CONFIG_X86_32 */ |
168 | 161 | ||
169 | put_user_ex(fpstate, &sc->fpstate); | 162 | put_user_ex(fpstate, &sc->fpstate); |
@@ -457,9 +450,19 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, | |||
457 | 450 | ||
458 | regs->sp = (unsigned long)frame; | 451 | regs->sp = (unsigned long)frame; |
459 | 452 | ||
460 | /* Set up the CS register to run signal handlers in 64-bit mode, | 453 | /* |
461 | even if the handler happens to be interrupting 32-bit code. */ | 454 | * Set up the CS and SS registers to run signal handlers in |
455 | * 64-bit mode, even if the handler happens to be interrupting | ||
456 | * 32-bit or 16-bit code. | ||
457 | * | ||
458 | * SS is subtle. In 64-bit mode, we don't need any particular | ||
459 | * SS descriptor, but we do need SS to be valid. It's possible | ||
460 | * that the old SS is entirely bogus -- this can happen if the | ||
461 | * signal we're trying to deliver is #GP or #SS caused by a bad | ||
462 | * SS value. | ||
463 | */ | ||
462 | regs->cs = __USER_CS; | 464 | regs->cs = __USER_CS; |
465 | regs->ss = __USER_DS; | ||
463 | 466 | ||
464 | return 0; | 467 | return 0; |
465 | } | 468 | } |
@@ -539,7 +542,6 @@ asmlinkage unsigned long sys_sigreturn(void) | |||
539 | { | 542 | { |
540 | struct pt_regs *regs = current_pt_regs(); | 543 | struct pt_regs *regs = current_pt_regs(); |
541 | struct sigframe __user *frame; | 544 | struct sigframe __user *frame; |
542 | unsigned long ax; | ||
543 | sigset_t set; | 545 | sigset_t set; |
544 | 546 | ||
545 | frame = (struct sigframe __user *)(regs->sp - 8); | 547 | frame = (struct sigframe __user *)(regs->sp - 8); |
@@ -553,9 +555,9 @@ asmlinkage unsigned long sys_sigreturn(void) | |||
553 | 555 | ||
554 | set_current_blocked(&set); | 556 | set_current_blocked(&set); |
555 | 557 | ||
556 | if (restore_sigcontext(regs, &frame->sc, &ax)) | 558 | if (restore_sigcontext(regs, &frame->sc)) |
557 | goto badframe; | 559 | goto badframe; |
558 | return ax; | 560 | return regs->ax; |
559 | 561 | ||
560 | badframe: | 562 | badframe: |
561 | signal_fault(regs, frame, "sigreturn"); | 563 | signal_fault(regs, frame, "sigreturn"); |
@@ -568,7 +570,6 @@ asmlinkage long sys_rt_sigreturn(void) | |||
568 | { | 570 | { |
569 | struct pt_regs *regs = current_pt_regs(); | 571 | struct pt_regs *regs = current_pt_regs(); |
570 | struct rt_sigframe __user *frame; | 572 | struct rt_sigframe __user *frame; |
571 | unsigned long ax; | ||
572 | sigset_t set; | 573 | sigset_t set; |
573 | 574 | ||
574 | frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); | 575 | frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); |
@@ -579,37 +580,23 @@ asmlinkage long sys_rt_sigreturn(void) | |||
579 | 580 | ||
580 | set_current_blocked(&set); | 581 | set_current_blocked(&set); |
581 | 582 | ||
582 | if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) | 583 | if (restore_sigcontext(regs, &frame->uc.uc_mcontext)) |
583 | goto badframe; | 584 | goto badframe; |
584 | 585 | ||
585 | if (restore_altstack(&frame->uc.uc_stack)) | 586 | if (restore_altstack(&frame->uc.uc_stack)) |
586 | goto badframe; | 587 | goto badframe; |
587 | 588 | ||
588 | return ax; | 589 | return regs->ax; |
589 | 590 | ||
590 | badframe: | 591 | badframe: |
591 | signal_fault(regs, frame, "rt_sigreturn"); | 592 | signal_fault(regs, frame, "rt_sigreturn"); |
592 | return 0; | 593 | return 0; |
593 | } | 594 | } |
594 | 595 | ||
595 | /* | ||
596 | * OK, we're invoking a handler: | ||
597 | */ | ||
598 | static int signr_convert(int sig) | ||
599 | { | ||
600 | #ifdef CONFIG_X86_32 | ||
601 | struct thread_info *info = current_thread_info(); | ||
602 | |||
603 | if (info->exec_domain && info->exec_domain->signal_invmap && sig < 32) | ||
604 | return info->exec_domain->signal_invmap[sig]; | ||
605 | #endif /* CONFIG_X86_32 */ | ||
606 | return sig; | ||
607 | } | ||
608 | |||
609 | static int | 596 | static int |
610 | setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) | 597 | setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) |
611 | { | 598 | { |
612 | int usig = signr_convert(ksig->sig); | 599 | int usig = ksig->sig; |
613 | sigset_t *set = sigmask_to_save(); | 600 | sigset_t *set = sigmask_to_save(); |
614 | compat_sigset_t *cset = (compat_sigset_t *) set; | 601 | compat_sigset_t *cset = (compat_sigset_t *) set; |
615 | 602 | ||
@@ -629,7 +616,8 @@ setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) | |||
629 | static void | 616 | static void |
630 | handle_signal(struct ksignal *ksig, struct pt_regs *regs) | 617 | handle_signal(struct ksignal *ksig, struct pt_regs *regs) |
631 | { | 618 | { |
632 | bool failed; | 619 | bool stepping, failed; |
620 | |||
633 | /* Are we from a system call? */ | 621 | /* Are we from a system call? */ |
634 | if (syscall_get_nr(current, regs) >= 0) { | 622 | if (syscall_get_nr(current, regs) >= 0) { |
635 | /* If so, check system call restarting.. */ | 623 | /* If so, check system call restarting.. */ |
@@ -653,12 +641,13 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) | |||
653 | } | 641 | } |
654 | 642 | ||
655 | /* | 643 | /* |
656 | * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF | 644 | * If TF is set due to a debugger (TIF_FORCED_TF), clear TF now |
657 | * flag so that register information in the sigcontext is correct. | 645 | * so that register information in the sigcontext is correct and |
646 | * then notify the tracer before entering the signal handler. | ||
658 | */ | 647 | */ |
659 | if (unlikely(regs->flags & X86_EFLAGS_TF) && | 648 | stepping = test_thread_flag(TIF_SINGLESTEP); |
660 | likely(test_and_clear_thread_flag(TIF_FORCED_TF))) | 649 | if (stepping) |
661 | regs->flags &= ~X86_EFLAGS_TF; | 650 | user_disable_single_step(current); |
662 | 651 | ||
663 | failed = (setup_rt_frame(ksig, regs) < 0); | 652 | failed = (setup_rt_frame(ksig, regs) < 0); |
664 | if (!failed) { | 653 | if (!failed) { |
@@ -669,19 +658,17 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) | |||
669 | * it might disable possible debug exception from the | 658 | * it might disable possible debug exception from the |
670 | * signal handler. | 659 | * signal handler. |
671 | * | 660 | * |
672 | * Clear TF when entering the signal handler, but | 661 | * Clear TF for the case when it wasn't set by debugger to |
673 | * notify any tracer that was single-stepping it. | 662 | * avoid the recursive send_sigtrap() in SIGTRAP handler. |
674 | * The tracer may want to single-step inside the | ||
675 | * handler too. | ||
676 | */ | 663 | */ |
677 | regs->flags &= ~(X86_EFLAGS_DF|X86_EFLAGS_RF|X86_EFLAGS_TF); | 664 | regs->flags &= ~(X86_EFLAGS_DF|X86_EFLAGS_RF|X86_EFLAGS_TF); |
678 | /* | 665 | /* |
679 | * Ensure the signal handler starts with the new fpu state. | 666 | * Ensure the signal handler starts with the new fpu state. |
680 | */ | 667 | */ |
681 | if (used_math()) | 668 | if (used_math()) |
682 | drop_init_fpu(current); | 669 | fpu_reset_state(current); |
683 | } | 670 | } |
684 | signal_setup_done(failed, ksig, test_thread_flag(TIF_SINGLESTEP)); | 671 | signal_setup_done(failed, ksig, stepping); |
685 | } | 672 | } |
686 | 673 | ||
687 | #ifdef CONFIG_X86_32 | 674 | #ifdef CONFIG_X86_32 |
@@ -780,7 +767,6 @@ asmlinkage long sys32_x32_rt_sigreturn(void) | |||
780 | struct pt_regs *regs = current_pt_regs(); | 767 | struct pt_regs *regs = current_pt_regs(); |
781 | struct rt_sigframe_x32 __user *frame; | 768 | struct rt_sigframe_x32 __user *frame; |
782 | sigset_t set; | 769 | sigset_t set; |
783 | unsigned long ax; | ||
784 | 770 | ||
785 | frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); | 771 | frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); |
786 | 772 | ||
@@ -791,13 +777,13 @@ asmlinkage long sys32_x32_rt_sigreturn(void) | |||
791 | 777 | ||
792 | set_current_blocked(&set); | 778 | set_current_blocked(&set); |
793 | 779 | ||
794 | if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) | 780 | if (restore_sigcontext(regs, &frame->uc.uc_mcontext)) |
795 | goto badframe; | 781 | goto badframe; |
796 | 782 | ||
797 | if (compat_restore_altstack(&frame->uc.uc_stack)) | 783 | if (compat_restore_altstack(&frame->uc.uc_stack)) |
798 | goto badframe; | 784 | goto badframe; |
799 | 785 | ||
800 | return ax; | 786 | return regs->ax; |
801 | 787 | ||
802 | badframe: | 788 | badframe: |
803 | signal_fault(regs, frame, "x32 rt_sigreturn"); | 789 | signal_fault(regs, frame, "x32 rt_sigreturn"); |
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index febc6aabc72e..50e547eac8cd 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c | |||
@@ -77,9 +77,6 @@ | |||
77 | #include <asm/realmode.h> | 77 | #include <asm/realmode.h> |
78 | #include <asm/misc.h> | 78 | #include <asm/misc.h> |
79 | 79 | ||
80 | /* State of each CPU */ | ||
81 | DEFINE_PER_CPU(int, cpu_state) = { 0 }; | ||
82 | |||
83 | /* Number of siblings per CPU package */ | 80 | /* Number of siblings per CPU package */ |
84 | int smp_num_siblings = 1; | 81 | int smp_num_siblings = 1; |
85 | EXPORT_SYMBOL(smp_num_siblings); | 82 | EXPORT_SYMBOL(smp_num_siblings); |
@@ -257,7 +254,7 @@ static void notrace start_secondary(void *unused) | |||
257 | lock_vector_lock(); | 254 | lock_vector_lock(); |
258 | set_cpu_online(smp_processor_id(), true); | 255 | set_cpu_online(smp_processor_id(), true); |
259 | unlock_vector_lock(); | 256 | unlock_vector_lock(); |
260 | per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; | 257 | cpu_set_state_online(smp_processor_id()); |
261 | x86_platform.nmi_init(); | 258 | x86_platform.nmi_init(); |
262 | 259 | ||
263 | /* enable local interrupts */ | 260 | /* enable local interrupts */ |
@@ -779,6 +776,26 @@ out: | |||
779 | return boot_error; | 776 | return boot_error; |
780 | } | 777 | } |
781 | 778 | ||
779 | void common_cpu_up(unsigned int cpu, struct task_struct *idle) | ||
780 | { | ||
781 | /* Just in case we booted with a single CPU. */ | ||
782 | alternatives_enable_smp(); | ||
783 | |||
784 | per_cpu(current_task, cpu) = idle; | ||
785 | |||
786 | #ifdef CONFIG_X86_32 | ||
787 | /* Stack for startup_32 can be just as for start_secondary onwards */ | ||
788 | irq_ctx_init(cpu); | ||
789 | per_cpu(cpu_current_top_of_stack, cpu) = | ||
790 | (unsigned long)task_stack_page(idle) + THREAD_SIZE; | ||
791 | #else | ||
792 | clear_tsk_thread_flag(idle, TIF_FORK); | ||
793 | initial_gs = per_cpu_offset(cpu); | ||
794 | #endif | ||
795 | per_cpu(kernel_stack, cpu) = | ||
796 | (unsigned long)task_stack_page(idle) + THREAD_SIZE; | ||
797 | } | ||
798 | |||
782 | /* | 799 | /* |
783 | * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad | 800 | * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad |
784 | * (ie clustered apic addressing mode), this is a LOGICAL apic ID. | 801 | * (ie clustered apic addressing mode), this is a LOGICAL apic ID. |
@@ -796,23 +813,9 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) | |||
796 | int cpu0_nmi_registered = 0; | 813 | int cpu0_nmi_registered = 0; |
797 | unsigned long timeout; | 814 | unsigned long timeout; |
798 | 815 | ||
799 | /* Just in case we booted with a single CPU. */ | ||
800 | alternatives_enable_smp(); | ||
801 | |||
802 | idle->thread.sp = (unsigned long) (((struct pt_regs *) | 816 | idle->thread.sp = (unsigned long) (((struct pt_regs *) |
803 | (THREAD_SIZE + task_stack_page(idle))) - 1); | 817 | (THREAD_SIZE + task_stack_page(idle))) - 1); |
804 | per_cpu(current_task, cpu) = idle; | ||
805 | 818 | ||
806 | #ifdef CONFIG_X86_32 | ||
807 | /* Stack for startup_32 can be just as for start_secondary onwards */ | ||
808 | irq_ctx_init(cpu); | ||
809 | #else | ||
810 | clear_tsk_thread_flag(idle, TIF_FORK); | ||
811 | initial_gs = per_cpu_offset(cpu); | ||
812 | #endif | ||
813 | per_cpu(kernel_stack, cpu) = | ||
814 | (unsigned long)task_stack_page(idle) - | ||
815 | KERNEL_STACK_OFFSET + THREAD_SIZE; | ||
816 | early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); | 819 | early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); |
817 | initial_code = (unsigned long)start_secondary; | 820 | initial_code = (unsigned long)start_secondary; |
818 | stack_start = idle->thread.sp; | 821 | stack_start = idle->thread.sp; |
@@ -948,11 +951,16 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle) | |||
948 | */ | 951 | */ |
949 | mtrr_save_state(); | 952 | mtrr_save_state(); |
950 | 953 | ||
951 | per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; | 954 | /* x86 CPUs take themselves offline, so delayed offline is OK. */ |
955 | err = cpu_check_up_prepare(cpu); | ||
956 | if (err && err != -EBUSY) | ||
957 | return err; | ||
952 | 958 | ||
953 | /* the FPU context is blank, nobody can own it */ | 959 | /* the FPU context is blank, nobody can own it */ |
954 | __cpu_disable_lazy_restore(cpu); | 960 | __cpu_disable_lazy_restore(cpu); |
955 | 961 | ||
962 | common_cpu_up(cpu, tidle); | ||
963 | |||
956 | err = do_boot_cpu(apicid, cpu, tidle); | 964 | err = do_boot_cpu(apicid, cpu, tidle); |
957 | if (err) { | 965 | if (err) { |
958 | pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu); | 966 | pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu); |
@@ -1086,8 +1094,6 @@ static int __init smp_sanity_check(unsigned max_cpus) | |||
1086 | return SMP_NO_APIC; | 1094 | return SMP_NO_APIC; |
1087 | } | 1095 | } |
1088 | 1096 | ||
1089 | verify_local_APIC(); | ||
1090 | |||
1091 | /* | 1097 | /* |
1092 | * If SMP should be disabled, then really disable it! | 1098 | * If SMP should be disabled, then really disable it! |
1093 | */ | 1099 | */ |
@@ -1191,7 +1197,7 @@ void __init native_smp_prepare_boot_cpu(void) | |||
1191 | switch_to_new_gdt(me); | 1197 | switch_to_new_gdt(me); |
1192 | /* already set me in cpu_online_mask in boot_cpu_init() */ | 1198 | /* already set me in cpu_online_mask in boot_cpu_init() */ |
1193 | cpumask_set_cpu(me, cpu_callout_mask); | 1199 | cpumask_set_cpu(me, cpu_callout_mask); |
1194 | per_cpu(cpu_state, me) = CPU_ONLINE; | 1200 | cpu_set_state_online(me); |
1195 | } | 1201 | } |
1196 | 1202 | ||
1197 | void __init native_smp_cpus_done(unsigned int max_cpus) | 1203 | void __init native_smp_cpus_done(unsigned int max_cpus) |
@@ -1318,14 +1324,10 @@ static void __ref remove_cpu_from_maps(int cpu) | |||
1318 | numa_remove_cpu(cpu); | 1324 | numa_remove_cpu(cpu); |
1319 | } | 1325 | } |
1320 | 1326 | ||
1321 | static DEFINE_PER_CPU(struct completion, die_complete); | ||
1322 | |||
1323 | void cpu_disable_common(void) | 1327 | void cpu_disable_common(void) |
1324 | { | 1328 | { |
1325 | int cpu = smp_processor_id(); | 1329 | int cpu = smp_processor_id(); |
1326 | 1330 | ||
1327 | init_completion(&per_cpu(die_complete, smp_processor_id())); | ||
1328 | |||
1329 | remove_siblinginfo(cpu); | 1331 | remove_siblinginfo(cpu); |
1330 | 1332 | ||
1331 | /* It's now safe to remove this processor from the online map */ | 1333 | /* It's now safe to remove this processor from the online map */ |
@@ -1349,24 +1351,27 @@ int native_cpu_disable(void) | |||
1349 | return 0; | 1351 | return 0; |
1350 | } | 1352 | } |
1351 | 1353 | ||
1352 | void cpu_die_common(unsigned int cpu) | 1354 | int common_cpu_die(unsigned int cpu) |
1353 | { | 1355 | { |
1354 | wait_for_completion_timeout(&per_cpu(die_complete, cpu), HZ); | 1356 | int ret = 0; |
1355 | } | ||
1356 | 1357 | ||
1357 | void native_cpu_die(unsigned int cpu) | ||
1358 | { | ||
1359 | /* We don't do anything here: idle task is faking death itself. */ | 1358 | /* We don't do anything here: idle task is faking death itself. */ |
1360 | 1359 | ||
1361 | cpu_die_common(cpu); | ||
1362 | |||
1363 | /* They ack this in play_dead() by setting CPU_DEAD */ | 1360 | /* They ack this in play_dead() by setting CPU_DEAD */ |
1364 | if (per_cpu(cpu_state, cpu) == CPU_DEAD) { | 1361 | if (cpu_wait_death(cpu, 5)) { |
1365 | if (system_state == SYSTEM_RUNNING) | 1362 | if (system_state == SYSTEM_RUNNING) |
1366 | pr_info("CPU %u is now offline\n", cpu); | 1363 | pr_info("CPU %u is now offline\n", cpu); |
1367 | } else { | 1364 | } else { |
1368 | pr_err("CPU %u didn't die...\n", cpu); | 1365 | pr_err("CPU %u didn't die...\n", cpu); |
1366 | ret = -1; | ||
1369 | } | 1367 | } |
1368 | |||
1369 | return ret; | ||
1370 | } | ||
1371 | |||
1372 | void native_cpu_die(unsigned int cpu) | ||
1373 | { | ||
1374 | common_cpu_die(cpu); | ||
1370 | } | 1375 | } |
1371 | 1376 | ||
1372 | void play_dead_common(void) | 1377 | void play_dead_common(void) |
@@ -1375,10 +1380,8 @@ void play_dead_common(void) | |||
1375 | reset_lazy_tlbstate(); | 1380 | reset_lazy_tlbstate(); |
1376 | amd_e400_remove_cpu(raw_smp_processor_id()); | 1381 | amd_e400_remove_cpu(raw_smp_processor_id()); |
1377 | 1382 | ||
1378 | mb(); | ||
1379 | /* Ack it */ | 1383 | /* Ack it */ |
1380 | __this_cpu_write(cpu_state, CPU_DEAD); | 1384 | (void)cpu_report_death(); |
1381 | complete(&per_cpu(die_complete, smp_processor_id())); | ||
1382 | 1385 | ||
1383 | /* | 1386 | /* |
1384 | * With physical CPU hotplug, we should halt the cpu | 1387 | * With physical CPU hotplug, we should halt the cpu |
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 30277e27431a..10e0272d789a 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c | |||
@@ -34,10 +34,26 @@ static unsigned long get_align_mask(void) | |||
34 | return va_align.mask; | 34 | return va_align.mask; |
35 | } | 35 | } |
36 | 36 | ||
37 | /* | ||
38 | * To avoid aliasing in the I$ on AMD F15h, the bits defined by the | ||
39 | * va_align.bits, [12:upper_bit), are set to a random value instead of | ||
40 | * zeroing them. This random value is computed once per boot. This form | ||
41 | * of ASLR is known as "per-boot ASLR". | ||
42 | * | ||
43 | * To achieve this, the random value is added to the info.align_offset | ||
44 | * value before calling vm_unmapped_area() or ORed directly to the | ||
45 | * address. | ||
46 | */ | ||
47 | static unsigned long get_align_bits(void) | ||
48 | { | ||
49 | return va_align.bits & get_align_mask(); | ||
50 | } | ||
51 | |||
37 | unsigned long align_vdso_addr(unsigned long addr) | 52 | unsigned long align_vdso_addr(unsigned long addr) |
38 | { | 53 | { |
39 | unsigned long align_mask = get_align_mask(); | 54 | unsigned long align_mask = get_align_mask(); |
40 | return (addr + align_mask) & ~align_mask; | 55 | addr = (addr + align_mask) & ~align_mask; |
56 | return addr | get_align_bits(); | ||
41 | } | 57 | } |
42 | 58 | ||
43 | static int __init control_va_addr_alignment(char *str) | 59 | static int __init control_va_addr_alignment(char *str) |
@@ -135,8 +151,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, | |||
135 | info.length = len; | 151 | info.length = len; |
136 | info.low_limit = begin; | 152 | info.low_limit = begin; |
137 | info.high_limit = end; | 153 | info.high_limit = end; |
138 | info.align_mask = filp ? get_align_mask() : 0; | 154 | info.align_mask = 0; |
139 | info.align_offset = pgoff << PAGE_SHIFT; | 155 | info.align_offset = pgoff << PAGE_SHIFT; |
156 | if (filp) { | ||
157 | info.align_mask = get_align_mask(); | ||
158 | info.align_offset += get_align_bits(); | ||
159 | } | ||
140 | return vm_unmapped_area(&info); | 160 | return vm_unmapped_area(&info); |
141 | } | 161 | } |
142 | 162 | ||
@@ -174,8 +194,12 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
174 | info.length = len; | 194 | info.length = len; |
175 | info.low_limit = PAGE_SIZE; | 195 | info.low_limit = PAGE_SIZE; |
176 | info.high_limit = mm->mmap_base; | 196 | info.high_limit = mm->mmap_base; |
177 | info.align_mask = filp ? get_align_mask() : 0; | 197 | info.align_mask = 0; |
178 | info.align_offset = pgoff << PAGE_SHIFT; | 198 | info.align_offset = pgoff << PAGE_SHIFT; |
199 | if (filp) { | ||
200 | info.align_mask = get_align_mask(); | ||
201 | info.align_offset += get_align_bits(); | ||
202 | } | ||
179 | addr = vm_unmapped_area(&info); | 203 | addr = vm_unmapped_area(&info); |
180 | if (!(addr & ~PAGE_MASK)) | 204 | if (!(addr & ~PAGE_MASK)) |
181 | return addr; | 205 | return addr; |
diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c index e9bcd57d8a9e..3777189c4a19 100644 --- a/arch/x86/kernel/syscall_32.c +++ b/arch/x86/kernel/syscall_32.c | |||
@@ -5,21 +5,29 @@ | |||
5 | #include <linux/cache.h> | 5 | #include <linux/cache.h> |
6 | #include <asm/asm-offsets.h> | 6 | #include <asm/asm-offsets.h> |
7 | 7 | ||
8 | #define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ; | 8 | #ifdef CONFIG_IA32_EMULATION |
9 | #define SYM(sym, compat) compat | ||
10 | #else | ||
11 | #define SYM(sym, compat) sym | ||
12 | #define ia32_sys_call_table sys_call_table | ||
13 | #define __NR_ia32_syscall_max __NR_syscall_max | ||
14 | #endif | ||
15 | |||
16 | #define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ; | ||
9 | #include <asm/syscalls_32.h> | 17 | #include <asm/syscalls_32.h> |
10 | #undef __SYSCALL_I386 | 18 | #undef __SYSCALL_I386 |
11 | 19 | ||
12 | #define __SYSCALL_I386(nr, sym, compat) [nr] = sym, | 20 | #define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat), |
13 | 21 | ||
14 | typedef asmlinkage void (*sys_call_ptr_t)(void); | 22 | typedef asmlinkage void (*sys_call_ptr_t)(void); |
15 | 23 | ||
16 | extern asmlinkage void sys_ni_syscall(void); | 24 | extern asmlinkage void sys_ni_syscall(void); |
17 | 25 | ||
18 | __visible const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { | 26 | __visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = { |
19 | /* | 27 | /* |
20 | * Smells like a compiler bug -- it doesn't work | 28 | * Smells like a compiler bug -- it doesn't work |
21 | * when the & below is removed. | 29 | * when the & below is removed. |
22 | */ | 30 | */ |
23 | [0 ... __NR_syscall_max] = &sys_ni_syscall, | 31 | [0 ... __NR_ia32_syscall_max] = &sys_ni_syscall, |
24 | #include <asm/syscalls_32.h> | 32 | #include <asm/syscalls_32.h> |
25 | }; | 33 | }; |
diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c index b79133abda48..5ecbfe5099da 100644 --- a/arch/x86/kernel/test_rodata.c +++ b/arch/x86/kernel/test_rodata.c | |||
@@ -57,7 +57,7 @@ int rodata_test(void) | |||
57 | /* test 3: check the value hasn't changed */ | 57 | /* test 3: check the value hasn't changed */ |
58 | /* If this test fails, we managed to overwrite the data */ | 58 | /* If this test fails, we managed to overwrite the data */ |
59 | if (!rodata_test_data) { | 59 | if (!rodata_test_data) { |
60 | printk(KERN_ERR "rodata_test: Test 3 failes (end data)\n"); | 60 | printk(KERN_ERR "rodata_test: Test 3 fails (end data)\n"); |
61 | return -ENODEV; | 61 | return -ENODEV; |
62 | } | 62 | } |
63 | /* test 4: check if the rodata section is 4Kb aligned */ | 63 | /* test 4: check if the rodata section is 4Kb aligned */ |
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index 25adc0e16eaa..d39c09119db6 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c | |||
@@ -30,7 +30,7 @@ unsigned long profile_pc(struct pt_regs *regs) | |||
30 | { | 30 | { |
31 | unsigned long pc = instruction_pointer(regs); | 31 | unsigned long pc = instruction_pointer(regs); |
32 | 32 | ||
33 | if (!user_mode_vm(regs) && in_lock_functions(pc)) { | 33 | if (!user_mode(regs) && in_lock_functions(pc)) { |
34 | #ifdef CONFIG_FRAME_POINTER | 34 | #ifdef CONFIG_FRAME_POINTER |
35 | return *(unsigned long *)(regs->bp + sizeof(long)); | 35 | return *(unsigned long *)(regs->bp + sizeof(long)); |
36 | #else | 36 | #else |
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 4ff5d162ff9f..324ab5247687 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c | |||
@@ -112,7 +112,7 @@ enum ctx_state ist_enter(struct pt_regs *regs) | |||
112 | { | 112 | { |
113 | enum ctx_state prev_state; | 113 | enum ctx_state prev_state; |
114 | 114 | ||
115 | if (user_mode_vm(regs)) { | 115 | if (user_mode(regs)) { |
116 | /* Other than that, we're just an exception. */ | 116 | /* Other than that, we're just an exception. */ |
117 | prev_state = exception_enter(); | 117 | prev_state = exception_enter(); |
118 | } else { | 118 | } else { |
@@ -123,7 +123,7 @@ enum ctx_state ist_enter(struct pt_regs *regs) | |||
123 | * but we need to notify RCU. | 123 | * but we need to notify RCU. |
124 | */ | 124 | */ |
125 | rcu_nmi_enter(); | 125 | rcu_nmi_enter(); |
126 | prev_state = IN_KERNEL; /* the value is irrelevant. */ | 126 | prev_state = CONTEXT_KERNEL; /* the value is irrelevant. */ |
127 | } | 127 | } |
128 | 128 | ||
129 | /* | 129 | /* |
@@ -146,7 +146,7 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state) | |||
146 | /* Must be before exception_exit. */ | 146 | /* Must be before exception_exit. */ |
147 | preempt_count_sub(HARDIRQ_OFFSET); | 147 | preempt_count_sub(HARDIRQ_OFFSET); |
148 | 148 | ||
149 | if (user_mode_vm(regs)) | 149 | if (user_mode(regs)) |
150 | return exception_exit(prev_state); | 150 | return exception_exit(prev_state); |
151 | else | 151 | else |
152 | rcu_nmi_exit(); | 152 | rcu_nmi_exit(); |
@@ -158,7 +158,7 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state) | |||
158 | * | 158 | * |
159 | * IST exception handlers normally cannot schedule. As a special | 159 | * IST exception handlers normally cannot schedule. As a special |
160 | * exception, if the exception interrupted userspace code (i.e. | 160 | * exception, if the exception interrupted userspace code (i.e. |
161 | * user_mode_vm(regs) would return true) and the exception was not | 161 | * user_mode(regs) would return true) and the exception was not |
162 | * a double fault, it can be safe to schedule. ist_begin_non_atomic() | 162 | * a double fault, it can be safe to schedule. ist_begin_non_atomic() |
163 | * begins a non-atomic section within an ist_enter()/ist_exit() region. | 163 | * begins a non-atomic section within an ist_enter()/ist_exit() region. |
164 | * Callers are responsible for enabling interrupts themselves inside | 164 | * Callers are responsible for enabling interrupts themselves inside |
@@ -167,15 +167,15 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state) | |||
167 | */ | 167 | */ |
168 | void ist_begin_non_atomic(struct pt_regs *regs) | 168 | void ist_begin_non_atomic(struct pt_regs *regs) |
169 | { | 169 | { |
170 | BUG_ON(!user_mode_vm(regs)); | 170 | BUG_ON(!user_mode(regs)); |
171 | 171 | ||
172 | /* | 172 | /* |
173 | * Sanity check: we need to be on the normal thread stack. This | 173 | * Sanity check: we need to be on the normal thread stack. This |
174 | * will catch asm bugs and any attempt to use ist_preempt_enable | 174 | * will catch asm bugs and any attempt to use ist_preempt_enable |
175 | * from double_fault. | 175 | * from double_fault. |
176 | */ | 176 | */ |
177 | BUG_ON(((current_stack_pointer() ^ this_cpu_read_stable(kernel_stack)) | 177 | BUG_ON((unsigned long)(current_top_of_stack() - |
178 | & ~(THREAD_SIZE - 1)) != 0); | 178 | current_stack_pointer()) >= THREAD_SIZE); |
179 | 179 | ||
180 | preempt_count_sub(HARDIRQ_OFFSET); | 180 | preempt_count_sub(HARDIRQ_OFFSET); |
181 | } | 181 | } |
@@ -194,8 +194,7 @@ static nokprobe_inline int | |||
194 | do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, | 194 | do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, |
195 | struct pt_regs *regs, long error_code) | 195 | struct pt_regs *regs, long error_code) |
196 | { | 196 | { |
197 | #ifdef CONFIG_X86_32 | 197 | if (v8086_mode(regs)) { |
198 | if (regs->flags & X86_VM_MASK) { | ||
199 | /* | 198 | /* |
200 | * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86. | 199 | * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86. |
201 | * On nmi (interrupt 2), do_trap should not be called. | 200 | * On nmi (interrupt 2), do_trap should not be called. |
@@ -207,7 +206,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, | |||
207 | } | 206 | } |
208 | return -1; | 207 | return -1; |
209 | } | 208 | } |
210 | #endif | 209 | |
211 | if (!user_mode(regs)) { | 210 | if (!user_mode(regs)) { |
212 | if (!fixup_exception(regs)) { | 211 | if (!fixup_exception(regs)) { |
213 | tsk->thread.error_code = error_code; | 212 | tsk->thread.error_code = error_code; |
@@ -384,7 +383,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) | |||
384 | goto exit; | 383 | goto exit; |
385 | conditional_sti(regs); | 384 | conditional_sti(regs); |
386 | 385 | ||
387 | if (!user_mode_vm(regs)) | 386 | if (!user_mode(regs)) |
388 | die("bounds", regs, error_code); | 387 | die("bounds", regs, error_code); |
389 | 388 | ||
390 | if (!cpu_feature_enabled(X86_FEATURE_MPX)) { | 389 | if (!cpu_feature_enabled(X86_FEATURE_MPX)) { |
@@ -462,13 +461,11 @@ do_general_protection(struct pt_regs *regs, long error_code) | |||
462 | prev_state = exception_enter(); | 461 | prev_state = exception_enter(); |
463 | conditional_sti(regs); | 462 | conditional_sti(regs); |
464 | 463 | ||
465 | #ifdef CONFIG_X86_32 | 464 | if (v8086_mode(regs)) { |
466 | if (regs->flags & X86_VM_MASK) { | ||
467 | local_irq_enable(); | 465 | local_irq_enable(); |
468 | handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); | 466 | handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); |
469 | goto exit; | 467 | goto exit; |
470 | } | 468 | } |
471 | #endif | ||
472 | 469 | ||
473 | tsk = current; | 470 | tsk = current; |
474 | if (!user_mode(regs)) { | 471 | if (!user_mode(regs)) { |
@@ -587,7 +584,7 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) | |||
587 | /* Copy the remainder of the stack from the current stack. */ | 584 | /* Copy the remainder of the stack from the current stack. */ |
588 | memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip)); | 585 | memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip)); |
589 | 586 | ||
590 | BUG_ON(!user_mode_vm(&new_stack->regs)); | 587 | BUG_ON(!user_mode(&new_stack->regs)); |
591 | return new_stack; | 588 | return new_stack; |
592 | } | 589 | } |
593 | NOKPROBE_SYMBOL(fixup_bad_iret); | 590 | NOKPROBE_SYMBOL(fixup_bad_iret); |
@@ -637,7 +634,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) | |||
637 | * then it's very likely the result of an icebp/int01 trap. | 634 | * then it's very likely the result of an icebp/int01 trap. |
638 | * User wants a sigtrap for that. | 635 | * User wants a sigtrap for that. |
639 | */ | 636 | */ |
640 | if (!dr6 && user_mode_vm(regs)) | 637 | if (!dr6 && user_mode(regs)) |
641 | user_icebp = 1; | 638 | user_icebp = 1; |
642 | 639 | ||
643 | /* Catch kmemcheck conditions first of all! */ | 640 | /* Catch kmemcheck conditions first of all! */ |
@@ -673,7 +670,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) | |||
673 | /* It's safe to allow irq's after DR6 has been saved */ | 670 | /* It's safe to allow irq's after DR6 has been saved */ |
674 | preempt_conditional_sti(regs); | 671 | preempt_conditional_sti(regs); |
675 | 672 | ||
676 | if (regs->flags & X86_VM_MASK) { | 673 | if (v8086_mode(regs)) { |
677 | handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, | 674 | handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, |
678 | X86_TRAP_DB); | 675 | X86_TRAP_DB); |
679 | preempt_conditional_cli(regs); | 676 | preempt_conditional_cli(regs); |
@@ -721,7 +718,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) | |||
721 | return; | 718 | return; |
722 | conditional_sti(regs); | 719 | conditional_sti(regs); |
723 | 720 | ||
724 | if (!user_mode_vm(regs)) | 721 | if (!user_mode(regs)) |
725 | { | 722 | { |
726 | if (!fixup_exception(regs)) { | 723 | if (!fixup_exception(regs)) { |
727 | task->thread.error_code = error_code; | 724 | task->thread.error_code = error_code; |
@@ -734,7 +731,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) | |||
734 | /* | 731 | /* |
735 | * Save the info for the exception handler and clear the error. | 732 | * Save the info for the exception handler and clear the error. |
736 | */ | 733 | */ |
737 | save_init_fpu(task); | 734 | unlazy_fpu(task); |
738 | task->thread.trap_nr = trapnr; | 735 | task->thread.trap_nr = trapnr; |
739 | task->thread.error_code = error_code; | 736 | task->thread.error_code = error_code; |
740 | info.si_signo = SIGFPE; | 737 | info.si_signo = SIGFPE; |
@@ -863,7 +860,7 @@ void math_state_restore(void) | |||
863 | kernel_fpu_disable(); | 860 | kernel_fpu_disable(); |
864 | __thread_fpu_begin(tsk); | 861 | __thread_fpu_begin(tsk); |
865 | if (unlikely(restore_fpu_checking(tsk))) { | 862 | if (unlikely(restore_fpu_checking(tsk))) { |
866 | drop_init_fpu(tsk); | 863 | fpu_reset_state(tsk); |
867 | force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk); | 864 | force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk); |
868 | } else { | 865 | } else { |
869 | tsk->thread.fpu_counter++; | 866 | tsk->thread.fpu_counter++; |
@@ -925,9 +922,21 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) | |||
925 | /* Set of traps needed for early debugging. */ | 922 | /* Set of traps needed for early debugging. */ |
926 | void __init early_trap_init(void) | 923 | void __init early_trap_init(void) |
927 | { | 924 | { |
928 | set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); | 925 | /* |
926 | * Don't use IST to set DEBUG_STACK as it doesn't work until TSS | ||
927 | * is ready in cpu_init() <-- trap_init(). Before trap_init(), | ||
928 | * CPU runs at ring 0 so it is impossible to hit an invalid | ||
929 | * stack. Using the original stack works well enough at this | ||
930 | * early stage. DEBUG_STACK will be equipped after cpu_init() in | ||
931 | * trap_init(). | ||
932 | * | ||
933 | * We don't need to set trace_idt_table like set_intr_gate(), | ||
934 | * since we don't have trace_debug and it will be reset to | ||
935 | * 'debug' in trap_init() by set_intr_gate_ist(). | ||
936 | */ | ||
937 | set_intr_gate_notrace(X86_TRAP_DB, debug); | ||
929 | /* int3 can be called from all */ | 938 | /* int3 can be called from all */ |
930 | set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); | 939 | set_system_intr_gate(X86_TRAP_BP, &int3); |
931 | #ifdef CONFIG_X86_32 | 940 | #ifdef CONFIG_X86_32 |
932 | set_intr_gate(X86_TRAP_PF, page_fault); | 941 | set_intr_gate(X86_TRAP_PF, page_fault); |
933 | #endif | 942 | #endif |
@@ -1005,6 +1014,15 @@ void __init trap_init(void) | |||
1005 | */ | 1014 | */ |
1006 | cpu_init(); | 1015 | cpu_init(); |
1007 | 1016 | ||
1017 | /* | ||
1018 | * X86_TRAP_DB and X86_TRAP_BP have been set | ||
1019 | * in early_trap_init(). However, ITS works only after | ||
1020 | * cpu_init() loads TSS. See comments in early_trap_init(). | ||
1021 | */ | ||
1022 | set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); | ||
1023 | /* int3 can be called from all */ | ||
1024 | set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); | ||
1025 | |||
1008 | x86_init.irqs.trap_init(); | 1026 | x86_init.irqs.trap_init(); |
1009 | 1027 | ||
1010 | #ifdef CONFIG_X86_64 | 1028 | #ifdef CONFIG_X86_64 |
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 81f8adb0679e..0b81ad67da07 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c | |||
@@ -912,7 +912,7 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, | |||
912 | int ret = NOTIFY_DONE; | 912 | int ret = NOTIFY_DONE; |
913 | 913 | ||
914 | /* We are only interested in userspace traps */ | 914 | /* We are only interested in userspace traps */ |
915 | if (regs && !user_mode_vm(regs)) | 915 | if (regs && !user_mode(regs)) |
916 | return NOTIFY_DONE; | 916 | return NOTIFY_DONE; |
917 | 917 | ||
918 | switch (val) { | 918 | switch (val) { |
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index e8edcf52e069..fc9db6ef2a95 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c | |||
@@ -150,7 +150,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs) | |||
150 | do_exit(SIGSEGV); | 150 | do_exit(SIGSEGV); |
151 | } | 151 | } |
152 | 152 | ||
153 | tss = &per_cpu(init_tss, get_cpu()); | 153 | tss = &per_cpu(cpu_tss, get_cpu()); |
154 | current->thread.sp0 = current->thread.saved_sp0; | 154 | current->thread.sp0 = current->thread.saved_sp0; |
155 | current->thread.sysenter_cs = __KERNEL_CS; | 155 | current->thread.sysenter_cs = __KERNEL_CS; |
156 | load_sp0(tss, ¤t->thread); | 156 | load_sp0(tss, ¤t->thread); |
@@ -318,7 +318,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk | |||
318 | tsk->thread.saved_fs = info->regs32->fs; | 318 | tsk->thread.saved_fs = info->regs32->fs; |
319 | tsk->thread.saved_gs = get_user_gs(info->regs32); | 319 | tsk->thread.saved_gs = get_user_gs(info->regs32); |
320 | 320 | ||
321 | tss = &per_cpu(init_tss, get_cpu()); | 321 | tss = &per_cpu(cpu_tss, get_cpu()); |
322 | tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0; | 322 | tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0; |
323 | if (cpu_has_sep) | 323 | if (cpu_has_sep) |
324 | tsk->thread.sysenter_cs = 0; | 324 | tsk->thread.sysenter_cs = 0; |
diff --git a/arch/x86/kernel/vsyscall_gtod.c b/arch/x86/kernel/vsyscall_gtod.c index c7d791f32b98..51e330416995 100644 --- a/arch/x86/kernel/vsyscall_gtod.c +++ b/arch/x86/kernel/vsyscall_gtod.c | |||
@@ -31,30 +31,30 @@ void update_vsyscall(struct timekeeper *tk) | |||
31 | gtod_write_begin(vdata); | 31 | gtod_write_begin(vdata); |
32 | 32 | ||
33 | /* copy vsyscall data */ | 33 | /* copy vsyscall data */ |
34 | vdata->vclock_mode = tk->tkr.clock->archdata.vclock_mode; | 34 | vdata->vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; |
35 | vdata->cycle_last = tk->tkr.cycle_last; | 35 | vdata->cycle_last = tk->tkr_mono.cycle_last; |
36 | vdata->mask = tk->tkr.mask; | 36 | vdata->mask = tk->tkr_mono.mask; |
37 | vdata->mult = tk->tkr.mult; | 37 | vdata->mult = tk->tkr_mono.mult; |
38 | vdata->shift = tk->tkr.shift; | 38 | vdata->shift = tk->tkr_mono.shift; |
39 | 39 | ||
40 | vdata->wall_time_sec = tk->xtime_sec; | 40 | vdata->wall_time_sec = tk->xtime_sec; |
41 | vdata->wall_time_snsec = tk->tkr.xtime_nsec; | 41 | vdata->wall_time_snsec = tk->tkr_mono.xtime_nsec; |
42 | 42 | ||
43 | vdata->monotonic_time_sec = tk->xtime_sec | 43 | vdata->monotonic_time_sec = tk->xtime_sec |
44 | + tk->wall_to_monotonic.tv_sec; | 44 | + tk->wall_to_monotonic.tv_sec; |
45 | vdata->monotonic_time_snsec = tk->tkr.xtime_nsec | 45 | vdata->monotonic_time_snsec = tk->tkr_mono.xtime_nsec |
46 | + ((u64)tk->wall_to_monotonic.tv_nsec | 46 | + ((u64)tk->wall_to_monotonic.tv_nsec |
47 | << tk->tkr.shift); | 47 | << tk->tkr_mono.shift); |
48 | while (vdata->monotonic_time_snsec >= | 48 | while (vdata->monotonic_time_snsec >= |
49 | (((u64)NSEC_PER_SEC) << tk->tkr.shift)) { | 49 | (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { |
50 | vdata->monotonic_time_snsec -= | 50 | vdata->monotonic_time_snsec -= |
51 | ((u64)NSEC_PER_SEC) << tk->tkr.shift; | 51 | ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift; |
52 | vdata->monotonic_time_sec++; | 52 | vdata->monotonic_time_sec++; |
53 | } | 53 | } |
54 | 54 | ||
55 | vdata->wall_time_coarse_sec = tk->xtime_sec; | 55 | vdata->wall_time_coarse_sec = tk->xtime_sec; |
56 | vdata->wall_time_coarse_nsec = (long)(tk->tkr.xtime_nsec >> | 56 | vdata->wall_time_coarse_nsec = (long)(tk->tkr_mono.xtime_nsec >> |
57 | tk->tkr.shift); | 57 | tk->tkr_mono.shift); |
58 | 58 | ||
59 | vdata->monotonic_time_coarse_sec = | 59 | vdata->monotonic_time_coarse_sec = |
60 | vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec; | 60 | vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec; |
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index cdc6cf903078..87a815b85f3e 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c | |||
@@ -342,7 +342,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) | |||
342 | config_enabled(CONFIG_IA32_EMULATION)); | 342 | config_enabled(CONFIG_IA32_EMULATION)); |
343 | 343 | ||
344 | if (!buf) { | 344 | if (!buf) { |
345 | drop_init_fpu(tsk); | 345 | fpu_reset_state(tsk); |
346 | return 0; | 346 | return 0; |
347 | } | 347 | } |
348 | 348 | ||
@@ -416,7 +416,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) | |||
416 | */ | 416 | */ |
417 | user_fpu_begin(); | 417 | user_fpu_begin(); |
418 | if (restore_user_xstate(buf_fx, xstate_bv, fx_only)) { | 418 | if (restore_user_xstate(buf_fx, xstate_bv, fx_only)) { |
419 | drop_init_fpu(tsk); | 419 | fpu_reset_state(tsk); |
420 | return -1; | 420 | return -1; |
421 | } | 421 | } |
422 | } | 422 | } |
@@ -678,19 +678,13 @@ void xsave_init(void) | |||
678 | this_func(); | 678 | this_func(); |
679 | } | 679 | } |
680 | 680 | ||
681 | static inline void __init eager_fpu_init_bp(void) | 681 | /* |
682 | { | 682 | * setup_init_fpu_buf() is __init and it is OK to call it here because |
683 | current->thread.fpu.state = | 683 | * init_xstate_buf will be unset only once during boot. |
684 | alloc_bootmem_align(xstate_size, __alignof__(struct xsave_struct)); | 684 | */ |
685 | if (!init_xstate_buf) | 685 | void __init_refok eager_fpu_init(void) |
686 | setup_init_fpu_buf(); | ||
687 | } | ||
688 | |||
689 | void eager_fpu_init(void) | ||
690 | { | 686 | { |
691 | static __refdata void (*boot_func)(void) = eager_fpu_init_bp; | 687 | WARN_ON(used_math()); |
692 | |||
693 | clear_used_math(); | ||
694 | current_thread_info()->status = 0; | 688 | current_thread_info()->status = 0; |
695 | 689 | ||
696 | if (eagerfpu == ENABLE) | 690 | if (eagerfpu == ENABLE) |
@@ -701,21 +695,8 @@ void eager_fpu_init(void) | |||
701 | return; | 695 | return; |
702 | } | 696 | } |
703 | 697 | ||
704 | if (boot_func) { | 698 | if (!init_xstate_buf) |
705 | boot_func(); | 699 | setup_init_fpu_buf(); |
706 | boot_func = NULL; | ||
707 | } | ||
708 | |||
709 | /* | ||
710 | * This is same as math_state_restore(). But use_xsave() is | ||
711 | * not yet patched to use math_state_restore(). | ||
712 | */ | ||
713 | init_fpu(current); | ||
714 | __thread_fpu_begin(current); | ||
715 | if (cpu_has_xsave) | ||
716 | xrstor_state(init_xstate_buf, -1); | ||
717 | else | ||
718 | fxrstor_checking(&init_xstate_buf->i387); | ||
719 | } | 700 | } |
720 | 701 | ||
721 | /* | 702 | /* |
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 08f790dfadc9..16e8f962eaad 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile | |||
@@ -1,5 +1,5 @@ | |||
1 | 1 | ||
2 | ccflags-y += -Ivirt/kvm -Iarch/x86/kvm | 2 | ccflags-y += -Iarch/x86/kvm |
3 | 3 | ||
4 | CFLAGS_x86.o := -I. | 4 | CFLAGS_x86.o := -I. |
5 | CFLAGS_svm.o := -I. | 5 | CFLAGS_svm.o := -I. |
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 8a80737ee6e6..59b69f6a2844 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c | |||
@@ -104,6 +104,9 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) | |||
104 | ((best->eax & 0xff00) >> 8) != 0) | 104 | ((best->eax & 0xff00) >> 8) != 0) |
105 | return -EINVAL; | 105 | return -EINVAL; |
106 | 106 | ||
107 | /* Update physical-address width */ | ||
108 | vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); | ||
109 | |||
107 | kvm_pmu_cpuid_update(vcpu); | 110 | kvm_pmu_cpuid_update(vcpu); |
108 | return 0; | 111 | return 0; |
109 | } | 112 | } |
@@ -135,6 +138,21 @@ static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) | |||
135 | } | 138 | } |
136 | } | 139 | } |
137 | 140 | ||
141 | int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu) | ||
142 | { | ||
143 | struct kvm_cpuid_entry2 *best; | ||
144 | |||
145 | best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0); | ||
146 | if (!best || best->eax < 0x80000008) | ||
147 | goto not_found; | ||
148 | best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); | ||
149 | if (best) | ||
150 | return best->eax & 0xff; | ||
151 | not_found: | ||
152 | return 36; | ||
153 | } | ||
154 | EXPORT_SYMBOL_GPL(cpuid_query_maxphyaddr); | ||
155 | |||
138 | /* when an old userspace process fills a new kernel module */ | 156 | /* when an old userspace process fills a new kernel module */ |
139 | int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, | 157 | int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, |
140 | struct kvm_cpuid *cpuid, | 158 | struct kvm_cpuid *cpuid, |
@@ -757,21 +775,6 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, | |||
757 | } | 775 | } |
758 | EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); | 776 | EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); |
759 | 777 | ||
760 | int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) | ||
761 | { | ||
762 | struct kvm_cpuid_entry2 *best; | ||
763 | |||
764 | best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0); | ||
765 | if (!best || best->eax < 0x80000008) | ||
766 | goto not_found; | ||
767 | best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); | ||
768 | if (best) | ||
769 | return best->eax & 0xff; | ||
770 | not_found: | ||
771 | return 36; | ||
772 | } | ||
773 | EXPORT_SYMBOL_GPL(cpuid_maxphyaddr); | ||
774 | |||
775 | /* | 778 | /* |
776 | * If no match is found, check whether we exceed the vCPU's limit | 779 | * If no match is found, check whether we exceed the vCPU's limit |
777 | * and return the content of the highest valid _standard_ leaf instead. | 780 | * and return the content of the highest valid _standard_ leaf instead. |
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 4452eedfaedd..c3b1ad9fca81 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h | |||
@@ -20,13 +20,19 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, | |||
20 | struct kvm_cpuid_entry2 __user *entries); | 20 | struct kvm_cpuid_entry2 __user *entries); |
21 | void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); | 21 | void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); |
22 | 22 | ||
23 | int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu); | ||
24 | |||
25 | static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) | ||
26 | { | ||
27 | return vcpu->arch.maxphyaddr; | ||
28 | } | ||
23 | 29 | ||
24 | static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) | 30 | static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) |
25 | { | 31 | { |
26 | struct kvm_cpuid_entry2 *best; | 32 | struct kvm_cpuid_entry2 *best; |
27 | 33 | ||
28 | if (!static_cpu_has(X86_FEATURE_XSAVE)) | 34 | if (!static_cpu_has(X86_FEATURE_XSAVE)) |
29 | return 0; | 35 | return false; |
30 | 36 | ||
31 | best = kvm_find_cpuid_entry(vcpu, 1, 0); | 37 | best = kvm_find_cpuid_entry(vcpu, 1, 0); |
32 | return best && (best->ecx & bit(X86_FEATURE_XSAVE)); | 38 | return best && (best->ecx & bit(X86_FEATURE_XSAVE)); |
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 106c01557f2b..630bcb0d7a04 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c | |||
@@ -248,27 +248,7 @@ struct mode_dual { | |||
248 | struct opcode mode64; | 248 | struct opcode mode64; |
249 | }; | 249 | }; |
250 | 250 | ||
251 | /* EFLAGS bit definitions. */ | ||
252 | #define EFLG_ID (1<<21) | ||
253 | #define EFLG_VIP (1<<20) | ||
254 | #define EFLG_VIF (1<<19) | ||
255 | #define EFLG_AC (1<<18) | ||
256 | #define EFLG_VM (1<<17) | ||
257 | #define EFLG_RF (1<<16) | ||
258 | #define EFLG_IOPL (3<<12) | ||
259 | #define EFLG_NT (1<<14) | ||
260 | #define EFLG_OF (1<<11) | ||
261 | #define EFLG_DF (1<<10) | ||
262 | #define EFLG_IF (1<<9) | ||
263 | #define EFLG_TF (1<<8) | ||
264 | #define EFLG_SF (1<<7) | ||
265 | #define EFLG_ZF (1<<6) | ||
266 | #define EFLG_AF (1<<4) | ||
267 | #define EFLG_PF (1<<2) | ||
268 | #define EFLG_CF (1<<0) | ||
269 | |||
270 | #define EFLG_RESERVED_ZEROS_MASK 0xffc0802a | 251 | #define EFLG_RESERVED_ZEROS_MASK 0xffc0802a |
271 | #define EFLG_RESERVED_ONE_MASK 2 | ||
272 | 252 | ||
273 | enum x86_transfer_type { | 253 | enum x86_transfer_type { |
274 | X86_TRANSFER_NONE, | 254 | X86_TRANSFER_NONE, |
@@ -317,7 +297,8 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt) | |||
317 | * These EFLAGS bits are restored from saved value during emulation, and | 297 | * These EFLAGS bits are restored from saved value during emulation, and |
318 | * any changes are written back to the saved value after emulation. | 298 | * any changes are written back to the saved value after emulation. |
319 | */ | 299 | */ |
320 | #define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF) | 300 | #define EFLAGS_MASK (X86_EFLAGS_OF|X86_EFLAGS_SF|X86_EFLAGS_ZF|X86_EFLAGS_AF|\ |
301 | X86_EFLAGS_PF|X86_EFLAGS_CF) | ||
321 | 302 | ||
322 | #ifdef CONFIG_X86_64 | 303 | #ifdef CONFIG_X86_64 |
323 | #define ON64(x) x | 304 | #define ON64(x) x |
@@ -478,6 +459,25 @@ static void assign_masked(ulong *dest, ulong src, ulong mask) | |||
478 | *dest = (*dest & ~mask) | (src & mask); | 459 | *dest = (*dest & ~mask) | (src & mask); |
479 | } | 460 | } |
480 | 461 | ||
462 | static void assign_register(unsigned long *reg, u64 val, int bytes) | ||
463 | { | ||
464 | /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ | ||
465 | switch (bytes) { | ||
466 | case 1: | ||
467 | *(u8 *)reg = (u8)val; | ||
468 | break; | ||
469 | case 2: | ||
470 | *(u16 *)reg = (u16)val; | ||
471 | break; | ||
472 | case 4: | ||
473 | *reg = (u32)val; | ||
474 | break; /* 64b: zero-extend */ | ||
475 | case 8: | ||
476 | *reg = val; | ||
477 | break; | ||
478 | } | ||
479 | } | ||
480 | |||
481 | static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt) | 481 | static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt) |
482 | { | 482 | { |
483 | return (1UL << (ctxt->ad_bytes << 3)) - 1; | 483 | return (1UL << (ctxt->ad_bytes << 3)) - 1; |
@@ -943,6 +943,22 @@ FASTOP2(xadd); | |||
943 | 943 | ||
944 | FASTOP2R(cmp, cmp_r); | 944 | FASTOP2R(cmp, cmp_r); |
945 | 945 | ||
946 | static int em_bsf_c(struct x86_emulate_ctxt *ctxt) | ||
947 | { | ||
948 | /* If src is zero, do not writeback, but update flags */ | ||
949 | if (ctxt->src.val == 0) | ||
950 | ctxt->dst.type = OP_NONE; | ||
951 | return fastop(ctxt, em_bsf); | ||
952 | } | ||
953 | |||
954 | static int em_bsr_c(struct x86_emulate_ctxt *ctxt) | ||
955 | { | ||
956 | /* If src is zero, do not writeback, but update flags */ | ||
957 | if (ctxt->src.val == 0) | ||
958 | ctxt->dst.type = OP_NONE; | ||
959 | return fastop(ctxt, em_bsr); | ||
960 | } | ||
961 | |||
946 | static u8 test_cc(unsigned int condition, unsigned long flags) | 962 | static u8 test_cc(unsigned int condition, unsigned long flags) |
947 | { | 963 | { |
948 | u8 rc; | 964 | u8 rc; |
@@ -1399,7 +1415,7 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, | |||
1399 | unsigned int in_page, n; | 1415 | unsigned int in_page, n; |
1400 | unsigned int count = ctxt->rep_prefix ? | 1416 | unsigned int count = ctxt->rep_prefix ? |
1401 | address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) : 1; | 1417 | address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) : 1; |
1402 | in_page = (ctxt->eflags & EFLG_DF) ? | 1418 | in_page = (ctxt->eflags & X86_EFLAGS_DF) ? |
1403 | offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)) : | 1419 | offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)) : |
1404 | PAGE_SIZE - offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)); | 1420 | PAGE_SIZE - offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)); |
1405 | n = min3(in_page, (unsigned int)sizeof(rc->data) / size, count); | 1421 | n = min3(in_page, (unsigned int)sizeof(rc->data) / size, count); |
@@ -1412,7 +1428,7 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, | |||
1412 | } | 1428 | } |
1413 | 1429 | ||
1414 | if (ctxt->rep_prefix && (ctxt->d & String) && | 1430 | if (ctxt->rep_prefix && (ctxt->d & String) && |
1415 | !(ctxt->eflags & EFLG_DF)) { | 1431 | !(ctxt->eflags & X86_EFLAGS_DF)) { |
1416 | ctxt->dst.data = rc->data + rc->pos; | 1432 | ctxt->dst.data = rc->data + rc->pos; |
1417 | ctxt->dst.type = OP_MEM_STR; | 1433 | ctxt->dst.type = OP_MEM_STR; |
1418 | ctxt->dst.count = (rc->end - rc->pos) / size; | 1434 | ctxt->dst.count = (rc->end - rc->pos) / size; |
@@ -1691,21 +1707,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
1691 | 1707 | ||
1692 | static void write_register_operand(struct operand *op) | 1708 | static void write_register_operand(struct operand *op) |
1693 | { | 1709 | { |
1694 | /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ | 1710 | return assign_register(op->addr.reg, op->val, op->bytes); |
1695 | switch (op->bytes) { | ||
1696 | case 1: | ||
1697 | *(u8 *)op->addr.reg = (u8)op->val; | ||
1698 | break; | ||
1699 | case 2: | ||
1700 | *(u16 *)op->addr.reg = (u16)op->val; | ||
1701 | break; | ||
1702 | case 4: | ||
1703 | *op->addr.reg = (u32)op->val; | ||
1704 | break; /* 64b: zero-extend */ | ||
1705 | case 8: | ||
1706 | *op->addr.reg = op->val; | ||
1707 | break; | ||
1708 | } | ||
1709 | } | 1711 | } |
1710 | 1712 | ||
1711 | static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op) | 1713 | static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op) |
@@ -1792,32 +1794,34 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, | |||
1792 | { | 1794 | { |
1793 | int rc; | 1795 | int rc; |
1794 | unsigned long val, change_mask; | 1796 | unsigned long val, change_mask; |
1795 | int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; | 1797 | int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> X86_EFLAGS_IOPL_BIT; |
1796 | int cpl = ctxt->ops->cpl(ctxt); | 1798 | int cpl = ctxt->ops->cpl(ctxt); |
1797 | 1799 | ||
1798 | rc = emulate_pop(ctxt, &val, len); | 1800 | rc = emulate_pop(ctxt, &val, len); |
1799 | if (rc != X86EMUL_CONTINUE) | 1801 | if (rc != X86EMUL_CONTINUE) |
1800 | return rc; | 1802 | return rc; |
1801 | 1803 | ||
1802 | change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF | 1804 | change_mask = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | |
1803 | | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_AC | EFLG_ID; | 1805 | X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF | |
1806 | X86_EFLAGS_TF | X86_EFLAGS_DF | X86_EFLAGS_NT | | ||
1807 | X86_EFLAGS_AC | X86_EFLAGS_ID; | ||
1804 | 1808 | ||
1805 | switch(ctxt->mode) { | 1809 | switch(ctxt->mode) { |
1806 | case X86EMUL_MODE_PROT64: | 1810 | case X86EMUL_MODE_PROT64: |
1807 | case X86EMUL_MODE_PROT32: | 1811 | case X86EMUL_MODE_PROT32: |
1808 | case X86EMUL_MODE_PROT16: | 1812 | case X86EMUL_MODE_PROT16: |
1809 | if (cpl == 0) | 1813 | if (cpl == 0) |
1810 | change_mask |= EFLG_IOPL; | 1814 | change_mask |= X86_EFLAGS_IOPL; |
1811 | if (cpl <= iopl) | 1815 | if (cpl <= iopl) |
1812 | change_mask |= EFLG_IF; | 1816 | change_mask |= X86_EFLAGS_IF; |
1813 | break; | 1817 | break; |
1814 | case X86EMUL_MODE_VM86: | 1818 | case X86EMUL_MODE_VM86: |
1815 | if (iopl < 3) | 1819 | if (iopl < 3) |
1816 | return emulate_gp(ctxt, 0); | 1820 | return emulate_gp(ctxt, 0); |
1817 | change_mask |= EFLG_IF; | 1821 | change_mask |= X86_EFLAGS_IF; |
1818 | break; | 1822 | break; |
1819 | default: /* real mode */ | 1823 | default: /* real mode */ |
1820 | change_mask |= (EFLG_IOPL | EFLG_IF); | 1824 | change_mask |= (X86_EFLAGS_IOPL | X86_EFLAGS_IF); |
1821 | break; | 1825 | break; |
1822 | } | 1826 | } |
1823 | 1827 | ||
@@ -1918,7 +1922,7 @@ static int em_pusha(struct x86_emulate_ctxt *ctxt) | |||
1918 | 1922 | ||
1919 | static int em_pushf(struct x86_emulate_ctxt *ctxt) | 1923 | static int em_pushf(struct x86_emulate_ctxt *ctxt) |
1920 | { | 1924 | { |
1921 | ctxt->src.val = (unsigned long)ctxt->eflags & ~EFLG_VM; | 1925 | ctxt->src.val = (unsigned long)ctxt->eflags & ~X86_EFLAGS_VM; |
1922 | return em_push(ctxt); | 1926 | return em_push(ctxt); |
1923 | } | 1927 | } |
1924 | 1928 | ||
@@ -1926,6 +1930,7 @@ static int em_popa(struct x86_emulate_ctxt *ctxt) | |||
1926 | { | 1930 | { |
1927 | int rc = X86EMUL_CONTINUE; | 1931 | int rc = X86EMUL_CONTINUE; |
1928 | int reg = VCPU_REGS_RDI; | 1932 | int reg = VCPU_REGS_RDI; |
1933 | u32 val; | ||
1929 | 1934 | ||
1930 | while (reg >= VCPU_REGS_RAX) { | 1935 | while (reg >= VCPU_REGS_RAX) { |
1931 | if (reg == VCPU_REGS_RSP) { | 1936 | if (reg == VCPU_REGS_RSP) { |
@@ -1933,9 +1938,10 @@ static int em_popa(struct x86_emulate_ctxt *ctxt) | |||
1933 | --reg; | 1938 | --reg; |
1934 | } | 1939 | } |
1935 | 1940 | ||
1936 | rc = emulate_pop(ctxt, reg_rmw(ctxt, reg), ctxt->op_bytes); | 1941 | rc = emulate_pop(ctxt, &val, ctxt->op_bytes); |
1937 | if (rc != X86EMUL_CONTINUE) | 1942 | if (rc != X86EMUL_CONTINUE) |
1938 | break; | 1943 | break; |
1944 | assign_register(reg_rmw(ctxt, reg), val, ctxt->op_bytes); | ||
1939 | --reg; | 1945 | --reg; |
1940 | } | 1946 | } |
1941 | return rc; | 1947 | return rc; |
@@ -1956,7 +1962,7 @@ static int __emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq) | |||
1956 | if (rc != X86EMUL_CONTINUE) | 1962 | if (rc != X86EMUL_CONTINUE) |
1957 | return rc; | 1963 | return rc; |
1958 | 1964 | ||
1959 | ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC); | 1965 | ctxt->eflags &= ~(X86_EFLAGS_IF | X86_EFLAGS_TF | X86_EFLAGS_AC); |
1960 | 1966 | ||
1961 | ctxt->src.val = get_segment_selector(ctxt, VCPU_SREG_CS); | 1967 | ctxt->src.val = get_segment_selector(ctxt, VCPU_SREG_CS); |
1962 | rc = em_push(ctxt); | 1968 | rc = em_push(ctxt); |
@@ -2022,10 +2028,14 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt) | |||
2022 | unsigned long temp_eip = 0; | 2028 | unsigned long temp_eip = 0; |
2023 | unsigned long temp_eflags = 0; | 2029 | unsigned long temp_eflags = 0; |
2024 | unsigned long cs = 0; | 2030 | unsigned long cs = 0; |
2025 | unsigned long mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_TF | | 2031 | unsigned long mask = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | |
2026 | EFLG_IF | EFLG_DF | EFLG_OF | EFLG_IOPL | EFLG_NT | EFLG_RF | | 2032 | X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_TF | |
2027 | EFLG_AC | EFLG_ID | (1 << 1); /* Last one is the reserved bit */ | 2033 | X86_EFLAGS_IF | X86_EFLAGS_DF | X86_EFLAGS_OF | |
2028 | unsigned long vm86_mask = EFLG_VM | EFLG_VIF | EFLG_VIP; | 2034 | X86_EFLAGS_IOPL | X86_EFLAGS_NT | X86_EFLAGS_RF | |
2035 | X86_EFLAGS_AC | X86_EFLAGS_ID | | ||
2036 | X86_EFLAGS_FIXED; | ||
2037 | unsigned long vm86_mask = X86_EFLAGS_VM | X86_EFLAGS_VIF | | ||
2038 | X86_EFLAGS_VIP; | ||
2029 | 2039 | ||
2030 | /* TODO: Add stack limit check */ | 2040 | /* TODO: Add stack limit check */ |
2031 | 2041 | ||
@@ -2054,7 +2064,6 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt) | |||
2054 | 2064 | ||
2055 | ctxt->_eip = temp_eip; | 2065 | ctxt->_eip = temp_eip; |
2056 | 2066 | ||
2057 | |||
2058 | if (ctxt->op_bytes == 4) | 2067 | if (ctxt->op_bytes == 4) |
2059 | ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask)); | 2068 | ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask)); |
2060 | else if (ctxt->op_bytes == 2) { | 2069 | else if (ctxt->op_bytes == 2) { |
@@ -2063,7 +2072,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt) | |||
2063 | } | 2072 | } |
2064 | 2073 | ||
2065 | ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */ | 2074 | ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */ |
2066 | ctxt->eflags |= EFLG_RESERVED_ONE_MASK; | 2075 | ctxt->eflags |= X86_EFLAGS_FIXED; |
2067 | ctxt->ops->set_nmi_mask(ctxt, false); | 2076 | ctxt->ops->set_nmi_mask(ctxt, false); |
2068 | 2077 | ||
2069 | return rc; | 2078 | return rc; |
@@ -2145,12 +2154,12 @@ static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt) | |||
2145 | ((u32) (old >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) { | 2154 | ((u32) (old >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) { |
2146 | *reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old >> 0); | 2155 | *reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old >> 0); |
2147 | *reg_write(ctxt, VCPU_REGS_RDX) = (u32) (old >> 32); | 2156 | *reg_write(ctxt, VCPU_REGS_RDX) = (u32) (old >> 32); |
2148 | ctxt->eflags &= ~EFLG_ZF; | 2157 | ctxt->eflags &= ~X86_EFLAGS_ZF; |
2149 | } else { | 2158 | } else { |
2150 | ctxt->dst.val64 = ((u64)reg_read(ctxt, VCPU_REGS_RCX) << 32) | | 2159 | ctxt->dst.val64 = ((u64)reg_read(ctxt, VCPU_REGS_RCX) << 32) | |
2151 | (u32) reg_read(ctxt, VCPU_REGS_RBX); | 2160 | (u32) reg_read(ctxt, VCPU_REGS_RBX); |
2152 | 2161 | ||
2153 | ctxt->eflags |= EFLG_ZF; | 2162 | ctxt->eflags |= X86_EFLAGS_ZF; |
2154 | } | 2163 | } |
2155 | return X86EMUL_CONTINUE; | 2164 | return X86EMUL_CONTINUE; |
2156 | } | 2165 | } |
@@ -2222,7 +2231,7 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt) | |||
2222 | ctxt->src.val = ctxt->dst.orig_val; | 2231 | ctxt->src.val = ctxt->dst.orig_val; |
2223 | fastop(ctxt, em_cmp); | 2232 | fastop(ctxt, em_cmp); |
2224 | 2233 | ||
2225 | if (ctxt->eflags & EFLG_ZF) { | 2234 | if (ctxt->eflags & X86_EFLAGS_ZF) { |
2226 | /* Success: write back to memory; no update of EAX */ | 2235 | /* Success: write back to memory; no update of EAX */ |
2227 | ctxt->src.type = OP_NONE; | 2236 | ctxt->src.type = OP_NONE; |
2228 | ctxt->dst.val = ctxt->src.orig_val; | 2237 | ctxt->dst.val = ctxt->src.orig_val; |
@@ -2381,14 +2390,14 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) | |||
2381 | 2390 | ||
2382 | ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data); | 2391 | ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data); |
2383 | ctxt->eflags &= ~msr_data; | 2392 | ctxt->eflags &= ~msr_data; |
2384 | ctxt->eflags |= EFLG_RESERVED_ONE_MASK; | 2393 | ctxt->eflags |= X86_EFLAGS_FIXED; |
2385 | #endif | 2394 | #endif |
2386 | } else { | 2395 | } else { |
2387 | /* legacy mode */ | 2396 | /* legacy mode */ |
2388 | ops->get_msr(ctxt, MSR_STAR, &msr_data); | 2397 | ops->get_msr(ctxt, MSR_STAR, &msr_data); |
2389 | ctxt->_eip = (u32)msr_data; | 2398 | ctxt->_eip = (u32)msr_data; |
2390 | 2399 | ||
2391 | ctxt->eflags &= ~(EFLG_VM | EFLG_IF); | 2400 | ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF); |
2392 | } | 2401 | } |
2393 | 2402 | ||
2394 | return X86EMUL_CONTINUE; | 2403 | return X86EMUL_CONTINUE; |
@@ -2425,8 +2434,8 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt) | |||
2425 | if ((msr_data & 0xfffc) == 0x0) | 2434 | if ((msr_data & 0xfffc) == 0x0) |
2426 | return emulate_gp(ctxt, 0); | 2435 | return emulate_gp(ctxt, 0); |
2427 | 2436 | ||
2428 | ctxt->eflags &= ~(EFLG_VM | EFLG_IF); | 2437 | ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF); |
2429 | cs_sel = (u16)msr_data & ~SELECTOR_RPL_MASK; | 2438 | cs_sel = (u16)msr_data & ~SEGMENT_RPL_MASK; |
2430 | ss_sel = cs_sel + 8; | 2439 | ss_sel = cs_sel + 8; |
2431 | if (efer & EFER_LMA) { | 2440 | if (efer & EFER_LMA) { |
2432 | cs.d = 0; | 2441 | cs.d = 0; |
@@ -2493,8 +2502,8 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt) | |||
2493 | return emulate_gp(ctxt, 0); | 2502 | return emulate_gp(ctxt, 0); |
2494 | break; | 2503 | break; |
2495 | } | 2504 | } |
2496 | cs_sel |= SELECTOR_RPL_MASK; | 2505 | cs_sel |= SEGMENT_RPL_MASK; |
2497 | ss_sel |= SELECTOR_RPL_MASK; | 2506 | ss_sel |= SEGMENT_RPL_MASK; |
2498 | 2507 | ||
2499 | ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); | 2508 | ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); |
2500 | ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); | 2509 | ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); |
@@ -2512,7 +2521,7 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt) | |||
2512 | return false; | 2521 | return false; |
2513 | if (ctxt->mode == X86EMUL_MODE_VM86) | 2522 | if (ctxt->mode == X86EMUL_MODE_VM86) |
2514 | return true; | 2523 | return true; |
2515 | iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; | 2524 | iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> X86_EFLAGS_IOPL_BIT; |
2516 | return ctxt->ops->cpl(ctxt) > iopl; | 2525 | return ctxt->ops->cpl(ctxt) > iopl; |
2517 | } | 2526 | } |
2518 | 2527 | ||
@@ -2782,10 +2791,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, | |||
2782 | return ret; | 2791 | return ret; |
2783 | ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl, | 2792 | ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl, |
2784 | X86_TRANSFER_TASK_SWITCH, NULL); | 2793 | X86_TRANSFER_TASK_SWITCH, NULL); |
2785 | if (ret != X86EMUL_CONTINUE) | ||
2786 | return ret; | ||
2787 | 2794 | ||
2788 | return X86EMUL_CONTINUE; | 2795 | return ret; |
2789 | } | 2796 | } |
2790 | 2797 | ||
2791 | static int task_switch_32(struct x86_emulate_ctxt *ctxt, | 2798 | static int task_switch_32(struct x86_emulate_ctxt *ctxt, |
@@ -2954,7 +2961,7 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, | |||
2954 | static void string_addr_inc(struct x86_emulate_ctxt *ctxt, int reg, | 2961 | static void string_addr_inc(struct x86_emulate_ctxt *ctxt, int reg, |
2955 | struct operand *op) | 2962 | struct operand *op) |
2956 | { | 2963 | { |
2957 | int df = (ctxt->eflags & EFLG_DF) ? -op->count : op->count; | 2964 | int df = (ctxt->eflags & X86_EFLAGS_DF) ? -op->count : op->count; |
2958 | 2965 | ||
2959 | register_address_increment(ctxt, reg, df * op->bytes); | 2966 | register_address_increment(ctxt, reg, df * op->bytes); |
2960 | op->addr.mem.ea = register_address(ctxt, reg); | 2967 | op->addr.mem.ea = register_address(ctxt, reg); |
@@ -3323,7 +3330,7 @@ static int em_clts(struct x86_emulate_ctxt *ctxt) | |||
3323 | return X86EMUL_CONTINUE; | 3330 | return X86EMUL_CONTINUE; |
3324 | } | 3331 | } |
3325 | 3332 | ||
3326 | static int em_vmcall(struct x86_emulate_ctxt *ctxt) | 3333 | static int em_hypercall(struct x86_emulate_ctxt *ctxt) |
3327 | { | 3334 | { |
3328 | int rc = ctxt->ops->fix_hypercall(ctxt); | 3335 | int rc = ctxt->ops->fix_hypercall(ctxt); |
3329 | 3336 | ||
@@ -3395,17 +3402,6 @@ static int em_lgdt(struct x86_emulate_ctxt *ctxt) | |||
3395 | return em_lgdt_lidt(ctxt, true); | 3402 | return em_lgdt_lidt(ctxt, true); |
3396 | } | 3403 | } |
3397 | 3404 | ||
3398 | static int em_vmmcall(struct x86_emulate_ctxt *ctxt) | ||
3399 | { | ||
3400 | int rc; | ||
3401 | |||
3402 | rc = ctxt->ops->fix_hypercall(ctxt); | ||
3403 | |||
3404 | /* Disable writeback. */ | ||
3405 | ctxt->dst.type = OP_NONE; | ||
3406 | return rc; | ||
3407 | } | ||
3408 | |||
3409 | static int em_lidt(struct x86_emulate_ctxt *ctxt) | 3405 | static int em_lidt(struct x86_emulate_ctxt *ctxt) |
3410 | { | 3406 | { |
3411 | return em_lgdt_lidt(ctxt, false); | 3407 | return em_lgdt_lidt(ctxt, false); |
@@ -3504,7 +3500,8 @@ static int em_sahf(struct x86_emulate_ctxt *ctxt) | |||
3504 | { | 3500 | { |
3505 | u32 flags; | 3501 | u32 flags; |
3506 | 3502 | ||
3507 | flags = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF; | 3503 | flags = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | |
3504 | X86_EFLAGS_SF; | ||
3508 | flags &= *reg_rmw(ctxt, VCPU_REGS_RAX) >> 8; | 3505 | flags &= *reg_rmw(ctxt, VCPU_REGS_RAX) >> 8; |
3509 | 3506 | ||
3510 | ctxt->eflags &= ~0xffUL; | 3507 | ctxt->eflags &= ~0xffUL; |
@@ -3769,7 +3766,7 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) | |||
3769 | 3766 | ||
3770 | static const struct opcode group7_rm0[] = { | 3767 | static const struct opcode group7_rm0[] = { |
3771 | N, | 3768 | N, |
3772 | I(SrcNone | Priv | EmulateOnUD, em_vmcall), | 3769 | I(SrcNone | Priv | EmulateOnUD, em_hypercall), |
3773 | N, N, N, N, N, N, | 3770 | N, N, N, N, N, N, |
3774 | }; | 3771 | }; |
3775 | 3772 | ||
@@ -3781,7 +3778,7 @@ static const struct opcode group7_rm1[] = { | |||
3781 | 3778 | ||
3782 | static const struct opcode group7_rm3[] = { | 3779 | static const struct opcode group7_rm3[] = { |
3783 | DIP(SrcNone | Prot | Priv, vmrun, check_svme_pa), | 3780 | DIP(SrcNone | Prot | Priv, vmrun, check_svme_pa), |
3784 | II(SrcNone | Prot | EmulateOnUD, em_vmmcall, vmmcall), | 3781 | II(SrcNone | Prot | EmulateOnUD, em_hypercall, vmmcall), |
3785 | DIP(SrcNone | Prot | Priv, vmload, check_svme_pa), | 3782 | DIP(SrcNone | Prot | Priv, vmload, check_svme_pa), |
3786 | DIP(SrcNone | Prot | Priv, vmsave, check_svme_pa), | 3783 | DIP(SrcNone | Prot | Priv, vmsave, check_svme_pa), |
3787 | DIP(SrcNone | Prot | Priv, stgi, check_svme), | 3784 | DIP(SrcNone | Prot | Priv, stgi, check_svme), |
@@ -4192,7 +4189,8 @@ static const struct opcode twobyte_table[256] = { | |||
4192 | N, N, | 4189 | N, N, |
4193 | G(BitOp, group8), | 4190 | G(BitOp, group8), |
4194 | F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc), | 4191 | F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc), |
4195 | F(DstReg | SrcMem | ModRM, em_bsf), F(DstReg | SrcMem | ModRM, em_bsr), | 4192 | I(DstReg | SrcMem | ModRM, em_bsf_c), |
4193 | I(DstReg | SrcMem | ModRM, em_bsr_c), | ||
4196 | D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), | 4194 | D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), |
4197 | /* 0xC0 - 0xC7 */ | 4195 | /* 0xC0 - 0xC7 */ |
4198 | F2bv(DstMem | SrcReg | ModRM | SrcWrite | Lock, em_xadd), | 4196 | F2bv(DstMem | SrcReg | ModRM | SrcWrite | Lock, em_xadd), |
@@ -4759,9 +4757,9 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) | |||
4759 | if (((ctxt->b == 0xa6) || (ctxt->b == 0xa7) || | 4757 | if (((ctxt->b == 0xa6) || (ctxt->b == 0xa7) || |
4760 | (ctxt->b == 0xae) || (ctxt->b == 0xaf)) | 4758 | (ctxt->b == 0xae) || (ctxt->b == 0xaf)) |
4761 | && (((ctxt->rep_prefix == REPE_PREFIX) && | 4759 | && (((ctxt->rep_prefix == REPE_PREFIX) && |
4762 | ((ctxt->eflags & EFLG_ZF) == 0)) | 4760 | ((ctxt->eflags & X86_EFLAGS_ZF) == 0)) |
4763 | || ((ctxt->rep_prefix == REPNE_PREFIX) && | 4761 | || ((ctxt->rep_prefix == REPNE_PREFIX) && |
4764 | ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)))) | 4762 | ((ctxt->eflags & X86_EFLAGS_ZF) == X86_EFLAGS_ZF)))) |
4765 | return true; | 4763 | return true; |
4766 | 4764 | ||
4767 | return false; | 4765 | return false; |
@@ -4913,7 +4911,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) | |||
4913 | /* All REP prefixes have the same first termination condition */ | 4911 | /* All REP prefixes have the same first termination condition */ |
4914 | if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) { | 4912 | if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) { |
4915 | ctxt->eip = ctxt->_eip; | 4913 | ctxt->eip = ctxt->_eip; |
4916 | ctxt->eflags &= ~EFLG_RF; | 4914 | ctxt->eflags &= ~X86_EFLAGS_RF; |
4917 | goto done; | 4915 | goto done; |
4918 | } | 4916 | } |
4919 | } | 4917 | } |
@@ -4963,9 +4961,9 @@ special_insn: | |||
4963 | } | 4961 | } |
4964 | 4962 | ||
4965 | if (ctxt->rep_prefix && (ctxt->d & String)) | 4963 | if (ctxt->rep_prefix && (ctxt->d & String)) |
4966 | ctxt->eflags |= EFLG_RF; | 4964 | ctxt->eflags |= X86_EFLAGS_RF; |
4967 | else | 4965 | else |
4968 | ctxt->eflags &= ~EFLG_RF; | 4966 | ctxt->eflags &= ~X86_EFLAGS_RF; |
4969 | 4967 | ||
4970 | if (ctxt->execute) { | 4968 | if (ctxt->execute) { |
4971 | if (ctxt->d & Fastop) { | 4969 | if (ctxt->d & Fastop) { |
@@ -5014,7 +5012,7 @@ special_insn: | |||
5014 | rc = emulate_int(ctxt, ctxt->src.val); | 5012 | rc = emulate_int(ctxt, ctxt->src.val); |
5015 | break; | 5013 | break; |
5016 | case 0xce: /* into */ | 5014 | case 0xce: /* into */ |
5017 | if (ctxt->eflags & EFLG_OF) | 5015 | if (ctxt->eflags & X86_EFLAGS_OF) |
5018 | rc = emulate_int(ctxt, 4); | 5016 | rc = emulate_int(ctxt, 4); |
5019 | break; | 5017 | break; |
5020 | case 0xe9: /* jmp rel */ | 5018 | case 0xe9: /* jmp rel */ |
@@ -5027,19 +5025,19 @@ special_insn: | |||
5027 | break; | 5025 | break; |
5028 | case 0xf5: /* cmc */ | 5026 | case 0xf5: /* cmc */ |
5029 | /* complement carry flag from eflags reg */ | 5027 | /* complement carry flag from eflags reg */ |
5030 | ctxt->eflags ^= EFLG_CF; | 5028 | ctxt->eflags ^= X86_EFLAGS_CF; |
5031 | break; | 5029 | break; |
5032 | case 0xf8: /* clc */ | 5030 | case 0xf8: /* clc */ |
5033 | ctxt->eflags &= ~EFLG_CF; | 5031 | ctxt->eflags &= ~X86_EFLAGS_CF; |
5034 | break; | 5032 | break; |
5035 | case 0xf9: /* stc */ | 5033 | case 0xf9: /* stc */ |
5036 | ctxt->eflags |= EFLG_CF; | 5034 | ctxt->eflags |= X86_EFLAGS_CF; |
5037 | break; | 5035 | break; |
5038 | case 0xfc: /* cld */ | 5036 | case 0xfc: /* cld */ |
5039 | ctxt->eflags &= ~EFLG_DF; | 5037 | ctxt->eflags &= ~X86_EFLAGS_DF; |
5040 | break; | 5038 | break; |
5041 | case 0xfd: /* std */ | 5039 | case 0xfd: /* std */ |
5042 | ctxt->eflags |= EFLG_DF; | 5040 | ctxt->eflags |= X86_EFLAGS_DF; |
5043 | break; | 5041 | break; |
5044 | default: | 5042 | default: |
5045 | goto cannot_emulate; | 5043 | goto cannot_emulate; |
@@ -5100,7 +5098,7 @@ writeback: | |||
5100 | } | 5098 | } |
5101 | goto done; /* skip rip writeback */ | 5099 | goto done; /* skip rip writeback */ |
5102 | } | 5100 | } |
5103 | ctxt->eflags &= ~EFLG_RF; | 5101 | ctxt->eflags &= ~X86_EFLAGS_RF; |
5104 | } | 5102 | } |
5105 | 5103 | ||
5106 | ctxt->eip = ctxt->_eip; | 5104 | ctxt->eip = ctxt->_eip; |
@@ -5137,8 +5135,7 @@ twobyte_insn: | |||
5137 | case 0x40 ... 0x4f: /* cmov */ | 5135 | case 0x40 ... 0x4f: /* cmov */ |
5138 | if (test_cc(ctxt->b, ctxt->eflags)) | 5136 | if (test_cc(ctxt->b, ctxt->eflags)) |
5139 | ctxt->dst.val = ctxt->src.val; | 5137 | ctxt->dst.val = ctxt->src.val; |
5140 | else if (ctxt->mode != X86EMUL_MODE_PROT64 || | 5138 | else if (ctxt->op_bytes != 4) |
5141 | ctxt->op_bytes != 4) | ||
5142 | ctxt->dst.type = OP_NONE; /* no writeback */ | 5139 | ctxt->dst.type = OP_NONE; /* no writeback */ |
5143 | break; | 5140 | break; |
5144 | case 0x80 ... 0x8f: /* jnz rel, etc*/ | 5141 | case 0x80 ... 0x8f: /* jnz rel, etc*/ |
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 298781d4cfb4..4dce6f8b6129 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c | |||
@@ -443,7 +443,8 @@ static inline int pit_in_range(gpa_t addr) | |||
443 | (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH)); | 443 | (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH)); |
444 | } | 444 | } |
445 | 445 | ||
446 | static int pit_ioport_write(struct kvm_io_device *this, | 446 | static int pit_ioport_write(struct kvm_vcpu *vcpu, |
447 | struct kvm_io_device *this, | ||
447 | gpa_t addr, int len, const void *data) | 448 | gpa_t addr, int len, const void *data) |
448 | { | 449 | { |
449 | struct kvm_pit *pit = dev_to_pit(this); | 450 | struct kvm_pit *pit = dev_to_pit(this); |
@@ -519,7 +520,8 @@ static int pit_ioport_write(struct kvm_io_device *this, | |||
519 | return 0; | 520 | return 0; |
520 | } | 521 | } |
521 | 522 | ||
522 | static int pit_ioport_read(struct kvm_io_device *this, | 523 | static int pit_ioport_read(struct kvm_vcpu *vcpu, |
524 | struct kvm_io_device *this, | ||
523 | gpa_t addr, int len, void *data) | 525 | gpa_t addr, int len, void *data) |
524 | { | 526 | { |
525 | struct kvm_pit *pit = dev_to_pit(this); | 527 | struct kvm_pit *pit = dev_to_pit(this); |
@@ -589,7 +591,8 @@ static int pit_ioport_read(struct kvm_io_device *this, | |||
589 | return 0; | 591 | return 0; |
590 | } | 592 | } |
591 | 593 | ||
592 | static int speaker_ioport_write(struct kvm_io_device *this, | 594 | static int speaker_ioport_write(struct kvm_vcpu *vcpu, |
595 | struct kvm_io_device *this, | ||
593 | gpa_t addr, int len, const void *data) | 596 | gpa_t addr, int len, const void *data) |
594 | { | 597 | { |
595 | struct kvm_pit *pit = speaker_to_pit(this); | 598 | struct kvm_pit *pit = speaker_to_pit(this); |
@@ -606,8 +609,9 @@ static int speaker_ioport_write(struct kvm_io_device *this, | |||
606 | return 0; | 609 | return 0; |
607 | } | 610 | } |
608 | 611 | ||
609 | static int speaker_ioport_read(struct kvm_io_device *this, | 612 | static int speaker_ioport_read(struct kvm_vcpu *vcpu, |
610 | gpa_t addr, int len, void *data) | 613 | struct kvm_io_device *this, |
614 | gpa_t addr, int len, void *data) | ||
611 | { | 615 | { |
612 | struct kvm_pit *pit = speaker_to_pit(this); | 616 | struct kvm_pit *pit = speaker_to_pit(this); |
613 | struct kvm_kpit_state *pit_state = &pit->pit_state; | 617 | struct kvm_kpit_state *pit_state = &pit->pit_state; |
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index dd1b16b611b0..c84990b42b5b 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h | |||
@@ -3,7 +3,7 @@ | |||
3 | 3 | ||
4 | #include <linux/kthread.h> | 4 | #include <linux/kthread.h> |
5 | 5 | ||
6 | #include "iodev.h" | 6 | #include <kvm/iodev.h> |
7 | 7 | ||
8 | struct kvm_kpit_channel_state { | 8 | struct kvm_kpit_channel_state { |
9 | u32 count; /* can be 65536 */ | 9 | u32 count; /* can be 65536 */ |
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 9541ba34126b..fef922ff2635 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c | |||
@@ -529,42 +529,42 @@ static int picdev_read(struct kvm_pic *s, | |||
529 | return 0; | 529 | return 0; |
530 | } | 530 | } |
531 | 531 | ||
532 | static int picdev_master_write(struct kvm_io_device *dev, | 532 | static int picdev_master_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, |
533 | gpa_t addr, int len, const void *val) | 533 | gpa_t addr, int len, const void *val) |
534 | { | 534 | { |
535 | return picdev_write(container_of(dev, struct kvm_pic, dev_master), | 535 | return picdev_write(container_of(dev, struct kvm_pic, dev_master), |
536 | addr, len, val); | 536 | addr, len, val); |
537 | } | 537 | } |
538 | 538 | ||
539 | static int picdev_master_read(struct kvm_io_device *dev, | 539 | static int picdev_master_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, |
540 | gpa_t addr, int len, void *val) | 540 | gpa_t addr, int len, void *val) |
541 | { | 541 | { |
542 | return picdev_read(container_of(dev, struct kvm_pic, dev_master), | 542 | return picdev_read(container_of(dev, struct kvm_pic, dev_master), |
543 | addr, len, val); | 543 | addr, len, val); |
544 | } | 544 | } |
545 | 545 | ||
546 | static int picdev_slave_write(struct kvm_io_device *dev, | 546 | static int picdev_slave_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, |
547 | gpa_t addr, int len, const void *val) | 547 | gpa_t addr, int len, const void *val) |
548 | { | 548 | { |
549 | return picdev_write(container_of(dev, struct kvm_pic, dev_slave), | 549 | return picdev_write(container_of(dev, struct kvm_pic, dev_slave), |
550 | addr, len, val); | 550 | addr, len, val); |
551 | } | 551 | } |
552 | 552 | ||
553 | static int picdev_slave_read(struct kvm_io_device *dev, | 553 | static int picdev_slave_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, |
554 | gpa_t addr, int len, void *val) | 554 | gpa_t addr, int len, void *val) |
555 | { | 555 | { |
556 | return picdev_read(container_of(dev, struct kvm_pic, dev_slave), | 556 | return picdev_read(container_of(dev, struct kvm_pic, dev_slave), |
557 | addr, len, val); | 557 | addr, len, val); |
558 | } | 558 | } |
559 | 559 | ||
560 | static int picdev_eclr_write(struct kvm_io_device *dev, | 560 | static int picdev_eclr_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, |
561 | gpa_t addr, int len, const void *val) | 561 | gpa_t addr, int len, const void *val) |
562 | { | 562 | { |
563 | return picdev_write(container_of(dev, struct kvm_pic, dev_eclr), | 563 | return picdev_write(container_of(dev, struct kvm_pic, dev_eclr), |
564 | addr, len, val); | 564 | addr, len, val); |
565 | } | 565 | } |
566 | 566 | ||
567 | static int picdev_eclr_read(struct kvm_io_device *dev, | 567 | static int picdev_eclr_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, |
568 | gpa_t addr, int len, void *val) | 568 | gpa_t addr, int len, void *val) |
569 | { | 569 | { |
570 | return picdev_read(container_of(dev, struct kvm_pic, dev_eclr), | 570 | return picdev_read(container_of(dev, struct kvm_pic, dev_eclr), |
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 46d4449772bc..28146f03c514 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c | |||
@@ -206,6 +206,8 @@ static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq, | |||
206 | 206 | ||
207 | old_irr = ioapic->irr; | 207 | old_irr = ioapic->irr; |
208 | ioapic->irr |= mask; | 208 | ioapic->irr |= mask; |
209 | if (edge) | ||
210 | ioapic->irr_delivered &= ~mask; | ||
209 | if ((edge && old_irr == ioapic->irr) || | 211 | if ((edge && old_irr == ioapic->irr) || |
210 | (!edge && entry.fields.remote_irr)) { | 212 | (!edge && entry.fields.remote_irr)) { |
211 | ret = 0; | 213 | ret = 0; |
@@ -349,7 +351,7 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status) | |||
349 | irqe.shorthand = 0; | 351 | irqe.shorthand = 0; |
350 | 352 | ||
351 | if (irqe.trig_mode == IOAPIC_EDGE_TRIG) | 353 | if (irqe.trig_mode == IOAPIC_EDGE_TRIG) |
352 | ioapic->irr &= ~(1 << irq); | 354 | ioapic->irr_delivered |= 1 << irq; |
353 | 355 | ||
354 | if (irq == RTC_GSI && line_status) { | 356 | if (irq == RTC_GSI && line_status) { |
355 | /* | 357 | /* |
@@ -473,13 +475,6 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, | |||
473 | } | 475 | } |
474 | } | 476 | } |
475 | 477 | ||
476 | bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector) | ||
477 | { | ||
478 | struct kvm_ioapic *ioapic = kvm->arch.vioapic; | ||
479 | smp_rmb(); | ||
480 | return test_bit(vector, ioapic->handled_vectors); | ||
481 | } | ||
482 | |||
483 | void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, int trigger_mode) | 478 | void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, int trigger_mode) |
484 | { | 479 | { |
485 | struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; | 480 | struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; |
@@ -500,8 +495,8 @@ static inline int ioapic_in_range(struct kvm_ioapic *ioapic, gpa_t addr) | |||
500 | (addr < ioapic->base_address + IOAPIC_MEM_LENGTH))); | 495 | (addr < ioapic->base_address + IOAPIC_MEM_LENGTH))); |
501 | } | 496 | } |
502 | 497 | ||
503 | static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, | 498 | static int ioapic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, |
504 | void *val) | 499 | gpa_t addr, int len, void *val) |
505 | { | 500 | { |
506 | struct kvm_ioapic *ioapic = to_ioapic(this); | 501 | struct kvm_ioapic *ioapic = to_ioapic(this); |
507 | u32 result; | 502 | u32 result; |
@@ -543,8 +538,8 @@ static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, | |||
543 | return 0; | 538 | return 0; |
544 | } | 539 | } |
545 | 540 | ||
546 | static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, | 541 | static int ioapic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, |
547 | const void *val) | 542 | gpa_t addr, int len, const void *val) |
548 | { | 543 | { |
549 | struct kvm_ioapic *ioapic = to_ioapic(this); | 544 | struct kvm_ioapic *ioapic = to_ioapic(this); |
550 | u32 data; | 545 | u32 data; |
@@ -599,6 +594,7 @@ static void kvm_ioapic_reset(struct kvm_ioapic *ioapic) | |||
599 | ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS; | 594 | ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS; |
600 | ioapic->ioregsel = 0; | 595 | ioapic->ioregsel = 0; |
601 | ioapic->irr = 0; | 596 | ioapic->irr = 0; |
597 | ioapic->irr_delivered = 0; | ||
602 | ioapic->id = 0; | 598 | ioapic->id = 0; |
603 | memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS); | 599 | memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS); |
604 | rtc_irq_eoi_tracking_reset(ioapic); | 600 | rtc_irq_eoi_tracking_reset(ioapic); |
@@ -656,6 +652,7 @@ int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) | |||
656 | 652 | ||
657 | spin_lock(&ioapic->lock); | 653 | spin_lock(&ioapic->lock); |
658 | memcpy(state, ioapic, sizeof(struct kvm_ioapic_state)); | 654 | memcpy(state, ioapic, sizeof(struct kvm_ioapic_state)); |
655 | state->irr &= ~ioapic->irr_delivered; | ||
659 | spin_unlock(&ioapic->lock); | 656 | spin_unlock(&ioapic->lock); |
660 | return 0; | 657 | return 0; |
661 | } | 658 | } |
@@ -669,6 +666,7 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) | |||
669 | spin_lock(&ioapic->lock); | 666 | spin_lock(&ioapic->lock); |
670 | memcpy(ioapic, state, sizeof(struct kvm_ioapic_state)); | 667 | memcpy(ioapic, state, sizeof(struct kvm_ioapic_state)); |
671 | ioapic->irr = 0; | 668 | ioapic->irr = 0; |
669 | ioapic->irr_delivered = 0; | ||
672 | update_handled_vectors(ioapic); | 670 | update_handled_vectors(ioapic); |
673 | kvm_vcpu_request_scan_ioapic(kvm); | 671 | kvm_vcpu_request_scan_ioapic(kvm); |
674 | kvm_ioapic_inject_all(ioapic, state->irr); | 672 | kvm_ioapic_inject_all(ioapic, state->irr); |
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index c2e36d934af4..ca0b0b4e6256 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h | |||
@@ -3,7 +3,7 @@ | |||
3 | 3 | ||
4 | #include <linux/kvm_host.h> | 4 | #include <linux/kvm_host.h> |
5 | 5 | ||
6 | #include "iodev.h" | 6 | #include <kvm/iodev.h> |
7 | 7 | ||
8 | struct kvm; | 8 | struct kvm; |
9 | struct kvm_vcpu; | 9 | struct kvm_vcpu; |
@@ -77,6 +77,7 @@ struct kvm_ioapic { | |||
77 | struct rtc_status rtc_status; | 77 | struct rtc_status rtc_status; |
78 | struct delayed_work eoi_inject; | 78 | struct delayed_work eoi_inject; |
79 | u32 irq_eoi[IOAPIC_NUM_PINS]; | 79 | u32 irq_eoi[IOAPIC_NUM_PINS]; |
80 | u32 irr_delivered; | ||
80 | }; | 81 | }; |
81 | 82 | ||
82 | #ifdef DEBUG | 83 | #ifdef DEBUG |
@@ -97,13 +98,19 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) | |||
97 | return kvm->arch.vioapic; | 98 | return kvm->arch.vioapic; |
98 | } | 99 | } |
99 | 100 | ||
101 | static inline bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector) | ||
102 | { | ||
103 | struct kvm_ioapic *ioapic = kvm->arch.vioapic; | ||
104 | smp_rmb(); | ||
105 | return test_bit(vector, ioapic->handled_vectors); | ||
106 | } | ||
107 | |||
100 | void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu); | 108 | void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu); |
101 | bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, | 109 | bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, |
102 | int short_hand, unsigned int dest, int dest_mode); | 110 | int short_hand, unsigned int dest, int dest_mode); |
103 | int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); | 111 | int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); |
104 | void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, | 112 | void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, |
105 | int trigger_mode); | 113 | int trigger_mode); |
106 | bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector); | ||
107 | int kvm_ioapic_init(struct kvm *kvm); | 114 | int kvm_ioapic_init(struct kvm *kvm); |
108 | void kvm_ioapic_destroy(struct kvm *kvm); | 115 | void kvm_ioapic_destroy(struct kvm *kvm); |
109 | int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, | 116 | int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, |
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 2d03568e9498..ad68c73008c5 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h | |||
@@ -27,7 +27,7 @@ | |||
27 | #include <linux/kvm_host.h> | 27 | #include <linux/kvm_host.h> |
28 | #include <linux/spinlock.h> | 28 | #include <linux/spinlock.h> |
29 | 29 | ||
30 | #include "iodev.h" | 30 | #include <kvm/iodev.h> |
31 | #include "ioapic.h" | 31 | #include "ioapic.h" |
32 | #include "lapic.h" | 32 | #include "lapic.h" |
33 | 33 | ||
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 4ee827d7bf36..d67206a7b99a 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c | |||
@@ -133,6 +133,28 @@ static inline int kvm_apic_id(struct kvm_lapic *apic) | |||
133 | return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff; | 133 | return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff; |
134 | } | 134 | } |
135 | 135 | ||
136 | /* The logical map is definitely wrong if we have multiple | ||
137 | * modes at the same time. (Physical map is always right.) | ||
138 | */ | ||
139 | static inline bool kvm_apic_logical_map_valid(struct kvm_apic_map *map) | ||
140 | { | ||
141 | return !(map->mode & (map->mode - 1)); | ||
142 | } | ||
143 | |||
144 | static inline void | ||
145 | apic_logical_id(struct kvm_apic_map *map, u32 dest_id, u16 *cid, u16 *lid) | ||
146 | { | ||
147 | unsigned lid_bits; | ||
148 | |||
149 | BUILD_BUG_ON(KVM_APIC_MODE_XAPIC_CLUSTER != 4); | ||
150 | BUILD_BUG_ON(KVM_APIC_MODE_XAPIC_FLAT != 8); | ||
151 | BUILD_BUG_ON(KVM_APIC_MODE_X2APIC != 16); | ||
152 | lid_bits = map->mode; | ||
153 | |||
154 | *cid = dest_id >> lid_bits; | ||
155 | *lid = dest_id & ((1 << lid_bits) - 1); | ||
156 | } | ||
157 | |||
136 | static void recalculate_apic_map(struct kvm *kvm) | 158 | static void recalculate_apic_map(struct kvm *kvm) |
137 | { | 159 | { |
138 | struct kvm_apic_map *new, *old = NULL; | 160 | struct kvm_apic_map *new, *old = NULL; |
@@ -146,48 +168,6 @@ static void recalculate_apic_map(struct kvm *kvm) | |||
146 | if (!new) | 168 | if (!new) |
147 | goto out; | 169 | goto out; |
148 | 170 | ||
149 | new->ldr_bits = 8; | ||
150 | /* flat mode is default */ | ||
151 | new->cid_shift = 8; | ||
152 | new->cid_mask = 0; | ||
153 | new->lid_mask = 0xff; | ||
154 | new->broadcast = APIC_BROADCAST; | ||
155 | |||
156 | kvm_for_each_vcpu(i, vcpu, kvm) { | ||
157 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
158 | |||
159 | if (!kvm_apic_present(vcpu)) | ||
160 | continue; | ||
161 | |||
162 | if (apic_x2apic_mode(apic)) { | ||
163 | new->ldr_bits = 32; | ||
164 | new->cid_shift = 16; | ||
165 | new->cid_mask = new->lid_mask = 0xffff; | ||
166 | new->broadcast = X2APIC_BROADCAST; | ||
167 | } else if (kvm_apic_get_reg(apic, APIC_LDR)) { | ||
168 | if (kvm_apic_get_reg(apic, APIC_DFR) == | ||
169 | APIC_DFR_CLUSTER) { | ||
170 | new->cid_shift = 4; | ||
171 | new->cid_mask = 0xf; | ||
172 | new->lid_mask = 0xf; | ||
173 | } else { | ||
174 | new->cid_shift = 8; | ||
175 | new->cid_mask = 0; | ||
176 | new->lid_mask = 0xff; | ||
177 | } | ||
178 | } | ||
179 | |||
180 | /* | ||
181 | * All APICs have to be configured in the same mode by an OS. | ||
182 | * We take advatage of this while building logical id loockup | ||
183 | * table. After reset APICs are in software disabled mode, so if | ||
184 | * we find apic with different setting we assume this is the mode | ||
185 | * OS wants all apics to be in; build lookup table accordingly. | ||
186 | */ | ||
187 | if (kvm_apic_sw_enabled(apic)) | ||
188 | break; | ||
189 | } | ||
190 | |||
191 | kvm_for_each_vcpu(i, vcpu, kvm) { | 171 | kvm_for_each_vcpu(i, vcpu, kvm) { |
192 | struct kvm_lapic *apic = vcpu->arch.apic; | 172 | struct kvm_lapic *apic = vcpu->arch.apic; |
193 | u16 cid, lid; | 173 | u16 cid, lid; |
@@ -198,11 +178,25 @@ static void recalculate_apic_map(struct kvm *kvm) | |||
198 | 178 | ||
199 | aid = kvm_apic_id(apic); | 179 | aid = kvm_apic_id(apic); |
200 | ldr = kvm_apic_get_reg(apic, APIC_LDR); | 180 | ldr = kvm_apic_get_reg(apic, APIC_LDR); |
201 | cid = apic_cluster_id(new, ldr); | ||
202 | lid = apic_logical_id(new, ldr); | ||
203 | 181 | ||
204 | if (aid < ARRAY_SIZE(new->phys_map)) | 182 | if (aid < ARRAY_SIZE(new->phys_map)) |
205 | new->phys_map[aid] = apic; | 183 | new->phys_map[aid] = apic; |
184 | |||
185 | if (apic_x2apic_mode(apic)) { | ||
186 | new->mode |= KVM_APIC_MODE_X2APIC; | ||
187 | } else if (ldr) { | ||
188 | ldr = GET_APIC_LOGICAL_ID(ldr); | ||
189 | if (kvm_apic_get_reg(apic, APIC_DFR) == APIC_DFR_FLAT) | ||
190 | new->mode |= KVM_APIC_MODE_XAPIC_FLAT; | ||
191 | else | ||
192 | new->mode |= KVM_APIC_MODE_XAPIC_CLUSTER; | ||
193 | } | ||
194 | |||
195 | if (!kvm_apic_logical_map_valid(new)) | ||
196 | continue; | ||
197 | |||
198 | apic_logical_id(new, ldr, &cid, &lid); | ||
199 | |||
206 | if (lid && cid < ARRAY_SIZE(new->logical_map)) | 200 | if (lid && cid < ARRAY_SIZE(new->logical_map)) |
207 | new->logical_map[cid][ffs(lid) - 1] = apic; | 201 | new->logical_map[cid][ffs(lid) - 1] = apic; |
208 | } | 202 | } |
@@ -588,15 +582,23 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) | |||
588 | apic_update_ppr(apic); | 582 | apic_update_ppr(apic); |
589 | } | 583 | } |
590 | 584 | ||
591 | static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 dest) | 585 | static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 mda) |
592 | { | 586 | { |
593 | return dest == (apic_x2apic_mode(apic) ? | 587 | if (apic_x2apic_mode(apic)) |
594 | X2APIC_BROADCAST : APIC_BROADCAST); | 588 | return mda == X2APIC_BROADCAST; |
589 | |||
590 | return GET_APIC_DEST_FIELD(mda) == APIC_BROADCAST; | ||
595 | } | 591 | } |
596 | 592 | ||
597 | static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest) | 593 | static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda) |
598 | { | 594 | { |
599 | return kvm_apic_id(apic) == dest || kvm_apic_broadcast(apic, dest); | 595 | if (kvm_apic_broadcast(apic, mda)) |
596 | return true; | ||
597 | |||
598 | if (apic_x2apic_mode(apic)) | ||
599 | return mda == kvm_apic_id(apic); | ||
600 | |||
601 | return mda == SET_APIC_DEST_FIELD(kvm_apic_id(apic)); | ||
600 | } | 602 | } |
601 | 603 | ||
602 | static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) | 604 | static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) |
@@ -613,6 +615,7 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) | |||
613 | && (logical_id & mda & 0xffff) != 0; | 615 | && (logical_id & mda & 0xffff) != 0; |
614 | 616 | ||
615 | logical_id = GET_APIC_LOGICAL_ID(logical_id); | 617 | logical_id = GET_APIC_LOGICAL_ID(logical_id); |
618 | mda = GET_APIC_DEST_FIELD(mda); | ||
616 | 619 | ||
617 | switch (kvm_apic_get_reg(apic, APIC_DFR)) { | 620 | switch (kvm_apic_get_reg(apic, APIC_DFR)) { |
618 | case APIC_DFR_FLAT: | 621 | case APIC_DFR_FLAT: |
@@ -627,10 +630,27 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) | |||
627 | } | 630 | } |
628 | } | 631 | } |
629 | 632 | ||
633 | /* KVM APIC implementation has two quirks | ||
634 | * - dest always begins at 0 while xAPIC MDA has offset 24, | ||
635 | * - IOxAPIC messages have to be delivered (directly) to x2APIC. | ||
636 | */ | ||
637 | static u32 kvm_apic_mda(unsigned int dest_id, struct kvm_lapic *source, | ||
638 | struct kvm_lapic *target) | ||
639 | { | ||
640 | bool ipi = source != NULL; | ||
641 | bool x2apic_mda = apic_x2apic_mode(ipi ? source : target); | ||
642 | |||
643 | if (!ipi && dest_id == APIC_BROADCAST && x2apic_mda) | ||
644 | return X2APIC_BROADCAST; | ||
645 | |||
646 | return x2apic_mda ? dest_id : SET_APIC_DEST_FIELD(dest_id); | ||
647 | } | ||
648 | |||
630 | bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, | 649 | bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, |
631 | int short_hand, unsigned int dest, int dest_mode) | 650 | int short_hand, unsigned int dest, int dest_mode) |
632 | { | 651 | { |
633 | struct kvm_lapic *target = vcpu->arch.apic; | 652 | struct kvm_lapic *target = vcpu->arch.apic; |
653 | u32 mda = kvm_apic_mda(dest, source, target); | ||
634 | 654 | ||
635 | apic_debug("target %p, source %p, dest 0x%x, " | 655 | apic_debug("target %p, source %p, dest 0x%x, " |
636 | "dest_mode 0x%x, short_hand 0x%x\n", | 656 | "dest_mode 0x%x, short_hand 0x%x\n", |
@@ -640,9 +660,9 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, | |||
640 | switch (short_hand) { | 660 | switch (short_hand) { |
641 | case APIC_DEST_NOSHORT: | 661 | case APIC_DEST_NOSHORT: |
642 | if (dest_mode == APIC_DEST_PHYSICAL) | 662 | if (dest_mode == APIC_DEST_PHYSICAL) |
643 | return kvm_apic_match_physical_addr(target, dest); | 663 | return kvm_apic_match_physical_addr(target, mda); |
644 | else | 664 | else |
645 | return kvm_apic_match_logical_addr(target, dest); | 665 | return kvm_apic_match_logical_addr(target, mda); |
646 | case APIC_DEST_SELF: | 666 | case APIC_DEST_SELF: |
647 | return target == source; | 667 | return target == source; |
648 | case APIC_DEST_ALLINC: | 668 | case APIC_DEST_ALLINC: |
@@ -664,6 +684,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, | |||
664 | struct kvm_lapic **dst; | 684 | struct kvm_lapic **dst; |
665 | int i; | 685 | int i; |
666 | bool ret = false; | 686 | bool ret = false; |
687 | bool x2apic_ipi = src && apic_x2apic_mode(src); | ||
667 | 688 | ||
668 | *r = -1; | 689 | *r = -1; |
669 | 690 | ||
@@ -675,15 +696,15 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, | |||
675 | if (irq->shorthand) | 696 | if (irq->shorthand) |
676 | return false; | 697 | return false; |
677 | 698 | ||
699 | if (irq->dest_id == (x2apic_ipi ? X2APIC_BROADCAST : APIC_BROADCAST)) | ||
700 | return false; | ||
701 | |||
678 | rcu_read_lock(); | 702 | rcu_read_lock(); |
679 | map = rcu_dereference(kvm->arch.apic_map); | 703 | map = rcu_dereference(kvm->arch.apic_map); |
680 | 704 | ||
681 | if (!map) | 705 | if (!map) |
682 | goto out; | 706 | goto out; |
683 | 707 | ||
684 | if (irq->dest_id == map->broadcast) | ||
685 | goto out; | ||
686 | |||
687 | ret = true; | 708 | ret = true; |
688 | 709 | ||
689 | if (irq->dest_mode == APIC_DEST_PHYSICAL) { | 710 | if (irq->dest_mode == APIC_DEST_PHYSICAL) { |
@@ -692,16 +713,20 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, | |||
692 | 713 | ||
693 | dst = &map->phys_map[irq->dest_id]; | 714 | dst = &map->phys_map[irq->dest_id]; |
694 | } else { | 715 | } else { |
695 | u32 mda = irq->dest_id << (32 - map->ldr_bits); | 716 | u16 cid; |
696 | u16 cid = apic_cluster_id(map, mda); | 717 | |
718 | if (!kvm_apic_logical_map_valid(map)) { | ||
719 | ret = false; | ||
720 | goto out; | ||
721 | } | ||
722 | |||
723 | apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap); | ||
697 | 724 | ||
698 | if (cid >= ARRAY_SIZE(map->logical_map)) | 725 | if (cid >= ARRAY_SIZE(map->logical_map)) |
699 | goto out; | 726 | goto out; |
700 | 727 | ||
701 | dst = map->logical_map[cid]; | 728 | dst = map->logical_map[cid]; |
702 | 729 | ||
703 | bitmap = apic_logical_id(map, mda); | ||
704 | |||
705 | if (irq->delivery_mode == APIC_DM_LOWEST) { | 730 | if (irq->delivery_mode == APIC_DM_LOWEST) { |
706 | int l = -1; | 731 | int l = -1; |
707 | for_each_set_bit(i, &bitmap, 16) { | 732 | for_each_set_bit(i, &bitmap, 16) { |
@@ -1037,7 +1062,7 @@ static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr) | |||
1037 | addr < apic->base_address + LAPIC_MMIO_LENGTH; | 1062 | addr < apic->base_address + LAPIC_MMIO_LENGTH; |
1038 | } | 1063 | } |
1039 | 1064 | ||
1040 | static int apic_mmio_read(struct kvm_io_device *this, | 1065 | static int apic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, |
1041 | gpa_t address, int len, void *data) | 1066 | gpa_t address, int len, void *data) |
1042 | { | 1067 | { |
1043 | struct kvm_lapic *apic = to_lapic(this); | 1068 | struct kvm_lapic *apic = to_lapic(this); |
@@ -1357,7 +1382,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) | |||
1357 | return ret; | 1382 | return ret; |
1358 | } | 1383 | } |
1359 | 1384 | ||
1360 | static int apic_mmio_write(struct kvm_io_device *this, | 1385 | static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, |
1361 | gpa_t address, int len, const void *data) | 1386 | gpa_t address, int len, const void *data) |
1362 | { | 1387 | { |
1363 | struct kvm_lapic *apic = to_lapic(this); | 1388 | struct kvm_lapic *apic = to_lapic(this); |
@@ -1497,8 +1522,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) | |||
1497 | return; | 1522 | return; |
1498 | } | 1523 | } |
1499 | 1524 | ||
1500 | if (!kvm_vcpu_is_bsp(apic->vcpu)) | ||
1501 | value &= ~MSR_IA32_APICBASE_BSP; | ||
1502 | vcpu->arch.apic_base = value; | 1525 | vcpu->arch.apic_base = value; |
1503 | 1526 | ||
1504 | /* update jump label if enable bit changes */ | 1527 | /* update jump label if enable bit changes */ |
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 0bc6c656625b..9d28383fc1e7 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h | |||
@@ -1,7 +1,7 @@ | |||
1 | #ifndef __KVM_X86_LAPIC_H | 1 | #ifndef __KVM_X86_LAPIC_H |
2 | #define __KVM_X86_LAPIC_H | 2 | #define __KVM_X86_LAPIC_H |
3 | 3 | ||
4 | #include "iodev.h" | 4 | #include <kvm/iodev.h> |
5 | 5 | ||
6 | #include <linux/kvm_host.h> | 6 | #include <linux/kvm_host.h> |
7 | 7 | ||
@@ -148,21 +148,6 @@ static inline bool kvm_apic_vid_enabled(struct kvm *kvm) | |||
148 | return kvm_x86_ops->vm_has_apicv(kvm); | 148 | return kvm_x86_ops->vm_has_apicv(kvm); |
149 | } | 149 | } |
150 | 150 | ||
151 | static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr) | ||
152 | { | ||
153 | u16 cid; | ||
154 | ldr >>= 32 - map->ldr_bits; | ||
155 | cid = (ldr >> map->cid_shift) & map->cid_mask; | ||
156 | |||
157 | return cid; | ||
158 | } | ||
159 | |||
160 | static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr) | ||
161 | { | ||
162 | ldr >>= (32 - map->ldr_bits); | ||
163 | return ldr & map->lid_mask; | ||
164 | } | ||
165 | |||
166 | static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu) | 151 | static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu) |
167 | { | 152 | { |
168 | return vcpu->arch.apic->pending_events; | 153 | return vcpu->arch.apic->pending_events; |
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index cee759299a35..146f295ee322 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
@@ -4465,6 +4465,79 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, | |||
4465 | kvm_flush_remote_tlbs(kvm); | 4465 | kvm_flush_remote_tlbs(kvm); |
4466 | } | 4466 | } |
4467 | 4467 | ||
4468 | static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, | ||
4469 | unsigned long *rmapp) | ||
4470 | { | ||
4471 | u64 *sptep; | ||
4472 | struct rmap_iterator iter; | ||
4473 | int need_tlb_flush = 0; | ||
4474 | pfn_t pfn; | ||
4475 | struct kvm_mmu_page *sp; | ||
4476 | |||
4477 | for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { | ||
4478 | BUG_ON(!(*sptep & PT_PRESENT_MASK)); | ||
4479 | |||
4480 | sp = page_header(__pa(sptep)); | ||
4481 | pfn = spte_to_pfn(*sptep); | ||
4482 | |||
4483 | /* | ||
4484 | * Only EPT supported for now; otherwise, one would need to | ||
4485 | * find out efficiently whether the guest page tables are | ||
4486 | * also using huge pages. | ||
4487 | */ | ||
4488 | if (sp->role.direct && | ||
4489 | !kvm_is_reserved_pfn(pfn) && | ||
4490 | PageTransCompound(pfn_to_page(pfn))) { | ||
4491 | drop_spte(kvm, sptep); | ||
4492 | sptep = rmap_get_first(*rmapp, &iter); | ||
4493 | need_tlb_flush = 1; | ||
4494 | } else | ||
4495 | sptep = rmap_get_next(&iter); | ||
4496 | } | ||
4497 | |||
4498 | return need_tlb_flush; | ||
4499 | } | ||
4500 | |||
4501 | void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, | ||
4502 | struct kvm_memory_slot *memslot) | ||
4503 | { | ||
4504 | bool flush = false; | ||
4505 | unsigned long *rmapp; | ||
4506 | unsigned long last_index, index; | ||
4507 | gfn_t gfn_start, gfn_end; | ||
4508 | |||
4509 | spin_lock(&kvm->mmu_lock); | ||
4510 | |||
4511 | gfn_start = memslot->base_gfn; | ||
4512 | gfn_end = memslot->base_gfn + memslot->npages - 1; | ||
4513 | |||
4514 | if (gfn_start >= gfn_end) | ||
4515 | goto out; | ||
4516 | |||
4517 | rmapp = memslot->arch.rmap[0]; | ||
4518 | last_index = gfn_to_index(gfn_end, memslot->base_gfn, | ||
4519 | PT_PAGE_TABLE_LEVEL); | ||
4520 | |||
4521 | for (index = 0; index <= last_index; ++index, ++rmapp) { | ||
4522 | if (*rmapp) | ||
4523 | flush |= kvm_mmu_zap_collapsible_spte(kvm, rmapp); | ||
4524 | |||
4525 | if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { | ||
4526 | if (flush) { | ||
4527 | kvm_flush_remote_tlbs(kvm); | ||
4528 | flush = false; | ||
4529 | } | ||
4530 | cond_resched_lock(&kvm->mmu_lock); | ||
4531 | } | ||
4532 | } | ||
4533 | |||
4534 | if (flush) | ||
4535 | kvm_flush_remote_tlbs(kvm); | ||
4536 | |||
4537 | out: | ||
4538 | spin_unlock(&kvm->mmu_lock); | ||
4539 | } | ||
4540 | |||
4468 | void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, | 4541 | void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, |
4469 | struct kvm_memory_slot *memslot) | 4542 | struct kvm_memory_slot *memslot) |
4470 | { | 4543 | { |
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 8e6b7d869d2f..29fbf9dfdc54 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c | |||
@@ -38,7 +38,7 @@ static struct kvm_arch_event_perf_mapping { | |||
38 | }; | 38 | }; |
39 | 39 | ||
40 | /* mapping between fixed pmc index and arch_events array */ | 40 | /* mapping between fixed pmc index and arch_events array */ |
41 | int fixed_pmc_events[] = {1, 0, 7}; | 41 | static int fixed_pmc_events[] = {1, 0, 7}; |
42 | 42 | ||
43 | static bool pmc_is_gp(struct kvm_pmc *pmc) | 43 | static bool pmc_is_gp(struct kvm_pmc *pmc) |
44 | { | 44 | { |
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index cc618c882f90..ce741b8650f6 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c | |||
@@ -1261,7 +1261,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) | |||
1261 | 1261 | ||
1262 | svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE | | 1262 | svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE | |
1263 | MSR_IA32_APICBASE_ENABLE; | 1263 | MSR_IA32_APICBASE_ENABLE; |
1264 | if (kvm_vcpu_is_bsp(&svm->vcpu)) | 1264 | if (kvm_vcpu_is_reset_bsp(&svm->vcpu)) |
1265 | svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; | 1265 | svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; |
1266 | 1266 | ||
1267 | svm_init_osvw(&svm->vcpu); | 1267 | svm_init_osvw(&svm->vcpu); |
@@ -1929,14 +1929,12 @@ static int nop_on_interception(struct vcpu_svm *svm) | |||
1929 | static int halt_interception(struct vcpu_svm *svm) | 1929 | static int halt_interception(struct vcpu_svm *svm) |
1930 | { | 1930 | { |
1931 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 1; | 1931 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 1; |
1932 | skip_emulated_instruction(&svm->vcpu); | ||
1933 | return kvm_emulate_halt(&svm->vcpu); | 1932 | return kvm_emulate_halt(&svm->vcpu); |
1934 | } | 1933 | } |
1935 | 1934 | ||
1936 | static int vmmcall_interception(struct vcpu_svm *svm) | 1935 | static int vmmcall_interception(struct vcpu_svm *svm) |
1937 | { | 1936 | { |
1938 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; | 1937 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; |
1939 | skip_emulated_instruction(&svm->vcpu); | ||
1940 | kvm_emulate_hypercall(&svm->vcpu); | 1938 | kvm_emulate_hypercall(&svm->vcpu); |
1941 | return 1; | 1939 | return 1; |
1942 | } | 1940 | } |
@@ -2757,11 +2755,11 @@ static int invlpga_interception(struct vcpu_svm *svm) | |||
2757 | { | 2755 | { |
2758 | struct kvm_vcpu *vcpu = &svm->vcpu; | 2756 | struct kvm_vcpu *vcpu = &svm->vcpu; |
2759 | 2757 | ||
2760 | trace_kvm_invlpga(svm->vmcb->save.rip, vcpu->arch.regs[VCPU_REGS_RCX], | 2758 | trace_kvm_invlpga(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RCX), |
2761 | vcpu->arch.regs[VCPU_REGS_RAX]); | 2759 | kvm_register_read(&svm->vcpu, VCPU_REGS_RAX)); |
2762 | 2760 | ||
2763 | /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ | 2761 | /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ |
2764 | kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]); | 2762 | kvm_mmu_invlpg(vcpu, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX)); |
2765 | 2763 | ||
2766 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; | 2764 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; |
2767 | skip_emulated_instruction(&svm->vcpu); | 2765 | skip_emulated_instruction(&svm->vcpu); |
@@ -2770,12 +2768,18 @@ static int invlpga_interception(struct vcpu_svm *svm) | |||
2770 | 2768 | ||
2771 | static int skinit_interception(struct vcpu_svm *svm) | 2769 | static int skinit_interception(struct vcpu_svm *svm) |
2772 | { | 2770 | { |
2773 | trace_kvm_skinit(svm->vmcb->save.rip, svm->vcpu.arch.regs[VCPU_REGS_RAX]); | 2771 | trace_kvm_skinit(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX)); |
2774 | 2772 | ||
2775 | kvm_queue_exception(&svm->vcpu, UD_VECTOR); | 2773 | kvm_queue_exception(&svm->vcpu, UD_VECTOR); |
2776 | return 1; | 2774 | return 1; |
2777 | } | 2775 | } |
2778 | 2776 | ||
2777 | static int wbinvd_interception(struct vcpu_svm *svm) | ||
2778 | { | ||
2779 | kvm_emulate_wbinvd(&svm->vcpu); | ||
2780 | return 1; | ||
2781 | } | ||
2782 | |||
2779 | static int xsetbv_interception(struct vcpu_svm *svm) | 2783 | static int xsetbv_interception(struct vcpu_svm *svm) |
2780 | { | 2784 | { |
2781 | u64 new_bv = kvm_read_edx_eax(&svm->vcpu); | 2785 | u64 new_bv = kvm_read_edx_eax(&svm->vcpu); |
@@ -2902,7 +2906,8 @@ static int rdpmc_interception(struct vcpu_svm *svm) | |||
2902 | return 1; | 2906 | return 1; |
2903 | } | 2907 | } |
2904 | 2908 | ||
2905 | bool check_selective_cr0_intercepted(struct vcpu_svm *svm, unsigned long val) | 2909 | static bool check_selective_cr0_intercepted(struct vcpu_svm *svm, |
2910 | unsigned long val) | ||
2906 | { | 2911 | { |
2907 | unsigned long cr0 = svm->vcpu.arch.cr0; | 2912 | unsigned long cr0 = svm->vcpu.arch.cr0; |
2908 | bool ret = false; | 2913 | bool ret = false; |
@@ -2940,7 +2945,10 @@ static int cr_interception(struct vcpu_svm *svm) | |||
2940 | return emulate_on_interception(svm); | 2945 | return emulate_on_interception(svm); |
2941 | 2946 | ||
2942 | reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; | 2947 | reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; |
2943 | cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0; | 2948 | if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE) |
2949 | cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0; | ||
2950 | else | ||
2951 | cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0; | ||
2944 | 2952 | ||
2945 | err = 0; | 2953 | err = 0; |
2946 | if (cr >= 16) { /* mov to cr */ | 2954 | if (cr >= 16) { /* mov to cr */ |
@@ -3133,7 +3141,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) | |||
3133 | 3141 | ||
3134 | static int rdmsr_interception(struct vcpu_svm *svm) | 3142 | static int rdmsr_interception(struct vcpu_svm *svm) |
3135 | { | 3143 | { |
3136 | u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; | 3144 | u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX); |
3137 | u64 data; | 3145 | u64 data; |
3138 | 3146 | ||
3139 | if (svm_get_msr(&svm->vcpu, ecx, &data)) { | 3147 | if (svm_get_msr(&svm->vcpu, ecx, &data)) { |
@@ -3142,8 +3150,8 @@ static int rdmsr_interception(struct vcpu_svm *svm) | |||
3142 | } else { | 3150 | } else { |
3143 | trace_kvm_msr_read(ecx, data); | 3151 | trace_kvm_msr_read(ecx, data); |
3144 | 3152 | ||
3145 | svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff; | 3153 | kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, data & 0xffffffff); |
3146 | svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; | 3154 | kvm_register_write(&svm->vcpu, VCPU_REGS_RDX, data >> 32); |
3147 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; | 3155 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; |
3148 | skip_emulated_instruction(&svm->vcpu); | 3156 | skip_emulated_instruction(&svm->vcpu); |
3149 | } | 3157 | } |
@@ -3246,9 +3254,8 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) | |||
3246 | static int wrmsr_interception(struct vcpu_svm *svm) | 3254 | static int wrmsr_interception(struct vcpu_svm *svm) |
3247 | { | 3255 | { |
3248 | struct msr_data msr; | 3256 | struct msr_data msr; |
3249 | u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; | 3257 | u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX); |
3250 | u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) | 3258 | u64 data = kvm_read_edx_eax(&svm->vcpu); |
3251 | | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); | ||
3252 | 3259 | ||
3253 | msr.data = data; | 3260 | msr.data = data; |
3254 | msr.index = ecx; | 3261 | msr.index = ecx; |
@@ -3325,7 +3332,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { | |||
3325 | [SVM_EXIT_READ_CR3] = cr_interception, | 3332 | [SVM_EXIT_READ_CR3] = cr_interception, |
3326 | [SVM_EXIT_READ_CR4] = cr_interception, | 3333 | [SVM_EXIT_READ_CR4] = cr_interception, |
3327 | [SVM_EXIT_READ_CR8] = cr_interception, | 3334 | [SVM_EXIT_READ_CR8] = cr_interception, |
3328 | [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, | 3335 | [SVM_EXIT_CR0_SEL_WRITE] = cr_interception, |
3329 | [SVM_EXIT_WRITE_CR0] = cr_interception, | 3336 | [SVM_EXIT_WRITE_CR0] = cr_interception, |
3330 | [SVM_EXIT_WRITE_CR3] = cr_interception, | 3337 | [SVM_EXIT_WRITE_CR3] = cr_interception, |
3331 | [SVM_EXIT_WRITE_CR4] = cr_interception, | 3338 | [SVM_EXIT_WRITE_CR4] = cr_interception, |
@@ -3376,7 +3383,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { | |||
3376 | [SVM_EXIT_STGI] = stgi_interception, | 3383 | [SVM_EXIT_STGI] = stgi_interception, |
3377 | [SVM_EXIT_CLGI] = clgi_interception, | 3384 | [SVM_EXIT_CLGI] = clgi_interception, |
3378 | [SVM_EXIT_SKINIT] = skinit_interception, | 3385 | [SVM_EXIT_SKINIT] = skinit_interception, |
3379 | [SVM_EXIT_WBINVD] = emulate_on_interception, | 3386 | [SVM_EXIT_WBINVD] = wbinvd_interception, |
3380 | [SVM_EXIT_MONITOR] = monitor_interception, | 3387 | [SVM_EXIT_MONITOR] = monitor_interception, |
3381 | [SVM_EXIT_MWAIT] = mwait_interception, | 3388 | [SVM_EXIT_MWAIT] = mwait_interception, |
3382 | [SVM_EXIT_XSETBV] = xsetbv_interception, | 3389 | [SVM_EXIT_XSETBV] = xsetbv_interception, |
@@ -3555,7 +3562,7 @@ static int handle_exit(struct kvm_vcpu *vcpu) | |||
3555 | 3562 | ||
3556 | if (exit_code >= ARRAY_SIZE(svm_exit_handlers) | 3563 | if (exit_code >= ARRAY_SIZE(svm_exit_handlers) |
3557 | || !svm_exit_handlers[exit_code]) { | 3564 | || !svm_exit_handlers[exit_code]) { |
3558 | WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_code); | 3565 | WARN_ONCE(1, "svm: unexpected exit reason 0x%x\n", exit_code); |
3559 | kvm_queue_exception(vcpu, UD_VECTOR); | 3566 | kvm_queue_exception(vcpu, UD_VECTOR); |
3560 | return 1; | 3567 | return 1; |
3561 | } | 3568 | } |
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ae4f6d35d19c..f5e8dce8046c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c | |||
@@ -2470,6 +2470,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) | |||
2470 | vmx->nested.nested_vmx_secondary_ctls_low = 0; | 2470 | vmx->nested.nested_vmx_secondary_ctls_low = 0; |
2471 | vmx->nested.nested_vmx_secondary_ctls_high &= | 2471 | vmx->nested.nested_vmx_secondary_ctls_high &= |
2472 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | | 2472 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | |
2473 | SECONDARY_EXEC_RDTSCP | | ||
2473 | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | | 2474 | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | |
2474 | SECONDARY_EXEC_APIC_REGISTER_VIRT | | 2475 | SECONDARY_EXEC_APIC_REGISTER_VIRT | |
2475 | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | | 2476 | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | |
@@ -3268,8 +3269,8 @@ static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, | |||
3268 | * default value. | 3269 | * default value. |
3269 | */ | 3270 | */ |
3270 | if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS) | 3271 | if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS) |
3271 | save->selector &= ~SELECTOR_RPL_MASK; | 3272 | save->selector &= ~SEGMENT_RPL_MASK; |
3272 | save->dpl = save->selector & SELECTOR_RPL_MASK; | 3273 | save->dpl = save->selector & SEGMENT_RPL_MASK; |
3273 | save->s = 1; | 3274 | save->s = 1; |
3274 | } | 3275 | } |
3275 | vmx_set_segment(vcpu, save, seg); | 3276 | vmx_set_segment(vcpu, save, seg); |
@@ -3842,7 +3843,7 @@ static bool code_segment_valid(struct kvm_vcpu *vcpu) | |||
3842 | unsigned int cs_rpl; | 3843 | unsigned int cs_rpl; |
3843 | 3844 | ||
3844 | vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); | 3845 | vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); |
3845 | cs_rpl = cs.selector & SELECTOR_RPL_MASK; | 3846 | cs_rpl = cs.selector & SEGMENT_RPL_MASK; |
3846 | 3847 | ||
3847 | if (cs.unusable) | 3848 | if (cs.unusable) |
3848 | return false; | 3849 | return false; |
@@ -3870,7 +3871,7 @@ static bool stack_segment_valid(struct kvm_vcpu *vcpu) | |||
3870 | unsigned int ss_rpl; | 3871 | unsigned int ss_rpl; |
3871 | 3872 | ||
3872 | vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); | 3873 | vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); |
3873 | ss_rpl = ss.selector & SELECTOR_RPL_MASK; | 3874 | ss_rpl = ss.selector & SEGMENT_RPL_MASK; |
3874 | 3875 | ||
3875 | if (ss.unusable) | 3876 | if (ss.unusable) |
3876 | return true; | 3877 | return true; |
@@ -3892,7 +3893,7 @@ static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) | |||
3892 | unsigned int rpl; | 3893 | unsigned int rpl; |
3893 | 3894 | ||
3894 | vmx_get_segment(vcpu, &var, seg); | 3895 | vmx_get_segment(vcpu, &var, seg); |
3895 | rpl = var.selector & SELECTOR_RPL_MASK; | 3896 | rpl = var.selector & SEGMENT_RPL_MASK; |
3896 | 3897 | ||
3897 | if (var.unusable) | 3898 | if (var.unusable) |
3898 | return true; | 3899 | return true; |
@@ -3919,7 +3920,7 @@ static bool tr_valid(struct kvm_vcpu *vcpu) | |||
3919 | 3920 | ||
3920 | if (tr.unusable) | 3921 | if (tr.unusable) |
3921 | return false; | 3922 | return false; |
3922 | if (tr.selector & SELECTOR_TI_MASK) /* TI = 1 */ | 3923 | if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */ |
3923 | return false; | 3924 | return false; |
3924 | if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */ | 3925 | if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */ |
3925 | return false; | 3926 | return false; |
@@ -3937,7 +3938,7 @@ static bool ldtr_valid(struct kvm_vcpu *vcpu) | |||
3937 | 3938 | ||
3938 | if (ldtr.unusable) | 3939 | if (ldtr.unusable) |
3939 | return true; | 3940 | return true; |
3940 | if (ldtr.selector & SELECTOR_TI_MASK) /* TI = 1 */ | 3941 | if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */ |
3941 | return false; | 3942 | return false; |
3942 | if (ldtr.type != 2) | 3943 | if (ldtr.type != 2) |
3943 | return false; | 3944 | return false; |
@@ -3954,8 +3955,8 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) | |||
3954 | vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); | 3955 | vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); |
3955 | vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); | 3956 | vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); |
3956 | 3957 | ||
3957 | return ((cs.selector & SELECTOR_RPL_MASK) == | 3958 | return ((cs.selector & SEGMENT_RPL_MASK) == |
3958 | (ss.selector & SELECTOR_RPL_MASK)); | 3959 | (ss.selector & SEGMENT_RPL_MASK)); |
3959 | } | 3960 | } |
3960 | 3961 | ||
3961 | /* | 3962 | /* |
@@ -4711,7 +4712,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
4711 | vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); | 4712 | vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); |
4712 | kvm_set_cr8(&vmx->vcpu, 0); | 4713 | kvm_set_cr8(&vmx->vcpu, 0); |
4713 | apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; | 4714 | apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; |
4714 | if (kvm_vcpu_is_bsp(&vmx->vcpu)) | 4715 | if (kvm_vcpu_is_reset_bsp(&vmx->vcpu)) |
4715 | apic_base_msr.data |= MSR_IA32_APICBASE_BSP; | 4716 | apic_base_msr.data |= MSR_IA32_APICBASE_BSP; |
4716 | apic_base_msr.host_initiated = true; | 4717 | apic_base_msr.host_initiated = true; |
4717 | kvm_set_apic_base(&vmx->vcpu, &apic_base_msr); | 4718 | kvm_set_apic_base(&vmx->vcpu, &apic_base_msr); |
@@ -5006,7 +5007,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, | |||
5006 | if (emulate_instruction(vcpu, 0) == EMULATE_DONE) { | 5007 | if (emulate_instruction(vcpu, 0) == EMULATE_DONE) { |
5007 | if (vcpu->arch.halt_request) { | 5008 | if (vcpu->arch.halt_request) { |
5008 | vcpu->arch.halt_request = 0; | 5009 | vcpu->arch.halt_request = 0; |
5009 | return kvm_emulate_halt(vcpu); | 5010 | return kvm_vcpu_halt(vcpu); |
5010 | } | 5011 | } |
5011 | return 1; | 5012 | return 1; |
5012 | } | 5013 | } |
@@ -5071,6 +5072,10 @@ static int handle_exception(struct kvm_vcpu *vcpu) | |||
5071 | } | 5072 | } |
5072 | 5073 | ||
5073 | if (is_invalid_opcode(intr_info)) { | 5074 | if (is_invalid_opcode(intr_info)) { |
5075 | if (is_guest_mode(vcpu)) { | ||
5076 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
5077 | return 1; | ||
5078 | } | ||
5074 | er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD); | 5079 | er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD); |
5075 | if (er != EMULATE_DONE) | 5080 | if (er != EMULATE_DONE) |
5076 | kvm_queue_exception(vcpu, UD_VECTOR); | 5081 | kvm_queue_exception(vcpu, UD_VECTOR); |
@@ -5090,9 +5095,10 @@ static int handle_exception(struct kvm_vcpu *vcpu) | |||
5090 | !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { | 5095 | !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { |
5091 | vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; | 5096 | vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; |
5092 | vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; | 5097 | vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; |
5093 | vcpu->run->internal.ndata = 2; | 5098 | vcpu->run->internal.ndata = 3; |
5094 | vcpu->run->internal.data[0] = vect_info; | 5099 | vcpu->run->internal.data[0] = vect_info; |
5095 | vcpu->run->internal.data[1] = intr_info; | 5100 | vcpu->run->internal.data[1] = intr_info; |
5101 | vcpu->run->internal.data[2] = error_code; | ||
5096 | return 0; | 5102 | return 0; |
5097 | } | 5103 | } |
5098 | 5104 | ||
@@ -5533,13 +5539,11 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu) | |||
5533 | 5539 | ||
5534 | static int handle_halt(struct kvm_vcpu *vcpu) | 5540 | static int handle_halt(struct kvm_vcpu *vcpu) |
5535 | { | 5541 | { |
5536 | skip_emulated_instruction(vcpu); | ||
5537 | return kvm_emulate_halt(vcpu); | 5542 | return kvm_emulate_halt(vcpu); |
5538 | } | 5543 | } |
5539 | 5544 | ||
5540 | static int handle_vmcall(struct kvm_vcpu *vcpu) | 5545 | static int handle_vmcall(struct kvm_vcpu *vcpu) |
5541 | { | 5546 | { |
5542 | skip_emulated_instruction(vcpu); | ||
5543 | kvm_emulate_hypercall(vcpu); | 5547 | kvm_emulate_hypercall(vcpu); |
5544 | return 1; | 5548 | return 1; |
5545 | } | 5549 | } |
@@ -5570,7 +5574,6 @@ static int handle_rdpmc(struct kvm_vcpu *vcpu) | |||
5570 | 5574 | ||
5571 | static int handle_wbinvd(struct kvm_vcpu *vcpu) | 5575 | static int handle_wbinvd(struct kvm_vcpu *vcpu) |
5572 | { | 5576 | { |
5573 | skip_emulated_instruction(vcpu); | ||
5574 | kvm_emulate_wbinvd(vcpu); | 5577 | kvm_emulate_wbinvd(vcpu); |
5575 | return 1; | 5578 | return 1; |
5576 | } | 5579 | } |
@@ -5828,7 +5831,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) | |||
5828 | gpa_t gpa; | 5831 | gpa_t gpa; |
5829 | 5832 | ||
5830 | gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); | 5833 | gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); |
5831 | if (!kvm_io_bus_write(vcpu->kvm, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { | 5834 | if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { |
5832 | skip_emulated_instruction(vcpu); | 5835 | skip_emulated_instruction(vcpu); |
5833 | return 1; | 5836 | return 1; |
5834 | } | 5837 | } |
@@ -5909,7 +5912,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) | |||
5909 | 5912 | ||
5910 | if (vcpu->arch.halt_request) { | 5913 | if (vcpu->arch.halt_request) { |
5911 | vcpu->arch.halt_request = 0; | 5914 | vcpu->arch.halt_request = 0; |
5912 | ret = kvm_emulate_halt(vcpu); | 5915 | ret = kvm_vcpu_halt(vcpu); |
5913 | goto out; | 5916 | goto out; |
5914 | } | 5917 | } |
5915 | 5918 | ||
@@ -7318,21 +7321,21 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, | |||
7318 | else if (port < 0x10000) | 7321 | else if (port < 0x10000) |
7319 | bitmap = vmcs12->io_bitmap_b; | 7322 | bitmap = vmcs12->io_bitmap_b; |
7320 | else | 7323 | else |
7321 | return 1; | 7324 | return true; |
7322 | bitmap += (port & 0x7fff) / 8; | 7325 | bitmap += (port & 0x7fff) / 8; |
7323 | 7326 | ||
7324 | if (last_bitmap != bitmap) | 7327 | if (last_bitmap != bitmap) |
7325 | if (kvm_read_guest(vcpu->kvm, bitmap, &b, 1)) | 7328 | if (kvm_read_guest(vcpu->kvm, bitmap, &b, 1)) |
7326 | return 1; | 7329 | return true; |
7327 | if (b & (1 << (port & 7))) | 7330 | if (b & (1 << (port & 7))) |
7328 | return 1; | 7331 | return true; |
7329 | 7332 | ||
7330 | port++; | 7333 | port++; |
7331 | size--; | 7334 | size--; |
7332 | last_bitmap = bitmap; | 7335 | last_bitmap = bitmap; |
7333 | } | 7336 | } |
7334 | 7337 | ||
7335 | return 0; | 7338 | return false; |
7336 | } | 7339 | } |
7337 | 7340 | ||
7338 | /* | 7341 | /* |
@@ -7348,7 +7351,7 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, | |||
7348 | gpa_t bitmap; | 7351 | gpa_t bitmap; |
7349 | 7352 | ||
7350 | if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) | 7353 | if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) |
7351 | return 1; | 7354 | return true; |
7352 | 7355 | ||
7353 | /* | 7356 | /* |
7354 | * The MSR_BITMAP page is divided into four 1024-byte bitmaps, | 7357 | * The MSR_BITMAP page is divided into four 1024-byte bitmaps, |
@@ -7367,10 +7370,10 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, | |||
7367 | if (msr_index < 1024*8) { | 7370 | if (msr_index < 1024*8) { |
7368 | unsigned char b; | 7371 | unsigned char b; |
7369 | if (kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1)) | 7372 | if (kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1)) |
7370 | return 1; | 7373 | return true; |
7371 | return 1 & (b >> (msr_index & 7)); | 7374 | return 1 & (b >> (msr_index & 7)); |
7372 | } else | 7375 | } else |
7373 | return 1; /* let L1 handle the wrong parameter */ | 7376 | return true; /* let L1 handle the wrong parameter */ |
7374 | } | 7377 | } |
7375 | 7378 | ||
7376 | /* | 7379 | /* |
@@ -7392,7 +7395,7 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, | |||
7392 | case 0: | 7395 | case 0: |
7393 | if (vmcs12->cr0_guest_host_mask & | 7396 | if (vmcs12->cr0_guest_host_mask & |
7394 | (val ^ vmcs12->cr0_read_shadow)) | 7397 | (val ^ vmcs12->cr0_read_shadow)) |
7395 | return 1; | 7398 | return true; |
7396 | break; | 7399 | break; |
7397 | case 3: | 7400 | case 3: |
7398 | if ((vmcs12->cr3_target_count >= 1 && | 7401 | if ((vmcs12->cr3_target_count >= 1 && |
@@ -7403,37 +7406,37 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, | |||
7403 | vmcs12->cr3_target_value2 == val) || | 7406 | vmcs12->cr3_target_value2 == val) || |
7404 | (vmcs12->cr3_target_count >= 4 && | 7407 | (vmcs12->cr3_target_count >= 4 && |
7405 | vmcs12->cr3_target_value3 == val)) | 7408 | vmcs12->cr3_target_value3 == val)) |
7406 | return 0; | 7409 | return false; |
7407 | if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) | 7410 | if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) |
7408 | return 1; | 7411 | return true; |
7409 | break; | 7412 | break; |
7410 | case 4: | 7413 | case 4: |
7411 | if (vmcs12->cr4_guest_host_mask & | 7414 | if (vmcs12->cr4_guest_host_mask & |
7412 | (vmcs12->cr4_read_shadow ^ val)) | 7415 | (vmcs12->cr4_read_shadow ^ val)) |
7413 | return 1; | 7416 | return true; |
7414 | break; | 7417 | break; |
7415 | case 8: | 7418 | case 8: |
7416 | if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) | 7419 | if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) |
7417 | return 1; | 7420 | return true; |
7418 | break; | 7421 | break; |
7419 | } | 7422 | } |
7420 | break; | 7423 | break; |
7421 | case 2: /* clts */ | 7424 | case 2: /* clts */ |
7422 | if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && | 7425 | if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && |
7423 | (vmcs12->cr0_read_shadow & X86_CR0_TS)) | 7426 | (vmcs12->cr0_read_shadow & X86_CR0_TS)) |
7424 | return 1; | 7427 | return true; |
7425 | break; | 7428 | break; |
7426 | case 1: /* mov from cr */ | 7429 | case 1: /* mov from cr */ |
7427 | switch (cr) { | 7430 | switch (cr) { |
7428 | case 3: | 7431 | case 3: |
7429 | if (vmcs12->cpu_based_vm_exec_control & | 7432 | if (vmcs12->cpu_based_vm_exec_control & |
7430 | CPU_BASED_CR3_STORE_EXITING) | 7433 | CPU_BASED_CR3_STORE_EXITING) |
7431 | return 1; | 7434 | return true; |
7432 | break; | 7435 | break; |
7433 | case 8: | 7436 | case 8: |
7434 | if (vmcs12->cpu_based_vm_exec_control & | 7437 | if (vmcs12->cpu_based_vm_exec_control & |
7435 | CPU_BASED_CR8_STORE_EXITING) | 7438 | CPU_BASED_CR8_STORE_EXITING) |
7436 | return 1; | 7439 | return true; |
7437 | break; | 7440 | break; |
7438 | } | 7441 | } |
7439 | break; | 7442 | break; |
@@ -7444,14 +7447,14 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, | |||
7444 | */ | 7447 | */ |
7445 | if (vmcs12->cr0_guest_host_mask & 0xe & | 7448 | if (vmcs12->cr0_guest_host_mask & 0xe & |
7446 | (val ^ vmcs12->cr0_read_shadow)) | 7449 | (val ^ vmcs12->cr0_read_shadow)) |
7447 | return 1; | 7450 | return true; |
7448 | if ((vmcs12->cr0_guest_host_mask & 0x1) && | 7451 | if ((vmcs12->cr0_guest_host_mask & 0x1) && |
7449 | !(vmcs12->cr0_read_shadow & 0x1) && | 7452 | !(vmcs12->cr0_read_shadow & 0x1) && |
7450 | (val & 0x1)) | 7453 | (val & 0x1)) |
7451 | return 1; | 7454 | return true; |
7452 | break; | 7455 | break; |
7453 | } | 7456 | } |
7454 | return 0; | 7457 | return false; |
7455 | } | 7458 | } |
7456 | 7459 | ||
7457 | /* | 7460 | /* |
@@ -7474,48 +7477,48 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) | |||
7474 | KVM_ISA_VMX); | 7477 | KVM_ISA_VMX); |
7475 | 7478 | ||
7476 | if (vmx->nested.nested_run_pending) | 7479 | if (vmx->nested.nested_run_pending) |
7477 | return 0; | 7480 | return false; |
7478 | 7481 | ||
7479 | if (unlikely(vmx->fail)) { | 7482 | if (unlikely(vmx->fail)) { |
7480 | pr_info_ratelimited("%s failed vm entry %x\n", __func__, | 7483 | pr_info_ratelimited("%s failed vm entry %x\n", __func__, |
7481 | vmcs_read32(VM_INSTRUCTION_ERROR)); | 7484 | vmcs_read32(VM_INSTRUCTION_ERROR)); |
7482 | return 1; | 7485 | return true; |
7483 | } | 7486 | } |
7484 | 7487 | ||
7485 | switch (exit_reason) { | 7488 | switch (exit_reason) { |
7486 | case EXIT_REASON_EXCEPTION_NMI: | 7489 | case EXIT_REASON_EXCEPTION_NMI: |
7487 | if (!is_exception(intr_info)) | 7490 | if (!is_exception(intr_info)) |
7488 | return 0; | 7491 | return false; |
7489 | else if (is_page_fault(intr_info)) | 7492 | else if (is_page_fault(intr_info)) |
7490 | return enable_ept; | 7493 | return enable_ept; |
7491 | else if (is_no_device(intr_info) && | 7494 | else if (is_no_device(intr_info) && |
7492 | !(vmcs12->guest_cr0 & X86_CR0_TS)) | 7495 | !(vmcs12->guest_cr0 & X86_CR0_TS)) |
7493 | return 0; | 7496 | return false; |
7494 | return vmcs12->exception_bitmap & | 7497 | return vmcs12->exception_bitmap & |
7495 | (1u << (intr_info & INTR_INFO_VECTOR_MASK)); | 7498 | (1u << (intr_info & INTR_INFO_VECTOR_MASK)); |
7496 | case EXIT_REASON_EXTERNAL_INTERRUPT: | 7499 | case EXIT_REASON_EXTERNAL_INTERRUPT: |
7497 | return 0; | 7500 | return false; |
7498 | case EXIT_REASON_TRIPLE_FAULT: | 7501 | case EXIT_REASON_TRIPLE_FAULT: |
7499 | return 1; | 7502 | return true; |
7500 | case EXIT_REASON_PENDING_INTERRUPT: | 7503 | case EXIT_REASON_PENDING_INTERRUPT: |
7501 | return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING); | 7504 | return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING); |
7502 | case EXIT_REASON_NMI_WINDOW: | 7505 | case EXIT_REASON_NMI_WINDOW: |
7503 | return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING); | 7506 | return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING); |
7504 | case EXIT_REASON_TASK_SWITCH: | 7507 | case EXIT_REASON_TASK_SWITCH: |
7505 | return 1; | 7508 | return true; |
7506 | case EXIT_REASON_CPUID: | 7509 | case EXIT_REASON_CPUID: |
7507 | if (kvm_register_read(vcpu, VCPU_REGS_RAX) == 0xa) | 7510 | if (kvm_register_read(vcpu, VCPU_REGS_RAX) == 0xa) |
7508 | return 0; | 7511 | return false; |
7509 | return 1; | 7512 | return true; |
7510 | case EXIT_REASON_HLT: | 7513 | case EXIT_REASON_HLT: |
7511 | return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); | 7514 | return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); |
7512 | case EXIT_REASON_INVD: | 7515 | case EXIT_REASON_INVD: |
7513 | return 1; | 7516 | return true; |
7514 | case EXIT_REASON_INVLPG: | 7517 | case EXIT_REASON_INVLPG: |
7515 | return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); | 7518 | return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); |
7516 | case EXIT_REASON_RDPMC: | 7519 | case EXIT_REASON_RDPMC: |
7517 | return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); | 7520 | return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); |
7518 | case EXIT_REASON_RDTSC: | 7521 | case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: |
7519 | return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); | 7522 | return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); |
7520 | case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: | 7523 | case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: |
7521 | case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: | 7524 | case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: |
@@ -7527,7 +7530,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) | |||
7527 | * VMX instructions trap unconditionally. This allows L1 to | 7530 | * VMX instructions trap unconditionally. This allows L1 to |
7528 | * emulate them for its L2 guest, i.e., allows 3-level nesting! | 7531 | * emulate them for its L2 guest, i.e., allows 3-level nesting! |
7529 | */ | 7532 | */ |
7530 | return 1; | 7533 | return true; |
7531 | case EXIT_REASON_CR_ACCESS: | 7534 | case EXIT_REASON_CR_ACCESS: |
7532 | return nested_vmx_exit_handled_cr(vcpu, vmcs12); | 7535 | return nested_vmx_exit_handled_cr(vcpu, vmcs12); |
7533 | case EXIT_REASON_DR_ACCESS: | 7536 | case EXIT_REASON_DR_ACCESS: |
@@ -7538,7 +7541,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) | |||
7538 | case EXIT_REASON_MSR_WRITE: | 7541 | case EXIT_REASON_MSR_WRITE: |
7539 | return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); | 7542 | return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); |
7540 | case EXIT_REASON_INVALID_STATE: | 7543 | case EXIT_REASON_INVALID_STATE: |
7541 | return 1; | 7544 | return true; |
7542 | case EXIT_REASON_MWAIT_INSTRUCTION: | 7545 | case EXIT_REASON_MWAIT_INSTRUCTION: |
7543 | return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); | 7546 | return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); |
7544 | case EXIT_REASON_MONITOR_INSTRUCTION: | 7547 | case EXIT_REASON_MONITOR_INSTRUCTION: |
@@ -7548,7 +7551,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) | |||
7548 | nested_cpu_has2(vmcs12, | 7551 | nested_cpu_has2(vmcs12, |
7549 | SECONDARY_EXEC_PAUSE_LOOP_EXITING); | 7552 | SECONDARY_EXEC_PAUSE_LOOP_EXITING); |
7550 | case EXIT_REASON_MCE_DURING_VMENTRY: | 7553 | case EXIT_REASON_MCE_DURING_VMENTRY: |
7551 | return 0; | 7554 | return false; |
7552 | case EXIT_REASON_TPR_BELOW_THRESHOLD: | 7555 | case EXIT_REASON_TPR_BELOW_THRESHOLD: |
7553 | return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); | 7556 | return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); |
7554 | case EXIT_REASON_APIC_ACCESS: | 7557 | case EXIT_REASON_APIC_ACCESS: |
@@ -7557,7 +7560,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) | |||
7557 | case EXIT_REASON_APIC_WRITE: | 7560 | case EXIT_REASON_APIC_WRITE: |
7558 | case EXIT_REASON_EOI_INDUCED: | 7561 | case EXIT_REASON_EOI_INDUCED: |
7559 | /* apic_write and eoi_induced should exit unconditionally. */ | 7562 | /* apic_write and eoi_induced should exit unconditionally. */ |
7560 | return 1; | 7563 | return true; |
7561 | case EXIT_REASON_EPT_VIOLATION: | 7564 | case EXIT_REASON_EPT_VIOLATION: |
7562 | /* | 7565 | /* |
7563 | * L0 always deals with the EPT violation. If nested EPT is | 7566 | * L0 always deals with the EPT violation. If nested EPT is |
@@ -7565,7 +7568,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) | |||
7565 | * missing in the guest EPT table (EPT12), the EPT violation | 7568 | * missing in the guest EPT table (EPT12), the EPT violation |
7566 | * will be injected with nested_ept_inject_page_fault() | 7569 | * will be injected with nested_ept_inject_page_fault() |
7567 | */ | 7570 | */ |
7568 | return 0; | 7571 | return false; |
7569 | case EXIT_REASON_EPT_MISCONFIG: | 7572 | case EXIT_REASON_EPT_MISCONFIG: |
7570 | /* | 7573 | /* |
7571 | * L2 never uses directly L1's EPT, but rather L0's own EPT | 7574 | * L2 never uses directly L1's EPT, but rather L0's own EPT |
@@ -7573,11 +7576,11 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) | |||
7573 | * (EPT on EPT). So any problems with the structure of the | 7576 | * (EPT on EPT). So any problems with the structure of the |
7574 | * table is L0's fault. | 7577 | * table is L0's fault. |
7575 | */ | 7578 | */ |
7576 | return 0; | 7579 | return false; |
7577 | case EXIT_REASON_WBINVD: | 7580 | case EXIT_REASON_WBINVD: |
7578 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); | 7581 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); |
7579 | case EXIT_REASON_XSETBV: | 7582 | case EXIT_REASON_XSETBV: |
7580 | return 1; | 7583 | return true; |
7581 | case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: | 7584 | case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: |
7582 | /* | 7585 | /* |
7583 | * This should never happen, since it is not possible to | 7586 | * This should never happen, since it is not possible to |
@@ -7587,7 +7590,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) | |||
7587 | */ | 7590 | */ |
7588 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); | 7591 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); |
7589 | default: | 7592 | default: |
7590 | return 1; | 7593 | return true; |
7591 | } | 7594 | } |
7592 | } | 7595 | } |
7593 | 7596 | ||
@@ -8522,6 +8525,9 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) | |||
8522 | exec_control); | 8525 | exec_control); |
8523 | } | 8526 | } |
8524 | } | 8527 | } |
8528 | if (nested && !vmx->rdtscp_enabled) | ||
8529 | vmx->nested.nested_vmx_secondary_ctls_high &= | ||
8530 | ~SECONDARY_EXEC_RDTSCP; | ||
8525 | } | 8531 | } |
8526 | 8532 | ||
8527 | /* Exposing INVPCID only when PCID is exposed */ | 8533 | /* Exposing INVPCID only when PCID is exposed */ |
@@ -8622,10 +8628,11 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, | |||
8622 | struct vmcs12 *vmcs12) | 8628 | struct vmcs12 *vmcs12) |
8623 | { | 8629 | { |
8624 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 8630 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
8631 | int maxphyaddr = cpuid_maxphyaddr(vcpu); | ||
8625 | 8632 | ||
8626 | if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { | 8633 | if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { |
8627 | /* TODO: Also verify bits beyond physical address width are 0 */ | 8634 | if (!PAGE_ALIGNED(vmcs12->apic_access_addr) || |
8628 | if (!PAGE_ALIGNED(vmcs12->apic_access_addr)) | 8635 | vmcs12->apic_access_addr >> maxphyaddr) |
8629 | return false; | 8636 | return false; |
8630 | 8637 | ||
8631 | /* | 8638 | /* |
@@ -8641,8 +8648,8 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, | |||
8641 | } | 8648 | } |
8642 | 8649 | ||
8643 | if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { | 8650 | if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { |
8644 | /* TODO: Also verify bits beyond physical address width are 0 */ | 8651 | if (!PAGE_ALIGNED(vmcs12->virtual_apic_page_addr) || |
8645 | if (!PAGE_ALIGNED(vmcs12->virtual_apic_page_addr)) | 8652 | vmcs12->virtual_apic_page_addr >> maxphyaddr) |
8646 | return false; | 8653 | return false; |
8647 | 8654 | ||
8648 | if (vmx->nested.virtual_apic_page) /* shouldn't happen */ | 8655 | if (vmx->nested.virtual_apic_page) /* shouldn't happen */ |
@@ -8665,7 +8672,8 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, | |||
8665 | } | 8672 | } |
8666 | 8673 | ||
8667 | if (nested_cpu_has_posted_intr(vmcs12)) { | 8674 | if (nested_cpu_has_posted_intr(vmcs12)) { |
8668 | if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64)) | 8675 | if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64) || |
8676 | vmcs12->posted_intr_desc_addr >> maxphyaddr) | ||
8669 | return false; | 8677 | return false; |
8670 | 8678 | ||
8671 | if (vmx->nested.pi_desc_page) { /* shouldn't happen */ | 8679 | if (vmx->nested.pi_desc_page) { /* shouldn't happen */ |
@@ -8864,9 +8872,9 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, | |||
8864 | 8872 | ||
8865 | static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, | 8873 | static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, |
8866 | unsigned long count_field, | 8874 | unsigned long count_field, |
8867 | unsigned long addr_field, | 8875 | unsigned long addr_field) |
8868 | int maxphyaddr) | ||
8869 | { | 8876 | { |
8877 | int maxphyaddr; | ||
8870 | u64 count, addr; | 8878 | u64 count, addr; |
8871 | 8879 | ||
8872 | if (vmcs12_read_any(vcpu, count_field, &count) || | 8880 | if (vmcs12_read_any(vcpu, count_field, &count) || |
@@ -8876,6 +8884,7 @@ static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, | |||
8876 | } | 8884 | } |
8877 | if (count == 0) | 8885 | if (count == 0) |
8878 | return 0; | 8886 | return 0; |
8887 | maxphyaddr = cpuid_maxphyaddr(vcpu); | ||
8879 | if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr || | 8888 | if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr || |
8880 | (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) { | 8889 | (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) { |
8881 | pr_warn_ratelimited( | 8890 | pr_warn_ratelimited( |
@@ -8889,19 +8898,16 @@ static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, | |||
8889 | static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu, | 8898 | static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu, |
8890 | struct vmcs12 *vmcs12) | 8899 | struct vmcs12 *vmcs12) |
8891 | { | 8900 | { |
8892 | int maxphyaddr; | ||
8893 | |||
8894 | if (vmcs12->vm_exit_msr_load_count == 0 && | 8901 | if (vmcs12->vm_exit_msr_load_count == 0 && |
8895 | vmcs12->vm_exit_msr_store_count == 0 && | 8902 | vmcs12->vm_exit_msr_store_count == 0 && |
8896 | vmcs12->vm_entry_msr_load_count == 0) | 8903 | vmcs12->vm_entry_msr_load_count == 0) |
8897 | return 0; /* Fast path */ | 8904 | return 0; /* Fast path */ |
8898 | maxphyaddr = cpuid_maxphyaddr(vcpu); | ||
8899 | if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT, | 8905 | if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT, |
8900 | VM_EXIT_MSR_LOAD_ADDR, maxphyaddr) || | 8906 | VM_EXIT_MSR_LOAD_ADDR) || |
8901 | nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT, | 8907 | nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT, |
8902 | VM_EXIT_MSR_STORE_ADDR, maxphyaddr) || | 8908 | VM_EXIT_MSR_STORE_ADDR) || |
8903 | nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT, | 8909 | nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT, |
8904 | VM_ENTRY_MSR_LOAD_ADDR, maxphyaddr)) | 8910 | VM_ENTRY_MSR_LOAD_ADDR)) |
8905 | return -EINVAL; | 8911 | return -EINVAL; |
8906 | return 0; | 8912 | return 0; |
8907 | } | 8913 | } |
@@ -9151,8 +9157,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) | |||
9151 | exec_control &= ~SECONDARY_EXEC_RDTSCP; | 9157 | exec_control &= ~SECONDARY_EXEC_RDTSCP; |
9152 | /* Take the following fields only from vmcs12 */ | 9158 | /* Take the following fields only from vmcs12 */ |
9153 | exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | | 9159 | exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | |
9160 | SECONDARY_EXEC_RDTSCP | | ||
9154 | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | | 9161 | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | |
9155 | SECONDARY_EXEC_APIC_REGISTER_VIRT); | 9162 | SECONDARY_EXEC_APIC_REGISTER_VIRT); |
9156 | if (nested_cpu_has(vmcs12, | 9163 | if (nested_cpu_has(vmcs12, |
9157 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) | 9164 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) |
9158 | exec_control |= vmcs12->secondary_vm_exec_control; | 9165 | exec_control |= vmcs12->secondary_vm_exec_control; |
@@ -9385,7 +9392,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) | |||
9385 | } | 9392 | } |
9386 | 9393 | ||
9387 | if (!nested_get_vmcs12_pages(vcpu, vmcs12)) { | 9394 | if (!nested_get_vmcs12_pages(vcpu, vmcs12)) { |
9388 | /*TODO: Also verify bits beyond physical address width are 0*/ | ||
9389 | nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); | 9395 | nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); |
9390 | return 1; | 9396 | return 1; |
9391 | } | 9397 | } |
@@ -9524,7 +9530,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) | |||
9524 | vmcs12->launch_state = 1; | 9530 | vmcs12->launch_state = 1; |
9525 | 9531 | ||
9526 | if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) | 9532 | if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) |
9527 | return kvm_emulate_halt(vcpu); | 9533 | return kvm_vcpu_halt(vcpu); |
9528 | 9534 | ||
9529 | vmx->nested.nested_run_pending = 1; | 9535 | vmx->nested.nested_run_pending = 1; |
9530 | 9536 | ||
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 32bf19ef3115..e1a81267f3f6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c | |||
@@ -801,6 +801,17 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) | |||
801 | } | 801 | } |
802 | EXPORT_SYMBOL_GPL(kvm_get_cr8); | 802 | EXPORT_SYMBOL_GPL(kvm_get_cr8); |
803 | 803 | ||
804 | static void kvm_update_dr0123(struct kvm_vcpu *vcpu) | ||
805 | { | ||
806 | int i; | ||
807 | |||
808 | if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { | ||
809 | for (i = 0; i < KVM_NR_DB_REGS; i++) | ||
810 | vcpu->arch.eff_db[i] = vcpu->arch.db[i]; | ||
811 | vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD; | ||
812 | } | ||
813 | } | ||
814 | |||
804 | static void kvm_update_dr6(struct kvm_vcpu *vcpu) | 815 | static void kvm_update_dr6(struct kvm_vcpu *vcpu) |
805 | { | 816 | { |
806 | if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) | 817 | if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) |
@@ -1070,19 +1081,19 @@ static void update_pvclock_gtod(struct timekeeper *tk) | |||
1070 | struct pvclock_gtod_data *vdata = &pvclock_gtod_data; | 1081 | struct pvclock_gtod_data *vdata = &pvclock_gtod_data; |
1071 | u64 boot_ns; | 1082 | u64 boot_ns; |
1072 | 1083 | ||
1073 | boot_ns = ktime_to_ns(ktime_add(tk->tkr.base_mono, tk->offs_boot)); | 1084 | boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot)); |
1074 | 1085 | ||
1075 | write_seqcount_begin(&vdata->seq); | 1086 | write_seqcount_begin(&vdata->seq); |
1076 | 1087 | ||
1077 | /* copy pvclock gtod data */ | 1088 | /* copy pvclock gtod data */ |
1078 | vdata->clock.vclock_mode = tk->tkr.clock->archdata.vclock_mode; | 1089 | vdata->clock.vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; |
1079 | vdata->clock.cycle_last = tk->tkr.cycle_last; | 1090 | vdata->clock.cycle_last = tk->tkr_mono.cycle_last; |
1080 | vdata->clock.mask = tk->tkr.mask; | 1091 | vdata->clock.mask = tk->tkr_mono.mask; |
1081 | vdata->clock.mult = tk->tkr.mult; | 1092 | vdata->clock.mult = tk->tkr_mono.mult; |
1082 | vdata->clock.shift = tk->tkr.shift; | 1093 | vdata->clock.shift = tk->tkr_mono.shift; |
1083 | 1094 | ||
1084 | vdata->boot_ns = boot_ns; | 1095 | vdata->boot_ns = boot_ns; |
1085 | vdata->nsec_base = tk->tkr.xtime_nsec; | 1096 | vdata->nsec_base = tk->tkr_mono.xtime_nsec; |
1086 | 1097 | ||
1087 | write_seqcount_end(&vdata->seq); | 1098 | write_seqcount_end(&vdata->seq); |
1088 | } | 1099 | } |
@@ -3149,6 +3160,7 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, | |||
3149 | return -EINVAL; | 3160 | return -EINVAL; |
3150 | 3161 | ||
3151 | memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); | 3162 | memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); |
3163 | kvm_update_dr0123(vcpu); | ||
3152 | vcpu->arch.dr6 = dbgregs->dr6; | 3164 | vcpu->arch.dr6 = dbgregs->dr6; |
3153 | kvm_update_dr6(vcpu); | 3165 | kvm_update_dr6(vcpu); |
3154 | vcpu->arch.dr7 = dbgregs->dr7; | 3166 | vcpu->arch.dr7 = dbgregs->dr7; |
@@ -4114,8 +4126,8 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, | |||
4114 | do { | 4126 | do { |
4115 | n = min(len, 8); | 4127 | n = min(len, 8); |
4116 | if (!(vcpu->arch.apic && | 4128 | if (!(vcpu->arch.apic && |
4117 | !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, n, v)) | 4129 | !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v)) |
4118 | && kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, n, v)) | 4130 | && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v)) |
4119 | break; | 4131 | break; |
4120 | handled += n; | 4132 | handled += n; |
4121 | addr += n; | 4133 | addr += n; |
@@ -4134,8 +4146,9 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) | |||
4134 | do { | 4146 | do { |
4135 | n = min(len, 8); | 4147 | n = min(len, 8); |
4136 | if (!(vcpu->arch.apic && | 4148 | if (!(vcpu->arch.apic && |
4137 | !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, n, v)) | 4149 | !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev, |
4138 | && kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, n, v)) | 4150 | addr, n, v)) |
4151 | && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v)) | ||
4139 | break; | 4152 | break; |
4140 | trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v); | 4153 | trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v); |
4141 | handled += n; | 4154 | handled += n; |
@@ -4475,7 +4488,8 @@ mmio: | |||
4475 | return X86EMUL_CONTINUE; | 4488 | return X86EMUL_CONTINUE; |
4476 | } | 4489 | } |
4477 | 4490 | ||
4478 | int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, | 4491 | static int emulator_read_write(struct x86_emulate_ctxt *ctxt, |
4492 | unsigned long addr, | ||
4479 | void *val, unsigned int bytes, | 4493 | void *val, unsigned int bytes, |
4480 | struct x86_exception *exception, | 4494 | struct x86_exception *exception, |
4481 | const struct read_write_emulator_ops *ops) | 4495 | const struct read_write_emulator_ops *ops) |
@@ -4538,7 +4552,7 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, | |||
4538 | exception, &read_emultor); | 4552 | exception, &read_emultor); |
4539 | } | 4553 | } |
4540 | 4554 | ||
4541 | int emulator_write_emulated(struct x86_emulate_ctxt *ctxt, | 4555 | static int emulator_write_emulated(struct x86_emulate_ctxt *ctxt, |
4542 | unsigned long addr, | 4556 | unsigned long addr, |
4543 | const void *val, | 4557 | const void *val, |
4544 | unsigned int bytes, | 4558 | unsigned int bytes, |
@@ -4629,10 +4643,10 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) | |||
4629 | int r; | 4643 | int r; |
4630 | 4644 | ||
4631 | if (vcpu->arch.pio.in) | 4645 | if (vcpu->arch.pio.in) |
4632 | r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port, | 4646 | r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port, |
4633 | vcpu->arch.pio.size, pd); | 4647 | vcpu->arch.pio.size, pd); |
4634 | else | 4648 | else |
4635 | r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS, | 4649 | r = kvm_io_bus_write(vcpu, KVM_PIO_BUS, |
4636 | vcpu->arch.pio.port, vcpu->arch.pio.size, | 4650 | vcpu->arch.pio.port, vcpu->arch.pio.size, |
4637 | pd); | 4651 | pd); |
4638 | return r; | 4652 | return r; |
@@ -4705,7 +4719,7 @@ static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address) | |||
4705 | kvm_mmu_invlpg(emul_to_vcpu(ctxt), address); | 4719 | kvm_mmu_invlpg(emul_to_vcpu(ctxt), address); |
4706 | } | 4720 | } |
4707 | 4721 | ||
4708 | int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) | 4722 | int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu) |
4709 | { | 4723 | { |
4710 | if (!need_emulate_wbinvd(vcpu)) | 4724 | if (!need_emulate_wbinvd(vcpu)) |
4711 | return X86EMUL_CONTINUE; | 4725 | return X86EMUL_CONTINUE; |
@@ -4722,19 +4736,29 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) | |||
4722 | wbinvd(); | 4736 | wbinvd(); |
4723 | return X86EMUL_CONTINUE; | 4737 | return X86EMUL_CONTINUE; |
4724 | } | 4738 | } |
4739 | |||
4740 | int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) | ||
4741 | { | ||
4742 | kvm_x86_ops->skip_emulated_instruction(vcpu); | ||
4743 | return kvm_emulate_wbinvd_noskip(vcpu); | ||
4744 | } | ||
4725 | EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); | 4745 | EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); |
4726 | 4746 | ||
4747 | |||
4748 | |||
4727 | static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt) | 4749 | static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt) |
4728 | { | 4750 | { |
4729 | kvm_emulate_wbinvd(emul_to_vcpu(ctxt)); | 4751 | kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt)); |
4730 | } | 4752 | } |
4731 | 4753 | ||
4732 | int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) | 4754 | static int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, |
4755 | unsigned long *dest) | ||
4733 | { | 4756 | { |
4734 | return kvm_get_dr(emul_to_vcpu(ctxt), dr, dest); | 4757 | return kvm_get_dr(emul_to_vcpu(ctxt), dr, dest); |
4735 | } | 4758 | } |
4736 | 4759 | ||
4737 | int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) | 4760 | static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, |
4761 | unsigned long value) | ||
4738 | { | 4762 | { |
4739 | 4763 | ||
4740 | return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value); | 4764 | return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value); |
@@ -5816,7 +5840,7 @@ void kvm_arch_exit(void) | |||
5816 | free_percpu(shared_msrs); | 5840 | free_percpu(shared_msrs); |
5817 | } | 5841 | } |
5818 | 5842 | ||
5819 | int kvm_emulate_halt(struct kvm_vcpu *vcpu) | 5843 | int kvm_vcpu_halt(struct kvm_vcpu *vcpu) |
5820 | { | 5844 | { |
5821 | ++vcpu->stat.halt_exits; | 5845 | ++vcpu->stat.halt_exits; |
5822 | if (irqchip_in_kernel(vcpu->kvm)) { | 5846 | if (irqchip_in_kernel(vcpu->kvm)) { |
@@ -5827,6 +5851,13 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu) | |||
5827 | return 0; | 5851 | return 0; |
5828 | } | 5852 | } |
5829 | } | 5853 | } |
5854 | EXPORT_SYMBOL_GPL(kvm_vcpu_halt); | ||
5855 | |||
5856 | int kvm_emulate_halt(struct kvm_vcpu *vcpu) | ||
5857 | { | ||
5858 | kvm_x86_ops->skip_emulated_instruction(vcpu); | ||
5859 | return kvm_vcpu_halt(vcpu); | ||
5860 | } | ||
5830 | EXPORT_SYMBOL_GPL(kvm_emulate_halt); | 5861 | EXPORT_SYMBOL_GPL(kvm_emulate_halt); |
5831 | 5862 | ||
5832 | int kvm_hv_hypercall(struct kvm_vcpu *vcpu) | 5863 | int kvm_hv_hypercall(struct kvm_vcpu *vcpu) |
@@ -5903,7 +5934,7 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid) | |||
5903 | lapic_irq.dest_id = apicid; | 5934 | lapic_irq.dest_id = apicid; |
5904 | 5935 | ||
5905 | lapic_irq.delivery_mode = APIC_DM_REMRD; | 5936 | lapic_irq.delivery_mode = APIC_DM_REMRD; |
5906 | kvm_irq_delivery_to_apic(kvm, 0, &lapic_irq, NULL); | 5937 | kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL); |
5907 | } | 5938 | } |
5908 | 5939 | ||
5909 | int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) | 5940 | int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) |
@@ -5911,6 +5942,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) | |||
5911 | unsigned long nr, a0, a1, a2, a3, ret; | 5942 | unsigned long nr, a0, a1, a2, a3, ret; |
5912 | int op_64_bit, r = 1; | 5943 | int op_64_bit, r = 1; |
5913 | 5944 | ||
5945 | kvm_x86_ops->skip_emulated_instruction(vcpu); | ||
5946 | |||
5914 | if (kvm_hv_hypercall_enabled(vcpu->kvm)) | 5947 | if (kvm_hv_hypercall_enabled(vcpu->kvm)) |
5915 | return kvm_hv_hypercall(vcpu); | 5948 | return kvm_hv_hypercall(vcpu); |
5916 | 5949 | ||
@@ -6164,7 +6197,7 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, | |||
6164 | } | 6197 | } |
6165 | 6198 | ||
6166 | /* | 6199 | /* |
6167 | * Returns 1 to let __vcpu_run() continue the guest execution loop without | 6200 | * Returns 1 to let vcpu_run() continue the guest execution loop without |
6168 | * exiting to the userspace. Otherwise, the value will be returned to the | 6201 | * exiting to the userspace. Otherwise, the value will be returned to the |
6169 | * userspace. | 6202 | * userspace. |
6170 | */ | 6203 | */ |
@@ -6301,6 +6334,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) | |||
6301 | set_debugreg(vcpu->arch.eff_db[2], 2); | 6334 | set_debugreg(vcpu->arch.eff_db[2], 2); |
6302 | set_debugreg(vcpu->arch.eff_db[3], 3); | 6335 | set_debugreg(vcpu->arch.eff_db[3], 3); |
6303 | set_debugreg(vcpu->arch.dr6, 6); | 6336 | set_debugreg(vcpu->arch.dr6, 6); |
6337 | vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; | ||
6304 | } | 6338 | } |
6305 | 6339 | ||
6306 | trace_kvm_entry(vcpu->vcpu_id); | 6340 | trace_kvm_entry(vcpu->vcpu_id); |
@@ -6382,42 +6416,47 @@ out: | |||
6382 | return r; | 6416 | return r; |
6383 | } | 6417 | } |
6384 | 6418 | ||
6419 | static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) | ||
6420 | { | ||
6421 | if (!kvm_arch_vcpu_runnable(vcpu)) { | ||
6422 | srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); | ||
6423 | kvm_vcpu_block(vcpu); | ||
6424 | vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); | ||
6425 | if (!kvm_check_request(KVM_REQ_UNHALT, vcpu)) | ||
6426 | return 1; | ||
6427 | } | ||
6428 | |||
6429 | kvm_apic_accept_events(vcpu); | ||
6430 | switch(vcpu->arch.mp_state) { | ||
6431 | case KVM_MP_STATE_HALTED: | ||
6432 | vcpu->arch.pv.pv_unhalted = false; | ||
6433 | vcpu->arch.mp_state = | ||
6434 | KVM_MP_STATE_RUNNABLE; | ||
6435 | case KVM_MP_STATE_RUNNABLE: | ||
6436 | vcpu->arch.apf.halted = false; | ||
6437 | break; | ||
6438 | case KVM_MP_STATE_INIT_RECEIVED: | ||
6439 | break; | ||
6440 | default: | ||
6441 | return -EINTR; | ||
6442 | break; | ||
6443 | } | ||
6444 | return 1; | ||
6445 | } | ||
6385 | 6446 | ||
6386 | static int __vcpu_run(struct kvm_vcpu *vcpu) | 6447 | static int vcpu_run(struct kvm_vcpu *vcpu) |
6387 | { | 6448 | { |
6388 | int r; | 6449 | int r; |
6389 | struct kvm *kvm = vcpu->kvm; | 6450 | struct kvm *kvm = vcpu->kvm; |
6390 | 6451 | ||
6391 | vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); | 6452 | vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); |
6392 | 6453 | ||
6393 | r = 1; | 6454 | for (;;) { |
6394 | while (r > 0) { | ||
6395 | if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && | 6455 | if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && |
6396 | !vcpu->arch.apf.halted) | 6456 | !vcpu->arch.apf.halted) |
6397 | r = vcpu_enter_guest(vcpu); | 6457 | r = vcpu_enter_guest(vcpu); |
6398 | else { | 6458 | else |
6399 | srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); | 6459 | r = vcpu_block(kvm, vcpu); |
6400 | kvm_vcpu_block(vcpu); | ||
6401 | vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); | ||
6402 | if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) { | ||
6403 | kvm_apic_accept_events(vcpu); | ||
6404 | switch(vcpu->arch.mp_state) { | ||
6405 | case KVM_MP_STATE_HALTED: | ||
6406 | vcpu->arch.pv.pv_unhalted = false; | ||
6407 | vcpu->arch.mp_state = | ||
6408 | KVM_MP_STATE_RUNNABLE; | ||
6409 | case KVM_MP_STATE_RUNNABLE: | ||
6410 | vcpu->arch.apf.halted = false; | ||
6411 | break; | ||
6412 | case KVM_MP_STATE_INIT_RECEIVED: | ||
6413 | break; | ||
6414 | default: | ||
6415 | r = -EINTR; | ||
6416 | break; | ||
6417 | } | ||
6418 | } | ||
6419 | } | ||
6420 | |||
6421 | if (r <= 0) | 6460 | if (r <= 0) |
6422 | break; | 6461 | break; |
6423 | 6462 | ||
@@ -6429,6 +6468,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) | |||
6429 | r = -EINTR; | 6468 | r = -EINTR; |
6430 | vcpu->run->exit_reason = KVM_EXIT_INTR; | 6469 | vcpu->run->exit_reason = KVM_EXIT_INTR; |
6431 | ++vcpu->stat.request_irq_exits; | 6470 | ++vcpu->stat.request_irq_exits; |
6471 | break; | ||
6432 | } | 6472 | } |
6433 | 6473 | ||
6434 | kvm_check_async_pf_completion(vcpu); | 6474 | kvm_check_async_pf_completion(vcpu); |
@@ -6437,6 +6477,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) | |||
6437 | r = -EINTR; | 6477 | r = -EINTR; |
6438 | vcpu->run->exit_reason = KVM_EXIT_INTR; | 6478 | vcpu->run->exit_reason = KVM_EXIT_INTR; |
6439 | ++vcpu->stat.signal_exits; | 6479 | ++vcpu->stat.signal_exits; |
6480 | break; | ||
6440 | } | 6481 | } |
6441 | if (need_resched()) { | 6482 | if (need_resched()) { |
6442 | srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); | 6483 | srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); |
@@ -6568,7 +6609,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
6568 | } else | 6609 | } else |
6569 | WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed); | 6610 | WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed); |
6570 | 6611 | ||
6571 | r = __vcpu_run(vcpu); | 6612 | r = vcpu_run(vcpu); |
6572 | 6613 | ||
6573 | out: | 6614 | out: |
6574 | post_kvm_run_save(vcpu); | 6615 | post_kvm_run_save(vcpu); |
@@ -7075,11 +7116,14 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu) | |||
7075 | kvm_clear_exception_queue(vcpu); | 7116 | kvm_clear_exception_queue(vcpu); |
7076 | 7117 | ||
7077 | memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); | 7118 | memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); |
7119 | kvm_update_dr0123(vcpu); | ||
7078 | vcpu->arch.dr6 = DR6_INIT; | 7120 | vcpu->arch.dr6 = DR6_INIT; |
7079 | kvm_update_dr6(vcpu); | 7121 | kvm_update_dr6(vcpu); |
7080 | vcpu->arch.dr7 = DR7_FIXED_1; | 7122 | vcpu->arch.dr7 = DR7_FIXED_1; |
7081 | kvm_update_dr7(vcpu); | 7123 | kvm_update_dr7(vcpu); |
7082 | 7124 | ||
7125 | vcpu->arch.cr2 = 0; | ||
7126 | |||
7083 | kvm_make_request(KVM_REQ_EVENT, vcpu); | 7127 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
7084 | vcpu->arch.apf.msr_val = 0; | 7128 | vcpu->arch.apf.msr_val = 0; |
7085 | vcpu->arch.st.msr_val = 0; | 7129 | vcpu->arch.st.msr_val = 0; |
@@ -7240,7 +7284,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) | |||
7240 | 7284 | ||
7241 | vcpu->arch.pv.pv_unhalted = false; | 7285 | vcpu->arch.pv.pv_unhalted = false; |
7242 | vcpu->arch.emulate_ctxt.ops = &emulate_ops; | 7286 | vcpu->arch.emulate_ctxt.ops = &emulate_ops; |
7243 | if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) | 7287 | if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu)) |
7244 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; | 7288 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; |
7245 | else | 7289 | else |
7246 | vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; | 7290 | vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; |
@@ -7288,6 +7332,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) | |||
7288 | vcpu->arch.guest_supported_xcr0 = 0; | 7332 | vcpu->arch.guest_supported_xcr0 = 0; |
7289 | vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; | 7333 | vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; |
7290 | 7334 | ||
7335 | vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); | ||
7336 | |||
7291 | kvm_async_pf_hash_reset(vcpu); | 7337 | kvm_async_pf_hash_reset(vcpu); |
7292 | kvm_pmu_init(vcpu); | 7338 | kvm_pmu_init(vcpu); |
7293 | 7339 | ||
@@ -7428,7 +7474,7 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, | |||
7428 | 7474 | ||
7429 | for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { | 7475 | for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { |
7430 | if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) { | 7476 | if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) { |
7431 | kvm_kvfree(free->arch.rmap[i]); | 7477 | kvfree(free->arch.rmap[i]); |
7432 | free->arch.rmap[i] = NULL; | 7478 | free->arch.rmap[i] = NULL; |
7433 | } | 7479 | } |
7434 | if (i == 0) | 7480 | if (i == 0) |
@@ -7436,7 +7482,7 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, | |||
7436 | 7482 | ||
7437 | if (!dont || free->arch.lpage_info[i - 1] != | 7483 | if (!dont || free->arch.lpage_info[i - 1] != |
7438 | dont->arch.lpage_info[i - 1]) { | 7484 | dont->arch.lpage_info[i - 1]) { |
7439 | kvm_kvfree(free->arch.lpage_info[i - 1]); | 7485 | kvfree(free->arch.lpage_info[i - 1]); |
7440 | free->arch.lpage_info[i - 1] = NULL; | 7486 | free->arch.lpage_info[i - 1] = NULL; |
7441 | } | 7487 | } |
7442 | } | 7488 | } |
@@ -7490,12 +7536,12 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, | |||
7490 | 7536 | ||
7491 | out_free: | 7537 | out_free: |
7492 | for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { | 7538 | for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { |
7493 | kvm_kvfree(slot->arch.rmap[i]); | 7539 | kvfree(slot->arch.rmap[i]); |
7494 | slot->arch.rmap[i] = NULL; | 7540 | slot->arch.rmap[i] = NULL; |
7495 | if (i == 0) | 7541 | if (i == 0) |
7496 | continue; | 7542 | continue; |
7497 | 7543 | ||
7498 | kvm_kvfree(slot->arch.lpage_info[i - 1]); | 7544 | kvfree(slot->arch.lpage_info[i - 1]); |
7499 | slot->arch.lpage_info[i - 1] = NULL; | 7545 | slot->arch.lpage_info[i - 1] = NULL; |
7500 | } | 7546 | } |
7501 | return -ENOMEM; | 7547 | return -ENOMEM; |
@@ -7618,6 +7664,23 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, | |||
7618 | new = id_to_memslot(kvm->memslots, mem->slot); | 7664 | new = id_to_memslot(kvm->memslots, mem->slot); |
7619 | 7665 | ||
7620 | /* | 7666 | /* |
7667 | * Dirty logging tracks sptes in 4k granularity, meaning that large | ||
7668 | * sptes have to be split. If live migration is successful, the guest | ||
7669 | * in the source machine will be destroyed and large sptes will be | ||
7670 | * created in the destination. However, if the guest continues to run | ||
7671 | * in the source machine (for example if live migration fails), small | ||
7672 | * sptes will remain around and cause bad performance. | ||
7673 | * | ||
7674 | * Scan sptes if dirty logging has been stopped, dropping those | ||
7675 | * which can be collapsed into a single large-page spte. Later | ||
7676 | * page faults will create the large-page sptes. | ||
7677 | */ | ||
7678 | if ((change != KVM_MR_DELETE) && | ||
7679 | (old->flags & KVM_MEM_LOG_DIRTY_PAGES) && | ||
7680 | !(new->flags & KVM_MEM_LOG_DIRTY_PAGES)) | ||
7681 | kvm_mmu_zap_collapsible_sptes(kvm, new); | ||
7682 | |||
7683 | /* | ||
7621 | * Set up write protection and/or dirty logging for the new slot. | 7684 | * Set up write protection and/or dirty logging for the new slot. |
7622 | * | 7685 | * |
7623 | * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of old slot have | 7686 | * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of old slot have |
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index ac4453d8520e..717908b16037 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c | |||
@@ -868,7 +868,8 @@ static void __init lguest_init_IRQ(void) | |||
868 | /* Some systems map "vectors" to interrupts weirdly. Not us! */ | 868 | /* Some systems map "vectors" to interrupts weirdly. Not us! */ |
869 | __this_cpu_write(vector_irq[i], i - FIRST_EXTERNAL_VECTOR); | 869 | __this_cpu_write(vector_irq[i], i - FIRST_EXTERNAL_VECTOR); |
870 | if (i != SYSCALL_VECTOR) | 870 | if (i != SYSCALL_VECTOR) |
871 | set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); | 871 | set_intr_gate(i, irq_entries_start + |
872 | 8 * (i - FIRST_EXTERNAL_VECTOR)); | ||
872 | } | 873 | } |
873 | 874 | ||
874 | /* | 875 | /* |
@@ -1076,6 +1077,7 @@ static void lguest_load_sp0(struct tss_struct *tss, | |||
1076 | { | 1077 | { |
1077 | lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0, | 1078 | lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0, |
1078 | THREAD_SIZE / PAGE_SIZE); | 1079 | THREAD_SIZE / PAGE_SIZE); |
1080 | tss->x86_tss.sp0 = thread->sp0; | ||
1079 | } | 1081 | } |
1080 | 1082 | ||
1081 | /* Let's just say, I wouldn't do debugging under a Guest. */ | 1083 | /* Let's just say, I wouldn't do debugging under a Guest. */ |
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S index f5cc9eb1d51b..082a85167a5b 100644 --- a/arch/x86/lib/atomic64_cx8_32.S +++ b/arch/x86/lib/atomic64_cx8_32.S | |||
@@ -13,16 +13,6 @@ | |||
13 | #include <asm/alternative-asm.h> | 13 | #include <asm/alternative-asm.h> |
14 | #include <asm/dwarf2.h> | 14 | #include <asm/dwarf2.h> |
15 | 15 | ||
16 | .macro SAVE reg | ||
17 | pushl_cfi %\reg | ||
18 | CFI_REL_OFFSET \reg, 0 | ||
19 | .endm | ||
20 | |||
21 | .macro RESTORE reg | ||
22 | popl_cfi %\reg | ||
23 | CFI_RESTORE \reg | ||
24 | .endm | ||
25 | |||
26 | .macro read64 reg | 16 | .macro read64 reg |
27 | movl %ebx, %eax | 17 | movl %ebx, %eax |
28 | movl %ecx, %edx | 18 | movl %ecx, %edx |
@@ -67,10 +57,10 @@ ENDPROC(atomic64_xchg_cx8) | |||
67 | .macro addsub_return func ins insc | 57 | .macro addsub_return func ins insc |
68 | ENTRY(atomic64_\func\()_return_cx8) | 58 | ENTRY(atomic64_\func\()_return_cx8) |
69 | CFI_STARTPROC | 59 | CFI_STARTPROC |
70 | SAVE ebp | 60 | pushl_cfi_reg ebp |
71 | SAVE ebx | 61 | pushl_cfi_reg ebx |
72 | SAVE esi | 62 | pushl_cfi_reg esi |
73 | SAVE edi | 63 | pushl_cfi_reg edi |
74 | 64 | ||
75 | movl %eax, %esi | 65 | movl %eax, %esi |
76 | movl %edx, %edi | 66 | movl %edx, %edi |
@@ -89,10 +79,10 @@ ENTRY(atomic64_\func\()_return_cx8) | |||
89 | 10: | 79 | 10: |
90 | movl %ebx, %eax | 80 | movl %ebx, %eax |
91 | movl %ecx, %edx | 81 | movl %ecx, %edx |
92 | RESTORE edi | 82 | popl_cfi_reg edi |
93 | RESTORE esi | 83 | popl_cfi_reg esi |
94 | RESTORE ebx | 84 | popl_cfi_reg ebx |
95 | RESTORE ebp | 85 | popl_cfi_reg ebp |
96 | ret | 86 | ret |
97 | CFI_ENDPROC | 87 | CFI_ENDPROC |
98 | ENDPROC(atomic64_\func\()_return_cx8) | 88 | ENDPROC(atomic64_\func\()_return_cx8) |
@@ -104,7 +94,7 @@ addsub_return sub sub sbb | |||
104 | .macro incdec_return func ins insc | 94 | .macro incdec_return func ins insc |
105 | ENTRY(atomic64_\func\()_return_cx8) | 95 | ENTRY(atomic64_\func\()_return_cx8) |
106 | CFI_STARTPROC | 96 | CFI_STARTPROC |
107 | SAVE ebx | 97 | pushl_cfi_reg ebx |
108 | 98 | ||
109 | read64 %esi | 99 | read64 %esi |
110 | 1: | 100 | 1: |
@@ -119,7 +109,7 @@ ENTRY(atomic64_\func\()_return_cx8) | |||
119 | 10: | 109 | 10: |
120 | movl %ebx, %eax | 110 | movl %ebx, %eax |
121 | movl %ecx, %edx | 111 | movl %ecx, %edx |
122 | RESTORE ebx | 112 | popl_cfi_reg ebx |
123 | ret | 113 | ret |
124 | CFI_ENDPROC | 114 | CFI_ENDPROC |
125 | ENDPROC(atomic64_\func\()_return_cx8) | 115 | ENDPROC(atomic64_\func\()_return_cx8) |
@@ -130,7 +120,7 @@ incdec_return dec sub sbb | |||
130 | 120 | ||
131 | ENTRY(atomic64_dec_if_positive_cx8) | 121 | ENTRY(atomic64_dec_if_positive_cx8) |
132 | CFI_STARTPROC | 122 | CFI_STARTPROC |
133 | SAVE ebx | 123 | pushl_cfi_reg ebx |
134 | 124 | ||
135 | read64 %esi | 125 | read64 %esi |
136 | 1: | 126 | 1: |
@@ -146,18 +136,18 @@ ENTRY(atomic64_dec_if_positive_cx8) | |||
146 | 2: | 136 | 2: |
147 | movl %ebx, %eax | 137 | movl %ebx, %eax |
148 | movl %ecx, %edx | 138 | movl %ecx, %edx |
149 | RESTORE ebx | 139 | popl_cfi_reg ebx |
150 | ret | 140 | ret |
151 | CFI_ENDPROC | 141 | CFI_ENDPROC |
152 | ENDPROC(atomic64_dec_if_positive_cx8) | 142 | ENDPROC(atomic64_dec_if_positive_cx8) |
153 | 143 | ||
154 | ENTRY(atomic64_add_unless_cx8) | 144 | ENTRY(atomic64_add_unless_cx8) |
155 | CFI_STARTPROC | 145 | CFI_STARTPROC |
156 | SAVE ebp | 146 | pushl_cfi_reg ebp |
157 | SAVE ebx | 147 | pushl_cfi_reg ebx |
158 | /* these just push these two parameters on the stack */ | 148 | /* these just push these two parameters on the stack */ |
159 | SAVE edi | 149 | pushl_cfi_reg edi |
160 | SAVE ecx | 150 | pushl_cfi_reg ecx |
161 | 151 | ||
162 | movl %eax, %ebp | 152 | movl %eax, %ebp |
163 | movl %edx, %edi | 153 | movl %edx, %edi |
@@ -179,8 +169,8 @@ ENTRY(atomic64_add_unless_cx8) | |||
179 | 3: | 169 | 3: |
180 | addl $8, %esp | 170 | addl $8, %esp |
181 | CFI_ADJUST_CFA_OFFSET -8 | 171 | CFI_ADJUST_CFA_OFFSET -8 |
182 | RESTORE ebx | 172 | popl_cfi_reg ebx |
183 | RESTORE ebp | 173 | popl_cfi_reg ebp |
184 | ret | 174 | ret |
185 | 4: | 175 | 4: |
186 | cmpl %edx, 4(%esp) | 176 | cmpl %edx, 4(%esp) |
@@ -192,7 +182,7 @@ ENDPROC(atomic64_add_unless_cx8) | |||
192 | 182 | ||
193 | ENTRY(atomic64_inc_not_zero_cx8) | 183 | ENTRY(atomic64_inc_not_zero_cx8) |
194 | CFI_STARTPROC | 184 | CFI_STARTPROC |
195 | SAVE ebx | 185 | pushl_cfi_reg ebx |
196 | 186 | ||
197 | read64 %esi | 187 | read64 %esi |
198 | 1: | 188 | 1: |
@@ -209,7 +199,7 @@ ENTRY(atomic64_inc_not_zero_cx8) | |||
209 | 199 | ||
210 | movl $1, %eax | 200 | movl $1, %eax |
211 | 3: | 201 | 3: |
212 | RESTORE ebx | 202 | popl_cfi_reg ebx |
213 | ret | 203 | ret |
214 | CFI_ENDPROC | 204 | CFI_ENDPROC |
215 | ENDPROC(atomic64_inc_not_zero_cx8) | 205 | ENDPROC(atomic64_inc_not_zero_cx8) |
diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S index e78b8eee6615..9bc944a91274 100644 --- a/arch/x86/lib/checksum_32.S +++ b/arch/x86/lib/checksum_32.S | |||
@@ -51,10 +51,8 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum) | |||
51 | */ | 51 | */ |
52 | ENTRY(csum_partial) | 52 | ENTRY(csum_partial) |
53 | CFI_STARTPROC | 53 | CFI_STARTPROC |
54 | pushl_cfi %esi | 54 | pushl_cfi_reg esi |
55 | CFI_REL_OFFSET esi, 0 | 55 | pushl_cfi_reg ebx |
56 | pushl_cfi %ebx | ||
57 | CFI_REL_OFFSET ebx, 0 | ||
58 | movl 20(%esp),%eax # Function arg: unsigned int sum | 56 | movl 20(%esp),%eax # Function arg: unsigned int sum |
59 | movl 16(%esp),%ecx # Function arg: int len | 57 | movl 16(%esp),%ecx # Function arg: int len |
60 | movl 12(%esp),%esi # Function arg: unsigned char *buff | 58 | movl 12(%esp),%esi # Function arg: unsigned char *buff |
@@ -127,14 +125,12 @@ ENTRY(csum_partial) | |||
127 | 6: addl %ecx,%eax | 125 | 6: addl %ecx,%eax |
128 | adcl $0, %eax | 126 | adcl $0, %eax |
129 | 7: | 127 | 7: |
130 | testl $1, 12(%esp) | 128 | testb $1, 12(%esp) |
131 | jz 8f | 129 | jz 8f |
132 | roll $8, %eax | 130 | roll $8, %eax |
133 | 8: | 131 | 8: |
134 | popl_cfi %ebx | 132 | popl_cfi_reg ebx |
135 | CFI_RESTORE ebx | 133 | popl_cfi_reg esi |
136 | popl_cfi %esi | ||
137 | CFI_RESTORE esi | ||
138 | ret | 134 | ret |
139 | CFI_ENDPROC | 135 | CFI_ENDPROC |
140 | ENDPROC(csum_partial) | 136 | ENDPROC(csum_partial) |
@@ -145,10 +141,8 @@ ENDPROC(csum_partial) | |||
145 | 141 | ||
146 | ENTRY(csum_partial) | 142 | ENTRY(csum_partial) |
147 | CFI_STARTPROC | 143 | CFI_STARTPROC |
148 | pushl_cfi %esi | 144 | pushl_cfi_reg esi |
149 | CFI_REL_OFFSET esi, 0 | 145 | pushl_cfi_reg ebx |
150 | pushl_cfi %ebx | ||
151 | CFI_REL_OFFSET ebx, 0 | ||
152 | movl 20(%esp),%eax # Function arg: unsigned int sum | 146 | movl 20(%esp),%eax # Function arg: unsigned int sum |
153 | movl 16(%esp),%ecx # Function arg: int len | 147 | movl 16(%esp),%ecx # Function arg: int len |
154 | movl 12(%esp),%esi # Function arg: const unsigned char *buf | 148 | movl 12(%esp),%esi # Function arg: const unsigned char *buf |
@@ -251,14 +245,12 @@ ENTRY(csum_partial) | |||
251 | addl %ebx,%eax | 245 | addl %ebx,%eax |
252 | adcl $0,%eax | 246 | adcl $0,%eax |
253 | 80: | 247 | 80: |
254 | testl $1, 12(%esp) | 248 | testb $1, 12(%esp) |
255 | jz 90f | 249 | jz 90f |
256 | roll $8, %eax | 250 | roll $8, %eax |
257 | 90: | 251 | 90: |
258 | popl_cfi %ebx | 252 | popl_cfi_reg ebx |
259 | CFI_RESTORE ebx | 253 | popl_cfi_reg esi |
260 | popl_cfi %esi | ||
261 | CFI_RESTORE esi | ||
262 | ret | 254 | ret |
263 | CFI_ENDPROC | 255 | CFI_ENDPROC |
264 | ENDPROC(csum_partial) | 256 | ENDPROC(csum_partial) |
@@ -298,12 +290,9 @@ ENTRY(csum_partial_copy_generic) | |||
298 | CFI_STARTPROC | 290 | CFI_STARTPROC |
299 | subl $4,%esp | 291 | subl $4,%esp |
300 | CFI_ADJUST_CFA_OFFSET 4 | 292 | CFI_ADJUST_CFA_OFFSET 4 |
301 | pushl_cfi %edi | 293 | pushl_cfi_reg edi |
302 | CFI_REL_OFFSET edi, 0 | 294 | pushl_cfi_reg esi |
303 | pushl_cfi %esi | 295 | pushl_cfi_reg ebx |
304 | CFI_REL_OFFSET esi, 0 | ||
305 | pushl_cfi %ebx | ||
306 | CFI_REL_OFFSET ebx, 0 | ||
307 | movl ARGBASE+16(%esp),%eax # sum | 296 | movl ARGBASE+16(%esp),%eax # sum |
308 | movl ARGBASE+12(%esp),%ecx # len | 297 | movl ARGBASE+12(%esp),%ecx # len |
309 | movl ARGBASE+4(%esp),%esi # src | 298 | movl ARGBASE+4(%esp),%esi # src |
@@ -412,12 +401,9 @@ DST( movb %cl, (%edi) ) | |||
412 | 401 | ||
413 | .previous | 402 | .previous |
414 | 403 | ||
415 | popl_cfi %ebx | 404 | popl_cfi_reg ebx |
416 | CFI_RESTORE ebx | 405 | popl_cfi_reg esi |
417 | popl_cfi %esi | 406 | popl_cfi_reg edi |
418 | CFI_RESTORE esi | ||
419 | popl_cfi %edi | ||
420 | CFI_RESTORE edi | ||
421 | popl_cfi %ecx # equivalent to addl $4,%esp | 407 | popl_cfi %ecx # equivalent to addl $4,%esp |
422 | ret | 408 | ret |
423 | CFI_ENDPROC | 409 | CFI_ENDPROC |
@@ -441,12 +427,9 @@ ENDPROC(csum_partial_copy_generic) | |||
441 | 427 | ||
442 | ENTRY(csum_partial_copy_generic) | 428 | ENTRY(csum_partial_copy_generic) |
443 | CFI_STARTPROC | 429 | CFI_STARTPROC |
444 | pushl_cfi %ebx | 430 | pushl_cfi_reg ebx |
445 | CFI_REL_OFFSET ebx, 0 | 431 | pushl_cfi_reg edi |
446 | pushl_cfi %edi | 432 | pushl_cfi_reg esi |
447 | CFI_REL_OFFSET edi, 0 | ||
448 | pushl_cfi %esi | ||
449 | CFI_REL_OFFSET esi, 0 | ||
450 | movl ARGBASE+4(%esp),%esi #src | 433 | movl ARGBASE+4(%esp),%esi #src |
451 | movl ARGBASE+8(%esp),%edi #dst | 434 | movl ARGBASE+8(%esp),%edi #dst |
452 | movl ARGBASE+12(%esp),%ecx #len | 435 | movl ARGBASE+12(%esp),%ecx #len |
@@ -506,12 +489,9 @@ DST( movb %dl, (%edi) ) | |||
506 | jmp 7b | 489 | jmp 7b |
507 | .previous | 490 | .previous |
508 | 491 | ||
509 | popl_cfi %esi | 492 | popl_cfi_reg esi |
510 | CFI_RESTORE esi | 493 | popl_cfi_reg edi |
511 | popl_cfi %edi | 494 | popl_cfi_reg ebx |
512 | CFI_RESTORE edi | ||
513 | popl_cfi %ebx | ||
514 | CFI_RESTORE ebx | ||
515 | ret | 495 | ret |
516 | CFI_ENDPROC | 496 | CFI_ENDPROC |
517 | ENDPROC(csum_partial_copy_generic) | 497 | ENDPROC(csum_partial_copy_generic) |
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index f2145cfa12a6..e67e579c93bd 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S | |||
@@ -1,31 +1,35 @@ | |||
1 | #include <linux/linkage.h> | 1 | #include <linux/linkage.h> |
2 | #include <asm/dwarf2.h> | 2 | #include <asm/dwarf2.h> |
3 | #include <asm/cpufeature.h> | ||
3 | #include <asm/alternative-asm.h> | 4 | #include <asm/alternative-asm.h> |
4 | 5 | ||
5 | /* | 6 | /* |
6 | * Zero a page. | 7 | * Most CPUs support enhanced REP MOVSB/STOSB instructions. It is |
7 | * rdi page | 8 | * recommended to use this when possible and we do use them by default. |
8 | */ | 9 | * If enhanced REP MOVSB/STOSB is not available, try to use fast string. |
9 | ENTRY(clear_page_c) | 10 | * Otherwise, use original. |
11 | */ | ||
12 | |||
13 | /* | ||
14 | * Zero a page. | ||
15 | * %rdi - page | ||
16 | */ | ||
17 | ENTRY(clear_page) | ||
10 | CFI_STARTPROC | 18 | CFI_STARTPROC |
19 | |||
20 | ALTERNATIVE_2 "jmp clear_page_orig", "", X86_FEATURE_REP_GOOD, \ | ||
21 | "jmp clear_page_c_e", X86_FEATURE_ERMS | ||
22 | |||
11 | movl $4096/8,%ecx | 23 | movl $4096/8,%ecx |
12 | xorl %eax,%eax | 24 | xorl %eax,%eax |
13 | rep stosq | 25 | rep stosq |
14 | ret | 26 | ret |
15 | CFI_ENDPROC | 27 | CFI_ENDPROC |
16 | ENDPROC(clear_page_c) | 28 | ENDPROC(clear_page) |
17 | 29 | ||
18 | ENTRY(clear_page_c_e) | 30 | ENTRY(clear_page_orig) |
19 | CFI_STARTPROC | 31 | CFI_STARTPROC |
20 | movl $4096,%ecx | ||
21 | xorl %eax,%eax | ||
22 | rep stosb | ||
23 | ret | ||
24 | CFI_ENDPROC | ||
25 | ENDPROC(clear_page_c_e) | ||
26 | 32 | ||
27 | ENTRY(clear_page) | ||
28 | CFI_STARTPROC | ||
29 | xorl %eax,%eax | 33 | xorl %eax,%eax |
30 | movl $4096/64,%ecx | 34 | movl $4096/64,%ecx |
31 | .p2align 4 | 35 | .p2align 4 |
@@ -45,29 +49,13 @@ ENTRY(clear_page) | |||
45 | nop | 49 | nop |
46 | ret | 50 | ret |
47 | CFI_ENDPROC | 51 | CFI_ENDPROC |
48 | .Lclear_page_end: | 52 | ENDPROC(clear_page_orig) |
49 | ENDPROC(clear_page) | ||
50 | |||
51 | /* | ||
52 | * Some CPUs support enhanced REP MOVSB/STOSB instructions. | ||
53 | * It is recommended to use this when possible. | ||
54 | * If enhanced REP MOVSB/STOSB is not available, try to use fast string. | ||
55 | * Otherwise, use original function. | ||
56 | * | ||
57 | */ | ||
58 | 53 | ||
59 | #include <asm/cpufeature.h> | 54 | ENTRY(clear_page_c_e) |
60 | 55 | CFI_STARTPROC | |
61 | .section .altinstr_replacement,"ax" | 56 | movl $4096,%ecx |
62 | 1: .byte 0xeb /* jmp <disp8> */ | 57 | xorl %eax,%eax |
63 | .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */ | 58 | rep stosb |
64 | 2: .byte 0xeb /* jmp <disp8> */ | 59 | ret |
65 | .byte (clear_page_c_e - clear_page) - (3f - 2b) /* offset */ | 60 | CFI_ENDPROC |
66 | 3: | 61 | ENDPROC(clear_page_c_e) |
67 | .previous | ||
68 | .section .altinstructions,"a" | ||
69 | altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\ | ||
70 | .Lclear_page_end-clear_page, 2b-1b | ||
71 | altinstruction_entry clear_page,2b,X86_FEATURE_ERMS, \ | ||
72 | .Lclear_page_end-clear_page,3b-2b | ||
73 | .previous | ||
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S index 176cca67212b..8239dbcbf984 100644 --- a/arch/x86/lib/copy_page_64.S +++ b/arch/x86/lib/copy_page_64.S | |||
@@ -2,23 +2,26 @@ | |||
2 | 2 | ||
3 | #include <linux/linkage.h> | 3 | #include <linux/linkage.h> |
4 | #include <asm/dwarf2.h> | 4 | #include <asm/dwarf2.h> |
5 | #include <asm/cpufeature.h> | ||
5 | #include <asm/alternative-asm.h> | 6 | #include <asm/alternative-asm.h> |
6 | 7 | ||
8 | /* | ||
9 | * Some CPUs run faster using the string copy instructions (sane microcode). | ||
10 | * It is also a lot simpler. Use this when possible. But, don't use streaming | ||
11 | * copy unless the CPU indicates X86_FEATURE_REP_GOOD. Could vary the | ||
12 | * prefetch distance based on SMP/UP. | ||
13 | */ | ||
7 | ALIGN | 14 | ALIGN |
8 | copy_page_rep: | 15 | ENTRY(copy_page) |
9 | CFI_STARTPROC | 16 | CFI_STARTPROC |
17 | ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD | ||
10 | movl $4096/8, %ecx | 18 | movl $4096/8, %ecx |
11 | rep movsq | 19 | rep movsq |
12 | ret | 20 | ret |
13 | CFI_ENDPROC | 21 | CFI_ENDPROC |
14 | ENDPROC(copy_page_rep) | 22 | ENDPROC(copy_page) |
15 | |||
16 | /* | ||
17 | * Don't use streaming copy unless the CPU indicates X86_FEATURE_REP_GOOD. | ||
18 | * Could vary the prefetch distance based on SMP/UP. | ||
19 | */ | ||
20 | 23 | ||
21 | ENTRY(copy_page) | 24 | ENTRY(copy_page_regs) |
22 | CFI_STARTPROC | 25 | CFI_STARTPROC |
23 | subq $2*8, %rsp | 26 | subq $2*8, %rsp |
24 | CFI_ADJUST_CFA_OFFSET 2*8 | 27 | CFI_ADJUST_CFA_OFFSET 2*8 |
@@ -90,21 +93,5 @@ ENTRY(copy_page) | |||
90 | addq $2*8, %rsp | 93 | addq $2*8, %rsp |
91 | CFI_ADJUST_CFA_OFFSET -2*8 | 94 | CFI_ADJUST_CFA_OFFSET -2*8 |
92 | ret | 95 | ret |
93 | .Lcopy_page_end: | ||
94 | CFI_ENDPROC | 96 | CFI_ENDPROC |
95 | ENDPROC(copy_page) | 97 | ENDPROC(copy_page_regs) |
96 | |||
97 | /* Some CPUs run faster using the string copy instructions. | ||
98 | It is also a lot simpler. Use this when possible */ | ||
99 | |||
100 | #include <asm/cpufeature.h> | ||
101 | |||
102 | .section .altinstr_replacement,"ax" | ||
103 | 1: .byte 0xeb /* jmp <disp8> */ | ||
104 | .byte (copy_page_rep - copy_page) - (2f - 1b) /* offset */ | ||
105 | 2: | ||
106 | .previous | ||
107 | .section .altinstructions,"a" | ||
108 | altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD, \ | ||
109 | .Lcopy_page_end-copy_page, 2b-1b | ||
110 | .previous | ||
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index dee945d55594..fa997dfaef24 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S | |||
@@ -8,9 +8,6 @@ | |||
8 | 8 | ||
9 | #include <linux/linkage.h> | 9 | #include <linux/linkage.h> |
10 | #include <asm/dwarf2.h> | 10 | #include <asm/dwarf2.h> |
11 | |||
12 | #define FIX_ALIGNMENT 1 | ||
13 | |||
14 | #include <asm/current.h> | 11 | #include <asm/current.h> |
15 | #include <asm/asm-offsets.h> | 12 | #include <asm/asm-offsets.h> |
16 | #include <asm/thread_info.h> | 13 | #include <asm/thread_info.h> |
@@ -19,33 +16,7 @@ | |||
19 | #include <asm/asm.h> | 16 | #include <asm/asm.h> |
20 | #include <asm/smap.h> | 17 | #include <asm/smap.h> |
21 | 18 | ||
22 | /* | ||
23 | * By placing feature2 after feature1 in altinstructions section, we logically | ||
24 | * implement: | ||
25 | * If CPU has feature2, jmp to alt2 is used | ||
26 | * else if CPU has feature1, jmp to alt1 is used | ||
27 | * else jmp to orig is used. | ||
28 | */ | ||
29 | .macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2 | ||
30 | 0: | ||
31 | .byte 0xe9 /* 32bit jump */ | ||
32 | .long \orig-1f /* by default jump to orig */ | ||
33 | 1: | ||
34 | .section .altinstr_replacement,"ax" | ||
35 | 2: .byte 0xe9 /* near jump with 32bit immediate */ | ||
36 | .long \alt1-1b /* offset */ /* or alternatively to alt1 */ | ||
37 | 3: .byte 0xe9 /* near jump with 32bit immediate */ | ||
38 | .long \alt2-1b /* offset */ /* or alternatively to alt2 */ | ||
39 | .previous | ||
40 | |||
41 | .section .altinstructions,"a" | ||
42 | altinstruction_entry 0b,2b,\feature1,5,5 | ||
43 | altinstruction_entry 0b,3b,\feature2,5,5 | ||
44 | .previous | ||
45 | .endm | ||
46 | |||
47 | .macro ALIGN_DESTINATION | 19 | .macro ALIGN_DESTINATION |
48 | #ifdef FIX_ALIGNMENT | ||
49 | /* check for bad alignment of destination */ | 20 | /* check for bad alignment of destination */ |
50 | movl %edi,%ecx | 21 | movl %edi,%ecx |
51 | andl $7,%ecx | 22 | andl $7,%ecx |
@@ -67,7 +38,6 @@ | |||
67 | 38 | ||
68 | _ASM_EXTABLE(100b,103b) | 39 | _ASM_EXTABLE(100b,103b) |
69 | _ASM_EXTABLE(101b,103b) | 40 | _ASM_EXTABLE(101b,103b) |
70 | #endif | ||
71 | .endm | 41 | .endm |
72 | 42 | ||
73 | /* Standard copy_to_user with segment limit checking */ | 43 | /* Standard copy_to_user with segment limit checking */ |
@@ -79,9 +49,11 @@ ENTRY(_copy_to_user) | |||
79 | jc bad_to_user | 49 | jc bad_to_user |
80 | cmpq TI_addr_limit(%rax),%rcx | 50 | cmpq TI_addr_limit(%rax),%rcx |
81 | ja bad_to_user | 51 | ja bad_to_user |
82 | ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ | 52 | ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \ |
83 | copy_user_generic_unrolled,copy_user_generic_string, \ | 53 | "jmp copy_user_generic_string", \ |
84 | copy_user_enhanced_fast_string | 54 | X86_FEATURE_REP_GOOD, \ |
55 | "jmp copy_user_enhanced_fast_string", \ | ||
56 | X86_FEATURE_ERMS | ||
85 | CFI_ENDPROC | 57 | CFI_ENDPROC |
86 | ENDPROC(_copy_to_user) | 58 | ENDPROC(_copy_to_user) |
87 | 59 | ||
@@ -94,9 +66,11 @@ ENTRY(_copy_from_user) | |||
94 | jc bad_from_user | 66 | jc bad_from_user |
95 | cmpq TI_addr_limit(%rax),%rcx | 67 | cmpq TI_addr_limit(%rax),%rcx |
96 | ja bad_from_user | 68 | ja bad_from_user |
97 | ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ | 69 | ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \ |
98 | copy_user_generic_unrolled,copy_user_generic_string, \ | 70 | "jmp copy_user_generic_string", \ |
99 | copy_user_enhanced_fast_string | 71 | X86_FEATURE_REP_GOOD, \ |
72 | "jmp copy_user_enhanced_fast_string", \ | ||
73 | X86_FEATURE_ERMS | ||
100 | CFI_ENDPROC | 74 | CFI_ENDPROC |
101 | ENDPROC(_copy_from_user) | 75 | ENDPROC(_copy_from_user) |
102 | 76 | ||
diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S index 2419d5fefae3..9734182966f3 100644 --- a/arch/x86/lib/csum-copy_64.S +++ b/arch/x86/lib/csum-copy_64.S | |||
@@ -196,7 +196,7 @@ ENTRY(csum_partial_copy_generic) | |||
196 | 196 | ||
197 | /* handle last odd byte */ | 197 | /* handle last odd byte */ |
198 | .Lhandle_1: | 198 | .Lhandle_1: |
199 | testl $1, %r10d | 199 | testb $1, %r10b |
200 | jz .Lende | 200 | jz .Lende |
201 | xorl %ebx, %ebx | 201 | xorl %ebx, %ebx |
202 | source | 202 | source |
diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index 1313ae6b478b..8f72b334aea0 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c | |||
@@ -52,6 +52,13 @@ | |||
52 | */ | 52 | */ |
53 | void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64) | 53 | void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64) |
54 | { | 54 | { |
55 | /* | ||
56 | * Instructions longer than MAX_INSN_SIZE (15 bytes) are invalid | ||
57 | * even if the input buffer is long enough to hold them. | ||
58 | */ | ||
59 | if (buf_len > MAX_INSN_SIZE) | ||
60 | buf_len = MAX_INSN_SIZE; | ||
61 | |||
55 | memset(insn, 0, sizeof(*insn)); | 62 | memset(insn, 0, sizeof(*insn)); |
56 | insn->kaddr = kaddr; | 63 | insn->kaddr = kaddr; |
57 | insn->end_kaddr = kaddr + buf_len; | 64 | insn->end_kaddr = kaddr + buf_len; |
@@ -164,6 +171,12 @@ found: | |||
164 | /* VEX.W overrides opnd_size */ | 171 | /* VEX.W overrides opnd_size */ |
165 | insn->opnd_bytes = 8; | 172 | insn->opnd_bytes = 8; |
166 | } else { | 173 | } else { |
174 | /* | ||
175 | * For VEX2, fake VEX3-like byte#2. | ||
176 | * Makes it easier to decode vex.W, vex.vvvv, | ||
177 | * vex.L and vex.pp. Masking with 0x7f sets vex.W == 0. | ||
178 | */ | ||
179 | insn->vex_prefix.bytes[2] = b2 & 0x7f; | ||
167 | insn->vex_prefix.nbytes = 2; | 180 | insn->vex_prefix.nbytes = 2; |
168 | insn->next_byte += 2; | 181 | insn->next_byte += 2; |
169 | } | 182 | } |
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 89b53c9968e7..b046664f5a1c 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S | |||
@@ -1,12 +1,20 @@ | |||
1 | /* Copyright 2002 Andi Kleen */ | 1 | /* Copyright 2002 Andi Kleen */ |
2 | 2 | ||
3 | #include <linux/linkage.h> | 3 | #include <linux/linkage.h> |
4 | |||
5 | #include <asm/cpufeature.h> | 4 | #include <asm/cpufeature.h> |
6 | #include <asm/dwarf2.h> | 5 | #include <asm/dwarf2.h> |
7 | #include <asm/alternative-asm.h> | 6 | #include <asm/alternative-asm.h> |
8 | 7 | ||
9 | /* | 8 | /* |
9 | * We build a jump to memcpy_orig by default which gets NOPped out on | ||
10 | * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which | ||
11 | * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs | ||
12 | * to a jmp to memcpy_erms which does the REP; MOVSB mem copy. | ||
13 | */ | ||
14 | |||
15 | .weak memcpy | ||
16 | |||
17 | /* | ||
10 | * memcpy - Copy a memory block. | 18 | * memcpy - Copy a memory block. |
11 | * | 19 | * |
12 | * Input: | 20 | * Input: |
@@ -17,15 +25,11 @@ | |||
17 | * Output: | 25 | * Output: |
18 | * rax original destination | 26 | * rax original destination |
19 | */ | 27 | */ |
28 | ENTRY(__memcpy) | ||
29 | ENTRY(memcpy) | ||
30 | ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \ | ||
31 | "jmp memcpy_erms", X86_FEATURE_ERMS | ||
20 | 32 | ||
21 | /* | ||
22 | * memcpy_c() - fast string ops (REP MOVSQ) based variant. | ||
23 | * | ||
24 | * This gets patched over the unrolled variant (below) via the | ||
25 | * alternative instructions framework: | ||
26 | */ | ||
27 | .section .altinstr_replacement, "ax", @progbits | ||
28 | .Lmemcpy_c: | ||
29 | movq %rdi, %rax | 33 | movq %rdi, %rax |
30 | movq %rdx, %rcx | 34 | movq %rdx, %rcx |
31 | shrq $3, %rcx | 35 | shrq $3, %rcx |
@@ -34,29 +38,21 @@ | |||
34 | movl %edx, %ecx | 38 | movl %edx, %ecx |
35 | rep movsb | 39 | rep movsb |
36 | ret | 40 | ret |
37 | .Lmemcpy_e: | 41 | ENDPROC(memcpy) |
38 | .previous | 42 | ENDPROC(__memcpy) |
39 | 43 | ||
40 | /* | 44 | /* |
41 | * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than | 45 | * memcpy_erms() - enhanced fast string memcpy. This is faster and |
42 | * memcpy_c. Use memcpy_c_e when possible. | 46 | * simpler than memcpy. Use memcpy_erms when possible. |
43 | * | ||
44 | * This gets patched over the unrolled variant (below) via the | ||
45 | * alternative instructions framework: | ||
46 | */ | 47 | */ |
47 | .section .altinstr_replacement, "ax", @progbits | 48 | ENTRY(memcpy_erms) |
48 | .Lmemcpy_c_e: | ||
49 | movq %rdi, %rax | 49 | movq %rdi, %rax |
50 | movq %rdx, %rcx | 50 | movq %rdx, %rcx |
51 | rep movsb | 51 | rep movsb |
52 | ret | 52 | ret |
53 | .Lmemcpy_e_e: | 53 | ENDPROC(memcpy_erms) |
54 | .previous | ||
55 | |||
56 | .weak memcpy | ||
57 | 54 | ||
58 | ENTRY(__memcpy) | 55 | ENTRY(memcpy_orig) |
59 | ENTRY(memcpy) | ||
60 | CFI_STARTPROC | 56 | CFI_STARTPROC |
61 | movq %rdi, %rax | 57 | movq %rdi, %rax |
62 | 58 | ||
@@ -183,26 +179,4 @@ ENTRY(memcpy) | |||
183 | .Lend: | 179 | .Lend: |
184 | retq | 180 | retq |
185 | CFI_ENDPROC | 181 | CFI_ENDPROC |
186 | ENDPROC(memcpy) | 182 | ENDPROC(memcpy_orig) |
187 | ENDPROC(__memcpy) | ||
188 | |||
189 | /* | ||
190 | * Some CPUs are adding enhanced REP MOVSB/STOSB feature | ||
191 | * If the feature is supported, memcpy_c_e() is the first choice. | ||
192 | * If enhanced rep movsb copy is not available, use fast string copy | ||
193 | * memcpy_c() when possible. This is faster and code is simpler than | ||
194 | * original memcpy(). | ||
195 | * Otherwise, original memcpy() is used. | ||
196 | * In .altinstructions section, ERMS feature is placed after REG_GOOD | ||
197 | * feature to implement the right patch order. | ||
198 | * | ||
199 | * Replace only beginning, memcpy is used to apply alternatives, | ||
200 | * so it is silly to overwrite itself with nops - reboot is the | ||
201 | * only outcome... | ||
202 | */ | ||
203 | .section .altinstructions, "a" | ||
204 | altinstruction_entry __memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\ | ||
205 | .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c | ||
206 | altinstruction_entry __memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \ | ||
207 | .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e | ||
208 | .previous | ||
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S index 9c4b530575da..0f8a0d0331b9 100644 --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S | |||
@@ -5,7 +5,6 @@ | |||
5 | * This assembly file is re-written from memmove_64.c file. | 5 | * This assembly file is re-written from memmove_64.c file. |
6 | * - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com> | 6 | * - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com> |
7 | */ | 7 | */ |
8 | #define _STRING_C | ||
9 | #include <linux/linkage.h> | 8 | #include <linux/linkage.h> |
10 | #include <asm/dwarf2.h> | 9 | #include <asm/dwarf2.h> |
11 | #include <asm/cpufeature.h> | 10 | #include <asm/cpufeature.h> |
@@ -44,6 +43,8 @@ ENTRY(__memmove) | |||
44 | jg 2f | 43 | jg 2f |
45 | 44 | ||
46 | .Lmemmove_begin_forward: | 45 | .Lmemmove_begin_forward: |
46 | ALTERNATIVE "", "movq %rdx, %rcx; rep movsb; retq", X86_FEATURE_ERMS | ||
47 | |||
47 | /* | 48 | /* |
48 | * movsq instruction have many startup latency | 49 | * movsq instruction have many startup latency |
49 | * so we handle small size by general register. | 50 | * so we handle small size by general register. |
@@ -207,21 +208,5 @@ ENTRY(__memmove) | |||
207 | 13: | 208 | 13: |
208 | retq | 209 | retq |
209 | CFI_ENDPROC | 210 | CFI_ENDPROC |
210 | |||
211 | .section .altinstr_replacement,"ax" | ||
212 | .Lmemmove_begin_forward_efs: | ||
213 | /* Forward moving data. */ | ||
214 | movq %rdx, %rcx | ||
215 | rep movsb | ||
216 | retq | ||
217 | .Lmemmove_end_forward_efs: | ||
218 | .previous | ||
219 | |||
220 | .section .altinstructions,"a" | ||
221 | altinstruction_entry .Lmemmove_begin_forward, \ | ||
222 | .Lmemmove_begin_forward_efs,X86_FEATURE_ERMS, \ | ||
223 | .Lmemmove_end_forward-.Lmemmove_begin_forward, \ | ||
224 | .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs | ||
225 | .previous | ||
226 | ENDPROC(__memmove) | 211 | ENDPROC(__memmove) |
227 | ENDPROC(memmove) | 212 | ENDPROC(memmove) |
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 6f44935c6a60..93118fb23976 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S | |||
@@ -5,19 +5,30 @@ | |||
5 | #include <asm/cpufeature.h> | 5 | #include <asm/cpufeature.h> |
6 | #include <asm/alternative-asm.h> | 6 | #include <asm/alternative-asm.h> |
7 | 7 | ||
8 | .weak memset | ||
9 | |||
8 | /* | 10 | /* |
9 | * ISO C memset - set a memory block to a byte value. This function uses fast | 11 | * ISO C memset - set a memory block to a byte value. This function uses fast |
10 | * string to get better performance than the original function. The code is | 12 | * string to get better performance than the original function. The code is |
11 | * simpler and shorter than the orignal function as well. | 13 | * simpler and shorter than the orignal function as well. |
12 | * | 14 | * |
13 | * rdi destination | 15 | * rdi destination |
14 | * rsi value (char) | 16 | * rsi value (char) |
15 | * rdx count (bytes) | 17 | * rdx count (bytes) |
16 | * | 18 | * |
17 | * rax original destination | 19 | * rax original destination |
18 | */ | 20 | */ |
19 | .section .altinstr_replacement, "ax", @progbits | 21 | ENTRY(memset) |
20 | .Lmemset_c: | 22 | ENTRY(__memset) |
23 | /* | ||
24 | * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended | ||
25 | * to use it when possible. If not available, use fast string instructions. | ||
26 | * | ||
27 | * Otherwise, use original memset function. | ||
28 | */ | ||
29 | ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \ | ||
30 | "jmp memset_erms", X86_FEATURE_ERMS | ||
31 | |||
21 | movq %rdi,%r9 | 32 | movq %rdi,%r9 |
22 | movq %rdx,%rcx | 33 | movq %rdx,%rcx |
23 | andl $7,%edx | 34 | andl $7,%edx |
@@ -31,8 +42,8 @@ | |||
31 | rep stosb | 42 | rep stosb |
32 | movq %r9,%rax | 43 | movq %r9,%rax |
33 | ret | 44 | ret |
34 | .Lmemset_e: | 45 | ENDPROC(memset) |
35 | .previous | 46 | ENDPROC(__memset) |
36 | 47 | ||
37 | /* | 48 | /* |
38 | * ISO C memset - set a memory block to a byte value. This function uses | 49 | * ISO C memset - set a memory block to a byte value. This function uses |
@@ -45,21 +56,16 @@ | |||
45 | * | 56 | * |
46 | * rax original destination | 57 | * rax original destination |
47 | */ | 58 | */ |
48 | .section .altinstr_replacement, "ax", @progbits | 59 | ENTRY(memset_erms) |
49 | .Lmemset_c_e: | ||
50 | movq %rdi,%r9 | 60 | movq %rdi,%r9 |
51 | movb %sil,%al | 61 | movb %sil,%al |
52 | movq %rdx,%rcx | 62 | movq %rdx,%rcx |
53 | rep stosb | 63 | rep stosb |
54 | movq %r9,%rax | 64 | movq %r9,%rax |
55 | ret | 65 | ret |
56 | .Lmemset_e_e: | 66 | ENDPROC(memset_erms) |
57 | .previous | ||
58 | |||
59 | .weak memset | ||
60 | 67 | ||
61 | ENTRY(memset) | 68 | ENTRY(memset_orig) |
62 | ENTRY(__memset) | ||
63 | CFI_STARTPROC | 69 | CFI_STARTPROC |
64 | movq %rdi,%r10 | 70 | movq %rdi,%r10 |
65 | 71 | ||
@@ -134,23 +140,4 @@ ENTRY(__memset) | |||
134 | jmp .Lafter_bad_alignment | 140 | jmp .Lafter_bad_alignment |
135 | .Lfinal: | 141 | .Lfinal: |
136 | CFI_ENDPROC | 142 | CFI_ENDPROC |
137 | ENDPROC(memset) | 143 | ENDPROC(memset_orig) |
138 | ENDPROC(__memset) | ||
139 | |||
140 | /* Some CPUs support enhanced REP MOVSB/STOSB feature. | ||
141 | * It is recommended to use this when possible. | ||
142 | * | ||
143 | * If enhanced REP MOVSB/STOSB feature is not available, use fast string | ||
144 | * instructions. | ||
145 | * | ||
146 | * Otherwise, use original memset function. | ||
147 | * | ||
148 | * In .altinstructions section, ERMS feature is placed after REG_GOOD | ||
149 | * feature to implement the right patch order. | ||
150 | */ | ||
151 | .section .altinstructions,"a" | ||
152 | altinstruction_entry __memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\ | ||
153 | .Lfinal-__memset,.Lmemset_e-.Lmemset_c | ||
154 | altinstruction_entry __memset,.Lmemset_c_e,X86_FEATURE_ERMS, \ | ||
155 | .Lfinal-__memset,.Lmemset_e_e-.Lmemset_c_e | ||
156 | .previous | ||
diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S index f6d13eefad10..3ca5218fbece 100644 --- a/arch/x86/lib/msr-reg.S +++ b/arch/x86/lib/msr-reg.S | |||
@@ -14,8 +14,8 @@ | |||
14 | .macro op_safe_regs op | 14 | .macro op_safe_regs op |
15 | ENTRY(\op\()_safe_regs) | 15 | ENTRY(\op\()_safe_regs) |
16 | CFI_STARTPROC | 16 | CFI_STARTPROC |
17 | pushq_cfi %rbx | 17 | pushq_cfi_reg rbx |
18 | pushq_cfi %rbp | 18 | pushq_cfi_reg rbp |
19 | movq %rdi, %r10 /* Save pointer */ | 19 | movq %rdi, %r10 /* Save pointer */ |
20 | xorl %r11d, %r11d /* Return value */ | 20 | xorl %r11d, %r11d /* Return value */ |
21 | movl (%rdi), %eax | 21 | movl (%rdi), %eax |
@@ -35,8 +35,8 @@ ENTRY(\op\()_safe_regs) | |||
35 | movl %ebp, 20(%r10) | 35 | movl %ebp, 20(%r10) |
36 | movl %esi, 24(%r10) | 36 | movl %esi, 24(%r10) |
37 | movl %edi, 28(%r10) | 37 | movl %edi, 28(%r10) |
38 | popq_cfi %rbp | 38 | popq_cfi_reg rbp |
39 | popq_cfi %rbx | 39 | popq_cfi_reg rbx |
40 | ret | 40 | ret |
41 | 3: | 41 | 3: |
42 | CFI_RESTORE_STATE | 42 | CFI_RESTORE_STATE |
@@ -53,10 +53,10 @@ ENDPROC(\op\()_safe_regs) | |||
53 | .macro op_safe_regs op | 53 | .macro op_safe_regs op |
54 | ENTRY(\op\()_safe_regs) | 54 | ENTRY(\op\()_safe_regs) |
55 | CFI_STARTPROC | 55 | CFI_STARTPROC |
56 | pushl_cfi %ebx | 56 | pushl_cfi_reg ebx |
57 | pushl_cfi %ebp | 57 | pushl_cfi_reg ebp |
58 | pushl_cfi %esi | 58 | pushl_cfi_reg esi |
59 | pushl_cfi %edi | 59 | pushl_cfi_reg edi |
60 | pushl_cfi $0 /* Return value */ | 60 | pushl_cfi $0 /* Return value */ |
61 | pushl_cfi %eax | 61 | pushl_cfi %eax |
62 | movl 4(%eax), %ecx | 62 | movl 4(%eax), %ecx |
@@ -80,10 +80,10 @@ ENTRY(\op\()_safe_regs) | |||
80 | movl %esi, 24(%eax) | 80 | movl %esi, 24(%eax) |
81 | movl %edi, 28(%eax) | 81 | movl %edi, 28(%eax) |
82 | popl_cfi %eax | 82 | popl_cfi %eax |
83 | popl_cfi %edi | 83 | popl_cfi_reg edi |
84 | popl_cfi %esi | 84 | popl_cfi_reg esi |
85 | popl_cfi %ebp | 85 | popl_cfi_reg ebp |
86 | popl_cfi %ebx | 86 | popl_cfi_reg ebx |
87 | ret | 87 | ret |
88 | 3: | 88 | 3: |
89 | CFI_RESTORE_STATE | 89 | CFI_RESTORE_STATE |
diff --git a/arch/x86/lib/rwsem.S b/arch/x86/lib/rwsem.S index 5dff5f042468..2322abe4da3b 100644 --- a/arch/x86/lib/rwsem.S +++ b/arch/x86/lib/rwsem.S | |||
@@ -34,10 +34,10 @@ | |||
34 | */ | 34 | */ |
35 | 35 | ||
36 | #define save_common_regs \ | 36 | #define save_common_regs \ |
37 | pushl_cfi %ecx; CFI_REL_OFFSET ecx, 0 | 37 | pushl_cfi_reg ecx |
38 | 38 | ||
39 | #define restore_common_regs \ | 39 | #define restore_common_regs \ |
40 | popl_cfi %ecx; CFI_RESTORE ecx | 40 | popl_cfi_reg ecx |
41 | 41 | ||
42 | /* Avoid uglifying the argument copying x86-64 needs to do. */ | 42 | /* Avoid uglifying the argument copying x86-64 needs to do. */ |
43 | .macro movq src, dst | 43 | .macro movq src, dst |
@@ -64,22 +64,22 @@ | |||
64 | */ | 64 | */ |
65 | 65 | ||
66 | #define save_common_regs \ | 66 | #define save_common_regs \ |
67 | pushq_cfi %rdi; CFI_REL_OFFSET rdi, 0; \ | 67 | pushq_cfi_reg rdi; \ |
68 | pushq_cfi %rsi; CFI_REL_OFFSET rsi, 0; \ | 68 | pushq_cfi_reg rsi; \ |
69 | pushq_cfi %rcx; CFI_REL_OFFSET rcx, 0; \ | 69 | pushq_cfi_reg rcx; \ |
70 | pushq_cfi %r8; CFI_REL_OFFSET r8, 0; \ | 70 | pushq_cfi_reg r8; \ |
71 | pushq_cfi %r9; CFI_REL_OFFSET r9, 0; \ | 71 | pushq_cfi_reg r9; \ |
72 | pushq_cfi %r10; CFI_REL_OFFSET r10, 0; \ | 72 | pushq_cfi_reg r10; \ |
73 | pushq_cfi %r11; CFI_REL_OFFSET r11, 0 | 73 | pushq_cfi_reg r11 |
74 | 74 | ||
75 | #define restore_common_regs \ | 75 | #define restore_common_regs \ |
76 | popq_cfi %r11; CFI_RESTORE r11; \ | 76 | popq_cfi_reg r11; \ |
77 | popq_cfi %r10; CFI_RESTORE r10; \ | 77 | popq_cfi_reg r10; \ |
78 | popq_cfi %r9; CFI_RESTORE r9; \ | 78 | popq_cfi_reg r9; \ |
79 | popq_cfi %r8; CFI_RESTORE r8; \ | 79 | popq_cfi_reg r8; \ |
80 | popq_cfi %rcx; CFI_RESTORE rcx; \ | 80 | popq_cfi_reg rcx; \ |
81 | popq_cfi %rsi; CFI_RESTORE rsi; \ | 81 | popq_cfi_reg rsi; \ |
82 | popq_cfi %rdi; CFI_RESTORE rdi | 82 | popq_cfi_reg rdi |
83 | 83 | ||
84 | #endif | 84 | #endif |
85 | 85 | ||
@@ -87,12 +87,10 @@ | |||
87 | ENTRY(call_rwsem_down_read_failed) | 87 | ENTRY(call_rwsem_down_read_failed) |
88 | CFI_STARTPROC | 88 | CFI_STARTPROC |
89 | save_common_regs | 89 | save_common_regs |
90 | __ASM_SIZE(push,_cfi) %__ASM_REG(dx) | 90 | __ASM_SIZE(push,_cfi_reg) __ASM_REG(dx) |
91 | CFI_REL_OFFSET __ASM_REG(dx), 0 | ||
92 | movq %rax,%rdi | 91 | movq %rax,%rdi |
93 | call rwsem_down_read_failed | 92 | call rwsem_down_read_failed |
94 | __ASM_SIZE(pop,_cfi) %__ASM_REG(dx) | 93 | __ASM_SIZE(pop,_cfi_reg) __ASM_REG(dx) |
95 | CFI_RESTORE __ASM_REG(dx) | ||
96 | restore_common_regs | 94 | restore_common_regs |
97 | ret | 95 | ret |
98 | CFI_ENDPROC | 96 | CFI_ENDPROC |
@@ -124,12 +122,10 @@ ENDPROC(call_rwsem_wake) | |||
124 | ENTRY(call_rwsem_downgrade_wake) | 122 | ENTRY(call_rwsem_downgrade_wake) |
125 | CFI_STARTPROC | 123 | CFI_STARTPROC |
126 | save_common_regs | 124 | save_common_regs |
127 | __ASM_SIZE(push,_cfi) %__ASM_REG(dx) | 125 | __ASM_SIZE(push,_cfi_reg) __ASM_REG(dx) |
128 | CFI_REL_OFFSET __ASM_REG(dx), 0 | ||
129 | movq %rax,%rdi | 126 | movq %rax,%rdi |
130 | call rwsem_downgrade_wake | 127 | call rwsem_downgrade_wake |
131 | __ASM_SIZE(pop,_cfi) %__ASM_REG(dx) | 128 | __ASM_SIZE(pop,_cfi_reg) __ASM_REG(dx) |
132 | CFI_RESTORE __ASM_REG(dx) | ||
133 | restore_common_regs | 129 | restore_common_regs |
134 | ret | 130 | ret |
135 | CFI_ENDPROC | 131 | CFI_ENDPROC |
diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S index e28cdaf5ac2c..5eb715087b80 100644 --- a/arch/x86/lib/thunk_32.S +++ b/arch/x86/lib/thunk_32.S | |||
@@ -13,12 +13,9 @@ | |||
13 | .globl \name | 13 | .globl \name |
14 | \name: | 14 | \name: |
15 | CFI_STARTPROC | 15 | CFI_STARTPROC |
16 | pushl_cfi %eax | 16 | pushl_cfi_reg eax |
17 | CFI_REL_OFFSET eax, 0 | 17 | pushl_cfi_reg ecx |
18 | pushl_cfi %ecx | 18 | pushl_cfi_reg edx |
19 | CFI_REL_OFFSET ecx, 0 | ||
20 | pushl_cfi %edx | ||
21 | CFI_REL_OFFSET edx, 0 | ||
22 | 19 | ||
23 | .if \put_ret_addr_in_eax | 20 | .if \put_ret_addr_in_eax |
24 | /* Place EIP in the arg1 */ | 21 | /* Place EIP in the arg1 */ |
@@ -26,12 +23,9 @@ | |||
26 | .endif | 23 | .endif |
27 | 24 | ||
28 | call \func | 25 | call \func |
29 | popl_cfi %edx | 26 | popl_cfi_reg edx |
30 | CFI_RESTORE edx | 27 | popl_cfi_reg ecx |
31 | popl_cfi %ecx | 28 | popl_cfi_reg eax |
32 | CFI_RESTORE ecx | ||
33 | popl_cfi %eax | ||
34 | CFI_RESTORE eax | ||
35 | ret | 29 | ret |
36 | CFI_ENDPROC | 30 | CFI_ENDPROC |
37 | _ASM_NOKPROBE(\name) | 31 | _ASM_NOKPROBE(\name) |
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S index b30b5ebd614a..f89ba4e93025 100644 --- a/arch/x86/lib/thunk_64.S +++ b/arch/x86/lib/thunk_64.S | |||
@@ -17,9 +17,18 @@ | |||
17 | CFI_STARTPROC | 17 | CFI_STARTPROC |
18 | 18 | ||
19 | /* this one pushes 9 elems, the next one would be %rIP */ | 19 | /* this one pushes 9 elems, the next one would be %rIP */ |
20 | SAVE_ARGS | 20 | pushq_cfi_reg rdi |
21 | pushq_cfi_reg rsi | ||
22 | pushq_cfi_reg rdx | ||
23 | pushq_cfi_reg rcx | ||
24 | pushq_cfi_reg rax | ||
25 | pushq_cfi_reg r8 | ||
26 | pushq_cfi_reg r9 | ||
27 | pushq_cfi_reg r10 | ||
28 | pushq_cfi_reg r11 | ||
21 | 29 | ||
22 | .if \put_ret_addr_in_rdi | 30 | .if \put_ret_addr_in_rdi |
31 | /* 9*8(%rsp) is return addr on stack */ | ||
23 | movq_cfi_restore 9*8, rdi | 32 | movq_cfi_restore 9*8, rdi |
24 | .endif | 33 | .endif |
25 | 34 | ||
@@ -45,11 +54,22 @@ | |||
45 | #endif | 54 | #endif |
46 | #endif | 55 | #endif |
47 | 56 | ||
48 | /* SAVE_ARGS below is used only for the .cfi directives it contains. */ | 57 | #if defined(CONFIG_TRACE_IRQFLAGS) \ |
58 | || defined(CONFIG_DEBUG_LOCK_ALLOC) \ | ||
59 | || defined(CONFIG_PREEMPT) | ||
49 | CFI_STARTPROC | 60 | CFI_STARTPROC |
50 | SAVE_ARGS | 61 | CFI_ADJUST_CFA_OFFSET 9*8 |
51 | restore: | 62 | restore: |
52 | RESTORE_ARGS | 63 | popq_cfi_reg r11 |
64 | popq_cfi_reg r10 | ||
65 | popq_cfi_reg r9 | ||
66 | popq_cfi_reg r8 | ||
67 | popq_cfi_reg rax | ||
68 | popq_cfi_reg rcx | ||
69 | popq_cfi_reg rdx | ||
70 | popq_cfi_reg rsi | ||
71 | popq_cfi_reg rdi | ||
53 | ret | 72 | ret |
54 | CFI_ENDPROC | 73 | CFI_ENDPROC |
55 | _ASM_NOKPROBE(restore) | 74 | _ASM_NOKPROBE(restore) |
75 | #endif | ||
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 1a2be7c6895d..816488c0b97e 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt | |||
@@ -273,6 +273,9 @@ dd: ESC | |||
273 | de: ESC | 273 | de: ESC |
274 | df: ESC | 274 | df: ESC |
275 | # 0xe0 - 0xef | 275 | # 0xe0 - 0xef |
276 | # Note: "forced64" is Intel CPU behavior: they ignore 0x66 prefix | ||
277 | # in 64-bit mode. AMD CPUs accept 0x66 prefix, it causes RIP truncation | ||
278 | # to 16 bits. In 32-bit mode, 0x66 is accepted by both Intel and AMD. | ||
276 | e0: LOOPNE/LOOPNZ Jb (f64) | 279 | e0: LOOPNE/LOOPNZ Jb (f64) |
277 | e1: LOOPE/LOOPZ Jb (f64) | 280 | e1: LOOPE/LOOPZ Jb (f64) |
278 | e2: LOOP Jb (f64) | 281 | e2: LOOP Jb (f64) |
@@ -281,6 +284,10 @@ e4: IN AL,Ib | |||
281 | e5: IN eAX,Ib | 284 | e5: IN eAX,Ib |
282 | e6: OUT Ib,AL | 285 | e6: OUT Ib,AL |
283 | e7: OUT Ib,eAX | 286 | e7: OUT Ib,eAX |
287 | # With 0x66 prefix in 64-bit mode, for AMD CPUs immediate offset | ||
288 | # in "near" jumps and calls is 16-bit. For CALL, | ||
289 | # push of return address is 16-bit wide, RSP is decremented by 2 | ||
290 | # but is not truncated to 16 bits, unlike RIP. | ||
284 | e8: CALL Jz (f64) | 291 | e8: CALL Jz (f64) |
285 | e9: JMP-near Jz (f64) | 292 | e9: JMP-near Jz (f64) |
286 | ea: JMP-far Ap (i64) | 293 | ea: JMP-far Ap (i64) |
@@ -456,6 +463,7 @@ AVXcode: 1 | |||
456 | 7e: movd/q Ey,Pd | vmovd/q Ey,Vy (66),(v1) | vmovq Vq,Wq (F3),(v1) | 463 | 7e: movd/q Ey,Pd | vmovd/q Ey,Vy (66),(v1) | vmovq Vq,Wq (F3),(v1) |
457 | 7f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqu Wx,Vx (F3) | 464 | 7f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqu Wx,Vx (F3) |
458 | # 0x0f 0x80-0x8f | 465 | # 0x0f 0x80-0x8f |
466 | # Note: "forced64" is Intel CPU behavior (see comment about CALL insn). | ||
459 | 80: JO Jz (f64) | 467 | 80: JO Jz (f64) |
460 | 81: JNO Jz (f64) | 468 | 81: JNO Jz (f64) |
461 | 82: JB/JC/JNAE Jz (f64) | 469 | 82: JB/JC/JNAE Jz (f64) |
@@ -842,6 +850,7 @@ EndTable | |||
842 | GrpTable: Grp5 | 850 | GrpTable: Grp5 |
843 | 0: INC Ev | 851 | 0: INC Ev |
844 | 1: DEC Ev | 852 | 1: DEC Ev |
853 | # Note: "forced64" is Intel CPU behavior (see comment about CALL insn). | ||
845 | 2: CALLN Ev (f64) | 854 | 2: CALLN Ev (f64) |
846 | 3: CALLF Ep | 855 | 3: CALLF Ep |
847 | 4: JMPN Ev (f64) | 856 | 4: JMPN Ev (f64) |
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index c4cc74006c61..a482d105172b 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile | |||
@@ -32,6 +32,4 @@ obj-$(CONFIG_AMD_NUMA) += amdtopology.o | |||
32 | obj-$(CONFIG_ACPI_NUMA) += srat.o | 32 | obj-$(CONFIG_ACPI_NUMA) += srat.o |
33 | obj-$(CONFIG_NUMA_EMU) += numa_emulation.o | 33 | obj-$(CONFIG_NUMA_EMU) += numa_emulation.o |
34 | 34 | ||
35 | obj-$(CONFIG_MEMTEST) += memtest.o | ||
36 | |||
37 | obj-$(CONFIG_X86_INTEL_MPX) += mpx.o | 35 | obj-$(CONFIG_X86_INTEL_MPX) += mpx.o |
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index ede025fb46f1..181c53bac3a7 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c | |||
@@ -59,7 +59,7 @@ static nokprobe_inline int kprobes_fault(struct pt_regs *regs) | |||
59 | int ret = 0; | 59 | int ret = 0; |
60 | 60 | ||
61 | /* kprobe_running() needs smp_processor_id() */ | 61 | /* kprobe_running() needs smp_processor_id() */ |
62 | if (kprobes_built_in() && !user_mode_vm(regs)) { | 62 | if (kprobes_built_in() && !user_mode(regs)) { |
63 | preempt_disable(); | 63 | preempt_disable(); |
64 | if (kprobe_running() && kprobe_fault_handler(regs, 14)) | 64 | if (kprobe_running() && kprobe_fault_handler(regs, 14)) |
65 | ret = 1; | 65 | ret = 1; |
@@ -148,7 +148,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) | |||
148 | instr = (void *)convert_ip_to_linear(current, regs); | 148 | instr = (void *)convert_ip_to_linear(current, regs); |
149 | max_instr = instr + 15; | 149 | max_instr = instr + 15; |
150 | 150 | ||
151 | if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) | 151 | if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX) |
152 | return 0; | 152 | return 0; |
153 | 153 | ||
154 | while (instr < max_instr) { | 154 | while (instr < max_instr) { |
@@ -1035,7 +1035,7 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs) | |||
1035 | if (error_code & PF_USER) | 1035 | if (error_code & PF_USER) |
1036 | return false; | 1036 | return false; |
1037 | 1037 | ||
1038 | if (!user_mode_vm(regs) && (regs->flags & X86_EFLAGS_AC)) | 1038 | if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC)) |
1039 | return false; | 1039 | return false; |
1040 | 1040 | ||
1041 | return true; | 1041 | return true; |
@@ -1140,7 +1140,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, | |||
1140 | * User-mode registers count as a user access even for any | 1140 | * User-mode registers count as a user access even for any |
1141 | * potential system fault or CPU buglet: | 1141 | * potential system fault or CPU buglet: |
1142 | */ | 1142 | */ |
1143 | if (user_mode_vm(regs)) { | 1143 | if (user_mode(regs)) { |
1144 | local_irq_enable(); | 1144 | local_irq_enable(); |
1145 | error_code |= PF_USER; | 1145 | error_code |= PF_USER; |
1146 | flags |= FAULT_FLAG_USER; | 1146 | flags |= FAULT_FLAG_USER; |
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index a110efca6d06..1d553186c434 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c | |||
@@ -29,29 +29,33 @@ | |||
29 | 29 | ||
30 | /* | 30 | /* |
31 | * Tables translating between page_cache_type_t and pte encoding. | 31 | * Tables translating between page_cache_type_t and pte encoding. |
32 | * Minimal supported modes are defined statically, modified if more supported | 32 | * |
33 | * cache modes are available. | 33 | * Minimal supported modes are defined statically, they are modified |
34 | * Index into __cachemode2pte_tbl is the cachemode. | 34 | * during bootup if more supported cache modes are available. |
35 | * Index into __pte2cachemode_tbl are the caching attribute bits of the pte | 35 | * |
36 | * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2. | 36 | * Index into __cachemode2pte_tbl[] is the cachemode. |
37 | * | ||
38 | * Index into __pte2cachemode_tbl[] are the caching attribute bits of the pte | ||
39 | * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2. | ||
37 | */ | 40 | */ |
38 | uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { | 41 | uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { |
39 | [_PAGE_CACHE_MODE_WB] = 0, | 42 | [_PAGE_CACHE_MODE_WB ] = 0 | 0 , |
40 | [_PAGE_CACHE_MODE_WC] = _PAGE_PWT, | 43 | [_PAGE_CACHE_MODE_WC ] = _PAGE_PWT | 0 , |
41 | [_PAGE_CACHE_MODE_UC_MINUS] = _PAGE_PCD, | 44 | [_PAGE_CACHE_MODE_UC_MINUS] = 0 | _PAGE_PCD, |
42 | [_PAGE_CACHE_MODE_UC] = _PAGE_PCD | _PAGE_PWT, | 45 | [_PAGE_CACHE_MODE_UC ] = _PAGE_PWT | _PAGE_PCD, |
43 | [_PAGE_CACHE_MODE_WT] = _PAGE_PCD, | 46 | [_PAGE_CACHE_MODE_WT ] = 0 | _PAGE_PCD, |
44 | [_PAGE_CACHE_MODE_WP] = _PAGE_PCD, | 47 | [_PAGE_CACHE_MODE_WP ] = 0 | _PAGE_PCD, |
45 | }; | 48 | }; |
46 | EXPORT_SYMBOL(__cachemode2pte_tbl); | 49 | EXPORT_SYMBOL(__cachemode2pte_tbl); |
50 | |||
47 | uint8_t __pte2cachemode_tbl[8] = { | 51 | uint8_t __pte2cachemode_tbl[8] = { |
48 | [__pte2cm_idx(0)] = _PAGE_CACHE_MODE_WB, | 52 | [__pte2cm_idx( 0 | 0 | 0 )] = _PAGE_CACHE_MODE_WB, |
49 | [__pte2cm_idx(_PAGE_PWT)] = _PAGE_CACHE_MODE_WC, | 53 | [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_WC, |
50 | [__pte2cm_idx(_PAGE_PCD)] = _PAGE_CACHE_MODE_UC_MINUS, | 54 | [__pte2cm_idx( 0 | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, |
51 | [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD)] = _PAGE_CACHE_MODE_UC, | 55 | [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC, |
52 | [__pte2cm_idx(_PAGE_PAT)] = _PAGE_CACHE_MODE_WB, | 56 | [__pte2cm_idx( 0 | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WB, |
53 | [__pte2cm_idx(_PAGE_PWT | _PAGE_PAT)] = _PAGE_CACHE_MODE_WC, | 57 | [__pte2cm_idx(_PAGE_PWT | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WC, |
54 | [__pte2cm_idx(_PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, | 58 | [__pte2cm_idx(0 | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, |
55 | [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC, | 59 | [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC, |
56 | }; | 60 | }; |
57 | EXPORT_SYMBOL(__pte2cachemode_tbl); | 61 | EXPORT_SYMBOL(__pte2cachemode_tbl); |
@@ -131,21 +135,7 @@ void __init early_alloc_pgt_buf(void) | |||
131 | 135 | ||
132 | int after_bootmem; | 136 | int after_bootmem; |
133 | 137 | ||
134 | int direct_gbpages | 138 | early_param_on_off("gbpages", "nogbpages", direct_gbpages, CONFIG_X86_DIRECT_GBPAGES); |
135 | #ifdef CONFIG_DIRECT_GBPAGES | ||
136 | = 1 | ||
137 | #endif | ||
138 | ; | ||
139 | |||
140 | static void __init init_gbpages(void) | ||
141 | { | ||
142 | #ifdef CONFIG_X86_64 | ||
143 | if (direct_gbpages && cpu_has_gbpages) | ||
144 | printk(KERN_INFO "Using GB pages for direct mapping\n"); | ||
145 | else | ||
146 | direct_gbpages = 0; | ||
147 | #endif | ||
148 | } | ||
149 | 139 | ||
150 | struct map_range { | 140 | struct map_range { |
151 | unsigned long start; | 141 | unsigned long start; |
@@ -157,16 +147,12 @@ static int page_size_mask; | |||
157 | 147 | ||
158 | static void __init probe_page_size_mask(void) | 148 | static void __init probe_page_size_mask(void) |
159 | { | 149 | { |
160 | init_gbpages(); | ||
161 | |||
162 | #if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK) | 150 | #if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK) |
163 | /* | 151 | /* |
164 | * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. | 152 | * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. |
165 | * This will simplify cpa(), which otherwise needs to support splitting | 153 | * This will simplify cpa(), which otherwise needs to support splitting |
166 | * large pages into small in interrupt context, etc. | 154 | * large pages into small in interrupt context, etc. |
167 | */ | 155 | */ |
168 | if (direct_gbpages) | ||
169 | page_size_mask |= 1 << PG_LEVEL_1G; | ||
170 | if (cpu_has_pse) | 156 | if (cpu_has_pse) |
171 | page_size_mask |= 1 << PG_LEVEL_2M; | 157 | page_size_mask |= 1 << PG_LEVEL_2M; |
172 | #endif | 158 | #endif |
@@ -179,6 +165,15 @@ static void __init probe_page_size_mask(void) | |||
179 | if (cpu_has_pge) { | 165 | if (cpu_has_pge) { |
180 | cr4_set_bits_and_update_boot(X86_CR4_PGE); | 166 | cr4_set_bits_and_update_boot(X86_CR4_PGE); |
181 | __supported_pte_mask |= _PAGE_GLOBAL; | 167 | __supported_pte_mask |= _PAGE_GLOBAL; |
168 | } else | ||
169 | __supported_pte_mask &= ~_PAGE_GLOBAL; | ||
170 | |||
171 | /* Enable 1 GB linear kernel mappings if available: */ | ||
172 | if (direct_gbpages && cpu_has_gbpages) { | ||
173 | printk(KERN_INFO "Using GB pages for direct mapping\n"); | ||
174 | page_size_mask |= 1 << PG_LEVEL_1G; | ||
175 | } else { | ||
176 | direct_gbpages = 0; | ||
182 | } | 177 | } |
183 | } | 178 | } |
184 | 179 | ||
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 30eb05ae7061..3fba623e3ba5 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c | |||
@@ -130,20 +130,6 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, | |||
130 | return 0; | 130 | return 0; |
131 | } | 131 | } |
132 | 132 | ||
133 | static int __init parse_direct_gbpages_off(char *arg) | ||
134 | { | ||
135 | direct_gbpages = 0; | ||
136 | return 0; | ||
137 | } | ||
138 | early_param("nogbpages", parse_direct_gbpages_off); | ||
139 | |||
140 | static int __init parse_direct_gbpages_on(char *arg) | ||
141 | { | ||
142 | direct_gbpages = 1; | ||
143 | return 0; | ||
144 | } | ||
145 | early_param("gbpages", parse_direct_gbpages_on); | ||
146 | |||
147 | /* | 133 | /* |
148 | * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the | 134 | * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the |
149 | * physical space so we can cache the place of the first one and move | 135 | * physical space so we can cache the place of the first one and move |
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index fdf617c00e2f..5ead4d6cf3a7 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c | |||
@@ -67,8 +67,13 @@ static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages, | |||
67 | 67 | ||
68 | /* | 68 | /* |
69 | * Remap an arbitrary physical address space into the kernel virtual | 69 | * Remap an arbitrary physical address space into the kernel virtual |
70 | * address space. Needed when the kernel wants to access high addresses | 70 | * address space. It transparently creates kernel huge I/O mapping when |
71 | * directly. | 71 | * the physical address is aligned by a huge page size (1GB or 2MB) and |
72 | * the requested size is at least the huge page size. | ||
73 | * | ||
74 | * NOTE: MTRRs can override PAT memory types with a 4KB granularity. | ||
75 | * Therefore, the mapping code falls back to use a smaller page toward 4KB | ||
76 | * when a mapping range is covered by non-WB type of MTRRs. | ||
72 | * | 77 | * |
73 | * NOTE! We need to allow non-page-aligned mappings too: we will obviously | 78 | * NOTE! We need to allow non-page-aligned mappings too: we will obviously |
74 | * have to convert them into an offset in a page-aligned mapping, but the | 79 | * have to convert them into an offset in a page-aligned mapping, but the |
@@ -326,6 +331,20 @@ void iounmap(volatile void __iomem *addr) | |||
326 | } | 331 | } |
327 | EXPORT_SYMBOL(iounmap); | 332 | EXPORT_SYMBOL(iounmap); |
328 | 333 | ||
334 | int arch_ioremap_pud_supported(void) | ||
335 | { | ||
336 | #ifdef CONFIG_X86_64 | ||
337 | return cpu_has_gbpages; | ||
338 | #else | ||
339 | return 0; | ||
340 | #endif | ||
341 | } | ||
342 | |||
343 | int arch_ioremap_pmd_supported(void) | ||
344 | { | ||
345 | return cpu_has_pse; | ||
346 | } | ||
347 | |||
329 | /* | 348 | /* |
330 | * Convert a physical pointer to a virtual kernel pointer for /dev/mem | 349 | * Convert a physical pointer to a virtual kernel pointer for /dev/mem |
331 | * access | 350 | * access |
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c deleted file mode 100644 index 1e9da795767a..000000000000 --- a/arch/x86/mm/memtest.c +++ /dev/null | |||
@@ -1,118 +0,0 @@ | |||
1 | #include <linux/kernel.h> | ||
2 | #include <linux/errno.h> | ||
3 | #include <linux/string.h> | ||
4 | #include <linux/types.h> | ||
5 | #include <linux/mm.h> | ||
6 | #include <linux/smp.h> | ||
7 | #include <linux/init.h> | ||
8 | #include <linux/pfn.h> | ||
9 | #include <linux/memblock.h> | ||
10 | |||
11 | static u64 patterns[] __initdata = { | ||
12 | /* The first entry has to be 0 to leave memtest with zeroed memory */ | ||
13 | 0, | ||
14 | 0xffffffffffffffffULL, | ||
15 | 0x5555555555555555ULL, | ||
16 | 0xaaaaaaaaaaaaaaaaULL, | ||
17 | 0x1111111111111111ULL, | ||
18 | 0x2222222222222222ULL, | ||
19 | 0x4444444444444444ULL, | ||
20 | 0x8888888888888888ULL, | ||
21 | 0x3333333333333333ULL, | ||
22 | 0x6666666666666666ULL, | ||
23 | 0x9999999999999999ULL, | ||
24 | 0xccccccccccccccccULL, | ||
25 | 0x7777777777777777ULL, | ||
26 | 0xbbbbbbbbbbbbbbbbULL, | ||
27 | 0xddddddddddddddddULL, | ||
28 | 0xeeeeeeeeeeeeeeeeULL, | ||
29 | 0x7a6c7258554e494cULL, /* yeah ;-) */ | ||
30 | }; | ||
31 | |||
32 | static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad) | ||
33 | { | ||
34 | printk(KERN_INFO " %016llx bad mem addr %010llx - %010llx reserved\n", | ||
35 | (unsigned long long) pattern, | ||
36 | (unsigned long long) start_bad, | ||
37 | (unsigned long long) end_bad); | ||
38 | memblock_reserve(start_bad, end_bad - start_bad); | ||
39 | } | ||
40 | |||
41 | static void __init memtest(u64 pattern, u64 start_phys, u64 size) | ||
42 | { | ||
43 | u64 *p, *start, *end; | ||
44 | u64 start_bad, last_bad; | ||
45 | u64 start_phys_aligned; | ||
46 | const size_t incr = sizeof(pattern); | ||
47 | |||
48 | start_phys_aligned = ALIGN(start_phys, incr); | ||
49 | start = __va(start_phys_aligned); | ||
50 | end = start + (size - (start_phys_aligned - start_phys)) / incr; | ||
51 | start_bad = 0; | ||
52 | last_bad = 0; | ||
53 | |||
54 | for (p = start; p < end; p++) | ||
55 | *p = pattern; | ||
56 | |||
57 | for (p = start; p < end; p++, start_phys_aligned += incr) { | ||
58 | if (*p == pattern) | ||
59 | continue; | ||
60 | if (start_phys_aligned == last_bad + incr) { | ||
61 | last_bad += incr; | ||
62 | continue; | ||
63 | } | ||
64 | if (start_bad) | ||
65 | reserve_bad_mem(pattern, start_bad, last_bad + incr); | ||
66 | start_bad = last_bad = start_phys_aligned; | ||
67 | } | ||
68 | if (start_bad) | ||
69 | reserve_bad_mem(pattern, start_bad, last_bad + incr); | ||
70 | } | ||
71 | |||
72 | static void __init do_one_pass(u64 pattern, u64 start, u64 end) | ||
73 | { | ||
74 | u64 i; | ||
75 | phys_addr_t this_start, this_end; | ||
76 | |||
77 | for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) { | ||
78 | this_start = clamp_t(phys_addr_t, this_start, start, end); | ||
79 | this_end = clamp_t(phys_addr_t, this_end, start, end); | ||
80 | if (this_start < this_end) { | ||
81 | printk(KERN_INFO " %010llx - %010llx pattern %016llx\n", | ||
82 | (unsigned long long)this_start, | ||
83 | (unsigned long long)this_end, | ||
84 | (unsigned long long)cpu_to_be64(pattern)); | ||
85 | memtest(pattern, this_start, this_end - this_start); | ||
86 | } | ||
87 | } | ||
88 | } | ||
89 | |||
90 | /* default is disabled */ | ||
91 | static int memtest_pattern __initdata; | ||
92 | |||
93 | static int __init parse_memtest(char *arg) | ||
94 | { | ||
95 | if (arg) | ||
96 | memtest_pattern = simple_strtoul(arg, NULL, 0); | ||
97 | else | ||
98 | memtest_pattern = ARRAY_SIZE(patterns); | ||
99 | |||
100 | return 0; | ||
101 | } | ||
102 | |||
103 | early_param("memtest", parse_memtest); | ||
104 | |||
105 | void __init early_memtest(unsigned long start, unsigned long end) | ||
106 | { | ||
107 | unsigned int i; | ||
108 | unsigned int idx = 0; | ||
109 | |||
110 | if (!memtest_pattern) | ||
111 | return; | ||
112 | |||
113 | printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern); | ||
114 | for (i = memtest_pattern-1; i < UINT_MAX; --i) { | ||
115 | idx = i % ARRAY_SIZE(patterns); | ||
116 | do_one_pass(patterns[idx], start, end); | ||
117 | } | ||
118 | } | ||
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index df4552bd239e..9d518d693b4b 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c | |||
@@ -65,24 +65,23 @@ static int mmap_is_legacy(void) | |||
65 | return sysctl_legacy_va_layout; | 65 | return sysctl_legacy_va_layout; |
66 | } | 66 | } |
67 | 67 | ||
68 | static unsigned long mmap_rnd(void) | 68 | unsigned long arch_mmap_rnd(void) |
69 | { | 69 | { |
70 | unsigned long rnd = 0; | 70 | unsigned long rnd; |
71 | 71 | ||
72 | /* | 72 | /* |
73 | * 8 bits of randomness in 32bit mmaps, 20 address space bits | 73 | * 8 bits of randomness in 32bit mmaps, 20 address space bits |
74 | * 28 bits of randomness in 64bit mmaps, 40 address space bits | 74 | * 28 bits of randomness in 64bit mmaps, 40 address space bits |
75 | */ | 75 | */ |
76 | if (current->flags & PF_RANDOMIZE) { | 76 | if (mmap_is_ia32()) |
77 | if (mmap_is_ia32()) | 77 | rnd = (unsigned long)get_random_int() % (1<<8); |
78 | rnd = get_random_int() % (1<<8); | 78 | else |
79 | else | 79 | rnd = (unsigned long)get_random_int() % (1<<28); |
80 | rnd = get_random_int() % (1<<28); | 80 | |
81 | } | ||
82 | return rnd << PAGE_SHIFT; | 81 | return rnd << PAGE_SHIFT; |
83 | } | 82 | } |
84 | 83 | ||
85 | static unsigned long mmap_base(void) | 84 | static unsigned long mmap_base(unsigned long rnd) |
86 | { | 85 | { |
87 | unsigned long gap = rlimit(RLIMIT_STACK); | 86 | unsigned long gap = rlimit(RLIMIT_STACK); |
88 | 87 | ||
@@ -91,19 +90,19 @@ static unsigned long mmap_base(void) | |||
91 | else if (gap > MAX_GAP) | 90 | else if (gap > MAX_GAP) |
92 | gap = MAX_GAP; | 91 | gap = MAX_GAP; |
93 | 92 | ||
94 | return PAGE_ALIGN(TASK_SIZE - gap - mmap_rnd()); | 93 | return PAGE_ALIGN(TASK_SIZE - gap - rnd); |
95 | } | 94 | } |
96 | 95 | ||
97 | /* | 96 | /* |
98 | * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64 | 97 | * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64 |
99 | * does, but not when emulating X86_32 | 98 | * does, but not when emulating X86_32 |
100 | */ | 99 | */ |
101 | static unsigned long mmap_legacy_base(void) | 100 | static unsigned long mmap_legacy_base(unsigned long rnd) |
102 | { | 101 | { |
103 | if (mmap_is_ia32()) | 102 | if (mmap_is_ia32()) |
104 | return TASK_UNMAPPED_BASE; | 103 | return TASK_UNMAPPED_BASE; |
105 | else | 104 | else |
106 | return TASK_UNMAPPED_BASE + mmap_rnd(); | 105 | return TASK_UNMAPPED_BASE + rnd; |
107 | } | 106 | } |
108 | 107 | ||
109 | /* | 108 | /* |
@@ -112,13 +111,18 @@ static unsigned long mmap_legacy_base(void) | |||
112 | */ | 111 | */ |
113 | void arch_pick_mmap_layout(struct mm_struct *mm) | 112 | void arch_pick_mmap_layout(struct mm_struct *mm) |
114 | { | 113 | { |
115 | mm->mmap_legacy_base = mmap_legacy_base(); | 114 | unsigned long random_factor = 0UL; |
116 | mm->mmap_base = mmap_base(); | 115 | |
116 | if (current->flags & PF_RANDOMIZE) | ||
117 | random_factor = arch_mmap_rnd(); | ||
118 | |||
119 | mm->mmap_legacy_base = mmap_legacy_base(random_factor); | ||
117 | 120 | ||
118 | if (mmap_is_legacy()) { | 121 | if (mmap_is_legacy()) { |
119 | mm->mmap_base = mm->mmap_legacy_base; | 122 | mm->mmap_base = mm->mmap_legacy_base; |
120 | mm->get_unmapped_area = arch_get_unmapped_area; | 123 | mm->get_unmapped_area = arch_get_unmapped_area; |
121 | } else { | 124 | } else { |
125 | mm->mmap_base = mmap_base(random_factor); | ||
122 | mm->get_unmapped_area = arch_get_unmapped_area_topdown; | 126 | mm->get_unmapped_area = arch_get_unmapped_area_topdown; |
123 | } | 127 | } |
124 | } | 128 | } |
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index cd4785bbacb9..4053bb58bf92 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c | |||
@@ -482,9 +482,16 @@ static void __init numa_clear_kernel_node_hotplug(void) | |||
482 | &memblock.reserved, mb->nid); | 482 | &memblock.reserved, mb->nid); |
483 | } | 483 | } |
484 | 484 | ||
485 | /* Mark all kernel nodes. */ | 485 | /* |
486 | * Mark all kernel nodes. | ||
487 | * | ||
488 | * When booting with mem=nn[kMG] or in a kdump kernel, numa_meminfo | ||
489 | * may not include all the memblock.reserved memory ranges because | ||
490 | * trim_snb_memory() reserves specific pages for Sandy Bridge graphics. | ||
491 | */ | ||
486 | for_each_memblock(reserved, r) | 492 | for_each_memblock(reserved, r) |
487 | node_set(r->nid, numa_kernel_nodes); | 493 | if (r->nid != MAX_NUMNODES) |
494 | node_set(r->nid, numa_kernel_nodes); | ||
488 | 495 | ||
489 | /* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */ | 496 | /* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */ |
490 | for (i = 0; i < numa_meminfo.nr_blks; i++) { | 497 | for (i = 0; i < numa_meminfo.nr_blks; i++) { |
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 536ea2fb6e33..89af288ec674 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c | |||
@@ -81,11 +81,9 @@ void arch_report_meminfo(struct seq_file *m) | |||
81 | seq_printf(m, "DirectMap4M: %8lu kB\n", | 81 | seq_printf(m, "DirectMap4M: %8lu kB\n", |
82 | direct_pages_count[PG_LEVEL_2M] << 12); | 82 | direct_pages_count[PG_LEVEL_2M] << 12); |
83 | #endif | 83 | #endif |
84 | #ifdef CONFIG_X86_64 | ||
85 | if (direct_gbpages) | 84 | if (direct_gbpages) |
86 | seq_printf(m, "DirectMap1G: %8lu kB\n", | 85 | seq_printf(m, "DirectMap1G: %8lu kB\n", |
87 | direct_pages_count[PG_LEVEL_1G] << 20); | 86 | direct_pages_count[PG_LEVEL_1G] << 20); |
88 | #endif | ||
89 | } | 87 | } |
90 | #else | 88 | #else |
91 | static inline void split_page_count(int level) { } | 89 | static inline void split_page_count(int level) { } |
@@ -1654,13 +1652,11 @@ int set_memory_ro(unsigned long addr, int numpages) | |||
1654 | { | 1652 | { |
1655 | return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0); | 1653 | return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0); |
1656 | } | 1654 | } |
1657 | EXPORT_SYMBOL_GPL(set_memory_ro); | ||
1658 | 1655 | ||
1659 | int set_memory_rw(unsigned long addr, int numpages) | 1656 | int set_memory_rw(unsigned long addr, int numpages) |
1660 | { | 1657 | { |
1661 | return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0); | 1658 | return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0); |
1662 | } | 1659 | } |
1663 | EXPORT_SYMBOL_GPL(set_memory_rw); | ||
1664 | 1660 | ||
1665 | int set_memory_np(unsigned long addr, int numpages) | 1661 | int set_memory_np(unsigned long addr, int numpages) |
1666 | { | 1662 | { |
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 7ac68698406c..35af6771a95a 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c | |||
@@ -610,7 +610,7 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, | |||
610 | } | 610 | } |
611 | 611 | ||
612 | #ifdef CONFIG_STRICT_DEVMEM | 612 | #ifdef CONFIG_STRICT_DEVMEM |
613 | /* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/ | 613 | /* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM */ |
614 | static inline int range_is_allowed(unsigned long pfn, unsigned long size) | 614 | static inline int range_is_allowed(unsigned long pfn, unsigned long size) |
615 | { | 615 | { |
616 | return 1; | 616 | return 1; |
@@ -628,8 +628,8 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size) | |||
628 | 628 | ||
629 | while (cursor < to) { | 629 | while (cursor < to) { |
630 | if (!devmem_is_allowed(pfn)) { | 630 | if (!devmem_is_allowed(pfn)) { |
631 | printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx]\n", | 631 | printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx], PAT prevents it\n", |
632 | current->comm, from, to - 1); | 632 | current->comm, from, to - 1); |
633 | return 0; | 633 | return 0; |
634 | } | 634 | } |
635 | cursor += PAGE_SIZE; | 635 | cursor += PAGE_SIZE; |
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 7b22adaad4f1..0b97d2c75df3 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c | |||
@@ -4,6 +4,7 @@ | |||
4 | #include <asm/pgtable.h> | 4 | #include <asm/pgtable.h> |
5 | #include <asm/tlb.h> | 5 | #include <asm/tlb.h> |
6 | #include <asm/fixmap.h> | 6 | #include <asm/fixmap.h> |
7 | #include <asm/mtrr.h> | ||
7 | 8 | ||
8 | #define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO | 9 | #define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO |
9 | 10 | ||
@@ -58,7 +59,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) | |||
58 | tlb_remove_page(tlb, pte); | 59 | tlb_remove_page(tlb, pte); |
59 | } | 60 | } |
60 | 61 | ||
61 | #if PAGETABLE_LEVELS > 2 | 62 | #if CONFIG_PGTABLE_LEVELS > 2 |
62 | void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) | 63 | void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) |
63 | { | 64 | { |
64 | struct page *page = virt_to_page(pmd); | 65 | struct page *page = virt_to_page(pmd); |
@@ -74,14 +75,14 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) | |||
74 | tlb_remove_page(tlb, page); | 75 | tlb_remove_page(tlb, page); |
75 | } | 76 | } |
76 | 77 | ||
77 | #if PAGETABLE_LEVELS > 3 | 78 | #if CONFIG_PGTABLE_LEVELS > 3 |
78 | void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) | 79 | void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) |
79 | { | 80 | { |
80 | paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); | 81 | paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); |
81 | tlb_remove_page(tlb, virt_to_page(pud)); | 82 | tlb_remove_page(tlb, virt_to_page(pud)); |
82 | } | 83 | } |
83 | #endif /* PAGETABLE_LEVELS > 3 */ | 84 | #endif /* CONFIG_PGTABLE_LEVELS > 3 */ |
84 | #endif /* PAGETABLE_LEVELS > 2 */ | 85 | #endif /* CONFIG_PGTABLE_LEVELS > 2 */ |
85 | 86 | ||
86 | static inline void pgd_list_add(pgd_t *pgd) | 87 | static inline void pgd_list_add(pgd_t *pgd) |
87 | { | 88 | { |
@@ -117,9 +118,9 @@ static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) | |||
117 | /* If the pgd points to a shared pagetable level (either the | 118 | /* If the pgd points to a shared pagetable level (either the |
118 | ptes in non-PAE, or shared PMD in PAE), then just copy the | 119 | ptes in non-PAE, or shared PMD in PAE), then just copy the |
119 | references from swapper_pg_dir. */ | 120 | references from swapper_pg_dir. */ |
120 | if (PAGETABLE_LEVELS == 2 || | 121 | if (CONFIG_PGTABLE_LEVELS == 2 || |
121 | (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) || | 122 | (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) || |
122 | PAGETABLE_LEVELS == 4) { | 123 | CONFIG_PGTABLE_LEVELS == 4) { |
123 | clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, | 124 | clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, |
124 | swapper_pg_dir + KERNEL_PGD_BOUNDARY, | 125 | swapper_pg_dir + KERNEL_PGD_BOUNDARY, |
125 | KERNEL_PGD_PTRS); | 126 | KERNEL_PGD_PTRS); |
@@ -275,12 +276,87 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) | |||
275 | } | 276 | } |
276 | } | 277 | } |
277 | 278 | ||
279 | /* | ||
280 | * Xen paravirt assumes pgd table should be in one page. 64 bit kernel also | ||
281 | * assumes that pgd should be in one page. | ||
282 | * | ||
283 | * But kernel with PAE paging that is not running as a Xen domain | ||
284 | * only needs to allocate 32 bytes for pgd instead of one page. | ||
285 | */ | ||
286 | #ifdef CONFIG_X86_PAE | ||
287 | |||
288 | #include <linux/slab.h> | ||
289 | |||
290 | #define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t)) | ||
291 | #define PGD_ALIGN 32 | ||
292 | |||
293 | static struct kmem_cache *pgd_cache; | ||
294 | |||
295 | static int __init pgd_cache_init(void) | ||
296 | { | ||
297 | /* | ||
298 | * When PAE kernel is running as a Xen domain, it does not use | ||
299 | * shared kernel pmd. And this requires a whole page for pgd. | ||
300 | */ | ||
301 | if (!SHARED_KERNEL_PMD) | ||
302 | return 0; | ||
303 | |||
304 | /* | ||
305 | * when PAE kernel is not running as a Xen domain, it uses | ||
306 | * shared kernel pmd. Shared kernel pmd does not require a whole | ||
307 | * page for pgd. We are able to just allocate a 32-byte for pgd. | ||
308 | * During boot time, we create a 32-byte slab for pgd table allocation. | ||
309 | */ | ||
310 | pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN, | ||
311 | SLAB_PANIC, NULL); | ||
312 | if (!pgd_cache) | ||
313 | return -ENOMEM; | ||
314 | |||
315 | return 0; | ||
316 | } | ||
317 | core_initcall(pgd_cache_init); | ||
318 | |||
319 | static inline pgd_t *_pgd_alloc(void) | ||
320 | { | ||
321 | /* | ||
322 | * If no SHARED_KERNEL_PMD, PAE kernel is running as a Xen domain. | ||
323 | * We allocate one page for pgd. | ||
324 | */ | ||
325 | if (!SHARED_KERNEL_PMD) | ||
326 | return (pgd_t *)__get_free_page(PGALLOC_GFP); | ||
327 | |||
328 | /* | ||
329 | * Now PAE kernel is not running as a Xen domain. We can allocate | ||
330 | * a 32-byte slab for pgd to save memory space. | ||
331 | */ | ||
332 | return kmem_cache_alloc(pgd_cache, PGALLOC_GFP); | ||
333 | } | ||
334 | |||
335 | static inline void _pgd_free(pgd_t *pgd) | ||
336 | { | ||
337 | if (!SHARED_KERNEL_PMD) | ||
338 | free_page((unsigned long)pgd); | ||
339 | else | ||
340 | kmem_cache_free(pgd_cache, pgd); | ||
341 | } | ||
342 | #else | ||
343 | static inline pgd_t *_pgd_alloc(void) | ||
344 | { | ||
345 | return (pgd_t *)__get_free_page(PGALLOC_GFP); | ||
346 | } | ||
347 | |||
348 | static inline void _pgd_free(pgd_t *pgd) | ||
349 | { | ||
350 | free_page((unsigned long)pgd); | ||
351 | } | ||
352 | #endif /* CONFIG_X86_PAE */ | ||
353 | |||
278 | pgd_t *pgd_alloc(struct mm_struct *mm) | 354 | pgd_t *pgd_alloc(struct mm_struct *mm) |
279 | { | 355 | { |
280 | pgd_t *pgd; | 356 | pgd_t *pgd; |
281 | pmd_t *pmds[PREALLOCATED_PMDS]; | 357 | pmd_t *pmds[PREALLOCATED_PMDS]; |
282 | 358 | ||
283 | pgd = (pgd_t *)__get_free_page(PGALLOC_GFP); | 359 | pgd = _pgd_alloc(); |
284 | 360 | ||
285 | if (pgd == NULL) | 361 | if (pgd == NULL) |
286 | goto out; | 362 | goto out; |
@@ -310,7 +386,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) | |||
310 | out_free_pmds: | 386 | out_free_pmds: |
311 | free_pmds(mm, pmds); | 387 | free_pmds(mm, pmds); |
312 | out_free_pgd: | 388 | out_free_pgd: |
313 | free_page((unsigned long)pgd); | 389 | _pgd_free(pgd); |
314 | out: | 390 | out: |
315 | return NULL; | 391 | return NULL; |
316 | } | 392 | } |
@@ -320,7 +396,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) | |||
320 | pgd_mop_up_pmds(mm, pgd); | 396 | pgd_mop_up_pmds(mm, pgd); |
321 | pgd_dtor(pgd); | 397 | pgd_dtor(pgd); |
322 | paravirt_pgd_free(mm, pgd); | 398 | paravirt_pgd_free(mm, pgd); |
323 | free_page((unsigned long)pgd); | 399 | _pgd_free(pgd); |
324 | } | 400 | } |
325 | 401 | ||
326 | /* | 402 | /* |
@@ -485,3 +561,67 @@ void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, | |||
485 | { | 561 | { |
486 | __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags)); | 562 | __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags)); |
487 | } | 563 | } |
564 | |||
565 | #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP | ||
566 | int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) | ||
567 | { | ||
568 | u8 mtrr; | ||
569 | |||
570 | /* | ||
571 | * Do not use a huge page when the range is covered by non-WB type | ||
572 | * of MTRRs. | ||
573 | */ | ||
574 | mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE); | ||
575 | if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF)) | ||
576 | return 0; | ||
577 | |||
578 | prot = pgprot_4k_2_large(prot); | ||
579 | |||
580 | set_pte((pte_t *)pud, pfn_pte( | ||
581 | (u64)addr >> PAGE_SHIFT, | ||
582 | __pgprot(pgprot_val(prot) | _PAGE_PSE))); | ||
583 | |||
584 | return 1; | ||
585 | } | ||
586 | |||
587 | int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) | ||
588 | { | ||
589 | u8 mtrr; | ||
590 | |||
591 | /* | ||
592 | * Do not use a huge page when the range is covered by non-WB type | ||
593 | * of MTRRs. | ||
594 | */ | ||
595 | mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE); | ||
596 | if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF)) | ||
597 | return 0; | ||
598 | |||
599 | prot = pgprot_4k_2_large(prot); | ||
600 | |||
601 | set_pte((pte_t *)pmd, pfn_pte( | ||
602 | (u64)addr >> PAGE_SHIFT, | ||
603 | __pgprot(pgprot_val(prot) | _PAGE_PSE))); | ||
604 | |||
605 | return 1; | ||
606 | } | ||
607 | |||
608 | int pud_clear_huge(pud_t *pud) | ||
609 | { | ||
610 | if (pud_large(*pud)) { | ||
611 | pud_clear(pud); | ||
612 | return 1; | ||
613 | } | ||
614 | |||
615 | return 0; | ||
616 | } | ||
617 | |||
618 | int pmd_clear_huge(pmd_t *pmd) | ||
619 | { | ||
620 | if (pmd_large(*pmd)) { | ||
621 | pmd_clear(pmd); | ||
622 | return 1; | ||
623 | } | ||
624 | |||
625 | return 0; | ||
626 | } | ||
627 | #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ | ||
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index 5d04be5efb64..4e664bdb535a 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c | |||
@@ -111,7 +111,7 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth) | |||
111 | { | 111 | { |
112 | struct stack_frame *head = (struct stack_frame *)frame_pointer(regs); | 112 | struct stack_frame *head = (struct stack_frame *)frame_pointer(regs); |
113 | 113 | ||
114 | if (!user_mode_vm(regs)) { | 114 | if (!user_mode(regs)) { |
115 | unsigned long stack = kernel_stack_pointer(regs); | 115 | unsigned long stack = kernel_stack_pointer(regs); |
116 | if (depth) | 116 | if (depth) |
117 | dump_trace(NULL, regs, (unsigned long *)stack, 0, | 117 | dump_trace(NULL, regs, (unsigned long *)stack, 0, |
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 2fb384724ebb..8fd6f44aee83 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c | |||
@@ -490,7 +490,9 @@ void pcibios_scan_root(int busnum) | |||
490 | if (!bus) { | 490 | if (!bus) { |
491 | pci_free_resource_list(&resources); | 491 | pci_free_resource_list(&resources); |
492 | kfree(sd); | 492 | kfree(sd); |
493 | return; | ||
493 | } | 494 | } |
495 | pci_bus_add_devices(bus); | ||
494 | } | 496 | } |
495 | 497 | ||
496 | void __init pcibios_set_cache_line_size(void) | 498 | void __init pcibios_set_cache_line_size(void) |
diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c index d143d216d52b..d7f997f7c26d 100644 --- a/arch/x86/platform/efi/efi-bgrt.c +++ b/arch/x86/platform/efi/efi-bgrt.c | |||
@@ -67,7 +67,7 @@ void __init efi_bgrt_init(void) | |||
67 | 67 | ||
68 | image = efi_lookup_mapped_addr(bgrt_tab->image_address); | 68 | image = efi_lookup_mapped_addr(bgrt_tab->image_address); |
69 | if (!image) { | 69 | if (!image) { |
70 | image = early_memremap(bgrt_tab->image_address, | 70 | image = early_ioremap(bgrt_tab->image_address, |
71 | sizeof(bmp_header)); | 71 | sizeof(bmp_header)); |
72 | ioremapped = true; | 72 | ioremapped = true; |
73 | if (!image) { | 73 | if (!image) { |
@@ -89,7 +89,7 @@ void __init efi_bgrt_init(void) | |||
89 | } | 89 | } |
90 | 90 | ||
91 | if (ioremapped) { | 91 | if (ioremapped) { |
92 | image = early_memremap(bgrt_tab->image_address, | 92 | image = early_ioremap(bgrt_tab->image_address, |
93 | bmp_header.size); | 93 | bmp_header.size); |
94 | if (!image) { | 94 | if (!image) { |
95 | pr_err("Ignoring BGRT: failed to map image memory\n"); | 95 | pr_err("Ignoring BGRT: failed to map image memory\n"); |
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index dbc8627a5cdf..02744df576d5 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c | |||
@@ -85,12 +85,20 @@ static efi_status_t __init phys_efi_set_virtual_address_map( | |||
85 | efi_memory_desc_t *virtual_map) | 85 | efi_memory_desc_t *virtual_map) |
86 | { | 86 | { |
87 | efi_status_t status; | 87 | efi_status_t status; |
88 | unsigned long flags; | ||
89 | pgd_t *save_pgd; | ||
88 | 90 | ||
89 | efi_call_phys_prolog(); | 91 | save_pgd = efi_call_phys_prolog(); |
92 | |||
93 | /* Disable interrupts around EFI calls: */ | ||
94 | local_irq_save(flags); | ||
90 | status = efi_call_phys(efi_phys.set_virtual_address_map, | 95 | status = efi_call_phys(efi_phys.set_virtual_address_map, |
91 | memory_map_size, descriptor_size, | 96 | memory_map_size, descriptor_size, |
92 | descriptor_version, virtual_map); | 97 | descriptor_version, virtual_map); |
93 | efi_call_phys_epilog(); | 98 | local_irq_restore(flags); |
99 | |||
100 | efi_call_phys_epilog(save_pgd); | ||
101 | |||
94 | return status; | 102 | return status; |
95 | } | 103 | } |
96 | 104 | ||
@@ -491,7 +499,8 @@ void __init efi_init(void) | |||
491 | if (efi_memmap_init()) | 499 | if (efi_memmap_init()) |
492 | return; | 500 | return; |
493 | 501 | ||
494 | print_efi_memmap(); | 502 | if (efi_enabled(EFI_DBG)) |
503 | print_efi_memmap(); | ||
495 | } | 504 | } |
496 | 505 | ||
497 | void __init efi_late_init(void) | 506 | void __init efi_late_init(void) |
@@ -939,6 +948,8 @@ static int __init arch_parse_efi_cmdline(char *str) | |||
939 | { | 948 | { |
940 | if (parse_option_str(str, "old_map")) | 949 | if (parse_option_str(str, "old_map")) |
941 | set_bit(EFI_OLD_MEMMAP, &efi.flags); | 950 | set_bit(EFI_OLD_MEMMAP, &efi.flags); |
951 | if (parse_option_str(str, "debug")) | ||
952 | set_bit(EFI_DBG, &efi.flags); | ||
942 | 953 | ||
943 | return 0; | 954 | return 0; |
944 | } | 955 | } |
diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c index 40e7cda52936..ed5b67338294 100644 --- a/arch/x86/platform/efi/efi_32.c +++ b/arch/x86/platform/efi/efi_32.c | |||
@@ -33,11 +33,10 @@ | |||
33 | 33 | ||
34 | /* | 34 | /* |
35 | * To make EFI call EFI runtime service in physical addressing mode we need | 35 | * To make EFI call EFI runtime service in physical addressing mode we need |
36 | * prolog/epilog before/after the invocation to disable interrupt, to | 36 | * prolog/epilog before/after the invocation to claim the EFI runtime service |
37 | * claim EFI runtime service handler exclusively and to duplicate a memory in | 37 | * handler exclusively and to duplicate a memory mapping in low memory space, |
38 | * low memory space say 0 - 3G. | 38 | * say 0 - 3G. |
39 | */ | 39 | */ |
40 | static unsigned long efi_rt_eflags; | ||
41 | 40 | ||
42 | void efi_sync_low_kernel_mappings(void) {} | 41 | void efi_sync_low_kernel_mappings(void) {} |
43 | void __init efi_dump_pagetable(void) {} | 42 | void __init efi_dump_pagetable(void) {} |
@@ -57,21 +56,24 @@ void __init efi_map_region(efi_memory_desc_t *md) | |||
57 | void __init efi_map_region_fixed(efi_memory_desc_t *md) {} | 56 | void __init efi_map_region_fixed(efi_memory_desc_t *md) {} |
58 | void __init parse_efi_setup(u64 phys_addr, u32 data_len) {} | 57 | void __init parse_efi_setup(u64 phys_addr, u32 data_len) {} |
59 | 58 | ||
60 | void __init efi_call_phys_prolog(void) | 59 | pgd_t * __init efi_call_phys_prolog(void) |
61 | { | 60 | { |
62 | struct desc_ptr gdt_descr; | 61 | struct desc_ptr gdt_descr; |
62 | pgd_t *save_pgd; | ||
63 | 63 | ||
64 | local_irq_save(efi_rt_eflags); | 64 | /* Current pgd is swapper_pg_dir, we'll restore it later: */ |
65 | 65 | save_pgd = swapper_pg_dir; | |
66 | load_cr3(initial_page_table); | 66 | load_cr3(initial_page_table); |
67 | __flush_tlb_all(); | 67 | __flush_tlb_all(); |
68 | 68 | ||
69 | gdt_descr.address = __pa(get_cpu_gdt_table(0)); | 69 | gdt_descr.address = __pa(get_cpu_gdt_table(0)); |
70 | gdt_descr.size = GDT_SIZE - 1; | 70 | gdt_descr.size = GDT_SIZE - 1; |
71 | load_gdt(&gdt_descr); | 71 | load_gdt(&gdt_descr); |
72 | |||
73 | return save_pgd; | ||
72 | } | 74 | } |
73 | 75 | ||
74 | void __init efi_call_phys_epilog(void) | 76 | void __init efi_call_phys_epilog(pgd_t *save_pgd) |
75 | { | 77 | { |
76 | struct desc_ptr gdt_descr; | 78 | struct desc_ptr gdt_descr; |
77 | 79 | ||
@@ -79,10 +81,8 @@ void __init efi_call_phys_epilog(void) | |||
79 | gdt_descr.size = GDT_SIZE - 1; | 81 | gdt_descr.size = GDT_SIZE - 1; |
80 | load_gdt(&gdt_descr); | 82 | load_gdt(&gdt_descr); |
81 | 83 | ||
82 | load_cr3(swapper_pg_dir); | 84 | load_cr3(save_pgd); |
83 | __flush_tlb_all(); | 85 | __flush_tlb_all(); |
84 | |||
85 | local_irq_restore(efi_rt_eflags); | ||
86 | } | 86 | } |
87 | 87 | ||
88 | void __init efi_runtime_mkexec(void) | 88 | void __init efi_runtime_mkexec(void) |
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 17e80d829df0..a0ac0f9c307f 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c | |||
@@ -41,9 +41,6 @@ | |||
41 | #include <asm/realmode.h> | 41 | #include <asm/realmode.h> |
42 | #include <asm/time.h> | 42 | #include <asm/time.h> |
43 | 43 | ||
44 | static pgd_t *save_pgd __initdata; | ||
45 | static unsigned long efi_flags __initdata; | ||
46 | |||
47 | /* | 44 | /* |
48 | * We allocate runtime services regions bottom-up, starting from -4G, i.e. | 45 | * We allocate runtime services regions bottom-up, starting from -4G, i.e. |
49 | * 0xffff_ffff_0000_0000 and limit EFI VA mapping space to 64G. | 46 | * 0xffff_ffff_0000_0000 and limit EFI VA mapping space to 64G. |
@@ -78,17 +75,18 @@ static void __init early_code_mapping_set_exec(int executable) | |||
78 | } | 75 | } |
79 | } | 76 | } |
80 | 77 | ||
81 | void __init efi_call_phys_prolog(void) | 78 | pgd_t * __init efi_call_phys_prolog(void) |
82 | { | 79 | { |
83 | unsigned long vaddress; | 80 | unsigned long vaddress; |
81 | pgd_t *save_pgd; | ||
82 | |||
84 | int pgd; | 83 | int pgd; |
85 | int n_pgds; | 84 | int n_pgds; |
86 | 85 | ||
87 | if (!efi_enabled(EFI_OLD_MEMMAP)) | 86 | if (!efi_enabled(EFI_OLD_MEMMAP)) |
88 | return; | 87 | return NULL; |
89 | 88 | ||
90 | early_code_mapping_set_exec(1); | 89 | early_code_mapping_set_exec(1); |
91 | local_irq_save(efi_flags); | ||
92 | 90 | ||
93 | n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT), PGDIR_SIZE); | 91 | n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT), PGDIR_SIZE); |
94 | save_pgd = kmalloc(n_pgds * sizeof(pgd_t), GFP_KERNEL); | 92 | save_pgd = kmalloc(n_pgds * sizeof(pgd_t), GFP_KERNEL); |
@@ -99,24 +97,29 @@ void __init efi_call_phys_prolog(void) | |||
99 | set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), *pgd_offset_k(vaddress)); | 97 | set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), *pgd_offset_k(vaddress)); |
100 | } | 98 | } |
101 | __flush_tlb_all(); | 99 | __flush_tlb_all(); |
100 | |||
101 | return save_pgd; | ||
102 | } | 102 | } |
103 | 103 | ||
104 | void __init efi_call_phys_epilog(void) | 104 | void __init efi_call_phys_epilog(pgd_t *save_pgd) |
105 | { | 105 | { |
106 | /* | 106 | /* |
107 | * After the lock is released, the original page table is restored. | 107 | * After the lock is released, the original page table is restored. |
108 | */ | 108 | */ |
109 | int pgd; | 109 | int pgd_idx; |
110 | int n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE); | 110 | int nr_pgds; |
111 | 111 | ||
112 | if (!efi_enabled(EFI_OLD_MEMMAP)) | 112 | if (!save_pgd) |
113 | return; | 113 | return; |
114 | 114 | ||
115 | for (pgd = 0; pgd < n_pgds; pgd++) | 115 | nr_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE); |
116 | set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), save_pgd[pgd]); | 116 | |
117 | for (pgd_idx = 0; pgd_idx < nr_pgds; pgd_idx++) | ||
118 | set_pgd(pgd_offset_k(pgd_idx * PGDIR_SIZE), save_pgd[pgd_idx]); | ||
119 | |||
117 | kfree(save_pgd); | 120 | kfree(save_pgd); |
121 | |||
118 | __flush_tlb_all(); | 122 | __flush_tlb_all(); |
119 | local_irq_restore(efi_flags); | ||
120 | early_code_mapping_set_exec(0); | 123 | early_code_mapping_set_exec(0); |
121 | } | 124 | } |
122 | 125 | ||
diff --git a/arch/x86/platform/intel-quark/imr_selftest.c b/arch/x86/platform/intel-quark/imr_selftest.c index c9a0838890e2..278e4da4222f 100644 --- a/arch/x86/platform/intel-quark/imr_selftest.c +++ b/arch/x86/platform/intel-quark/imr_selftest.c | |||
@@ -11,6 +11,7 @@ | |||
11 | */ | 11 | */ |
12 | 12 | ||
13 | #include <asm-generic/sections.h> | 13 | #include <asm-generic/sections.h> |
14 | #include <asm/cpu_device_id.h> | ||
14 | #include <asm/imr.h> | 15 | #include <asm/imr.h> |
15 | #include <linux/init.h> | 16 | #include <linux/init.h> |
16 | #include <linux/mm.h> | 17 | #include <linux/mm.h> |
@@ -101,6 +102,12 @@ static void __init imr_self_test(void) | |||
101 | } | 102 | } |
102 | } | 103 | } |
103 | 104 | ||
105 | static const struct x86_cpu_id imr_ids[] __initconst = { | ||
106 | { X86_VENDOR_INTEL, 5, 9 }, /* Intel Quark SoC X1000. */ | ||
107 | {} | ||
108 | }; | ||
109 | MODULE_DEVICE_TABLE(x86cpu, imr_ids); | ||
110 | |||
104 | /** | 111 | /** |
105 | * imr_self_test_init - entry point for IMR driver. | 112 | * imr_self_test_init - entry point for IMR driver. |
106 | * | 113 | * |
@@ -108,7 +115,8 @@ static void __init imr_self_test(void) | |||
108 | */ | 115 | */ |
109 | static int __init imr_self_test_init(void) | 116 | static int __init imr_self_test_init(void) |
110 | { | 117 | { |
111 | imr_self_test(); | 118 | if (x86_match_cpu(imr_ids)) |
119 | imr_self_test(); | ||
112 | return 0; | 120 | return 0; |
113 | } | 121 | } |
114 | 122 | ||
diff --git a/arch/x86/platform/olpc/olpc-xo1-sci.c b/arch/x86/platform/olpc/olpc-xo1-sci.c index 9a2e590dd202..7fa8b3b53bc0 100644 --- a/arch/x86/platform/olpc/olpc-xo1-sci.c +++ b/arch/x86/platform/olpc/olpc-xo1-sci.c | |||
@@ -61,7 +61,7 @@ static void battery_status_changed(void) | |||
61 | 61 | ||
62 | if (psy) { | 62 | if (psy) { |
63 | power_supply_changed(psy); | 63 | power_supply_changed(psy); |
64 | put_device(psy->dev); | 64 | power_supply_put(psy); |
65 | } | 65 | } |
66 | } | 66 | } |
67 | 67 | ||
@@ -71,7 +71,7 @@ static void ac_status_changed(void) | |||
71 | 71 | ||
72 | if (psy) { | 72 | if (psy) { |
73 | power_supply_changed(psy); | 73 | power_supply_changed(psy); |
74 | put_device(psy->dev); | 74 | power_supply_put(psy); |
75 | } | 75 | } |
76 | } | 76 | } |
77 | 77 | ||
diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c index 08e350e757dc..55130846ac87 100644 --- a/arch/x86/platform/olpc/olpc-xo15-sci.c +++ b/arch/x86/platform/olpc/olpc-xo15-sci.c | |||
@@ -83,7 +83,7 @@ static void battery_status_changed(void) | |||
83 | 83 | ||
84 | if (psy) { | 84 | if (psy) { |
85 | power_supply_changed(psy); | 85 | power_supply_changed(psy); |
86 | put_device(psy->dev); | 86 | power_supply_put(psy); |
87 | } | 87 | } |
88 | } | 88 | } |
89 | 89 | ||
@@ -93,7 +93,7 @@ static void ac_status_changed(void) | |||
93 | 93 | ||
94 | if (psy) { | 94 | if (psy) { |
95 | power_supply_changed(psy); | 95 | power_supply_changed(psy); |
96 | put_device(psy->dev); | 96 | power_supply_put(psy); |
97 | } | 97 | } |
98 | } | 98 | } |
99 | 99 | ||
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 994798548b1a..3b6ec42718e4 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c | |||
@@ -415,7 +415,7 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp) | |||
415 | struct reset_args reset_args; | 415 | struct reset_args reset_args; |
416 | 416 | ||
417 | reset_args.sender = sender; | 417 | reset_args.sender = sender; |
418 | cpus_clear(*mask); | 418 | cpumask_clear(mask); |
419 | /* find a single cpu for each uvhub in this distribution mask */ | 419 | /* find a single cpu for each uvhub in this distribution mask */ |
420 | maskbits = sizeof(struct pnmask) * BITSPERBYTE; | 420 | maskbits = sizeof(struct pnmask) * BITSPERBYTE; |
421 | /* each bit is a pnode relative to the partition base pnode */ | 421 | /* each bit is a pnode relative to the partition base pnode */ |
@@ -425,7 +425,7 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp) | |||
425 | continue; | 425 | continue; |
426 | apnode = pnode + bcp->partition_base_pnode; | 426 | apnode = pnode + bcp->partition_base_pnode; |
427 | cpu = pnode_to_first_cpu(apnode, smaster); | 427 | cpu = pnode_to_first_cpu(apnode, smaster); |
428 | cpu_set(cpu, *mask); | 428 | cpumask_set_cpu(cpu, mask); |
429 | } | 429 | } |
430 | 430 | ||
431 | /* IPI all cpus; preemption is already disabled */ | 431 | /* IPI all cpus; preemption is already disabled */ |
@@ -1126,7 +1126,7 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, | |||
1126 | /* don't actually do a shootdown of the local cpu */ | 1126 | /* don't actually do a shootdown of the local cpu */ |
1127 | cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); | 1127 | cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); |
1128 | 1128 | ||
1129 | if (cpu_isset(cpu, *cpumask)) | 1129 | if (cpumask_test_cpu(cpu, cpumask)) |
1130 | stat->s_ntargself++; | 1130 | stat->s_ntargself++; |
1131 | 1131 | ||
1132 | bau_desc = bcp->descriptor_base; | 1132 | bau_desc = bcp->descriptor_base; |
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 3e32ed5648a0..757678fb26e1 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c | |||
@@ -134,7 +134,7 @@ static void do_fpu_end(void) | |||
134 | static void fix_processor_context(void) | 134 | static void fix_processor_context(void) |
135 | { | 135 | { |
136 | int cpu = smp_processor_id(); | 136 | int cpu = smp_processor_id(); |
137 | struct tss_struct *t = &per_cpu(init_tss, cpu); | 137 | struct tss_struct *t = &per_cpu(cpu_tss, cpu); |
138 | #ifdef CONFIG_X86_64 | 138 | #ifdef CONFIG_X86_64 |
139 | struct desc_struct *desc = get_cpu_gdt_table(cpu); | 139 | struct desc_struct *desc = get_cpu_gdt_table(cpu); |
140 | tss_desc tss; | 140 | tss_desc tss; |
diff --git a/arch/x86/syscalls/Makefile b/arch/x86/syscalls/Makefile index 3323c2745248..a55abb9f6c5e 100644 --- a/arch/x86/syscalls/Makefile +++ b/arch/x86/syscalls/Makefile | |||
@@ -19,6 +19,9 @@ quiet_cmd_syshdr = SYSHDR $@ | |||
19 | quiet_cmd_systbl = SYSTBL $@ | 19 | quiet_cmd_systbl = SYSTBL $@ |
20 | cmd_systbl = $(CONFIG_SHELL) '$(systbl)' $< $@ | 20 | cmd_systbl = $(CONFIG_SHELL) '$(systbl)' $< $@ |
21 | 21 | ||
22 | quiet_cmd_hypercalls = HYPERCALLS $@ | ||
23 | cmd_hypercalls = $(CONFIG_SHELL) '$<' $@ $(filter-out $<,$^) | ||
24 | |||
22 | syshdr_abi_unistd_32 := i386 | 25 | syshdr_abi_unistd_32 := i386 |
23 | $(uapi)/unistd_32.h: $(syscall32) $(syshdr) | 26 | $(uapi)/unistd_32.h: $(syscall32) $(syshdr) |
24 | $(call if_changed,syshdr) | 27 | $(call if_changed,syshdr) |
@@ -47,10 +50,16 @@ $(out)/syscalls_32.h: $(syscall32) $(systbl) | |||
47 | $(out)/syscalls_64.h: $(syscall64) $(systbl) | 50 | $(out)/syscalls_64.h: $(syscall64) $(systbl) |
48 | $(call if_changed,systbl) | 51 | $(call if_changed,systbl) |
49 | 52 | ||
53 | $(out)/xen-hypercalls.h: $(srctree)/scripts/xen-hypercalls.sh | ||
54 | $(call if_changed,hypercalls) | ||
55 | |||
56 | $(out)/xen-hypercalls.h: $(srctree)/include/xen/interface/xen*.h | ||
57 | |||
50 | uapisyshdr-y += unistd_32.h unistd_64.h unistd_x32.h | 58 | uapisyshdr-y += unistd_32.h unistd_64.h unistd_x32.h |
51 | syshdr-y += syscalls_32.h | 59 | syshdr-y += syscalls_32.h |
52 | syshdr-$(CONFIG_X86_64) += unistd_32_ia32.h unistd_64_x32.h | 60 | syshdr-$(CONFIG_X86_64) += unistd_32_ia32.h unistd_64_x32.h |
53 | syshdr-$(CONFIG_X86_64) += syscalls_64.h | 61 | syshdr-$(CONFIG_X86_64) += syscalls_64.h |
62 | syshdr-$(CONFIG_XEN) += xen-hypercalls.h | ||
54 | 63 | ||
55 | targets += $(uapisyshdr-y) $(syshdr-y) | 64 | targets += $(uapisyshdr-y) $(syshdr-y) |
56 | 65 | ||
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl index b3560ece1c9f..ef8187f9d28d 100644 --- a/arch/x86/syscalls/syscall_32.tbl +++ b/arch/x86/syscalls/syscall_32.tbl | |||
@@ -119,7 +119,7 @@ | |||
119 | 110 i386 iopl sys_iopl | 119 | 110 i386 iopl sys_iopl |
120 | 111 i386 vhangup sys_vhangup | 120 | 111 i386 vhangup sys_vhangup |
121 | 112 i386 idle | 121 | 112 i386 idle |
122 | 113 i386 vm86old sys_vm86old sys32_vm86_warning | 122 | 113 i386 vm86old sys_vm86old sys_ni_syscall |
123 | 114 i386 wait4 sys_wait4 compat_sys_wait4 | 123 | 114 i386 wait4 sys_wait4 compat_sys_wait4 |
124 | 115 i386 swapoff sys_swapoff | 124 | 115 i386 swapoff sys_swapoff |
125 | 116 i386 sysinfo sys_sysinfo compat_sys_sysinfo | 125 | 116 i386 sysinfo sys_sysinfo compat_sys_sysinfo |
@@ -172,7 +172,7 @@ | |||
172 | 163 i386 mremap sys_mremap | 172 | 163 i386 mremap sys_mremap |
173 | 164 i386 setresuid sys_setresuid16 | 173 | 164 i386 setresuid sys_setresuid16 |
174 | 165 i386 getresuid sys_getresuid16 | 174 | 165 i386 getresuid sys_getresuid16 |
175 | 166 i386 vm86 sys_vm86 sys32_vm86_warning | 175 | 166 i386 vm86 sys_vm86 sys_ni_syscall |
176 | 167 i386 query_module | 176 | 167 i386 query_module |
177 | 168 i386 poll sys_poll | 177 | 168 i386 poll sys_poll |
178 | 169 i386 nfsservctl | 178 | 169 i386 nfsservctl |
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index 8d656fbb57aa..9ef32d5f1b19 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl | |||
@@ -178,7 +178,7 @@ | |||
178 | 169 common reboot sys_reboot | 178 | 169 common reboot sys_reboot |
179 | 170 common sethostname sys_sethostname | 179 | 170 common sethostname sys_sethostname |
180 | 171 common setdomainname sys_setdomainname | 180 | 171 common setdomainname sys_setdomainname |
181 | 172 common iopl stub_iopl | 181 | 172 common iopl sys_iopl |
182 | 173 common ioperm sys_ioperm | 182 | 173 common ioperm sys_ioperm |
183 | 174 64 create_module | 183 | 174 64 create_module |
184 | 175 common init_module sys_init_module | 184 | 175 common init_module sys_init_module |
diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile index eafa324eb7a5..acb384d24669 100644 --- a/arch/x86/um/Makefile +++ b/arch/x86/um/Makefile | |||
@@ -21,7 +21,6 @@ obj-$(CONFIG_BINFMT_ELF) += elfcore.o | |||
21 | 21 | ||
22 | subarch-y = ../lib/string_32.o ../lib/atomic64_32.o ../lib/atomic64_cx8_32.o | 22 | subarch-y = ../lib/string_32.o ../lib/atomic64_32.o ../lib/atomic64_cx8_32.o |
23 | subarch-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += ../lib/rwsem.o | 23 | subarch-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += ../lib/rwsem.o |
24 | subarch-$(CONFIG_HIGHMEM) += ../mm/highmem_32.o | ||
25 | 24 | ||
26 | else | 25 | else |
27 | 26 | ||
diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h index 2d7d9a1f5b53..7e8a1a650435 100644 --- a/arch/x86/um/asm/barrier.h +++ b/arch/x86/um/asm/barrier.h | |||
@@ -36,22 +36,11 @@ | |||
36 | #endif /* CONFIG_X86_PPRO_FENCE */ | 36 | #endif /* CONFIG_X86_PPRO_FENCE */ |
37 | #define dma_wmb() barrier() | 37 | #define dma_wmb() barrier() |
38 | 38 | ||
39 | #ifdef CONFIG_SMP | ||
40 | |||
41 | #define smp_mb() mb() | ||
42 | #define smp_rmb() dma_rmb() | ||
43 | #define smp_wmb() barrier() | ||
44 | #define set_mb(var, value) do { (void)xchg(&var, value); } while (0) | ||
45 | |||
46 | #else /* CONFIG_SMP */ | ||
47 | |||
48 | #define smp_mb() barrier() | 39 | #define smp_mb() barrier() |
49 | #define smp_rmb() barrier() | 40 | #define smp_rmb() barrier() |
50 | #define smp_wmb() barrier() | 41 | #define smp_wmb() barrier() |
51 | #define set_mb(var, value) do { var = value; barrier(); } while (0) | 42 | #define set_mb(var, value) do { var = value; barrier(); } while (0) |
52 | 43 | ||
53 | #endif /* CONFIG_SMP */ | ||
54 | |||
55 | #define read_barrier_depends() do { } while (0) | 44 | #define read_barrier_depends() do { } while (0) |
56 | #define smp_read_barrier_depends() do { } while (0) | 45 | #define smp_read_barrier_depends() do { } while (0) |
57 | 46 | ||
@@ -64,8 +53,8 @@ | |||
64 | */ | 53 | */ |
65 | static inline void rdtsc_barrier(void) | 54 | static inline void rdtsc_barrier(void) |
66 | { | 55 | { |
67 | alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); | 56 | alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, |
68 | alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); | 57 | "lfence", X86_FEATURE_LFENCE_RDTSC); |
69 | } | 58 | } |
70 | 59 | ||
71 | #endif | 60 | #endif |
diff --git a/arch/x86/um/asm/elf.h b/arch/x86/um/asm/elf.h index 25a1022dd793..0a656b727b1a 100644 --- a/arch/x86/um/asm/elf.h +++ b/arch/x86/um/asm/elf.h | |||
@@ -210,7 +210,7 @@ extern int elf_core_copy_fpregs(struct task_struct *t, elf_fpregset_t *fpu); | |||
210 | 210 | ||
211 | #define ELF_EXEC_PAGESIZE 4096 | 211 | #define ELF_EXEC_PAGESIZE 4096 |
212 | 212 | ||
213 | #define ELF_ET_DYN_BASE (2 * TASK_SIZE / 3) | 213 | #define ELF_ET_DYN_BASE (TASK_SIZE / 3 * 2) |
214 | 214 | ||
215 | extern long elf_aux_hwcap; | 215 | extern long elf_aux_hwcap; |
216 | #define ELF_HWCAP (elf_aux_hwcap) | 216 | #define ELF_HWCAP (elf_aux_hwcap) |
diff --git a/arch/x86/um/ldt.c b/arch/x86/um/ldt.c index 8e08176f0bcb..5c0b711d2433 100644 --- a/arch/x86/um/ldt.c +++ b/arch/x86/um/ldt.c | |||
@@ -8,9 +8,7 @@ | |||
8 | #include <linux/slab.h> | 8 | #include <linux/slab.h> |
9 | #include <asm/unistd.h> | 9 | #include <asm/unistd.h> |
10 | #include <os.h> | 10 | #include <os.h> |
11 | #include <proc_mm.h> | ||
12 | #include <skas.h> | 11 | #include <skas.h> |
13 | #include <skas_ptrace.h> | ||
14 | #include <sysdep/tls.h> | 12 | #include <sysdep/tls.h> |
15 | 13 | ||
16 | extern int modify_ldt(int func, void *ptr, unsigned long bytecount); | 14 | extern int modify_ldt(int func, void *ptr, unsigned long bytecount); |
@@ -19,105 +17,20 @@ static long write_ldt_entry(struct mm_id *mm_idp, int func, | |||
19 | struct user_desc *desc, void **addr, int done) | 17 | struct user_desc *desc, void **addr, int done) |
20 | { | 18 | { |
21 | long res; | 19 | long res; |
22 | 20 | void *stub_addr; | |
23 | if (proc_mm) { | 21 | res = syscall_stub_data(mm_idp, (unsigned long *)desc, |
24 | /* | 22 | (sizeof(*desc) + sizeof(long) - 1) & |
25 | * This is a special handling for the case, that the mm to | 23 | ~(sizeof(long) - 1), |
26 | * modify isn't current->active_mm. | 24 | addr, &stub_addr); |
27 | * If this is called directly by modify_ldt, | 25 | if (!res) { |
28 | * (current->active_mm->context.skas.u == mm_idp) | 26 | unsigned long args[] = { func, |
29 | * will be true. So no call to __switch_mm(mm_idp) is done. | 27 | (unsigned long)stub_addr, |
30 | * If this is called in case of init_new_ldt or PTRACE_LDT, | 28 | sizeof(*desc), |
31 | * mm_idp won't belong to current->active_mm, but child->mm. | 29 | 0, 0, 0 }; |
32 | * So we need to switch child's mm into our userspace, then | 30 | res = run_syscall_stub(mm_idp, __NR_modify_ldt, args, |
33 | * later switch back. | 31 | 0, addr, done); |
34 | * | ||
35 | * Note: I'm unsure: should interrupts be disabled here? | ||
36 | */ | ||
37 | if (!current->active_mm || current->active_mm == &init_mm || | ||
38 | mm_idp != ¤t->active_mm->context.id) | ||
39 | __switch_mm(mm_idp); | ||
40 | } | ||
41 | |||
42 | if (ptrace_ldt) { | ||
43 | struct ptrace_ldt ldt_op = (struct ptrace_ldt) { | ||
44 | .func = func, | ||
45 | .ptr = desc, | ||
46 | .bytecount = sizeof(*desc)}; | ||
47 | u32 cpu; | ||
48 | int pid; | ||
49 | |||
50 | if (!proc_mm) | ||
51 | pid = mm_idp->u.pid; | ||
52 | else { | ||
53 | cpu = get_cpu(); | ||
54 | pid = userspace_pid[cpu]; | ||
55 | } | ||
56 | |||
57 | res = os_ptrace_ldt(pid, 0, (unsigned long) &ldt_op); | ||
58 | |||
59 | if (proc_mm) | ||
60 | put_cpu(); | ||
61 | } | ||
62 | else { | ||
63 | void *stub_addr; | ||
64 | res = syscall_stub_data(mm_idp, (unsigned long *)desc, | ||
65 | (sizeof(*desc) + sizeof(long) - 1) & | ||
66 | ~(sizeof(long) - 1), | ||
67 | addr, &stub_addr); | ||
68 | if (!res) { | ||
69 | unsigned long args[] = { func, | ||
70 | (unsigned long)stub_addr, | ||
71 | sizeof(*desc), | ||
72 | 0, 0, 0 }; | ||
73 | res = run_syscall_stub(mm_idp, __NR_modify_ldt, args, | ||
74 | 0, addr, done); | ||
75 | } | ||
76 | } | 32 | } |
77 | 33 | ||
78 | if (proc_mm) { | ||
79 | /* | ||
80 | * This is the second part of special handling, that makes | ||
81 | * PTRACE_LDT possible to implement. | ||
82 | */ | ||
83 | if (current->active_mm && current->active_mm != &init_mm && | ||
84 | mm_idp != ¤t->active_mm->context.id) | ||
85 | __switch_mm(¤t->active_mm->context.id); | ||
86 | } | ||
87 | |||
88 | return res; | ||
89 | } | ||
90 | |||
91 | static long read_ldt_from_host(void __user * ptr, unsigned long bytecount) | ||
92 | { | ||
93 | int res, n; | ||
94 | struct ptrace_ldt ptrace_ldt = (struct ptrace_ldt) { | ||
95 | .func = 0, | ||
96 | .bytecount = bytecount, | ||
97 | .ptr = kmalloc(bytecount, GFP_KERNEL)}; | ||
98 | u32 cpu; | ||
99 | |||
100 | if (ptrace_ldt.ptr == NULL) | ||
101 | return -ENOMEM; | ||
102 | |||
103 | /* | ||
104 | * This is called from sys_modify_ldt only, so userspace_pid gives | ||
105 | * us the right number | ||
106 | */ | ||
107 | |||
108 | cpu = get_cpu(); | ||
109 | res = os_ptrace_ldt(userspace_pid[cpu], 0, (unsigned long) &ptrace_ldt); | ||
110 | put_cpu(); | ||
111 | if (res < 0) | ||
112 | goto out; | ||
113 | |||
114 | n = copy_to_user(ptr, ptrace_ldt.ptr, res); | ||
115 | if (n != 0) | ||
116 | res = -EFAULT; | ||
117 | |||
118 | out: | ||
119 | kfree(ptrace_ldt.ptr); | ||
120 | |||
121 | return res; | 34 | return res; |
122 | } | 35 | } |
123 | 36 | ||
@@ -145,9 +58,6 @@ static int read_ldt(void __user * ptr, unsigned long bytecount) | |||
145 | bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; | 58 | bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; |
146 | err = bytecount; | 59 | err = bytecount; |
147 | 60 | ||
148 | if (ptrace_ldt) | ||
149 | return read_ldt_from_host(ptr, bytecount); | ||
150 | |||
151 | mutex_lock(&ldt->lock); | 61 | mutex_lock(&ldt->lock); |
152 | if (ldt->entry_count <= LDT_DIRECT_ENTRIES) { | 62 | if (ldt->entry_count <= LDT_DIRECT_ENTRIES) { |
153 | size = LDT_ENTRY_SIZE*LDT_DIRECT_ENTRIES; | 63 | size = LDT_ENTRY_SIZE*LDT_DIRECT_ENTRIES; |
@@ -229,17 +139,11 @@ static int write_ldt(void __user * ptr, unsigned long bytecount, int func) | |||
229 | goto out; | 139 | goto out; |
230 | } | 140 | } |
231 | 141 | ||
232 | if (!ptrace_ldt) | 142 | mutex_lock(&ldt->lock); |
233 | mutex_lock(&ldt->lock); | ||
234 | 143 | ||
235 | err = write_ldt_entry(mm_idp, func, &ldt_info, &addr, 1); | 144 | err = write_ldt_entry(mm_idp, func, &ldt_info, &addr, 1); |
236 | if (err) | 145 | if (err) |
237 | goto out_unlock; | 146 | goto out_unlock; |
238 | else if (ptrace_ldt) { | ||
239 | /* With PTRACE_LDT available, this is used as a flag only */ | ||
240 | ldt->entry_count = 1; | ||
241 | goto out; | ||
242 | } | ||
243 | 147 | ||
244 | if (ldt_info.entry_number >= ldt->entry_count && | 148 | if (ldt_info.entry_number >= ldt->entry_count && |
245 | ldt_info.entry_number >= LDT_DIRECT_ENTRIES) { | 149 | ldt_info.entry_number >= LDT_DIRECT_ENTRIES) { |
@@ -393,91 +297,56 @@ long init_new_ldt(struct mm_context *new_mm, struct mm_context *from_mm) | |||
393 | int i; | 297 | int i; |
394 | long page, err=0; | 298 | long page, err=0; |
395 | void *addr = NULL; | 299 | void *addr = NULL; |
396 | struct proc_mm_op copy; | ||
397 | 300 | ||
398 | 301 | ||
399 | if (!ptrace_ldt) | 302 | mutex_init(&new_mm->arch.ldt.lock); |
400 | mutex_init(&new_mm->arch.ldt.lock); | ||
401 | 303 | ||
402 | if (!from_mm) { | 304 | if (!from_mm) { |
403 | memset(&desc, 0, sizeof(desc)); | 305 | memset(&desc, 0, sizeof(desc)); |
404 | /* | 306 | /* |
405 | * We have to initialize a clean ldt. | 307 | * Now we try to retrieve info about the ldt, we |
308 | * inherited from the host. All ldt-entries found | ||
309 | * will be reset in the following loop | ||
406 | */ | 310 | */ |
407 | if (proc_mm) { | 311 | ldt_get_host_info(); |
408 | /* | 312 | for (num_p=host_ldt_entries; *num_p != -1; num_p++) { |
409 | * If the new mm was created using proc_mm, host's | 313 | desc.entry_number = *num_p; |
410 | * default-ldt currently is assigned, which normally | 314 | err = write_ldt_entry(&new_mm->id, 1, &desc, |
411 | * contains the call-gates for lcall7 and lcall27. | 315 | &addr, *(num_p + 1) == -1); |
412 | * To remove these gates, we simply write an empty | 316 | if (err) |
413 | * entry as number 0 to the host. | 317 | break; |
414 | */ | ||
415 | err = write_ldt_entry(&new_mm->id, 1, &desc, &addr, 1); | ||
416 | } | ||
417 | else{ | ||
418 | /* | ||
419 | * Now we try to retrieve info about the ldt, we | ||
420 | * inherited from the host. All ldt-entries found | ||
421 | * will be reset in the following loop | ||
422 | */ | ||
423 | ldt_get_host_info(); | ||
424 | for (num_p=host_ldt_entries; *num_p != -1; num_p++) { | ||
425 | desc.entry_number = *num_p; | ||
426 | err = write_ldt_entry(&new_mm->id, 1, &desc, | ||
427 | &addr, *(num_p + 1) == -1); | ||
428 | if (err) | ||
429 | break; | ||
430 | } | ||
431 | } | 318 | } |
432 | new_mm->arch.ldt.entry_count = 0; | 319 | new_mm->arch.ldt.entry_count = 0; |
433 | 320 | ||
434 | goto out; | 321 | goto out; |
435 | } | 322 | } |
436 | 323 | ||
437 | if (proc_mm) { | 324 | /* |
438 | /* | 325 | * Our local LDT is used to supply the data for |
439 | * We have a valid from_mm, so we now have to copy the LDT of | 326 | * modify_ldt(READLDT), if PTRACE_LDT isn't available, |
440 | * from_mm to new_mm, because using proc_mm an new mm with | 327 | * i.e., we have to use the stub for modify_ldt, which |
441 | * an empty/default LDT was created in new_mm() | 328 | * can't handle the big read buffer of up to 64kB. |
442 | */ | 329 | */ |
443 | copy = ((struct proc_mm_op) { .op = MM_COPY_SEGMENTS, | 330 | mutex_lock(&from_mm->arch.ldt.lock); |
444 | .u = | 331 | if (from_mm->arch.ldt.entry_count <= LDT_DIRECT_ENTRIES) |
445 | { .copy_segments = | 332 | memcpy(new_mm->arch.ldt.u.entries, from_mm->arch.ldt.u.entries, |
446 | from_mm->id.u.mm_fd } } ); | 333 | sizeof(new_mm->arch.ldt.u.entries)); |
447 | i = os_write_file(new_mm->id.u.mm_fd, ©, sizeof(copy)); | 334 | else { |
448 | if (i != sizeof(copy)) | 335 | i = from_mm->arch.ldt.entry_count / LDT_ENTRIES_PER_PAGE; |
449 | printk(KERN_ERR "new_mm : /proc/mm copy_segments " | 336 | while (i-->0) { |
450 | "failed, err = %d\n", -i); | 337 | page = __get_free_page(GFP_KERNEL|__GFP_ZERO); |
451 | } | 338 | if (!page) { |
452 | 339 | err = -ENOMEM; | |
453 | if (!ptrace_ldt) { | 340 | break; |
454 | /* | ||
455 | * Our local LDT is used to supply the data for | ||
456 | * modify_ldt(READLDT), if PTRACE_LDT isn't available, | ||
457 | * i.e., we have to use the stub for modify_ldt, which | ||
458 | * can't handle the big read buffer of up to 64kB. | ||
459 | */ | ||
460 | mutex_lock(&from_mm->arch.ldt.lock); | ||
461 | if (from_mm->arch.ldt.entry_count <= LDT_DIRECT_ENTRIES) | ||
462 | memcpy(new_mm->arch.ldt.u.entries, from_mm->arch.ldt.u.entries, | ||
463 | sizeof(new_mm->arch.ldt.u.entries)); | ||
464 | else { | ||
465 | i = from_mm->arch.ldt.entry_count / LDT_ENTRIES_PER_PAGE; | ||
466 | while (i-->0) { | ||
467 | page = __get_free_page(GFP_KERNEL|__GFP_ZERO); | ||
468 | if (!page) { | ||
469 | err = -ENOMEM; | ||
470 | break; | ||
471 | } | ||
472 | new_mm->arch.ldt.u.pages[i] = | ||
473 | (struct ldt_entry *) page; | ||
474 | memcpy(new_mm->arch.ldt.u.pages[i], | ||
475 | from_mm->arch.ldt.u.pages[i], PAGE_SIZE); | ||
476 | } | 341 | } |
342 | new_mm->arch.ldt.u.pages[i] = | ||
343 | (struct ldt_entry *) page; | ||
344 | memcpy(new_mm->arch.ldt.u.pages[i], | ||
345 | from_mm->arch.ldt.u.pages[i], PAGE_SIZE); | ||
477 | } | 346 | } |
478 | new_mm->arch.ldt.entry_count = from_mm->arch.ldt.entry_count; | ||
479 | mutex_unlock(&from_mm->arch.ldt.lock); | ||
480 | } | 347 | } |
348 | new_mm->arch.ldt.entry_count = from_mm->arch.ldt.entry_count; | ||
349 | mutex_unlock(&from_mm->arch.ldt.lock); | ||
481 | 350 | ||
482 | out: | 351 | out: |
483 | return err; | 352 | return err; |
@@ -488,7 +357,7 @@ void free_ldt(struct mm_context *mm) | |||
488 | { | 357 | { |
489 | int i; | 358 | int i; |
490 | 359 | ||
491 | if (!ptrace_ldt && mm->arch.ldt.entry_count > LDT_DIRECT_ENTRIES) { | 360 | if (mm->arch.ldt.entry_count > LDT_DIRECT_ENTRIES) { |
492 | i = mm->arch.ldt.entry_count / LDT_ENTRIES_PER_PAGE; | 361 | i = mm->arch.ldt.entry_count / LDT_ENTRIES_PER_PAGE; |
493 | while (i-- > 0) | 362 | while (i-- > 0) |
494 | free_page((long) mm->arch.ldt.u.pages[i]); | 363 | free_page((long) mm->arch.ldt.u.pages[i]); |
diff --git a/arch/x86/um/shared/sysdep/faultinfo_32.h b/arch/x86/um/shared/sysdep/faultinfo_32.h index a26086b8a800..b6f2437ec29c 100644 --- a/arch/x86/um/shared/sysdep/faultinfo_32.h +++ b/arch/x86/um/shared/sysdep/faultinfo_32.h | |||
@@ -27,9 +27,6 @@ struct faultinfo { | |||
27 | /* This is Page Fault */ | 27 | /* This is Page Fault */ |
28 | #define SEGV_IS_FIXABLE(fi) ((fi)->trap_no == 14) | 28 | #define SEGV_IS_FIXABLE(fi) ((fi)->trap_no == 14) |
29 | 29 | ||
30 | /* SKAS3 has no trap_no on i386, but get_skas_faultinfo() sets it to 0. */ | ||
31 | #define SEGV_MAYBE_FIXABLE(fi) ((fi)->trap_no == 0 && ptrace_faultinfo) | ||
32 | |||
33 | #define PTRACE_FULL_FAULTINFO 0 | 30 | #define PTRACE_FULL_FAULTINFO 0 |
34 | 31 | ||
35 | #endif | 32 | #endif |
diff --git a/arch/x86/um/shared/sysdep/faultinfo_64.h b/arch/x86/um/shared/sysdep/faultinfo_64.h index f811cbe15d62..ee88f88974ea 100644 --- a/arch/x86/um/shared/sysdep/faultinfo_64.h +++ b/arch/x86/um/shared/sysdep/faultinfo_64.h | |||
@@ -27,9 +27,6 @@ struct faultinfo { | |||
27 | /* This is Page Fault */ | 27 | /* This is Page Fault */ |
28 | #define SEGV_IS_FIXABLE(fi) ((fi)->trap_no == 14) | 28 | #define SEGV_IS_FIXABLE(fi) ((fi)->trap_no == 14) |
29 | 29 | ||
30 | /* No broken SKAS API, which doesn't pass trap_no, here. */ | ||
31 | #define SEGV_MAYBE_FIXABLE(fi) 0 | ||
32 | |||
33 | #define PTRACE_FULL_FAULTINFO 1 | 30 | #define PTRACE_FULL_FAULTINFO 1 |
34 | 31 | ||
35 | #endif | 32 | #endif |
diff --git a/arch/x86/um/shared/sysdep/skas_ptrace.h b/arch/x86/um/shared/sysdep/skas_ptrace.h deleted file mode 100644 index 453febe98993..000000000000 --- a/arch/x86/um/shared/sysdep/skas_ptrace.h +++ /dev/null | |||
@@ -1,22 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) | ||
3 | * Licensed under the GPL | ||
4 | */ | ||
5 | |||
6 | #ifndef __SYSDEP_X86_SKAS_PTRACE_H | ||
7 | #define __SYSDEP_X86_SKAS_PTRACE_H | ||
8 | |||
9 | struct ptrace_faultinfo { | ||
10 | int is_write; | ||
11 | unsigned long addr; | ||
12 | }; | ||
13 | |||
14 | struct ptrace_ldt { | ||
15 | int func; | ||
16 | void *ptr; | ||
17 | unsigned long bytecount; | ||
18 | }; | ||
19 | |||
20 | #define PTRACE_LDT 54 | ||
21 | |||
22 | #endif | ||
diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c index 0c8c32bfd792..592491d1d70d 100644 --- a/arch/x86/um/signal.c +++ b/arch/x86/um/signal.c | |||
@@ -549,13 +549,6 @@ int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig, | |||
549 | if (err) | 549 | if (err) |
550 | return err; | 550 | return err; |
551 | 551 | ||
552 | /* Set up registers for signal handler */ | ||
553 | { | ||
554 | struct exec_domain *ed = current_thread_info()->exec_domain; | ||
555 | if (unlikely(ed && ed->signal_invmap && sig < 32)) | ||
556 | sig = ed->signal_invmap[sig]; | ||
557 | } | ||
558 | |||
559 | PT_REGS_SP(regs) = (unsigned long) frame; | 552 | PT_REGS_SP(regs) = (unsigned long) frame; |
560 | PT_REGS_DI(regs) = sig; | 553 | PT_REGS_DI(regs) = sig; |
561 | /* In case the signal handler was declared without prototypes */ | 554 | /* In case the signal handler was declared without prototypes */ |
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index 5cdfa9db2217..a75d8700472a 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c | |||
@@ -16,7 +16,7 @@ | |||
16 | */ | 16 | */ |
17 | 17 | ||
18 | /* Not going to be implemented by UML, since we have no hardware. */ | 18 | /* Not going to be implemented by UML, since we have no hardware. */ |
19 | #define stub_iopl sys_ni_syscall | 19 | #define sys_iopl sys_ni_syscall |
20 | #define sys_ioperm sys_ni_syscall | 20 | #define sys_ioperm sys_ni_syscall |
21 | 21 | ||
22 | /* | 22 | /* |
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index 7b9be9822724..275a3a8b78af 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile | |||
@@ -51,7 +51,7 @@ VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \ | |||
51 | $(obj)/vdso64.so.dbg: $(src)/vdso.lds $(vobjs) FORCE | 51 | $(obj)/vdso64.so.dbg: $(src)/vdso.lds $(vobjs) FORCE |
52 | $(call if_changed,vdso) | 52 | $(call if_changed,vdso) |
53 | 53 | ||
54 | HOST_EXTRACFLAGS += -I$(srctree)/tools/include | 54 | HOST_EXTRACFLAGS += -I$(srctree)/tools/include -I$(srctree)/include/uapi |
55 | hostprogs-y += vdso2c | 55 | hostprogs-y += vdso2c |
56 | 56 | ||
57 | quiet_cmd_vdso2c = VDSO2C $@ | 57 | quiet_cmd_vdso2c = VDSO2C $@ |
@@ -206,4 +206,4 @@ $(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso FORCE | |||
206 | PHONY += vdso_install $(vdso_img_insttargets) | 206 | PHONY += vdso_install $(vdso_img_insttargets) |
207 | vdso_install: $(vdso_img_insttargets) FORCE | 207 | vdso_install: $(vdso_img_insttargets) FORCE |
208 | 208 | ||
209 | clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80* vdso64* | 209 | clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80* vdso64* vdso-image-*.c vdsox32.so* |
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 9793322751e0..40d2473836c9 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c | |||
@@ -82,18 +82,15 @@ static notrace cycle_t vread_pvclock(int *mode) | |||
82 | cycle_t ret; | 82 | cycle_t ret; |
83 | u64 last; | 83 | u64 last; |
84 | u32 version; | 84 | u32 version; |
85 | u32 migrate_count; | ||
85 | u8 flags; | 86 | u8 flags; |
86 | unsigned cpu, cpu1; | 87 | unsigned cpu, cpu1; |
87 | 88 | ||
88 | 89 | ||
89 | /* | 90 | /* |
90 | * Note: hypervisor must guarantee that: | 91 | * When looping to get a consistent (time-info, tsc) pair, we |
91 | * 1. cpu ID number maps 1:1 to per-CPU pvclock time info. | 92 | * also need to deal with the possibility we can switch vcpus, |
92 | * 2. that per-CPU pvclock time info is updated if the | 93 | * so make sure we always re-fetch time-info for the current vcpu. |
93 | * underlying CPU changes. | ||
94 | * 3. that version is increased whenever underlying CPU | ||
95 | * changes. | ||
96 | * | ||
97 | */ | 94 | */ |
98 | do { | 95 | do { |
99 | cpu = __getcpu() & VGETCPU_CPU_MASK; | 96 | cpu = __getcpu() & VGETCPU_CPU_MASK; |
@@ -102,20 +99,27 @@ static notrace cycle_t vread_pvclock(int *mode) | |||
102 | * __getcpu() calls (Gleb). | 99 | * __getcpu() calls (Gleb). |
103 | */ | 100 | */ |
104 | 101 | ||
105 | pvti = get_pvti(cpu); | 102 | /* Make sure migrate_count will change if we leave the VCPU. */ |
103 | do { | ||
104 | pvti = get_pvti(cpu); | ||
105 | migrate_count = pvti->migrate_count; | ||
106 | |||
107 | cpu1 = cpu; | ||
108 | cpu = __getcpu() & VGETCPU_CPU_MASK; | ||
109 | } while (unlikely(cpu != cpu1)); | ||
106 | 110 | ||
107 | version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags); | 111 | version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags); |
108 | 112 | ||
109 | /* | 113 | /* |
110 | * Test we're still on the cpu as well as the version. | 114 | * Test we're still on the cpu as well as the version. |
111 | * We could have been migrated just after the first | 115 | * - We must read TSC of pvti's VCPU. |
112 | * vgetcpu but before fetching the version, so we | 116 | * - KVM doesn't follow the versioning protocol, so data could |
113 | * wouldn't notice a version change. | 117 | * change before version if we left the VCPU. |
114 | */ | 118 | */ |
115 | cpu1 = __getcpu() & VGETCPU_CPU_MASK; | 119 | smp_rmb(); |
116 | } while (unlikely(cpu != cpu1 || | 120 | } while (unlikely((pvti->pvti.version & 1) || |
117 | (pvti->pvti.version & 1) || | 121 | pvti->pvti.version != version || |
118 | pvti->pvti.version != version)); | 122 | pvti->migrate_count != migrate_count)); |
119 | 123 | ||
120 | if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT))) | 124 | if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT))) |
121 | *mode = VCLOCK_NONE; | 125 | *mode = VCLOCK_NONE; |
diff --git a/arch/x86/vdso/vdso32/syscall.S b/arch/x86/vdso/vdso32/syscall.S index 5415b5613d55..6b286bb5251c 100644 --- a/arch/x86/vdso/vdso32/syscall.S +++ b/arch/x86/vdso/vdso32/syscall.S | |||
@@ -19,8 +19,6 @@ __kernel_vsyscall: | |||
19 | .Lpush_ebp: | 19 | .Lpush_ebp: |
20 | movl %ecx, %ebp | 20 | movl %ecx, %ebp |
21 | syscall | 21 | syscall |
22 | movl $__USER32_DS, %ecx | ||
23 | movl %ecx, %ss | ||
24 | movl %ebp, %ecx | 22 | movl %ebp, %ecx |
25 | popl %ebp | 23 | popl %ebp |
26 | .Lpop_ebp: | 24 | .Lpop_ebp: |
diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index 7005ced5d1ad..70e060ad879a 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c | |||
@@ -7,6 +7,7 @@ | |||
7 | #include <xen/xen.h> | 7 | #include <xen/xen.h> |
8 | #include <xen/interface/physdev.h> | 8 | #include <xen/interface/physdev.h> |
9 | #include "xen-ops.h" | 9 | #include "xen-ops.h" |
10 | #include "smp.h" | ||
10 | 11 | ||
11 | static unsigned int xen_io_apic_read(unsigned apic, unsigned reg) | 12 | static unsigned int xen_io_apic_read(unsigned apic, unsigned reg) |
12 | { | 13 | { |
@@ -28,7 +29,186 @@ static unsigned int xen_io_apic_read(unsigned apic, unsigned reg) | |||
28 | return 0xfd; | 29 | return 0xfd; |
29 | } | 30 | } |
30 | 31 | ||
32 | static unsigned long xen_set_apic_id(unsigned int x) | ||
33 | { | ||
34 | WARN_ON(1); | ||
35 | return x; | ||
36 | } | ||
37 | |||
38 | static unsigned int xen_get_apic_id(unsigned long x) | ||
39 | { | ||
40 | return ((x)>>24) & 0xFFu; | ||
41 | } | ||
42 | |||
43 | static u32 xen_apic_read(u32 reg) | ||
44 | { | ||
45 | struct xen_platform_op op = { | ||
46 | .cmd = XENPF_get_cpuinfo, | ||
47 | .interface_version = XENPF_INTERFACE_VERSION, | ||
48 | .u.pcpu_info.xen_cpuid = 0, | ||
49 | }; | ||
50 | int ret = 0; | ||
51 | |||
52 | /* Shouldn't need this as APIC is turned off for PV, and we only | ||
53 | * get called on the bootup processor. But just in case. */ | ||
54 | if (!xen_initial_domain() || smp_processor_id()) | ||
55 | return 0; | ||
56 | |||
57 | if (reg == APIC_LVR) | ||
58 | return 0x10; | ||
59 | #ifdef CONFIG_X86_32 | ||
60 | if (reg == APIC_LDR) | ||
61 | return SET_APIC_LOGICAL_ID(1UL << smp_processor_id()); | ||
62 | #endif | ||
63 | if (reg != APIC_ID) | ||
64 | return 0; | ||
65 | |||
66 | ret = HYPERVISOR_dom0_op(&op); | ||
67 | if (ret) | ||
68 | return 0; | ||
69 | |||
70 | return op.u.pcpu_info.apic_id << 24; | ||
71 | } | ||
72 | |||
73 | static void xen_apic_write(u32 reg, u32 val) | ||
74 | { | ||
75 | /* Warn to see if there's any stray references */ | ||
76 | WARN(1,"register: %x, value: %x\n", reg, val); | ||
77 | } | ||
78 | |||
79 | static u64 xen_apic_icr_read(void) | ||
80 | { | ||
81 | return 0; | ||
82 | } | ||
83 | |||
84 | static void xen_apic_icr_write(u32 low, u32 id) | ||
85 | { | ||
86 | /* Warn to see if there's any stray references */ | ||
87 | WARN_ON(1); | ||
88 | } | ||
89 | |||
90 | static u32 xen_safe_apic_wait_icr_idle(void) | ||
91 | { | ||
92 | return 0; | ||
93 | } | ||
94 | |||
95 | static int xen_apic_probe_pv(void) | ||
96 | { | ||
97 | if (xen_pv_domain()) | ||
98 | return 1; | ||
99 | |||
100 | return 0; | ||
101 | } | ||
102 | |||
103 | static int xen_madt_oem_check(char *oem_id, char *oem_table_id) | ||
104 | { | ||
105 | return xen_pv_domain(); | ||
106 | } | ||
107 | |||
108 | static int xen_id_always_valid(int apicid) | ||
109 | { | ||
110 | return 1; | ||
111 | } | ||
112 | |||
113 | static int xen_id_always_registered(void) | ||
114 | { | ||
115 | return 1; | ||
116 | } | ||
117 | |||
118 | static int xen_phys_pkg_id(int initial_apic_id, int index_msb) | ||
119 | { | ||
120 | return initial_apic_id >> index_msb; | ||
121 | } | ||
122 | |||
123 | #ifdef CONFIG_X86_32 | ||
124 | static int xen_x86_32_early_logical_apicid(int cpu) | ||
125 | { | ||
126 | /* Match with APIC_LDR read. Otherwise setup_local_APIC complains. */ | ||
127 | return 1 << cpu; | ||
128 | } | ||
129 | #endif | ||
130 | |||
131 | static void xen_noop(void) | ||
132 | { | ||
133 | } | ||
134 | |||
135 | static void xen_silent_inquire(int apicid) | ||
136 | { | ||
137 | } | ||
138 | |||
139 | static struct apic xen_pv_apic = { | ||
140 | .name = "Xen PV", | ||
141 | .probe = xen_apic_probe_pv, | ||
142 | .acpi_madt_oem_check = xen_madt_oem_check, | ||
143 | .apic_id_valid = xen_id_always_valid, | ||
144 | .apic_id_registered = xen_id_always_registered, | ||
145 | |||
146 | /* .irq_delivery_mode - used in native_compose_msi_msg only */ | ||
147 | /* .irq_dest_mode - used in native_compose_msi_msg only */ | ||
148 | |||
149 | .target_cpus = default_target_cpus, | ||
150 | .disable_esr = 0, | ||
151 | /* .dest_logical - default_send_IPI_ use it but we use our own. */ | ||
152 | .check_apicid_used = default_check_apicid_used, /* Used on 32-bit */ | ||
153 | |||
154 | .vector_allocation_domain = flat_vector_allocation_domain, | ||
155 | .init_apic_ldr = xen_noop, /* setup_local_APIC calls it */ | ||
156 | |||
157 | .ioapic_phys_id_map = default_ioapic_phys_id_map, /* Used on 32-bit */ | ||
158 | .setup_apic_routing = NULL, | ||
159 | .cpu_present_to_apicid = default_cpu_present_to_apicid, | ||
160 | .apicid_to_cpu_present = physid_set_mask_of_physid, /* Used on 32-bit */ | ||
161 | .check_phys_apicid_present = default_check_phys_apicid_present, /* smp_sanity_check needs it */ | ||
162 | .phys_pkg_id = xen_phys_pkg_id, /* detect_ht */ | ||
163 | |||
164 | .get_apic_id = xen_get_apic_id, | ||
165 | .set_apic_id = xen_set_apic_id, /* Can be NULL on 32-bit. */ | ||
166 | .apic_id_mask = 0xFF << 24, /* Used by verify_local_APIC. Match with what xen_get_apic_id does. */ | ||
167 | |||
168 | .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, | ||
169 | |||
170 | #ifdef CONFIG_SMP | ||
171 | .send_IPI_mask = xen_send_IPI_mask, | ||
172 | .send_IPI_mask_allbutself = xen_send_IPI_mask_allbutself, | ||
173 | .send_IPI_allbutself = xen_send_IPI_allbutself, | ||
174 | .send_IPI_all = xen_send_IPI_all, | ||
175 | .send_IPI_self = xen_send_IPI_self, | ||
176 | #endif | ||
177 | /* .wait_for_init_deassert- used by AP bootup - smp_callin which we don't use */ | ||
178 | .inquire_remote_apic = xen_silent_inquire, | ||
179 | |||
180 | .read = xen_apic_read, | ||
181 | .write = xen_apic_write, | ||
182 | .eoi_write = xen_apic_write, | ||
183 | |||
184 | .icr_read = xen_apic_icr_read, | ||
185 | .icr_write = xen_apic_icr_write, | ||
186 | .wait_icr_idle = xen_noop, | ||
187 | .safe_wait_icr_idle = xen_safe_apic_wait_icr_idle, | ||
188 | |||
189 | #ifdef CONFIG_X86_32 | ||
190 | /* generic_processor_info and setup_local_APIC. */ | ||
191 | .x86_32_early_logical_apicid = xen_x86_32_early_logical_apicid, | ||
192 | #endif | ||
193 | }; | ||
194 | |||
195 | static void __init xen_apic_check(void) | ||
196 | { | ||
197 | if (apic == &xen_pv_apic) | ||
198 | return; | ||
199 | |||
200 | pr_info("Switched APIC routing from %s to %s.\n", apic->name, | ||
201 | xen_pv_apic.name); | ||
202 | apic = &xen_pv_apic; | ||
203 | } | ||
31 | void __init xen_init_apic(void) | 204 | void __init xen_init_apic(void) |
32 | { | 205 | { |
33 | x86_io_apic_ops.read = xen_io_apic_read; | 206 | x86_io_apic_ops.read = xen_io_apic_read; |
207 | /* On PV guests the APIC CPUID bit is disabled so none of the | ||
208 | * routines end up executing. */ | ||
209 | if (!xen_initial_domain()) | ||
210 | apic = &xen_pv_apic; | ||
211 | |||
212 | x86_platform.apic_post_init = xen_apic_check; | ||
34 | } | 213 | } |
214 | apic_driver(xen_pv_apic); | ||
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 5240f563076d..94578efd3067 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c | |||
@@ -912,6 +912,7 @@ static void xen_load_sp0(struct tss_struct *tss, | |||
912 | mcs = xen_mc_entry(0); | 912 | mcs = xen_mc_entry(0); |
913 | MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); | 913 | MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); |
914 | xen_mc_issue(PARAVIRT_LAZY_CPU); | 914 | xen_mc_issue(PARAVIRT_LAZY_CPU); |
915 | tss->x86_tss.sp0 = thread->sp0; | ||
915 | } | 916 | } |
916 | 917 | ||
917 | static void xen_set_iopl_mask(unsigned mask) | 918 | static void xen_set_iopl_mask(unsigned mask) |
@@ -927,92 +928,6 @@ static void xen_io_delay(void) | |||
927 | { | 928 | { |
928 | } | 929 | } |
929 | 930 | ||
930 | #ifdef CONFIG_X86_LOCAL_APIC | ||
931 | static unsigned long xen_set_apic_id(unsigned int x) | ||
932 | { | ||
933 | WARN_ON(1); | ||
934 | return x; | ||
935 | } | ||
936 | static unsigned int xen_get_apic_id(unsigned long x) | ||
937 | { | ||
938 | return ((x)>>24) & 0xFFu; | ||
939 | } | ||
940 | static u32 xen_apic_read(u32 reg) | ||
941 | { | ||
942 | struct xen_platform_op op = { | ||
943 | .cmd = XENPF_get_cpuinfo, | ||
944 | .interface_version = XENPF_INTERFACE_VERSION, | ||
945 | .u.pcpu_info.xen_cpuid = 0, | ||
946 | }; | ||
947 | int ret = 0; | ||
948 | |||
949 | /* Shouldn't need this as APIC is turned off for PV, and we only | ||
950 | * get called on the bootup processor. But just in case. */ | ||
951 | if (!xen_initial_domain() || smp_processor_id()) | ||
952 | return 0; | ||
953 | |||
954 | if (reg == APIC_LVR) | ||
955 | return 0x10; | ||
956 | |||
957 | if (reg != APIC_ID) | ||
958 | return 0; | ||
959 | |||
960 | ret = HYPERVISOR_dom0_op(&op); | ||
961 | if (ret) | ||
962 | return 0; | ||
963 | |||
964 | return op.u.pcpu_info.apic_id << 24; | ||
965 | } | ||
966 | |||
967 | static void xen_apic_write(u32 reg, u32 val) | ||
968 | { | ||
969 | /* Warn to see if there's any stray references */ | ||
970 | WARN_ON(1); | ||
971 | } | ||
972 | |||
973 | static u64 xen_apic_icr_read(void) | ||
974 | { | ||
975 | return 0; | ||
976 | } | ||
977 | |||
978 | static void xen_apic_icr_write(u32 low, u32 id) | ||
979 | { | ||
980 | /* Warn to see if there's any stray references */ | ||
981 | WARN_ON(1); | ||
982 | } | ||
983 | |||
984 | static void xen_apic_wait_icr_idle(void) | ||
985 | { | ||
986 | return; | ||
987 | } | ||
988 | |||
989 | static u32 xen_safe_apic_wait_icr_idle(void) | ||
990 | { | ||
991 | return 0; | ||
992 | } | ||
993 | |||
994 | static void set_xen_basic_apic_ops(void) | ||
995 | { | ||
996 | apic->read = xen_apic_read; | ||
997 | apic->write = xen_apic_write; | ||
998 | apic->icr_read = xen_apic_icr_read; | ||
999 | apic->icr_write = xen_apic_icr_write; | ||
1000 | apic->wait_icr_idle = xen_apic_wait_icr_idle; | ||
1001 | apic->safe_wait_icr_idle = xen_safe_apic_wait_icr_idle; | ||
1002 | apic->set_apic_id = xen_set_apic_id; | ||
1003 | apic->get_apic_id = xen_get_apic_id; | ||
1004 | |||
1005 | #ifdef CONFIG_SMP | ||
1006 | apic->send_IPI_allbutself = xen_send_IPI_allbutself; | ||
1007 | apic->send_IPI_mask_allbutself = xen_send_IPI_mask_allbutself; | ||
1008 | apic->send_IPI_mask = xen_send_IPI_mask; | ||
1009 | apic->send_IPI_all = xen_send_IPI_all; | ||
1010 | apic->send_IPI_self = xen_send_IPI_self; | ||
1011 | #endif | ||
1012 | } | ||
1013 | |||
1014 | #endif | ||
1015 | |||
1016 | static void xen_clts(void) | 931 | static void xen_clts(void) |
1017 | { | 932 | { |
1018 | struct multicall_space mcs; | 933 | struct multicall_space mcs; |
@@ -1618,7 +1533,7 @@ asmlinkage __visible void __init xen_start_kernel(void) | |||
1618 | /* | 1533 | /* |
1619 | * set up the basic apic ops. | 1534 | * set up the basic apic ops. |
1620 | */ | 1535 | */ |
1621 | set_xen_basic_apic_ops(); | 1536 | xen_init_apic(); |
1622 | #endif | 1537 | #endif |
1623 | 1538 | ||
1624 | if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { | 1539 | if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { |
@@ -1731,8 +1646,6 @@ asmlinkage __visible void __init xen_start_kernel(void) | |||
1731 | if (HYPERVISOR_dom0_op(&op) == 0) | 1646 | if (HYPERVISOR_dom0_op(&op) == 0) |
1732 | boot_params.kbd_status = op.u.firmware_info.u.kbd_shift_flags; | 1647 | boot_params.kbd_status = op.u.firmware_info.u.kbd_shift_flags; |
1733 | 1648 | ||
1734 | xen_init_apic(); | ||
1735 | |||
1736 | /* Make sure ACS will be enabled */ | 1649 | /* Make sure ACS will be enabled */ |
1737 | pci_request_acs(); | 1650 | pci_request_acs(); |
1738 | 1651 | ||
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index adca9e2b6553..dd151b2045b0 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c | |||
@@ -502,7 +502,7 @@ __visible pmd_t xen_make_pmd(pmdval_t pmd) | |||
502 | } | 502 | } |
503 | PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd); | 503 | PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd); |
504 | 504 | ||
505 | #if PAGETABLE_LEVELS == 4 | 505 | #if CONFIG_PGTABLE_LEVELS == 4 |
506 | __visible pudval_t xen_pud_val(pud_t pud) | 506 | __visible pudval_t xen_pud_val(pud_t pud) |
507 | { | 507 | { |
508 | return pte_mfn_to_pfn(pud.pud); | 508 | return pte_mfn_to_pfn(pud.pud); |
@@ -589,7 +589,7 @@ static void xen_set_pgd(pgd_t *ptr, pgd_t val) | |||
589 | 589 | ||
590 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 590 | xen_mc_issue(PARAVIRT_LAZY_MMU); |
591 | } | 591 | } |
592 | #endif /* PAGETABLE_LEVELS == 4 */ | 592 | #endif /* CONFIG_PGTABLE_LEVELS == 4 */ |
593 | 593 | ||
594 | /* | 594 | /* |
595 | * (Yet another) pagetable walker. This one is intended for pinning a | 595 | * (Yet another) pagetable walker. This one is intended for pinning a |
@@ -1628,7 +1628,7 @@ static void xen_release_pmd(unsigned long pfn) | |||
1628 | xen_release_ptpage(pfn, PT_PMD); | 1628 | xen_release_ptpage(pfn, PT_PMD); |
1629 | } | 1629 | } |
1630 | 1630 | ||
1631 | #if PAGETABLE_LEVELS == 4 | 1631 | #if CONFIG_PGTABLE_LEVELS == 4 |
1632 | static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) | 1632 | static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) |
1633 | { | 1633 | { |
1634 | xen_alloc_ptpage(mm, pfn, PT_PUD); | 1634 | xen_alloc_ptpage(mm, pfn, PT_PUD); |
@@ -2046,7 +2046,7 @@ static void __init xen_post_allocator_init(void) | |||
2046 | pv_mmu_ops.set_pte = xen_set_pte; | 2046 | pv_mmu_ops.set_pte = xen_set_pte; |
2047 | pv_mmu_ops.set_pmd = xen_set_pmd; | 2047 | pv_mmu_ops.set_pmd = xen_set_pmd; |
2048 | pv_mmu_ops.set_pud = xen_set_pud; | 2048 | pv_mmu_ops.set_pud = xen_set_pud; |
2049 | #if PAGETABLE_LEVELS == 4 | 2049 | #if CONFIG_PGTABLE_LEVELS == 4 |
2050 | pv_mmu_ops.set_pgd = xen_set_pgd; | 2050 | pv_mmu_ops.set_pgd = xen_set_pgd; |
2051 | #endif | 2051 | #endif |
2052 | 2052 | ||
@@ -2056,7 +2056,7 @@ static void __init xen_post_allocator_init(void) | |||
2056 | pv_mmu_ops.alloc_pmd = xen_alloc_pmd; | 2056 | pv_mmu_ops.alloc_pmd = xen_alloc_pmd; |
2057 | pv_mmu_ops.release_pte = xen_release_pte; | 2057 | pv_mmu_ops.release_pte = xen_release_pte; |
2058 | pv_mmu_ops.release_pmd = xen_release_pmd; | 2058 | pv_mmu_ops.release_pmd = xen_release_pmd; |
2059 | #if PAGETABLE_LEVELS == 4 | 2059 | #if CONFIG_PGTABLE_LEVELS == 4 |
2060 | pv_mmu_ops.alloc_pud = xen_alloc_pud; | 2060 | pv_mmu_ops.alloc_pud = xen_alloc_pud; |
2061 | pv_mmu_ops.release_pud = xen_release_pud; | 2061 | pv_mmu_ops.release_pud = xen_release_pud; |
2062 | #endif | 2062 | #endif |
@@ -2122,14 +2122,14 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { | |||
2122 | .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), | 2122 | .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), |
2123 | .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), | 2123 | .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), |
2124 | 2124 | ||
2125 | #if PAGETABLE_LEVELS == 4 | 2125 | #if CONFIG_PGTABLE_LEVELS == 4 |
2126 | .pud_val = PV_CALLEE_SAVE(xen_pud_val), | 2126 | .pud_val = PV_CALLEE_SAVE(xen_pud_val), |
2127 | .make_pud = PV_CALLEE_SAVE(xen_make_pud), | 2127 | .make_pud = PV_CALLEE_SAVE(xen_make_pud), |
2128 | .set_pgd = xen_set_pgd_hyper, | 2128 | .set_pgd = xen_set_pgd_hyper, |
2129 | 2129 | ||
2130 | .alloc_pud = xen_alloc_pmd_init, | 2130 | .alloc_pud = xen_alloc_pmd_init, |
2131 | .release_pud = xen_release_pmd_init, | 2131 | .release_pud = xen_release_pmd_init, |
2132 | #endif /* PAGETABLE_LEVELS == 4 */ | 2132 | #endif /* CONFIG_PGTABLE_LEVELS == 4 */ |
2133 | 2133 | ||
2134 | .activate_mm = xen_activate_mm, | 2134 | .activate_mm = xen_activate_mm, |
2135 | .dup_mmap = xen_dup_mmap, | 2135 | .dup_mmap = xen_dup_mmap, |
@@ -2436,99 +2436,11 @@ void __init xen_hvm_init_mmu_ops(void) | |||
2436 | } | 2436 | } |
2437 | #endif | 2437 | #endif |
2438 | 2438 | ||
2439 | #ifdef CONFIG_XEN_PVH | ||
2440 | /* | ||
2441 | * Map foreign gfn (fgfn), to local pfn (lpfn). This for the user | ||
2442 | * space creating new guest on pvh dom0 and needing to map domU pages. | ||
2443 | */ | ||
2444 | static int xlate_add_to_p2m(unsigned long lpfn, unsigned long fgfn, | ||
2445 | unsigned int domid) | ||
2446 | { | ||
2447 | int rc, err = 0; | ||
2448 | xen_pfn_t gpfn = lpfn; | ||
2449 | xen_ulong_t idx = fgfn; | ||
2450 | |||
2451 | struct xen_add_to_physmap_range xatp = { | ||
2452 | .domid = DOMID_SELF, | ||
2453 | .foreign_domid = domid, | ||
2454 | .size = 1, | ||
2455 | .space = XENMAPSPACE_gmfn_foreign, | ||
2456 | }; | ||
2457 | set_xen_guest_handle(xatp.idxs, &idx); | ||
2458 | set_xen_guest_handle(xatp.gpfns, &gpfn); | ||
2459 | set_xen_guest_handle(xatp.errs, &err); | ||
2460 | |||
2461 | rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp); | ||
2462 | if (rc < 0) | ||
2463 | return rc; | ||
2464 | return err; | ||
2465 | } | ||
2466 | |||
2467 | static int xlate_remove_from_p2m(unsigned long spfn, int count) | ||
2468 | { | ||
2469 | struct xen_remove_from_physmap xrp; | ||
2470 | int i, rc; | ||
2471 | |||
2472 | for (i = 0; i < count; i++) { | ||
2473 | xrp.domid = DOMID_SELF; | ||
2474 | xrp.gpfn = spfn+i; | ||
2475 | rc = HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp); | ||
2476 | if (rc) | ||
2477 | break; | ||
2478 | } | ||
2479 | return rc; | ||
2480 | } | ||
2481 | |||
2482 | struct xlate_remap_data { | ||
2483 | unsigned long fgfn; /* foreign domain's gfn */ | ||
2484 | pgprot_t prot; | ||
2485 | domid_t domid; | ||
2486 | int index; | ||
2487 | struct page **pages; | ||
2488 | }; | ||
2489 | |||
2490 | static int xlate_map_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr, | ||
2491 | void *data) | ||
2492 | { | ||
2493 | int rc; | ||
2494 | struct xlate_remap_data *remap = data; | ||
2495 | unsigned long pfn = page_to_pfn(remap->pages[remap->index++]); | ||
2496 | pte_t pteval = pte_mkspecial(pfn_pte(pfn, remap->prot)); | ||
2497 | |||
2498 | rc = xlate_add_to_p2m(pfn, remap->fgfn, remap->domid); | ||
2499 | if (rc) | ||
2500 | return rc; | ||
2501 | native_set_pte(ptep, pteval); | ||
2502 | |||
2503 | return 0; | ||
2504 | } | ||
2505 | |||
2506 | static int xlate_remap_gfn_range(struct vm_area_struct *vma, | ||
2507 | unsigned long addr, unsigned long mfn, | ||
2508 | int nr, pgprot_t prot, unsigned domid, | ||
2509 | struct page **pages) | ||
2510 | { | ||
2511 | int err; | ||
2512 | struct xlate_remap_data pvhdata; | ||
2513 | |||
2514 | BUG_ON(!pages); | ||
2515 | |||
2516 | pvhdata.fgfn = mfn; | ||
2517 | pvhdata.prot = prot; | ||
2518 | pvhdata.domid = domid; | ||
2519 | pvhdata.index = 0; | ||
2520 | pvhdata.pages = pages; | ||
2521 | err = apply_to_page_range(vma->vm_mm, addr, nr << PAGE_SHIFT, | ||
2522 | xlate_map_pte_fn, &pvhdata); | ||
2523 | flush_tlb_all(); | ||
2524 | return err; | ||
2525 | } | ||
2526 | #endif | ||
2527 | |||
2528 | #define REMAP_BATCH_SIZE 16 | 2439 | #define REMAP_BATCH_SIZE 16 |
2529 | 2440 | ||
2530 | struct remap_data { | 2441 | struct remap_data { |
2531 | unsigned long mfn; | 2442 | xen_pfn_t *mfn; |
2443 | bool contiguous; | ||
2532 | pgprot_t prot; | 2444 | pgprot_t prot; |
2533 | struct mmu_update *mmu_update; | 2445 | struct mmu_update *mmu_update; |
2534 | }; | 2446 | }; |
@@ -2537,7 +2449,14 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token, | |||
2537 | unsigned long addr, void *data) | 2449 | unsigned long addr, void *data) |
2538 | { | 2450 | { |
2539 | struct remap_data *rmd = data; | 2451 | struct remap_data *rmd = data; |
2540 | pte_t pte = pte_mkspecial(mfn_pte(rmd->mfn++, rmd->prot)); | 2452 | pte_t pte = pte_mkspecial(mfn_pte(*rmd->mfn, rmd->prot)); |
2453 | |||
2454 | /* If we have a contigious range, just update the mfn itself, | ||
2455 | else update pointer to be "next mfn". */ | ||
2456 | if (rmd->contiguous) | ||
2457 | (*rmd->mfn)++; | ||
2458 | else | ||
2459 | rmd->mfn++; | ||
2541 | 2460 | ||
2542 | rmd->mmu_update->ptr = virt_to_machine(ptep).maddr; | 2461 | rmd->mmu_update->ptr = virt_to_machine(ptep).maddr; |
2543 | rmd->mmu_update->val = pte_val_ma(pte); | 2462 | rmd->mmu_update->val = pte_val_ma(pte); |
@@ -2546,26 +2465,26 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token, | |||
2546 | return 0; | 2465 | return 0; |
2547 | } | 2466 | } |
2548 | 2467 | ||
2549 | int xen_remap_domain_mfn_range(struct vm_area_struct *vma, | 2468 | static int do_remap_mfn(struct vm_area_struct *vma, |
2550 | unsigned long addr, | 2469 | unsigned long addr, |
2551 | xen_pfn_t mfn, int nr, | 2470 | xen_pfn_t *mfn, int nr, |
2552 | pgprot_t prot, unsigned domid, | 2471 | int *err_ptr, pgprot_t prot, |
2553 | struct page **pages) | 2472 | unsigned domid, |
2554 | 2473 | struct page **pages) | |
2555 | { | 2474 | { |
2475 | int err = 0; | ||
2556 | struct remap_data rmd; | 2476 | struct remap_data rmd; |
2557 | struct mmu_update mmu_update[REMAP_BATCH_SIZE]; | 2477 | struct mmu_update mmu_update[REMAP_BATCH_SIZE]; |
2558 | int batch; | ||
2559 | unsigned long range; | 2478 | unsigned long range; |
2560 | int err = 0; | 2479 | int mapped = 0; |
2561 | 2480 | ||
2562 | BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO))); | 2481 | BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO))); |
2563 | 2482 | ||
2564 | if (xen_feature(XENFEAT_auto_translated_physmap)) { | 2483 | if (xen_feature(XENFEAT_auto_translated_physmap)) { |
2565 | #ifdef CONFIG_XEN_PVH | 2484 | #ifdef CONFIG_XEN_PVH |
2566 | /* We need to update the local page tables and the xen HAP */ | 2485 | /* We need to update the local page tables and the xen HAP */ |
2567 | return xlate_remap_gfn_range(vma, addr, mfn, nr, prot, | 2486 | return xen_xlate_remap_gfn_array(vma, addr, mfn, nr, err_ptr, |
2568 | domid, pages); | 2487 | prot, domid, pages); |
2569 | #else | 2488 | #else |
2570 | return -EINVAL; | 2489 | return -EINVAL; |
2571 | #endif | 2490 | #endif |
@@ -2573,9 +2492,15 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, | |||
2573 | 2492 | ||
2574 | rmd.mfn = mfn; | 2493 | rmd.mfn = mfn; |
2575 | rmd.prot = prot; | 2494 | rmd.prot = prot; |
2495 | /* We use the err_ptr to indicate if there we are doing a contigious | ||
2496 | * mapping or a discontigious mapping. */ | ||
2497 | rmd.contiguous = !err_ptr; | ||
2576 | 2498 | ||
2577 | while (nr) { | 2499 | while (nr) { |
2578 | batch = min(REMAP_BATCH_SIZE, nr); | 2500 | int index = 0; |
2501 | int done = 0; | ||
2502 | int batch = min(REMAP_BATCH_SIZE, nr); | ||
2503 | int batch_left = batch; | ||
2579 | range = (unsigned long)batch << PAGE_SHIFT; | 2504 | range = (unsigned long)batch << PAGE_SHIFT; |
2580 | 2505 | ||
2581 | rmd.mmu_update = mmu_update; | 2506 | rmd.mmu_update = mmu_update; |
@@ -2584,23 +2509,72 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, | |||
2584 | if (err) | 2509 | if (err) |
2585 | goto out; | 2510 | goto out; |
2586 | 2511 | ||
2587 | err = HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid); | 2512 | /* We record the error for each page that gives an error, but |
2588 | if (err < 0) | 2513 | * continue mapping until the whole set is done */ |
2589 | goto out; | 2514 | do { |
2515 | int i; | ||
2516 | |||
2517 | err = HYPERVISOR_mmu_update(&mmu_update[index], | ||
2518 | batch_left, &done, domid); | ||
2519 | |||
2520 | /* | ||
2521 | * @err_ptr may be the same buffer as @mfn, so | ||
2522 | * only clear it after each chunk of @mfn is | ||
2523 | * used. | ||
2524 | */ | ||
2525 | if (err_ptr) { | ||
2526 | for (i = index; i < index + done; i++) | ||
2527 | err_ptr[i] = 0; | ||
2528 | } | ||
2529 | if (err < 0) { | ||
2530 | if (!err_ptr) | ||
2531 | goto out; | ||
2532 | err_ptr[i] = err; | ||
2533 | done++; /* Skip failed frame. */ | ||
2534 | } else | ||
2535 | mapped += done; | ||
2536 | batch_left -= done; | ||
2537 | index += done; | ||
2538 | } while (batch_left); | ||
2590 | 2539 | ||
2591 | nr -= batch; | 2540 | nr -= batch; |
2592 | addr += range; | 2541 | addr += range; |
2542 | if (err_ptr) | ||
2543 | err_ptr += batch; | ||
2593 | } | 2544 | } |
2594 | |||
2595 | err = 0; | ||
2596 | out: | 2545 | out: |
2597 | 2546 | ||
2598 | xen_flush_tlb_all(); | 2547 | xen_flush_tlb_all(); |
2599 | 2548 | ||
2600 | return err; | 2549 | return err < 0 ? err : mapped; |
2550 | } | ||
2551 | |||
2552 | int xen_remap_domain_mfn_range(struct vm_area_struct *vma, | ||
2553 | unsigned long addr, | ||
2554 | xen_pfn_t mfn, int nr, | ||
2555 | pgprot_t prot, unsigned domid, | ||
2556 | struct page **pages) | ||
2557 | { | ||
2558 | return do_remap_mfn(vma, addr, &mfn, nr, NULL, prot, domid, pages); | ||
2601 | } | 2559 | } |
2602 | EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range); | 2560 | EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range); |
2603 | 2561 | ||
2562 | int xen_remap_domain_mfn_array(struct vm_area_struct *vma, | ||
2563 | unsigned long addr, | ||
2564 | xen_pfn_t *mfn, int nr, | ||
2565 | int *err_ptr, pgprot_t prot, | ||
2566 | unsigned domid, struct page **pages) | ||
2567 | { | ||
2568 | /* We BUG_ON because it's a programmer error to pass a NULL err_ptr, | ||
2569 | * and the consequences later is quite hard to detect what the actual | ||
2570 | * cause of "wrong memory was mapped in". | ||
2571 | */ | ||
2572 | BUG_ON(err_ptr == NULL); | ||
2573 | return do_remap_mfn(vma, addr, mfn, nr, err_ptr, prot, domid, pages); | ||
2574 | } | ||
2575 | EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_array); | ||
2576 | |||
2577 | |||
2604 | /* Returns: 0 success */ | 2578 | /* Returns: 0 success */ |
2605 | int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, | 2579 | int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, |
2606 | int numpgs, struct page **pages) | 2580 | int numpgs, struct page **pages) |
@@ -2609,22 +2583,7 @@ int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, | |||
2609 | return 0; | 2583 | return 0; |
2610 | 2584 | ||
2611 | #ifdef CONFIG_XEN_PVH | 2585 | #ifdef CONFIG_XEN_PVH |
2612 | while (numpgs--) { | 2586 | return xen_xlate_unmap_gfn_range(vma, numpgs, pages); |
2613 | /* | ||
2614 | * The mmu has already cleaned up the process mmu | ||
2615 | * resources at this point (lookup_address will return | ||
2616 | * NULL). | ||
2617 | */ | ||
2618 | unsigned long pfn = page_to_pfn(pages[numpgs]); | ||
2619 | |||
2620 | xlate_remove_from_p2m(pfn, 1); | ||
2621 | } | ||
2622 | /* | ||
2623 | * We don't need to flush tlbs because as part of | ||
2624 | * xlate_remove_from_p2m, the hypervisor will do tlb flushes | ||
2625 | * after removing the p2m entries from the EPT/NPT | ||
2626 | */ | ||
2627 | return 0; | ||
2628 | #else | 2587 | #else |
2629 | return -EINVAL; | 2588 | return -EINVAL; |
2630 | #endif | 2589 | #endif |
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 08e8489c47f1..86484384492e 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c | |||
@@ -90,14 +90,10 @@ static void cpu_bringup(void) | |||
90 | 90 | ||
91 | set_cpu_online(cpu, true); | 91 | set_cpu_online(cpu, true); |
92 | 92 | ||
93 | this_cpu_write(cpu_state, CPU_ONLINE); | 93 | cpu_set_state_online(cpu); /* Implies full memory barrier. */ |
94 | |||
95 | wmb(); | ||
96 | 94 | ||
97 | /* We can take interrupts now: we're officially "up". */ | 95 | /* We can take interrupts now: we're officially "up". */ |
98 | local_irq_enable(); | 96 | local_irq_enable(); |
99 | |||
100 | wmb(); /* make sure everything is out */ | ||
101 | } | 97 | } |
102 | 98 | ||
103 | /* | 99 | /* |
@@ -445,21 +441,19 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle) | |||
445 | { | 441 | { |
446 | int rc; | 442 | int rc; |
447 | 443 | ||
448 | per_cpu(current_task, cpu) = idle; | 444 | common_cpu_up(cpu, idle); |
449 | #ifdef CONFIG_X86_32 | ||
450 | irq_ctx_init(cpu); | ||
451 | #else | ||
452 | clear_tsk_thread_flag(idle, TIF_FORK); | ||
453 | #endif | ||
454 | per_cpu(kernel_stack, cpu) = | ||
455 | (unsigned long)task_stack_page(idle) - | ||
456 | KERNEL_STACK_OFFSET + THREAD_SIZE; | ||
457 | 445 | ||
458 | xen_setup_runstate_info(cpu); | 446 | xen_setup_runstate_info(cpu); |
459 | xen_setup_timer(cpu); | 447 | xen_setup_timer(cpu); |
460 | xen_init_lock_cpu(cpu); | 448 | xen_init_lock_cpu(cpu); |
461 | 449 | ||
462 | per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; | 450 | /* |
451 | * PV VCPUs are always successfully taken down (see 'while' loop | ||
452 | * in xen_cpu_die()), so -EBUSY is an error. | ||
453 | */ | ||
454 | rc = cpu_check_up_prepare(cpu); | ||
455 | if (rc) | ||
456 | return rc; | ||
463 | 457 | ||
464 | /* make sure interrupts start blocked */ | 458 | /* make sure interrupts start blocked */ |
465 | per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; | 459 | per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; |
@@ -468,10 +462,6 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle) | |||
468 | if (rc) | 462 | if (rc) |
469 | return rc; | 463 | return rc; |
470 | 464 | ||
471 | if (num_online_cpus() == 1) | ||
472 | /* Just in case we booted with a single CPU. */ | ||
473 | alternatives_enable_smp(); | ||
474 | |||
475 | rc = xen_smp_intr_init(cpu); | 465 | rc = xen_smp_intr_init(cpu); |
476 | if (rc) | 466 | if (rc) |
477 | return rc; | 467 | return rc; |
@@ -479,10 +469,8 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle) | |||
479 | rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL); | 469 | rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL); |
480 | BUG_ON(rc); | 470 | BUG_ON(rc); |
481 | 471 | ||
482 | while(per_cpu(cpu_state, cpu) != CPU_ONLINE) { | 472 | while (cpu_report_state(cpu) != CPU_ONLINE) |
483 | HYPERVISOR_sched_op(SCHEDOP_yield, NULL); | 473 | HYPERVISOR_sched_op(SCHEDOP_yield, NULL); |
484 | barrier(); | ||
485 | } | ||
486 | 474 | ||
487 | return 0; | 475 | return 0; |
488 | } | 476 | } |
@@ -511,11 +499,11 @@ static void xen_cpu_die(unsigned int cpu) | |||
511 | schedule_timeout(HZ/10); | 499 | schedule_timeout(HZ/10); |
512 | } | 500 | } |
513 | 501 | ||
514 | cpu_die_common(cpu); | 502 | if (common_cpu_die(cpu) == 0) { |
515 | 503 | xen_smp_intr_free(cpu); | |
516 | xen_smp_intr_free(cpu); | 504 | xen_uninit_lock_cpu(cpu); |
517 | xen_uninit_lock_cpu(cpu); | 505 | xen_teardown_timer(cpu); |
518 | xen_teardown_timer(cpu); | 506 | } |
519 | } | 507 | } |
520 | 508 | ||
521 | static void xen_play_dead(void) /* used only with HOTPLUG_CPU */ | 509 | static void xen_play_dead(void) /* used only with HOTPLUG_CPU */ |
@@ -747,6 +735,16 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) | |||
747 | static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle) | 735 | static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle) |
748 | { | 736 | { |
749 | int rc; | 737 | int rc; |
738 | |||
739 | /* | ||
740 | * This can happen if CPU was offlined earlier and | ||
741 | * offlining timed out in common_cpu_die(). | ||
742 | */ | ||
743 | if (cpu_report_state(cpu) == CPU_DEAD_FROZEN) { | ||
744 | xen_smp_intr_free(cpu); | ||
745 | xen_uninit_lock_cpu(cpu); | ||
746 | } | ||
747 | |||
750 | /* | 748 | /* |
751 | * xen_smp_intr_init() needs to run before native_cpu_up() | 749 | * xen_smp_intr_init() needs to run before native_cpu_up() |
752 | * so that IPI vectors are set up on the booting CPU before | 750 | * so that IPI vectors are set up on the booting CPU before |
@@ -768,12 +766,6 @@ static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle) | |||
768 | return rc; | 766 | return rc; |
769 | } | 767 | } |
770 | 768 | ||
771 | static void xen_hvm_cpu_die(unsigned int cpu) | ||
772 | { | ||
773 | xen_cpu_die(cpu); | ||
774 | native_cpu_die(cpu); | ||
775 | } | ||
776 | |||
777 | void __init xen_hvm_smp_init(void) | 769 | void __init xen_hvm_smp_init(void) |
778 | { | 770 | { |
779 | if (!xen_have_vector_callback) | 771 | if (!xen_have_vector_callback) |
@@ -781,7 +773,7 @@ void __init xen_hvm_smp_init(void) | |||
781 | smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus; | 773 | smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus; |
782 | smp_ops.smp_send_reschedule = xen_smp_send_reschedule; | 774 | smp_ops.smp_send_reschedule = xen_smp_send_reschedule; |
783 | smp_ops.cpu_up = xen_hvm_cpu_up; | 775 | smp_ops.cpu_up = xen_hvm_cpu_up; |
784 | smp_ops.cpu_die = xen_hvm_cpu_die; | 776 | smp_ops.cpu_die = xen_cpu_die; |
785 | smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi; | 777 | smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi; |
786 | smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi; | 778 | smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi; |
787 | smp_ops.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu; | 779 | smp_ops.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu; |
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index c4df9dbd63b7..d9497698645a 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c | |||
@@ -1,5 +1,5 @@ | |||
1 | #include <linux/types.h> | 1 | #include <linux/types.h> |
2 | #include <linux/clockchips.h> | 2 | #include <linux/tick.h> |
3 | 3 | ||
4 | #include <xen/interface/xen.h> | 4 | #include <xen/interface/xen.h> |
5 | #include <xen/grant_table.h> | 5 | #include <xen/grant_table.h> |
@@ -81,17 +81,14 @@ void xen_arch_post_suspend(int cancelled) | |||
81 | 81 | ||
82 | static void xen_vcpu_notify_restore(void *data) | 82 | static void xen_vcpu_notify_restore(void *data) |
83 | { | 83 | { |
84 | unsigned long reason = (unsigned long)data; | ||
85 | |||
86 | /* Boot processor notified via generic timekeeping_resume() */ | 84 | /* Boot processor notified via generic timekeeping_resume() */ |
87 | if ( smp_processor_id() == 0) | 85 | if (smp_processor_id() == 0) |
88 | return; | 86 | return; |
89 | 87 | ||
90 | clockevents_notify(reason, NULL); | 88 | tick_resume_local(); |
91 | } | 89 | } |
92 | 90 | ||
93 | void xen_arch_resume(void) | 91 | void xen_arch_resume(void) |
94 | { | 92 | { |
95 | on_each_cpu(xen_vcpu_notify_restore, | 93 | on_each_cpu(xen_vcpu_notify_restore, NULL, 1); |
96 | (void *)CLOCK_EVT_NOTIFY_RESUME, 1); | ||
97 | } | 94 | } |
diff --git a/arch/x86/xen/trace.c b/arch/x86/xen/trace.c index 520022d1a181..a702ec2f5931 100644 --- a/arch/x86/xen/trace.c +++ b/arch/x86/xen/trace.c | |||
@@ -1,54 +1,12 @@ | |||
1 | #include <linux/ftrace.h> | 1 | #include <linux/ftrace.h> |
2 | #include <xen/interface/xen.h> | 2 | #include <xen/interface/xen.h> |
3 | #include <xen/interface/xen-mca.h> | ||
3 | 4 | ||
4 | #define N(x) [__HYPERVISOR_##x] = "("#x")" | 5 | #define HYPERCALL(x) [__HYPERVISOR_##x] = "("#x")", |
5 | static const char *xen_hypercall_names[] = { | 6 | static const char *xen_hypercall_names[] = { |
6 | N(set_trap_table), | 7 | #include <asm/xen-hypercalls.h> |
7 | N(mmu_update), | ||
8 | N(set_gdt), | ||
9 | N(stack_switch), | ||
10 | N(set_callbacks), | ||
11 | N(fpu_taskswitch), | ||
12 | N(sched_op_compat), | ||
13 | N(dom0_op), | ||
14 | N(set_debugreg), | ||
15 | N(get_debugreg), | ||
16 | N(update_descriptor), | ||
17 | N(memory_op), | ||
18 | N(multicall), | ||
19 | N(update_va_mapping), | ||
20 | N(set_timer_op), | ||
21 | N(event_channel_op_compat), | ||
22 | N(xen_version), | ||
23 | N(console_io), | ||
24 | N(physdev_op_compat), | ||
25 | N(grant_table_op), | ||
26 | N(vm_assist), | ||
27 | N(update_va_mapping_otherdomain), | ||
28 | N(iret), | ||
29 | N(vcpu_op), | ||
30 | N(set_segment_base), | ||
31 | N(mmuext_op), | ||
32 | N(acm_op), | ||
33 | N(nmi_op), | ||
34 | N(sched_op), | ||
35 | N(callback_op), | ||
36 | N(xenoprof_op), | ||
37 | N(event_channel_op), | ||
38 | N(physdev_op), | ||
39 | N(hvm_op), | ||
40 | |||
41 | /* Architecture-specific hypercall definitions. */ | ||
42 | N(arch_0), | ||
43 | N(arch_1), | ||
44 | N(arch_2), | ||
45 | N(arch_3), | ||
46 | N(arch_4), | ||
47 | N(arch_5), | ||
48 | N(arch_6), | ||
49 | N(arch_7), | ||
50 | }; | 8 | }; |
51 | #undef N | 9 | #undef HYPERCALL |
52 | 10 | ||
53 | static const char *xen_hypercall_name(unsigned op) | 11 | static const char *xen_hypercall_name(unsigned op) |
54 | { | 12 | { |
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 53adefda4275..985fc3ee0973 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S | |||
@@ -68,11 +68,11 @@ ENTRY(xen_sysret64) | |||
68 | * We're already on the usermode stack at this point, but | 68 | * We're already on the usermode stack at this point, but |
69 | * still with the kernel gs, so we can easily switch back | 69 | * still with the kernel gs, so we can easily switch back |
70 | */ | 70 | */ |
71 | movq %rsp, PER_CPU_VAR(old_rsp) | 71 | movq %rsp, PER_CPU_VAR(rsp_scratch) |
72 | movq PER_CPU_VAR(kernel_stack), %rsp | 72 | movq PER_CPU_VAR(kernel_stack), %rsp |
73 | 73 | ||
74 | pushq $__USER_DS | 74 | pushq $__USER_DS |
75 | pushq PER_CPU_VAR(old_rsp) | 75 | pushq PER_CPU_VAR(rsp_scratch) |
76 | pushq %r11 | 76 | pushq %r11 |
77 | pushq $__USER_CS | 77 | pushq $__USER_CS |
78 | pushq %rcx | 78 | pushq %rcx |
@@ -87,11 +87,11 @@ ENTRY(xen_sysret32) | |||
87 | * We're already on the usermode stack at this point, but | 87 | * We're already on the usermode stack at this point, but |
88 | * still with the kernel gs, so we can easily switch back | 88 | * still with the kernel gs, so we can easily switch back |
89 | */ | 89 | */ |
90 | movq %rsp, PER_CPU_VAR(old_rsp) | 90 | movq %rsp, PER_CPU_VAR(rsp_scratch) |
91 | movq PER_CPU_VAR(kernel_stack), %rsp | 91 | movq PER_CPU_VAR(kernel_stack), %rsp |
92 | 92 | ||
93 | pushq $__USER32_DS | 93 | pushq $__USER32_DS |
94 | pushq PER_CPU_VAR(old_rsp) | 94 | pushq PER_CPU_VAR(rsp_scratch) |
95 | pushq %r11 | 95 | pushq %r11 |
96 | pushq $__USER32_CS | 96 | pushq $__USER32_CS |
97 | pushq %rcx | 97 | pushq %rcx |
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 674b222544b7..8afdfccf6086 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S | |||
@@ -12,6 +12,8 @@ | |||
12 | 12 | ||
13 | #include <xen/interface/elfnote.h> | 13 | #include <xen/interface/elfnote.h> |
14 | #include <xen/interface/features.h> | 14 | #include <xen/interface/features.h> |
15 | #include <xen/interface/xen.h> | ||
16 | #include <xen/interface/xen-mca.h> | ||
15 | #include <asm/xen/interface.h> | 17 | #include <asm/xen/interface.h> |
16 | 18 | ||
17 | #ifdef CONFIG_XEN_PVH | 19 | #ifdef CONFIG_XEN_PVH |
@@ -85,59 +87,14 @@ ENTRY(xen_pvh_early_cpu_init) | |||
85 | .pushsection .text | 87 | .pushsection .text |
86 | .balign PAGE_SIZE | 88 | .balign PAGE_SIZE |
87 | ENTRY(hypercall_page) | 89 | ENTRY(hypercall_page) |
88 | #define NEXT_HYPERCALL(x) \ | 90 | .skip PAGE_SIZE |
89 | ENTRY(xen_hypercall_##x) \ | 91 | |
90 | .skip 32 | 92 | #define HYPERCALL(n) \ |
91 | 93 | .equ xen_hypercall_##n, hypercall_page + __HYPERVISOR_##n * 32; \ | |
92 | NEXT_HYPERCALL(set_trap_table) | 94 | .type xen_hypercall_##n, @function; .size xen_hypercall_##n, 32 |
93 | NEXT_HYPERCALL(mmu_update) | 95 | #include <asm/xen-hypercalls.h> |
94 | NEXT_HYPERCALL(set_gdt) | 96 | #undef HYPERCALL |
95 | NEXT_HYPERCALL(stack_switch) | 97 | |
96 | NEXT_HYPERCALL(set_callbacks) | ||
97 | NEXT_HYPERCALL(fpu_taskswitch) | ||
98 | NEXT_HYPERCALL(sched_op_compat) | ||
99 | NEXT_HYPERCALL(platform_op) | ||
100 | NEXT_HYPERCALL(set_debugreg) | ||
101 | NEXT_HYPERCALL(get_debugreg) | ||
102 | NEXT_HYPERCALL(update_descriptor) | ||
103 | NEXT_HYPERCALL(ni) | ||
104 | NEXT_HYPERCALL(memory_op) | ||
105 | NEXT_HYPERCALL(multicall) | ||
106 | NEXT_HYPERCALL(update_va_mapping) | ||
107 | NEXT_HYPERCALL(set_timer_op) | ||
108 | NEXT_HYPERCALL(event_channel_op_compat) | ||
109 | NEXT_HYPERCALL(xen_version) | ||
110 | NEXT_HYPERCALL(console_io) | ||
111 | NEXT_HYPERCALL(physdev_op_compat) | ||
112 | NEXT_HYPERCALL(grant_table_op) | ||
113 | NEXT_HYPERCALL(vm_assist) | ||
114 | NEXT_HYPERCALL(update_va_mapping_otherdomain) | ||
115 | NEXT_HYPERCALL(iret) | ||
116 | NEXT_HYPERCALL(vcpu_op) | ||
117 | NEXT_HYPERCALL(set_segment_base) | ||
118 | NEXT_HYPERCALL(mmuext_op) | ||
119 | NEXT_HYPERCALL(xsm_op) | ||
120 | NEXT_HYPERCALL(nmi_op) | ||
121 | NEXT_HYPERCALL(sched_op) | ||
122 | NEXT_HYPERCALL(callback_op) | ||
123 | NEXT_HYPERCALL(xenoprof_op) | ||
124 | NEXT_HYPERCALL(event_channel_op) | ||
125 | NEXT_HYPERCALL(physdev_op) | ||
126 | NEXT_HYPERCALL(hvm_op) | ||
127 | NEXT_HYPERCALL(sysctl) | ||
128 | NEXT_HYPERCALL(domctl) | ||
129 | NEXT_HYPERCALL(kexec_op) | ||
130 | NEXT_HYPERCALL(tmem_op) /* 38 */ | ||
131 | ENTRY(xen_hypercall_rsvr) | ||
132 | .skip 320 | ||
133 | NEXT_HYPERCALL(mca) /* 48 */ | ||
134 | NEXT_HYPERCALL(arch_1) | ||
135 | NEXT_HYPERCALL(arch_2) | ||
136 | NEXT_HYPERCALL(arch_3) | ||
137 | NEXT_HYPERCALL(arch_4) | ||
138 | NEXT_HYPERCALL(arch_5) | ||
139 | NEXT_HYPERCALL(arch_6) | ||
140 | .balign PAGE_SIZE | ||
141 | .popsection | 98 | .popsection |
142 | 99 | ||
143 | ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") | 100 | ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") |