diff options
author | Len Brown <len.brown@intel.com> | 2010-08-15 01:06:31 -0400 |
---|---|---|
committer | Len Brown <len.brown@intel.com> | 2010-08-15 01:06:31 -0400 |
commit | 95ee46aa8698f2000647dfb362400fadbb5807cf (patch) | |
tree | e5a05c7297f997e191c73091934e42e3195c0e40 /arch/x86 | |
parent | cfa806f059801dbe7e435745eb2e187c8bfe1e7f (diff) | |
parent | 92fa5bd9a946b6e7aab6764e7312e4e3d9bed295 (diff) |
Merge branch 'linus' into release
Conflicts:
drivers/acpi/debug.c
Signed-off-by: Len Brown <len.brown@intel.com>
Diffstat (limited to 'arch/x86')
202 files changed, 6510 insertions, 3148 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index dcb0593b4a66..a84fc34c8f77 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -55,6 +55,7 @@ config X86 | |||
55 | select HAVE_HW_BREAKPOINT | 55 | select HAVE_HW_BREAKPOINT |
56 | select HAVE_MIXED_BREAKPOINTS_REGS | 56 | select HAVE_MIXED_BREAKPOINTS_REGS |
57 | select PERF_EVENTS | 57 | select PERF_EVENTS |
58 | select HAVE_PERF_EVENTS_NMI | ||
58 | select ANON_INODES | 59 | select ANON_INODES |
59 | select HAVE_ARCH_KMEMCHECK | 60 | select HAVE_ARCH_KMEMCHECK |
60 | select HAVE_USER_RETURN_NOTIFIER | 61 | select HAVE_USER_RETURN_NOTIFIER |
@@ -72,9 +73,6 @@ config ARCH_DEFCONFIG | |||
72 | default "arch/x86/configs/i386_defconfig" if X86_32 | 73 | default "arch/x86/configs/i386_defconfig" if X86_32 |
73 | default "arch/x86/configs/x86_64_defconfig" if X86_64 | 74 | default "arch/x86/configs/x86_64_defconfig" if X86_64 |
74 | 75 | ||
75 | config GENERIC_TIME | ||
76 | def_bool y | ||
77 | |||
78 | config GENERIC_CMOS_UPDATE | 76 | config GENERIC_CMOS_UPDATE |
79 | def_bool y | 77 | def_bool y |
80 | 78 | ||
@@ -2046,7 +2044,7 @@ config SCx200 | |||
2046 | 2044 | ||
2047 | config SCx200HR_TIMER | 2045 | config SCx200HR_TIMER |
2048 | tristate "NatSemi SCx200 27MHz High-Resolution Timer Support" | 2046 | tristate "NatSemi SCx200 27MHz High-Resolution Timer Support" |
2049 | depends on SCx200 && GENERIC_TIME | 2047 | depends on SCx200 |
2050 | default y | 2048 | default y |
2051 | ---help--- | 2049 | ---help--- |
2052 | This driver provides a clocksource built upon the on-chip | 2050 | This driver provides a clocksource built upon the on-chip |
@@ -2062,6 +2060,15 @@ config OLPC | |||
2062 | Add support for detecting the unique features of the OLPC | 2060 | Add support for detecting the unique features of the OLPC |
2063 | XO hardware. | 2061 | XO hardware. |
2064 | 2062 | ||
2063 | config OLPC_OPENFIRMWARE | ||
2064 | bool "Support for OLPC's Open Firmware" | ||
2065 | depends on !X86_64 && !X86_PAE | ||
2066 | default y if OLPC | ||
2067 | help | ||
2068 | This option adds support for the implementation of Open Firmware | ||
2069 | that is used on the OLPC XO-1 Children's Machine. | ||
2070 | If unsure, say N here. | ||
2071 | |||
2065 | endif # X86_32 | 2072 | endif # X86_32 |
2066 | 2073 | ||
2067 | config K8_NB | 2074 | config K8_NB |
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index ec749c2bfdd7..f7cb086b4add 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile | |||
@@ -26,10 +26,10 @@ targets := vmlinux.bin setup.bin setup.elf bzImage | |||
26 | targets += fdimage fdimage144 fdimage288 image.iso mtools.conf | 26 | targets += fdimage fdimage144 fdimage288 image.iso mtools.conf |
27 | subdir- := compressed | 27 | subdir- := compressed |
28 | 28 | ||
29 | setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpucheck.o edd.o | 29 | setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpucheck.o |
30 | setup-y += header.o main.o mca.o memory.o pm.o pmjump.o | 30 | setup-y += early_serial_console.o edd.o header.o main.o mca.o memory.o |
31 | setup-y += printf.o regs.o string.o tty.o video.o video-mode.o | 31 | setup-y += pm.o pmjump.o printf.o regs.o string.o tty.o video.o |
32 | setup-y += version.o | 32 | setup-y += video-mode.o version.o |
33 | setup-$(CONFIG_X86_APM_BOOT) += apm.o | 33 | setup-$(CONFIG_X86_APM_BOOT) += apm.o |
34 | 34 | ||
35 | # The link order of the video-*.o modules can matter. In particular, | 35 | # The link order of the video-*.o modules can matter. In particular, |
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 98239d2658f2..c7093bd9f2d3 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h | |||
@@ -28,6 +28,7 @@ | |||
28 | #include "bitops.h" | 28 | #include "bitops.h" |
29 | #include <asm/cpufeature.h> | 29 | #include <asm/cpufeature.h> |
30 | #include <asm/processor-flags.h> | 30 | #include <asm/processor-flags.h> |
31 | #include "ctype.h" | ||
31 | 32 | ||
32 | /* Useful macros */ | 33 | /* Useful macros */ |
33 | #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) | 34 | #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) |
@@ -37,6 +38,8 @@ | |||
37 | extern struct setup_header hdr; | 38 | extern struct setup_header hdr; |
38 | extern struct boot_params boot_params; | 39 | extern struct boot_params boot_params; |
39 | 40 | ||
41 | #define cpu_relax() asm volatile("rep; nop") | ||
42 | |||
40 | /* Basic port I/O */ | 43 | /* Basic port I/O */ |
41 | static inline void outb(u8 v, u16 port) | 44 | static inline void outb(u8 v, u16 port) |
42 | { | 45 | { |
@@ -198,11 +201,6 @@ static inline int memcmp_gs(const void *s1, addr_t s2, size_t len) | |||
198 | return diff; | 201 | return diff; |
199 | } | 202 | } |
200 | 203 | ||
201 | static inline int isdigit(int ch) | ||
202 | { | ||
203 | return (ch >= '0') && (ch <= '9'); | ||
204 | } | ||
205 | |||
206 | /* Heap -- available for dynamic lists. */ | 204 | /* Heap -- available for dynamic lists. */ |
207 | extern char _end[]; | 205 | extern char _end[]; |
208 | extern char *HEAP; | 206 | extern char *HEAP; |
@@ -287,8 +285,18 @@ struct biosregs { | |||
287 | void intcall(u8 int_no, const struct biosregs *ireg, struct biosregs *oreg); | 285 | void intcall(u8 int_no, const struct biosregs *ireg, struct biosregs *oreg); |
288 | 286 | ||
289 | /* cmdline.c */ | 287 | /* cmdline.c */ |
290 | int cmdline_find_option(const char *option, char *buffer, int bufsize); | 288 | int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int bufsize); |
291 | int cmdline_find_option_bool(const char *option); | 289 | int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option); |
290 | static inline int cmdline_find_option(const char *option, char *buffer, int bufsize) | ||
291 | { | ||
292 | return __cmdline_find_option(boot_params.hdr.cmd_line_ptr, option, buffer, bufsize); | ||
293 | } | ||
294 | |||
295 | static inline int cmdline_find_option_bool(const char *option) | ||
296 | { | ||
297 | return __cmdline_find_option_bool(boot_params.hdr.cmd_line_ptr, option); | ||
298 | } | ||
299 | |||
292 | 300 | ||
293 | /* cpu.c, cpucheck.c */ | 301 | /* cpu.c, cpucheck.c */ |
294 | struct cpu_features { | 302 | struct cpu_features { |
@@ -300,6 +308,10 @@ extern struct cpu_features cpu; | |||
300 | int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr); | 308 | int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr); |
301 | int validate_cpu(void); | 309 | int validate_cpu(void); |
302 | 310 | ||
311 | /* early_serial_console.c */ | ||
312 | extern int early_serial_base; | ||
313 | void console_init(void); | ||
314 | |||
303 | /* edd.c */ | 315 | /* edd.c */ |
304 | void query_edd(void); | 316 | void query_edd(void); |
305 | 317 | ||
@@ -329,8 +341,10 @@ void initregs(struct biosregs *regs); | |||
329 | 341 | ||
330 | /* string.c */ | 342 | /* string.c */ |
331 | int strcmp(const char *str1, const char *str2); | 343 | int strcmp(const char *str1, const char *str2); |
344 | int strncmp(const char *cs, const char *ct, size_t count); | ||
332 | size_t strnlen(const char *s, size_t maxlen); | 345 | size_t strnlen(const char *s, size_t maxlen); |
333 | unsigned int atou(const char *s); | 346 | unsigned int atou(const char *s); |
347 | unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base); | ||
334 | 348 | ||
335 | /* tty.c */ | 349 | /* tty.c */ |
336 | void puts(const char *); | 350 | void puts(const char *); |
diff --git a/arch/x86/boot/cmdline.c b/arch/x86/boot/cmdline.c index a1d35634bce0..6b3b6f708c04 100644 --- a/arch/x86/boot/cmdline.c +++ b/arch/x86/boot/cmdline.c | |||
@@ -27,9 +27,8 @@ static inline int myisspace(u8 c) | |||
27 | * Returns the length of the argument (regardless of if it was | 27 | * Returns the length of the argument (regardless of if it was |
28 | * truncated to fit in the buffer), or -1 on not found. | 28 | * truncated to fit in the buffer), or -1 on not found. |
29 | */ | 29 | */ |
30 | int cmdline_find_option(const char *option, char *buffer, int bufsize) | 30 | int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int bufsize) |
31 | { | 31 | { |
32 | u32 cmdline_ptr = boot_params.hdr.cmd_line_ptr; | ||
33 | addr_t cptr; | 32 | addr_t cptr; |
34 | char c; | 33 | char c; |
35 | int len = -1; | 34 | int len = -1; |
@@ -100,9 +99,8 @@ int cmdline_find_option(const char *option, char *buffer, int bufsize) | |||
100 | * Returns the position of that option (starts counting with 1) | 99 | * Returns the position of that option (starts counting with 1) |
101 | * or 0 on not found | 100 | * or 0 on not found |
102 | */ | 101 | */ |
103 | int cmdline_find_option_bool(const char *option) | 102 | int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option) |
104 | { | 103 | { |
105 | u32 cmdline_ptr = boot_params.hdr.cmd_line_ptr; | ||
106 | addr_t cptr; | 104 | addr_t cptr; |
107 | char c; | 105 | char c; |
108 | int pos = 0, wstart = 0; | 106 | int pos = 0, wstart = 0; |
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index fbb47daf2459..0c229551eead 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile | |||
@@ -4,7 +4,7 @@ | |||
4 | # create a compressed vmlinux image from the original vmlinux | 4 | # create a compressed vmlinux image from the original vmlinux |
5 | # | 5 | # |
6 | 6 | ||
7 | targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.lzo head_$(BITS).o misc.o piggy.o | 7 | targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.lzo head_$(BITS).o misc.o string.o cmdline.o early_serial_console.o piggy.o |
8 | 8 | ||
9 | KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 | 9 | KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 |
10 | KBUILD_CFLAGS += -fno-strict-aliasing -fPIC | 10 | KBUILD_CFLAGS += -fno-strict-aliasing -fPIC |
@@ -23,7 +23,7 @@ LDFLAGS_vmlinux := -T | |||
23 | 23 | ||
24 | hostprogs-y := mkpiggy | 24 | hostprogs-y := mkpiggy |
25 | 25 | ||
26 | $(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE | 26 | $(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o $(obj)/piggy.o FORCE |
27 | $(call if_changed,ld) | 27 | $(call if_changed,ld) |
28 | @: | 28 | @: |
29 | 29 | ||
diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c new file mode 100644 index 000000000000..cb62f786990d --- /dev/null +++ b/arch/x86/boot/compressed/cmdline.c | |||
@@ -0,0 +1,21 @@ | |||
1 | #include "misc.h" | ||
2 | |||
3 | static unsigned long fs; | ||
4 | static inline void set_fs(unsigned long seg) | ||
5 | { | ||
6 | fs = seg << 4; /* shift it back */ | ||
7 | } | ||
8 | typedef unsigned long addr_t; | ||
9 | static inline char rdfs8(addr_t addr) | ||
10 | { | ||
11 | return *((char *)(fs + addr)); | ||
12 | } | ||
13 | #include "../cmdline.c" | ||
14 | int cmdline_find_option(const char *option, char *buffer, int bufsize) | ||
15 | { | ||
16 | return __cmdline_find_option(real_mode->hdr.cmd_line_ptr, option, buffer, bufsize); | ||
17 | } | ||
18 | int cmdline_find_option_bool(const char *option) | ||
19 | { | ||
20 | return __cmdline_find_option_bool(real_mode->hdr.cmd_line_ptr, option); | ||
21 | } | ||
diff --git a/arch/x86/boot/compressed/early_serial_console.c b/arch/x86/boot/compressed/early_serial_console.c new file mode 100644 index 000000000000..261e81fb9582 --- /dev/null +++ b/arch/x86/boot/compressed/early_serial_console.c | |||
@@ -0,0 +1,5 @@ | |||
1 | #include "misc.h" | ||
2 | |||
3 | int early_serial_base; | ||
4 | |||
5 | #include "../early_serial_console.c" | ||
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index f543b70ffae2..67a655a39ce4 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S | |||
@@ -124,6 +124,19 @@ relocated: | |||
124 | rep stosl | 124 | rep stosl |
125 | 125 | ||
126 | /* | 126 | /* |
127 | * Adjust our own GOT | ||
128 | */ | ||
129 | leal _got(%ebx), %edx | ||
130 | leal _egot(%ebx), %ecx | ||
131 | 1: | ||
132 | cmpl %ecx, %edx | ||
133 | jae 2f | ||
134 | addl %ebx, (%edx) | ||
135 | addl $4, %edx | ||
136 | jmp 1b | ||
137 | 2: | ||
138 | |||
139 | /* | ||
127 | * Do the decompression, and jump to the new kernel.. | 140 | * Do the decompression, and jump to the new kernel.. |
128 | */ | 141 | */ |
129 | leal z_extract_offset_negative(%ebx), %ebp | 142 | leal z_extract_offset_negative(%ebx), %ebp |
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index faff0dc9c06a..52f85a196fa0 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S | |||
@@ -280,6 +280,19 @@ relocated: | |||
280 | rep stosq | 280 | rep stosq |
281 | 281 | ||
282 | /* | 282 | /* |
283 | * Adjust our own GOT | ||
284 | */ | ||
285 | leaq _got(%rip), %rdx | ||
286 | leaq _egot(%rip), %rcx | ||
287 | 1: | ||
288 | cmpq %rcx, %rdx | ||
289 | jae 2f | ||
290 | addq %rbx, (%rdx) | ||
291 | addq $8, %rdx | ||
292 | jmp 1b | ||
293 | 2: | ||
294 | |||
295 | /* | ||
283 | * Do the decompression, and jump to the new kernel.. | 296 | * Do the decompression, and jump to the new kernel.. |
284 | */ | 297 | */ |
285 | pushq %rsi /* Save the real mode argument */ | 298 | pushq %rsi /* Save the real mode argument */ |
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 51e240779a44..8f7bef8e9fff 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c | |||
@@ -9,23 +9,7 @@ | |||
9 | * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 | 9 | * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 |
10 | */ | 10 | */ |
11 | 11 | ||
12 | /* | 12 | #include "misc.h" |
13 | * we have to be careful, because no indirections are allowed here, and | ||
14 | * paravirt_ops is a kind of one. As it will only run in baremetal anyway, | ||
15 | * we just keep it from happening | ||
16 | */ | ||
17 | #undef CONFIG_PARAVIRT | ||
18 | #ifdef CONFIG_X86_32 | ||
19 | #define _ASM_X86_DESC_H 1 | ||
20 | #endif | ||
21 | |||
22 | #include <linux/linkage.h> | ||
23 | #include <linux/screen_info.h> | ||
24 | #include <linux/elf.h> | ||
25 | #include <linux/io.h> | ||
26 | #include <asm/page.h> | ||
27 | #include <asm/boot.h> | ||
28 | #include <asm/bootparam.h> | ||
29 | 13 | ||
30 | /* WARNING!! | 14 | /* WARNING!! |
31 | * This code is compiled with -fPIC and it is relocated dynamically | 15 | * This code is compiled with -fPIC and it is relocated dynamically |
@@ -123,15 +107,13 @@ static void error(char *m); | |||
123 | /* | 107 | /* |
124 | * This is set up by the setup-routine at boot-time | 108 | * This is set up by the setup-routine at boot-time |
125 | */ | 109 | */ |
126 | static struct boot_params *real_mode; /* Pointer to real-mode data */ | 110 | struct boot_params *real_mode; /* Pointer to real-mode data */ |
127 | static int quiet; | 111 | static int quiet; |
112 | static int debug; | ||
128 | 113 | ||
129 | void *memset(void *s, int c, size_t n); | 114 | void *memset(void *s, int c, size_t n); |
130 | void *memcpy(void *dest, const void *src, size_t n); | 115 | void *memcpy(void *dest, const void *src, size_t n); |
131 | 116 | ||
132 | static void __putstr(int, const char *); | ||
133 | #define putstr(__x) __putstr(0, __x) | ||
134 | |||
135 | #ifdef CONFIG_X86_64 | 117 | #ifdef CONFIG_X86_64 |
136 | #define memptr long | 118 | #define memptr long |
137 | #else | 119 | #else |
@@ -170,7 +152,21 @@ static void scroll(void) | |||
170 | vidmem[i] = ' '; | 152 | vidmem[i] = ' '; |
171 | } | 153 | } |
172 | 154 | ||
173 | static void __putstr(int error, const char *s) | 155 | #define XMTRDY 0x20 |
156 | |||
157 | #define TXR 0 /* Transmit register (WRITE) */ | ||
158 | #define LSR 5 /* Line Status */ | ||
159 | static void serial_putchar(int ch) | ||
160 | { | ||
161 | unsigned timeout = 0xffff; | ||
162 | |||
163 | while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) | ||
164 | cpu_relax(); | ||
165 | |||
166 | outb(ch, early_serial_base + TXR); | ||
167 | } | ||
168 | |||
169 | void __putstr(int error, const char *s) | ||
174 | { | 170 | { |
175 | int x, y, pos; | 171 | int x, y, pos; |
176 | char c; | 172 | char c; |
@@ -179,6 +175,14 @@ static void __putstr(int error, const char *s) | |||
179 | if (!error) | 175 | if (!error) |
180 | return; | 176 | return; |
181 | #endif | 177 | #endif |
178 | if (early_serial_base) { | ||
179 | const char *str = s; | ||
180 | while (*str) { | ||
181 | if (*str == '\n') | ||
182 | serial_putchar('\r'); | ||
183 | serial_putchar(*str++); | ||
184 | } | ||
185 | } | ||
182 | 186 | ||
183 | if (real_mode->screen_info.orig_video_mode == 0 && | 187 | if (real_mode->screen_info.orig_video_mode == 0 && |
184 | lines == 0 && cols == 0) | 188 | lines == 0 && cols == 0) |
@@ -305,8 +309,10 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap, | |||
305 | { | 309 | { |
306 | real_mode = rmode; | 310 | real_mode = rmode; |
307 | 311 | ||
308 | if (real_mode->hdr.loadflags & QUIET_FLAG) | 312 | if (cmdline_find_option_bool("quiet")) |
309 | quiet = 1; | 313 | quiet = 1; |
314 | if (cmdline_find_option_bool("debug")) | ||
315 | debug = 1; | ||
310 | 316 | ||
311 | if (real_mode->screen_info.orig_video_mode == 7) { | 317 | if (real_mode->screen_info.orig_video_mode == 7) { |
312 | vidmem = (char *) 0xb0000; | 318 | vidmem = (char *) 0xb0000; |
@@ -319,6 +325,10 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap, | |||
319 | lines = real_mode->screen_info.orig_video_lines; | 325 | lines = real_mode->screen_info.orig_video_lines; |
320 | cols = real_mode->screen_info.orig_video_cols; | 326 | cols = real_mode->screen_info.orig_video_cols; |
321 | 327 | ||
328 | console_init(); | ||
329 | if (debug) | ||
330 | putstr("early console in decompress_kernel\n"); | ||
331 | |||
322 | free_mem_ptr = heap; /* Heap */ | 332 | free_mem_ptr = heap; /* Heap */ |
323 | free_mem_end_ptr = heap + BOOT_HEAP_SIZE; | 333 | free_mem_end_ptr = heap + BOOT_HEAP_SIZE; |
324 | 334 | ||
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h new file mode 100644 index 000000000000..3f19c81a6203 --- /dev/null +++ b/arch/x86/boot/compressed/misc.h | |||
@@ -0,0 +1,39 @@ | |||
1 | #ifndef BOOT_COMPRESSED_MISC_H | ||
2 | #define BOOT_COMPRESSED_MISC_H | ||
3 | |||
4 | /* | ||
5 | * we have to be careful, because no indirections are allowed here, and | ||
6 | * paravirt_ops is a kind of one. As it will only run in baremetal anyway, | ||
7 | * we just keep it from happening | ||
8 | */ | ||
9 | #undef CONFIG_PARAVIRT | ||
10 | #ifdef CONFIG_X86_32 | ||
11 | #define _ASM_X86_DESC_H 1 | ||
12 | #endif | ||
13 | |||
14 | #include <linux/linkage.h> | ||
15 | #include <linux/screen_info.h> | ||
16 | #include <linux/elf.h> | ||
17 | #include <linux/io.h> | ||
18 | #include <asm/page.h> | ||
19 | #include <asm/boot.h> | ||
20 | #include <asm/bootparam.h> | ||
21 | |||
22 | #define BOOT_BOOT_H | ||
23 | #include "../ctype.h" | ||
24 | |||
25 | /* misc.c */ | ||
26 | extern struct boot_params *real_mode; /* Pointer to real-mode data */ | ||
27 | void __putstr(int error, const char *s); | ||
28 | #define putstr(__x) __putstr(0, __x) | ||
29 | #define puts(__x) __putstr(0, __x) | ||
30 | |||
31 | /* cmdline.c */ | ||
32 | int cmdline_find_option(const char *option, char *buffer, int bufsize); | ||
33 | int cmdline_find_option_bool(const char *option); | ||
34 | |||
35 | /* early_serial_console.c */ | ||
36 | extern int early_serial_base; | ||
37 | void console_init(void); | ||
38 | |||
39 | #endif | ||
diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c new file mode 100644 index 000000000000..19b3e693cd72 --- /dev/null +++ b/arch/x86/boot/compressed/string.c | |||
@@ -0,0 +1,2 @@ | |||
1 | #include "misc.h" | ||
2 | #include "../string.c" | ||
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S index 5ddabceee124..34d047c98284 100644 --- a/arch/x86/boot/compressed/vmlinux.lds.S +++ b/arch/x86/boot/compressed/vmlinux.lds.S | |||
@@ -41,6 +41,12 @@ SECTIONS | |||
41 | *(.rodata.*) | 41 | *(.rodata.*) |
42 | _erodata = . ; | 42 | _erodata = . ; |
43 | } | 43 | } |
44 | .got : { | ||
45 | _got = .; | ||
46 | KEEP(*(.got.plt)) | ||
47 | KEEP(*(.got)) | ||
48 | _egot = .; | ||
49 | } | ||
44 | .data : { | 50 | .data : { |
45 | _data = . ; | 51 | _data = . ; |
46 | *(.data) | 52 | *(.data) |
diff --git a/arch/x86/boot/ctype.h b/arch/x86/boot/ctype.h new file mode 100644 index 000000000000..25e13403193c --- /dev/null +++ b/arch/x86/boot/ctype.h | |||
@@ -0,0 +1,21 @@ | |||
1 | #ifndef BOOT_ISDIGIT_H | ||
2 | |||
3 | #define BOOT_ISDIGIT_H | ||
4 | |||
5 | static inline int isdigit(int ch) | ||
6 | { | ||
7 | return (ch >= '0') && (ch <= '9'); | ||
8 | } | ||
9 | |||
10 | static inline int isxdigit(int ch) | ||
11 | { | ||
12 | if (isdigit(ch)) | ||
13 | return true; | ||
14 | |||
15 | if ((ch >= 'a') && (ch <= 'f')) | ||
16 | return true; | ||
17 | |||
18 | return (ch >= 'A') && (ch <= 'F'); | ||
19 | } | ||
20 | |||
21 | #endif | ||
diff --git a/arch/x86/boot/early_serial_console.c b/arch/x86/boot/early_serial_console.c new file mode 100644 index 000000000000..030f4b93e255 --- /dev/null +++ b/arch/x86/boot/early_serial_console.c | |||
@@ -0,0 +1,139 @@ | |||
1 | #include "boot.h" | ||
2 | |||
3 | #define DEFAULT_SERIAL_PORT 0x3f8 /* ttyS0 */ | ||
4 | |||
5 | #define XMTRDY 0x20 | ||
6 | |||
7 | #define DLAB 0x80 | ||
8 | |||
9 | #define TXR 0 /* Transmit register (WRITE) */ | ||
10 | #define RXR 0 /* Receive register (READ) */ | ||
11 | #define IER 1 /* Interrupt Enable */ | ||
12 | #define IIR 2 /* Interrupt ID */ | ||
13 | #define FCR 2 /* FIFO control */ | ||
14 | #define LCR 3 /* Line control */ | ||
15 | #define MCR 4 /* Modem control */ | ||
16 | #define LSR 5 /* Line Status */ | ||
17 | #define MSR 6 /* Modem Status */ | ||
18 | #define DLL 0 /* Divisor Latch Low */ | ||
19 | #define DLH 1 /* Divisor latch High */ | ||
20 | |||
21 | #define DEFAULT_BAUD 9600 | ||
22 | |||
23 | static void early_serial_init(int port, int baud) | ||
24 | { | ||
25 | unsigned char c; | ||
26 | unsigned divisor; | ||
27 | |||
28 | outb(0x3, port + LCR); /* 8n1 */ | ||
29 | outb(0, port + IER); /* no interrupt */ | ||
30 | outb(0, port + FCR); /* no fifo */ | ||
31 | outb(0x3, port + MCR); /* DTR + RTS */ | ||
32 | |||
33 | divisor = 115200 / baud; | ||
34 | c = inb(port + LCR); | ||
35 | outb(c | DLAB, port + LCR); | ||
36 | outb(divisor & 0xff, port + DLL); | ||
37 | outb((divisor >> 8) & 0xff, port + DLH); | ||
38 | outb(c & ~DLAB, port + LCR); | ||
39 | |||
40 | early_serial_base = port; | ||
41 | } | ||
42 | |||
43 | static void parse_earlyprintk(void) | ||
44 | { | ||
45 | int baud = DEFAULT_BAUD; | ||
46 | char arg[32]; | ||
47 | int pos = 0; | ||
48 | int port = 0; | ||
49 | |||
50 | if (cmdline_find_option("earlyprintk", arg, sizeof arg) > 0) { | ||
51 | char *e; | ||
52 | |||
53 | if (!strncmp(arg, "serial", 6)) { | ||
54 | port = DEFAULT_SERIAL_PORT; | ||
55 | pos += 6; | ||
56 | } | ||
57 | |||
58 | if (arg[pos] == ',') | ||
59 | pos++; | ||
60 | |||
61 | if (!strncmp(arg, "ttyS", 4)) { | ||
62 | static const int bases[] = { 0x3f8, 0x2f8 }; | ||
63 | int idx = 0; | ||
64 | |||
65 | if (!strncmp(arg + pos, "ttyS", 4)) | ||
66 | pos += 4; | ||
67 | |||
68 | if (arg[pos++] == '1') | ||
69 | idx = 1; | ||
70 | |||
71 | port = bases[idx]; | ||
72 | } | ||
73 | |||
74 | if (arg[pos] == ',') | ||
75 | pos++; | ||
76 | |||
77 | baud = simple_strtoull(arg + pos, &e, 0); | ||
78 | if (baud == 0 || arg + pos == e) | ||
79 | baud = DEFAULT_BAUD; | ||
80 | } | ||
81 | |||
82 | if (port) | ||
83 | early_serial_init(port, baud); | ||
84 | } | ||
85 | |||
86 | #define BASE_BAUD (1843200/16) | ||
87 | static unsigned int probe_baud(int port) | ||
88 | { | ||
89 | unsigned char lcr, dll, dlh; | ||
90 | unsigned int quot; | ||
91 | |||
92 | lcr = inb(port + LCR); | ||
93 | outb(lcr | DLAB, port + LCR); | ||
94 | dll = inb(port + DLL); | ||
95 | dlh = inb(port + DLH); | ||
96 | outb(lcr, port + LCR); | ||
97 | quot = (dlh << 8) | dll; | ||
98 | |||
99 | return BASE_BAUD / quot; | ||
100 | } | ||
101 | |||
102 | static void parse_console_uart8250(void) | ||
103 | { | ||
104 | char optstr[64], *options; | ||
105 | int baud = DEFAULT_BAUD; | ||
106 | int port = 0; | ||
107 | |||
108 | /* | ||
109 | * console=uart8250,io,0x3f8,115200n8 | ||
110 | * need to make sure it is last one console ! | ||
111 | */ | ||
112 | if (cmdline_find_option("console", optstr, sizeof optstr) <= 0) | ||
113 | return; | ||
114 | |||
115 | options = optstr; | ||
116 | |||
117 | if (!strncmp(options, "uart8250,io,", 12)) | ||
118 | port = simple_strtoull(options + 12, &options, 0); | ||
119 | else if (!strncmp(options, "uart,io,", 8)) | ||
120 | port = simple_strtoull(options + 8, &options, 0); | ||
121 | else | ||
122 | return; | ||
123 | |||
124 | if (options && (options[0] == ',')) | ||
125 | baud = simple_strtoull(options + 1, &options, 0); | ||
126 | else | ||
127 | baud = probe_baud(port); | ||
128 | |||
129 | if (port) | ||
130 | early_serial_init(port, baud); | ||
131 | } | ||
132 | |||
133 | void console_init(void) | ||
134 | { | ||
135 | parse_earlyprintk(); | ||
136 | |||
137 | if (!early_serial_base) | ||
138 | parse_console_uart8250(); | ||
139 | } | ||
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c index 140172b895bd..40358c8905be 100644 --- a/arch/x86/boot/main.c +++ b/arch/x86/boot/main.c | |||
@@ -130,6 +130,11 @@ void main(void) | |||
130 | /* First, copy the boot header into the "zeropage" */ | 130 | /* First, copy the boot header into the "zeropage" */ |
131 | copy_boot_params(); | 131 | copy_boot_params(); |
132 | 132 | ||
133 | /* Initialize the early-boot console */ | ||
134 | console_init(); | ||
135 | if (cmdline_find_option_bool("debug")) | ||
136 | puts("early console in setup code\n"); | ||
137 | |||
133 | /* End of heap check */ | 138 | /* End of heap check */ |
134 | init_heap(); | 139 | init_heap(); |
135 | 140 | ||
@@ -168,10 +173,6 @@ void main(void) | |||
168 | /* Set the video mode */ | 173 | /* Set the video mode */ |
169 | set_video(); | 174 | set_video(); |
170 | 175 | ||
171 | /* Parse command line for 'quiet' and pass it to decompressor. */ | ||
172 | if (cmdline_find_option_bool("quiet")) | ||
173 | boot_params.hdr.loadflags |= QUIET_FLAG; | ||
174 | |||
175 | /* Do the last things and invoke protected mode */ | 176 | /* Do the last things and invoke protected mode */ |
176 | go_to_protected_mode(); | 177 | go_to_protected_mode(); |
177 | } | 178 | } |
diff --git a/arch/x86/boot/printf.c b/arch/x86/boot/printf.c index 50e47cdbdddd..cdac91ca55d3 100644 --- a/arch/x86/boot/printf.c +++ b/arch/x86/boot/printf.c | |||
@@ -34,7 +34,7 @@ static int skip_atoi(const char **s) | |||
34 | #define SMALL 32 /* Must be 32 == 0x20 */ | 34 | #define SMALL 32 /* Must be 32 == 0x20 */ |
35 | #define SPECIAL 64 /* 0x */ | 35 | #define SPECIAL 64 /* 0x */ |
36 | 36 | ||
37 | #define do_div(n,base) ({ \ | 37 | #define __do_div(n, base) ({ \ |
38 | int __res; \ | 38 | int __res; \ |
39 | __res = ((unsigned long) n) % (unsigned) base; \ | 39 | __res = ((unsigned long) n) % (unsigned) base; \ |
40 | n = ((unsigned long) n) / (unsigned) base; \ | 40 | n = ((unsigned long) n) / (unsigned) base; \ |
@@ -83,7 +83,7 @@ static char *number(char *str, long num, int base, int size, int precision, | |||
83 | tmp[i++] = '0'; | 83 | tmp[i++] = '0'; |
84 | else | 84 | else |
85 | while (num != 0) | 85 | while (num != 0) |
86 | tmp[i++] = (digits[do_div(num, base)] | locase); | 86 | tmp[i++] = (digits[__do_div(num, base)] | locase); |
87 | if (i > precision) | 87 | if (i > precision) |
88 | precision = i; | 88 | precision = i; |
89 | size -= precision; | 89 | size -= precision; |
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index f94b7a0c2abf..3cbc4058dd26 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c | |||
@@ -30,6 +30,22 @@ int strcmp(const char *str1, const char *str2) | |||
30 | return 0; | 30 | return 0; |
31 | } | 31 | } |
32 | 32 | ||
33 | int strncmp(const char *cs, const char *ct, size_t count) | ||
34 | { | ||
35 | unsigned char c1, c2; | ||
36 | |||
37 | while (count) { | ||
38 | c1 = *cs++; | ||
39 | c2 = *ct++; | ||
40 | if (c1 != c2) | ||
41 | return c1 < c2 ? -1 : 1; | ||
42 | if (!c1) | ||
43 | break; | ||
44 | count--; | ||
45 | } | ||
46 | return 0; | ||
47 | } | ||
48 | |||
33 | size_t strnlen(const char *s, size_t maxlen) | 49 | size_t strnlen(const char *s, size_t maxlen) |
34 | { | 50 | { |
35 | const char *es = s; | 51 | const char *es = s; |
@@ -48,3 +64,50 @@ unsigned int atou(const char *s) | |||
48 | i = i * 10 + (*s++ - '0'); | 64 | i = i * 10 + (*s++ - '0'); |
49 | return i; | 65 | return i; |
50 | } | 66 | } |
67 | |||
68 | /* Works only for digits and letters, but small and fast */ | ||
69 | #define TOLOWER(x) ((x) | 0x20) | ||
70 | |||
71 | static unsigned int simple_guess_base(const char *cp) | ||
72 | { | ||
73 | if (cp[0] == '0') { | ||
74 | if (TOLOWER(cp[1]) == 'x' && isxdigit(cp[2])) | ||
75 | return 16; | ||
76 | else | ||
77 | return 8; | ||
78 | } else { | ||
79 | return 10; | ||
80 | } | ||
81 | } | ||
82 | |||
83 | /** | ||
84 | * simple_strtoull - convert a string to an unsigned long long | ||
85 | * @cp: The start of the string | ||
86 | * @endp: A pointer to the end of the parsed string will be placed here | ||
87 | * @base: The number base to use | ||
88 | */ | ||
89 | |||
90 | unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base) | ||
91 | { | ||
92 | unsigned long long result = 0; | ||
93 | |||
94 | if (!base) | ||
95 | base = simple_guess_base(cp); | ||
96 | |||
97 | if (base == 16 && cp[0] == '0' && TOLOWER(cp[1]) == 'x') | ||
98 | cp += 2; | ||
99 | |||
100 | while (isxdigit(*cp)) { | ||
101 | unsigned int value; | ||
102 | |||
103 | value = isdigit(*cp) ? *cp - '0' : TOLOWER(*cp) - 'a' + 10; | ||
104 | if (value >= base) | ||
105 | break; | ||
106 | result = result * base + value; | ||
107 | cp++; | ||
108 | } | ||
109 | if (endp) | ||
110 | *endp = (char *)cp; | ||
111 | |||
112 | return result; | ||
113 | } | ||
diff --git a/arch/x86/boot/tty.c b/arch/x86/boot/tty.c index 01ec69c901c7..def2451f46ae 100644 --- a/arch/x86/boot/tty.c +++ b/arch/x86/boot/tty.c | |||
@@ -10,23 +10,36 @@ | |||
10 | * ----------------------------------------------------------------------- */ | 10 | * ----------------------------------------------------------------------- */ |
11 | 11 | ||
12 | /* | 12 | /* |
13 | * Very simple screen I/O | 13 | * Very simple screen and serial I/O |
14 | * XXX: Probably should add very simple serial I/O? | ||
15 | */ | 14 | */ |
16 | 15 | ||
17 | #include "boot.h" | 16 | #include "boot.h" |
18 | 17 | ||
18 | int early_serial_base; | ||
19 | |||
20 | #define XMTRDY 0x20 | ||
21 | |||
22 | #define TXR 0 /* Transmit register (WRITE) */ | ||
23 | #define LSR 5 /* Line Status */ | ||
24 | |||
19 | /* | 25 | /* |
20 | * These functions are in .inittext so they can be used to signal | 26 | * These functions are in .inittext so they can be used to signal |
21 | * error during initialization. | 27 | * error during initialization. |
22 | */ | 28 | */ |
23 | 29 | ||
24 | void __attribute__((section(".inittext"))) putchar(int ch) | 30 | static void __attribute__((section(".inittext"))) serial_putchar(int ch) |
25 | { | 31 | { |
26 | struct biosregs ireg; | 32 | unsigned timeout = 0xffff; |
27 | 33 | ||
28 | if (ch == '\n') | 34 | while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) |
29 | putchar('\r'); /* \n -> \r\n */ | 35 | cpu_relax(); |
36 | |||
37 | outb(ch, early_serial_base + TXR); | ||
38 | } | ||
39 | |||
40 | static void __attribute__((section(".inittext"))) bios_putchar(int ch) | ||
41 | { | ||
42 | struct biosregs ireg; | ||
30 | 43 | ||
31 | initregs(&ireg); | 44 | initregs(&ireg); |
32 | ireg.bx = 0x0007; | 45 | ireg.bx = 0x0007; |
@@ -36,6 +49,17 @@ void __attribute__((section(".inittext"))) putchar(int ch) | |||
36 | intcall(0x10, &ireg, NULL); | 49 | intcall(0x10, &ireg, NULL); |
37 | } | 50 | } |
38 | 51 | ||
52 | void __attribute__((section(".inittext"))) putchar(int ch) | ||
53 | { | ||
54 | if (ch == '\n') | ||
55 | putchar('\r'); /* \n -> \r\n */ | ||
56 | |||
57 | bios_putchar(ch); | ||
58 | |||
59 | if (early_serial_base != 0) | ||
60 | serial_putchar(ch); | ||
61 | } | ||
62 | |||
39 | void __attribute__((section(".inittext"))) puts(const char *str) | 63 | void __attribute__((section(".inittext"))) puts(const char *str) |
40 | { | 64 | { |
41 | while (*str) | 65 | while (*str) |
@@ -112,3 +136,4 @@ int getchar_timeout(void) | |||
112 | 136 | ||
113 | return 0; /* Timeout! */ | 137 | return 0; /* Timeout! */ |
114 | } | 138 | } |
139 | |||
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index d28fad19654a..e3a32431ca1e 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig | |||
@@ -1471,6 +1471,7 @@ CONFIG_HWMON=y | |||
1471 | # CONFIG_SENSORS_GL518SM is not set | 1471 | # CONFIG_SENSORS_GL518SM is not set |
1472 | # CONFIG_SENSORS_GL520SM is not set | 1472 | # CONFIG_SENSORS_GL520SM is not set |
1473 | # CONFIG_SENSORS_CORETEMP is not set | 1473 | # CONFIG_SENSORS_CORETEMP is not set |
1474 | # CONFIG_SENSORS_PKGTEMP is not set | ||
1474 | # CONFIG_SENSORS_IT87 is not set | 1475 | # CONFIG_SENSORS_IT87 is not set |
1475 | # CONFIG_SENSORS_LM63 is not set | 1476 | # CONFIG_SENSORS_LM63 is not set |
1476 | # CONFIG_SENSORS_LM75 is not set | 1477 | # CONFIG_SENSORS_LM75 is not set |
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 6c86acd847a4..4251f8372050 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig | |||
@@ -1456,6 +1456,7 @@ CONFIG_HWMON=y | |||
1456 | # CONFIG_SENSORS_GL518SM is not set | 1456 | # CONFIG_SENSORS_GL518SM is not set |
1457 | # CONFIG_SENSORS_GL520SM is not set | 1457 | # CONFIG_SENSORS_GL520SM is not set |
1458 | # CONFIG_SENSORS_CORETEMP is not set | 1458 | # CONFIG_SENSORS_CORETEMP is not set |
1459 | # CONFIG_SENSORS_PKGTEMP is not set | ||
1459 | # CONFIG_SENSORS_IT87 is not set | 1460 | # CONFIG_SENSORS_IT87 is not set |
1460 | # CONFIG_SENSORS_LM63 is not set | 1461 | # CONFIG_SENSORS_LM63 is not set |
1461 | # CONFIG_SENSORS_LM75 is not set | 1462 | # CONFIG_SENSORS_LM75 is not set |
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index e790bc1fbfa3..b86feabed69b 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S | |||
@@ -842,4 +842,7 @@ ia32_sys_call_table: | |||
842 | .quad compat_sys_rt_tgsigqueueinfo /* 335 */ | 842 | .quad compat_sys_rt_tgsigqueueinfo /* 335 */ |
843 | .quad sys_perf_event_open | 843 | .quad sys_perf_event_open |
844 | .quad compat_sys_recvmmsg | 844 | .quad compat_sys_recvmmsg |
845 | .quad sys_fanotify_init | ||
846 | .quad sys32_fanotify_mark | ||
847 | .quad sys_prlimit64 /* 340 */ | ||
845 | ia32_syscall_end: | 848 | ia32_syscall_end: |
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 626be156d88d..849813f398e7 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c | |||
@@ -51,7 +51,7 @@ | |||
51 | #define AA(__x) ((unsigned long)(__x)) | 51 | #define AA(__x) ((unsigned long)(__x)) |
52 | 52 | ||
53 | 53 | ||
54 | asmlinkage long sys32_truncate64(char __user *filename, | 54 | asmlinkage long sys32_truncate64(const char __user *filename, |
55 | unsigned long offset_low, | 55 | unsigned long offset_low, |
56 | unsigned long offset_high) | 56 | unsigned long offset_high) |
57 | { | 57 | { |
@@ -96,7 +96,7 @@ static int cp_stat64(struct stat64 __user *ubuf, struct kstat *stat) | |||
96 | return 0; | 96 | return 0; |
97 | } | 97 | } |
98 | 98 | ||
99 | asmlinkage long sys32_stat64(char __user *filename, | 99 | asmlinkage long sys32_stat64(const char __user *filename, |
100 | struct stat64 __user *statbuf) | 100 | struct stat64 __user *statbuf) |
101 | { | 101 | { |
102 | struct kstat stat; | 102 | struct kstat stat; |
@@ -107,7 +107,7 @@ asmlinkage long sys32_stat64(char __user *filename, | |||
107 | return ret; | 107 | return ret; |
108 | } | 108 | } |
109 | 109 | ||
110 | asmlinkage long sys32_lstat64(char __user *filename, | 110 | asmlinkage long sys32_lstat64(const char __user *filename, |
111 | struct stat64 __user *statbuf) | 111 | struct stat64 __user *statbuf) |
112 | { | 112 | { |
113 | struct kstat stat; | 113 | struct kstat stat; |
@@ -126,7 +126,7 @@ asmlinkage long sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf) | |||
126 | return ret; | 126 | return ret; |
127 | } | 127 | } |
128 | 128 | ||
129 | asmlinkage long sys32_fstatat(unsigned int dfd, char __user *filename, | 129 | asmlinkage long sys32_fstatat(unsigned int dfd, const char __user *filename, |
130 | struct stat64 __user *statbuf, int flag) | 130 | struct stat64 __user *statbuf, int flag) |
131 | { | 131 | { |
132 | struct kstat stat; | 132 | struct kstat stat; |
@@ -408,8 +408,8 @@ asmlinkage long sys32_pread(unsigned int fd, char __user *ubuf, u32 count, | |||
408 | ((loff_t)AA(poshi) << 32) | AA(poslo)); | 408 | ((loff_t)AA(poshi) << 32) | AA(poslo)); |
409 | } | 409 | } |
410 | 410 | ||
411 | asmlinkage long sys32_pwrite(unsigned int fd, char __user *ubuf, u32 count, | 411 | asmlinkage long sys32_pwrite(unsigned int fd, const char __user *ubuf, |
412 | u32 poslo, u32 poshi) | 412 | u32 count, u32 poslo, u32 poshi) |
413 | { | 413 | { |
414 | return sys_pwrite64(fd, ubuf, count, | 414 | return sys_pwrite64(fd, ubuf, count, |
415 | ((loff_t)AA(poshi) << 32) | AA(poslo)); | 415 | ((loff_t)AA(poshi) << 32) | AA(poslo)); |
@@ -449,7 +449,7 @@ asmlinkage long sys32_sendfile(int out_fd, int in_fd, | |||
449 | return ret; | 449 | return ret; |
450 | } | 450 | } |
451 | 451 | ||
452 | asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv, | 452 | asmlinkage long sys32_execve(const char __user *name, compat_uptr_t __user *argv, |
453 | compat_uptr_t __user *envp, struct pt_regs *regs) | 453 | compat_uptr_t __user *envp, struct pt_regs *regs) |
454 | { | 454 | { |
455 | long error; | 455 | long error; |
@@ -546,3 +546,12 @@ asmlinkage long sys32_fallocate(int fd, int mode, unsigned offset_lo, | |||
546 | return sys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo, | 546 | return sys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo, |
547 | ((u64)len_hi << 32) | len_lo); | 547 | ((u64)len_hi << 32) | len_lo); |
548 | } | 548 | } |
549 | |||
550 | asmlinkage long sys32_fanotify_mark(int fanotify_fd, unsigned int flags, | ||
551 | u32 mask_lo, u32 mask_hi, | ||
552 | int fd, const char __user *pathname) | ||
553 | { | ||
554 | return sys_fanotify_mark(fanotify_fd, flags, | ||
555 | ((u64)mask_hi << 32) | mask_lo, | ||
556 | fd, pathname); | ||
557 | } | ||
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index aa2c39d968fc..92091de11113 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h | |||
@@ -134,7 +134,7 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate) | |||
134 | boot_cpu_data.x86_model <= 0x05 && | 134 | boot_cpu_data.x86_model <= 0x05 && |
135 | boot_cpu_data.x86_mask < 0x0A) | 135 | boot_cpu_data.x86_mask < 0x0A) |
136 | return 1; | 136 | return 1; |
137 | else if (boot_cpu_has(X86_FEATURE_AMDC1E)) | 137 | else if (c1e_detected) |
138 | return 1; | 138 | return 1; |
139 | else | 139 | else |
140 | return max_cstate; | 140 | return max_cstate; |
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 03b6bb5394a0..bc6abb7bc7ee 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h | |||
@@ -45,10 +45,9 @@ | |||
45 | struct alt_instr { | 45 | struct alt_instr { |
46 | u8 *instr; /* original instruction */ | 46 | u8 *instr; /* original instruction */ |
47 | u8 *replacement; | 47 | u8 *replacement; |
48 | u8 cpuid; /* cpuid bit set for replacement */ | 48 | u16 cpuid; /* cpuid bit set for replacement */ |
49 | u8 instrlen; /* length of original instruction */ | 49 | u8 instrlen; /* length of original instruction */ |
50 | u8 replacementlen; /* length of new instruction, <= instrlen */ | 50 | u8 replacementlen; /* length of new instruction, <= instrlen */ |
51 | u8 pad1; | ||
52 | #ifdef CONFIG_X86_64 | 51 | #ifdef CONFIG_X86_64 |
53 | u32 pad2; | 52 | u32 pad2; |
54 | #endif | 53 | #endif |
@@ -86,9 +85,11 @@ static inline int alternatives_text_reserved(void *start, void *end) | |||
86 | _ASM_ALIGN "\n" \ | 85 | _ASM_ALIGN "\n" \ |
87 | _ASM_PTR "661b\n" /* label */ \ | 86 | _ASM_PTR "661b\n" /* label */ \ |
88 | _ASM_PTR "663f\n" /* new instruction */ \ | 87 | _ASM_PTR "663f\n" /* new instruction */ \ |
89 | " .byte " __stringify(feature) "\n" /* feature bit */ \ | 88 | " .word " __stringify(feature) "\n" /* feature bit */ \ |
90 | " .byte 662b-661b\n" /* sourcelen */ \ | 89 | " .byte 662b-661b\n" /* sourcelen */ \ |
91 | " .byte 664f-663f\n" /* replacementlen */ \ | 90 | " .byte 664f-663f\n" /* replacementlen */ \ |
91 | ".previous\n" \ | ||
92 | ".section .discard,\"aw\",@progbits\n" \ | ||
92 | " .byte 0xff + (664f-663f) - (662b-661b)\n" /* rlen <= slen */ \ | 93 | " .byte 0xff + (664f-663f) - (662b-661b)\n" /* rlen <= slen */ \ |
93 | ".previous\n" \ | 94 | ".previous\n" \ |
94 | ".section .altinstr_replacement, \"ax\"\n" \ | 95 | ".section .altinstr_replacement, \"ax\"\n" \ |
diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h index c74a2eebe570..a69b1ac9eaf8 100644 --- a/arch/x86/include/asm/apb_timer.h +++ b/arch/x86/include/asm/apb_timer.h | |||
@@ -55,7 +55,6 @@ extern unsigned long apbt_quick_calibrate(void); | |||
55 | extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu); | 55 | extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu); |
56 | extern void apbt_setup_secondary_clock(void); | 56 | extern void apbt_setup_secondary_clock(void); |
57 | extern unsigned int boot_cpu_id; | 57 | extern unsigned int boot_cpu_id; |
58 | extern int disable_apbt_percpu; | ||
59 | 58 | ||
60 | extern struct sfi_timer_table_entry *sfi_get_mtmr(int hint); | 59 | extern struct sfi_timer_table_entry *sfi_get_mtmr(int hint); |
61 | extern void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr); | 60 | extern void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr); |
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h index 6be33d83c716..8e6218550e77 100644 --- a/arch/x86/include/asm/bootparam.h +++ b/arch/x86/include/asm/bootparam.h | |||
@@ -70,6 +70,14 @@ struct sys_desc_table { | |||
70 | __u8 table[14]; | 70 | __u8 table[14]; |
71 | }; | 71 | }; |
72 | 72 | ||
73 | /* Gleaned from OFW's set-parameters in cpu/x86/pc/linux.fth */ | ||
74 | struct olpc_ofw_header { | ||
75 | __u32 ofw_magic; /* OFW signature */ | ||
76 | __u32 ofw_version; | ||
77 | __u32 cif_handler; /* callback into OFW */ | ||
78 | __u32 irq_desc_table; | ||
79 | } __attribute__((packed)); | ||
80 | |||
73 | struct efi_info { | 81 | struct efi_info { |
74 | __u32 efi_loader_signature; | 82 | __u32 efi_loader_signature; |
75 | __u32 efi_systab; | 83 | __u32 efi_systab; |
@@ -92,7 +100,8 @@ struct boot_params { | |||
92 | __u8 hd0_info[16]; /* obsolete! */ /* 0x080 */ | 100 | __u8 hd0_info[16]; /* obsolete! */ /* 0x080 */ |
93 | __u8 hd1_info[16]; /* obsolete! */ /* 0x090 */ | 101 | __u8 hd1_info[16]; /* obsolete! */ /* 0x090 */ |
94 | struct sys_desc_table sys_desc_table; /* 0x0a0 */ | 102 | struct sys_desc_table sys_desc_table; /* 0x0a0 */ |
95 | __u8 _pad4[144]; /* 0x0b0 */ | 103 | struct olpc_ofw_header olpc_ofw_header; /* 0x0b0 */ |
104 | __u8 _pad4[128]; /* 0x0c0 */ | ||
96 | struct edid_info edid_info; /* 0x140 */ | 105 | struct edid_info edid_info; /* 0x140 */ |
97 | struct efi_info efi_info; /* 0x1c0 */ | 106 | struct efi_info efi_info; /* 0x1c0 */ |
98 | __u32 alt_mem_k; /* 0x1e0 */ | 107 | __u32 alt_mem_k; /* 0x1e0 */ |
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index 8859e12dd3cf..284a6e8f7ce1 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h | |||
@@ -11,38 +11,42 @@ | |||
11 | extern void __xchg_wrong_size(void); | 11 | extern void __xchg_wrong_size(void); |
12 | 12 | ||
13 | /* | 13 | /* |
14 | * Note: no "lock" prefix even on SMP: xchg always implies lock anyway | 14 | * Note: no "lock" prefix even on SMP: xchg always implies lock anyway. |
15 | * Note 2: xchg has side effect, so that attribute volatile is necessary, | 15 | * Since this is generally used to protect other memory information, we |
16 | * but generally the primitive is invalid, *ptr is output argument. --ANK | 16 | * use "asm volatile" and "memory" clobbers to prevent gcc from moving |
17 | * information around. | ||
17 | */ | 18 | */ |
18 | |||
19 | struct __xchg_dummy { | ||
20 | unsigned long a[100]; | ||
21 | }; | ||
22 | #define __xg(x) ((struct __xchg_dummy *)(x)) | ||
23 | |||
24 | #define __xchg(x, ptr, size) \ | 19 | #define __xchg(x, ptr, size) \ |
25 | ({ \ | 20 | ({ \ |
26 | __typeof(*(ptr)) __x = (x); \ | 21 | __typeof(*(ptr)) __x = (x); \ |
27 | switch (size) { \ | 22 | switch (size) { \ |
28 | case 1: \ | 23 | case 1: \ |
29 | asm volatile("xchgb %b0,%1" \ | 24 | { \ |
30 | : "=q" (__x) \ | 25 | volatile u8 *__ptr = (volatile u8 *)(ptr); \ |
31 | : "m" (*__xg(ptr)), "0" (__x) \ | 26 | asm volatile("xchgb %0,%1" \ |
27 | : "=q" (__x), "+m" (*__ptr) \ | ||
28 | : "0" (__x) \ | ||
32 | : "memory"); \ | 29 | : "memory"); \ |
33 | break; \ | 30 | break; \ |
31 | } \ | ||
34 | case 2: \ | 32 | case 2: \ |
35 | asm volatile("xchgw %w0,%1" \ | 33 | { \ |
36 | : "=r" (__x) \ | 34 | volatile u16 *__ptr = (volatile u16 *)(ptr); \ |
37 | : "m" (*__xg(ptr)), "0" (__x) \ | 35 | asm volatile("xchgw %0,%1" \ |
36 | : "=r" (__x), "+m" (*__ptr) \ | ||
37 | : "0" (__x) \ | ||
38 | : "memory"); \ | 38 | : "memory"); \ |
39 | break; \ | 39 | break; \ |
40 | } \ | ||
40 | case 4: \ | 41 | case 4: \ |
42 | { \ | ||
43 | volatile u32 *__ptr = (volatile u32 *)(ptr); \ | ||
41 | asm volatile("xchgl %0,%1" \ | 44 | asm volatile("xchgl %0,%1" \ |
42 | : "=r" (__x) \ | 45 | : "=r" (__x), "+m" (*__ptr) \ |
43 | : "m" (*__xg(ptr)), "0" (__x) \ | 46 | : "0" (__x) \ |
44 | : "memory"); \ | 47 | : "memory"); \ |
45 | break; \ | 48 | break; \ |
49 | } \ | ||
46 | default: \ | 50 | default: \ |
47 | __xchg_wrong_size(); \ | 51 | __xchg_wrong_size(); \ |
48 | } \ | 52 | } \ |
@@ -53,60 +57,33 @@ struct __xchg_dummy { | |||
53 | __xchg((v), (ptr), sizeof(*ptr)) | 57 | __xchg((v), (ptr), sizeof(*ptr)) |
54 | 58 | ||
55 | /* | 59 | /* |
56 | * The semantics of XCHGCMP8B are a bit strange, this is why | 60 | * CMPXCHG8B only writes to the target if we had the previous |
57 | * there is a loop and the loading of %%eax and %%edx has to | 61 | * value in registers, otherwise it acts as a read and gives us the |
58 | * be inside. This inlines well in most cases, the cached | 62 | * "new previous" value. That is why there is a loop. Preloading |
59 | * cost is around ~38 cycles. (in the future we might want | 63 | * EDX:EAX is a performance optimization: in the common case it means |
60 | * to do an SIMD/3DNOW!/MMX/FPU 64-bit store here, but that | 64 | * we need only one locked operation. |
61 | * might have an implicit FPU-save as a cost, so it's not | ||
62 | * clear which path to go.) | ||
63 | * | 65 | * |
64 | * cmpxchg8b must be used with the lock prefix here to allow | 66 | * A SIMD/3DNOW!/MMX/FPU 64-bit store here would require at the very |
65 | * the instruction to be executed atomically, see page 3-102 | 67 | * least an FPU save and/or %cr0.ts manipulation. |
66 | * of the instruction set reference 24319102.pdf. We need | 68 | * |
67 | * the reader side to see the coherent 64bit value. | 69 | * cmpxchg8b must be used with the lock prefix here to allow the |
70 | * instruction to be executed atomically. We need to have the reader | ||
71 | * side to see the coherent 64bit value. | ||
68 | */ | 72 | */ |
69 | static inline void __set_64bit(unsigned long long *ptr, | 73 | static inline void set_64bit(volatile u64 *ptr, u64 value) |
70 | unsigned int low, unsigned int high) | ||
71 | { | 74 | { |
75 | u32 low = value; | ||
76 | u32 high = value >> 32; | ||
77 | u64 prev = *ptr; | ||
78 | |||
72 | asm volatile("\n1:\t" | 79 | asm volatile("\n1:\t" |
73 | "movl (%0), %%eax\n\t" | 80 | LOCK_PREFIX "cmpxchg8b %0\n\t" |
74 | "movl 4(%0), %%edx\n\t" | ||
75 | LOCK_PREFIX "cmpxchg8b (%0)\n\t" | ||
76 | "jnz 1b" | 81 | "jnz 1b" |
77 | : /* no outputs */ | 82 | : "=m" (*ptr), "+A" (prev) |
78 | : "D"(ptr), | 83 | : "b" (low), "c" (high) |
79 | "b"(low), | 84 | : "memory"); |
80 | "c"(high) | ||
81 | : "ax", "dx", "memory"); | ||
82 | } | ||
83 | |||
84 | static inline void __set_64bit_constant(unsigned long long *ptr, | ||
85 | unsigned long long value) | ||
86 | { | ||
87 | __set_64bit(ptr, (unsigned int)value, (unsigned int)(value >> 32)); | ||
88 | } | ||
89 | |||
90 | #define ll_low(x) *(((unsigned int *)&(x)) + 0) | ||
91 | #define ll_high(x) *(((unsigned int *)&(x)) + 1) | ||
92 | |||
93 | static inline void __set_64bit_var(unsigned long long *ptr, | ||
94 | unsigned long long value) | ||
95 | { | ||
96 | __set_64bit(ptr, ll_low(value), ll_high(value)); | ||
97 | } | 85 | } |
98 | 86 | ||
99 | #define set_64bit(ptr, value) \ | ||
100 | (__builtin_constant_p((value)) \ | ||
101 | ? __set_64bit_constant((ptr), (value)) \ | ||
102 | : __set_64bit_var((ptr), (value))) | ||
103 | |||
104 | #define _set_64bit(ptr, value) \ | ||
105 | (__builtin_constant_p(value) \ | ||
106 | ? __set_64bit(ptr, (unsigned int)(value), \ | ||
107 | (unsigned int)((value) >> 32)) \ | ||
108 | : __set_64bit(ptr, ll_low((value)), ll_high((value)))) | ||
109 | |||
110 | extern void __cmpxchg_wrong_size(void); | 87 | extern void __cmpxchg_wrong_size(void); |
111 | 88 | ||
112 | /* | 89 | /* |
@@ -121,23 +98,32 @@ extern void __cmpxchg_wrong_size(void); | |||
121 | __typeof__(*(ptr)) __new = (new); \ | 98 | __typeof__(*(ptr)) __new = (new); \ |
122 | switch (size) { \ | 99 | switch (size) { \ |
123 | case 1: \ | 100 | case 1: \ |
124 | asm volatile(lock "cmpxchgb %b1,%2" \ | 101 | { \ |
125 | : "=a"(__ret) \ | 102 | volatile u8 *__ptr = (volatile u8 *)(ptr); \ |
126 | : "q"(__new), "m"(*__xg(ptr)), "0"(__old) \ | 103 | asm volatile(lock "cmpxchgb %2,%1" \ |
104 | : "=a" (__ret), "+m" (*__ptr) \ | ||
105 | : "q" (__new), "0" (__old) \ | ||
127 | : "memory"); \ | 106 | : "memory"); \ |
128 | break; \ | 107 | break; \ |
108 | } \ | ||
129 | case 2: \ | 109 | case 2: \ |
130 | asm volatile(lock "cmpxchgw %w1,%2" \ | 110 | { \ |
131 | : "=a"(__ret) \ | 111 | volatile u16 *__ptr = (volatile u16 *)(ptr); \ |
132 | : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ | 112 | asm volatile(lock "cmpxchgw %2,%1" \ |
113 | : "=a" (__ret), "+m" (*__ptr) \ | ||
114 | : "r" (__new), "0" (__old) \ | ||
133 | : "memory"); \ | 115 | : "memory"); \ |
134 | break; \ | 116 | break; \ |
117 | } \ | ||
135 | case 4: \ | 118 | case 4: \ |
136 | asm volatile(lock "cmpxchgl %1,%2" \ | 119 | { \ |
137 | : "=a"(__ret) \ | 120 | volatile u32 *__ptr = (volatile u32 *)(ptr); \ |
138 | : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ | 121 | asm volatile(lock "cmpxchgl %2,%1" \ |
122 | : "=a" (__ret), "+m" (*__ptr) \ | ||
123 | : "r" (__new), "0" (__old) \ | ||
139 | : "memory"); \ | 124 | : "memory"); \ |
140 | break; \ | 125 | break; \ |
126 | } \ | ||
141 | default: \ | 127 | default: \ |
142 | __cmpxchg_wrong_size(); \ | 128 | __cmpxchg_wrong_size(); \ |
143 | } \ | 129 | } \ |
@@ -175,32 +161,28 @@ extern void __cmpxchg_wrong_size(void); | |||
175 | (unsigned long long)(n))) | 161 | (unsigned long long)(n))) |
176 | #endif | 162 | #endif |
177 | 163 | ||
178 | static inline unsigned long long __cmpxchg64(volatile void *ptr, | 164 | static inline u64 __cmpxchg64(volatile u64 *ptr, u64 old, u64 new) |
179 | unsigned long long old, | ||
180 | unsigned long long new) | ||
181 | { | 165 | { |
182 | unsigned long long prev; | 166 | u64 prev; |
183 | asm volatile(LOCK_PREFIX "cmpxchg8b %3" | 167 | asm volatile(LOCK_PREFIX "cmpxchg8b %1" |
184 | : "=A"(prev) | 168 | : "=A" (prev), |
185 | : "b"((unsigned long)new), | 169 | "+m" (*ptr) |
186 | "c"((unsigned long)(new >> 32)), | 170 | : "b" ((u32)new), |
187 | "m"(*__xg(ptr)), | 171 | "c" ((u32)(new >> 32)), |
188 | "0"(old) | 172 | "0" (old) |
189 | : "memory"); | 173 | : "memory"); |
190 | return prev; | 174 | return prev; |
191 | } | 175 | } |
192 | 176 | ||
193 | static inline unsigned long long __cmpxchg64_local(volatile void *ptr, | 177 | static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new) |
194 | unsigned long long old, | ||
195 | unsigned long long new) | ||
196 | { | 178 | { |
197 | unsigned long long prev; | 179 | u64 prev; |
198 | asm volatile("cmpxchg8b %3" | 180 | asm volatile("cmpxchg8b %1" |
199 | : "=A"(prev) | 181 | : "=A" (prev), |
200 | : "b"((unsigned long)new), | 182 | "+m" (*ptr) |
201 | "c"((unsigned long)(new >> 32)), | 183 | : "b" ((u32)new), |
202 | "m"(*__xg(ptr)), | 184 | "c" ((u32)(new >> 32)), |
203 | "0"(old) | 185 | "0" (old) |
204 | : "memory"); | 186 | : "memory"); |
205 | return prev; | 187 | return prev; |
206 | } | 188 | } |
@@ -264,8 +246,6 @@ static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old, | |||
264 | * to simulate the cmpxchg8b on the 80386 and 80486 CPU. | 246 | * to simulate the cmpxchg8b on the 80386 and 80486 CPU. |
265 | */ | 247 | */ |
266 | 248 | ||
267 | extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64); | ||
268 | |||
269 | #define cmpxchg64(ptr, o, n) \ | 249 | #define cmpxchg64(ptr, o, n) \ |
270 | ({ \ | 250 | ({ \ |
271 | __typeof__(*(ptr)) __ret; \ | 251 | __typeof__(*(ptr)) __ret; \ |
@@ -283,20 +263,20 @@ extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64); | |||
283 | __ret; }) | 263 | __ret; }) |
284 | 264 | ||
285 | 265 | ||
286 | 266 | #define cmpxchg64_local(ptr, o, n) \ | |
287 | #define cmpxchg64_local(ptr, o, n) \ | 267 | ({ \ |
288 | ({ \ | 268 | __typeof__(*(ptr)) __ret; \ |
289 | __typeof__(*(ptr)) __ret; \ | 269 | __typeof__(*(ptr)) __old = (o); \ |
290 | if (likely(boot_cpu_data.x86 > 4)) \ | 270 | __typeof__(*(ptr)) __new = (n); \ |
291 | __ret = (__typeof__(*(ptr)))__cmpxchg64_local((ptr), \ | 271 | alternative_io("call cmpxchg8b_emu", \ |
292 | (unsigned long long)(o), \ | 272 | "cmpxchg8b (%%esi)" , \ |
293 | (unsigned long long)(n)); \ | 273 | X86_FEATURE_CX8, \ |
294 | else \ | 274 | "=A" (__ret), \ |
295 | __ret = (__typeof__(*(ptr)))cmpxchg_486_u64((ptr), \ | 275 | "S" ((ptr)), "0" (__old), \ |
296 | (unsigned long long)(o), \ | 276 | "b" ((unsigned int)__new), \ |
297 | (unsigned long long)(n)); \ | 277 | "c" ((unsigned int)(__new>>32)) \ |
298 | __ret; \ | 278 | : "memory"); \ |
299 | }) | 279 | __ret; }) |
300 | 280 | ||
301 | #endif | 281 | #endif |
302 | 282 | ||
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h index 485ae415faec..423ae58aa020 100644 --- a/arch/x86/include/asm/cmpxchg_64.h +++ b/arch/x86/include/asm/cmpxchg_64.h | |||
@@ -3,51 +3,60 @@ | |||
3 | 3 | ||
4 | #include <asm/alternative.h> /* Provides LOCK_PREFIX */ | 4 | #include <asm/alternative.h> /* Provides LOCK_PREFIX */ |
5 | 5 | ||
6 | #define __xg(x) ((volatile long *)(x)) | 6 | static inline void set_64bit(volatile u64 *ptr, u64 val) |
7 | |||
8 | static inline void set_64bit(volatile unsigned long *ptr, unsigned long val) | ||
9 | { | 7 | { |
10 | *ptr = val; | 8 | *ptr = val; |
11 | } | 9 | } |
12 | 10 | ||
13 | #define _set_64bit set_64bit | ||
14 | |||
15 | extern void __xchg_wrong_size(void); | 11 | extern void __xchg_wrong_size(void); |
16 | extern void __cmpxchg_wrong_size(void); | 12 | extern void __cmpxchg_wrong_size(void); |
17 | 13 | ||
18 | /* | 14 | /* |
19 | * Note: no "lock" prefix even on SMP: xchg always implies lock anyway | 15 | * Note: no "lock" prefix even on SMP: xchg always implies lock anyway. |
20 | * Note 2: xchg has side effect, so that attribute volatile is necessary, | 16 | * Since this is generally used to protect other memory information, we |
21 | * but generally the primitive is invalid, *ptr is output argument. --ANK | 17 | * use "asm volatile" and "memory" clobbers to prevent gcc from moving |
18 | * information around. | ||
22 | */ | 19 | */ |
23 | #define __xchg(x, ptr, size) \ | 20 | #define __xchg(x, ptr, size) \ |
24 | ({ \ | 21 | ({ \ |
25 | __typeof(*(ptr)) __x = (x); \ | 22 | __typeof(*(ptr)) __x = (x); \ |
26 | switch (size) { \ | 23 | switch (size) { \ |
27 | case 1: \ | 24 | case 1: \ |
28 | asm volatile("xchgb %b0,%1" \ | 25 | { \ |
29 | : "=q" (__x) \ | 26 | volatile u8 *__ptr = (volatile u8 *)(ptr); \ |
30 | : "m" (*__xg(ptr)), "0" (__x) \ | 27 | asm volatile("xchgb %0,%1" \ |
28 | : "=q" (__x), "+m" (*__ptr) \ | ||
29 | : "0" (__x) \ | ||
31 | : "memory"); \ | 30 | : "memory"); \ |
32 | break; \ | 31 | break; \ |
32 | } \ | ||
33 | case 2: \ | 33 | case 2: \ |
34 | asm volatile("xchgw %w0,%1" \ | 34 | { \ |
35 | : "=r" (__x) \ | 35 | volatile u16 *__ptr = (volatile u16 *)(ptr); \ |
36 | : "m" (*__xg(ptr)), "0" (__x) \ | 36 | asm volatile("xchgw %0,%1" \ |
37 | : "=r" (__x), "+m" (*__ptr) \ | ||
38 | : "0" (__x) \ | ||
37 | : "memory"); \ | 39 | : "memory"); \ |
38 | break; \ | 40 | break; \ |
41 | } \ | ||
39 | case 4: \ | 42 | case 4: \ |
40 | asm volatile("xchgl %k0,%1" \ | 43 | { \ |
41 | : "=r" (__x) \ | 44 | volatile u32 *__ptr = (volatile u32 *)(ptr); \ |
42 | : "m" (*__xg(ptr)), "0" (__x) \ | 45 | asm volatile("xchgl %0,%1" \ |
46 | : "=r" (__x), "+m" (*__ptr) \ | ||
47 | : "0" (__x) \ | ||
43 | : "memory"); \ | 48 | : "memory"); \ |
44 | break; \ | 49 | break; \ |
50 | } \ | ||
45 | case 8: \ | 51 | case 8: \ |
52 | { \ | ||
53 | volatile u64 *__ptr = (volatile u64 *)(ptr); \ | ||
46 | asm volatile("xchgq %0,%1" \ | 54 | asm volatile("xchgq %0,%1" \ |
47 | : "=r" (__x) \ | 55 | : "=r" (__x), "+m" (*__ptr) \ |
48 | : "m" (*__xg(ptr)), "0" (__x) \ | 56 | : "0" (__x) \ |
49 | : "memory"); \ | 57 | : "memory"); \ |
50 | break; \ | 58 | break; \ |
59 | } \ | ||
51 | default: \ | 60 | default: \ |
52 | __xchg_wrong_size(); \ | 61 | __xchg_wrong_size(); \ |
53 | } \ | 62 | } \ |
@@ -71,29 +80,41 @@ extern void __cmpxchg_wrong_size(void); | |||
71 | __typeof__(*(ptr)) __new = (new); \ | 80 | __typeof__(*(ptr)) __new = (new); \ |
72 | switch (size) { \ | 81 | switch (size) { \ |
73 | case 1: \ | 82 | case 1: \ |
74 | asm volatile(lock "cmpxchgb %b1,%2" \ | 83 | { \ |
75 | : "=a"(__ret) \ | 84 | volatile u8 *__ptr = (volatile u8 *)(ptr); \ |
76 | : "q"(__new), "m"(*__xg(ptr)), "0"(__old) \ | 85 | asm volatile(lock "cmpxchgb %2,%1" \ |
86 | : "=a" (__ret), "+m" (*__ptr) \ | ||
87 | : "q" (__new), "0" (__old) \ | ||
77 | : "memory"); \ | 88 | : "memory"); \ |
78 | break; \ | 89 | break; \ |
90 | } \ | ||
79 | case 2: \ | 91 | case 2: \ |
80 | asm volatile(lock "cmpxchgw %w1,%2" \ | 92 | { \ |
81 | : "=a"(__ret) \ | 93 | volatile u16 *__ptr = (volatile u16 *)(ptr); \ |
82 | : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ | 94 | asm volatile(lock "cmpxchgw %2,%1" \ |
95 | : "=a" (__ret), "+m" (*__ptr) \ | ||
96 | : "r" (__new), "0" (__old) \ | ||
83 | : "memory"); \ | 97 | : "memory"); \ |
84 | break; \ | 98 | break; \ |
99 | } \ | ||
85 | case 4: \ | 100 | case 4: \ |
86 | asm volatile(lock "cmpxchgl %k1,%2" \ | 101 | { \ |
87 | : "=a"(__ret) \ | 102 | volatile u32 *__ptr = (volatile u32 *)(ptr); \ |
88 | : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ | 103 | asm volatile(lock "cmpxchgl %2,%1" \ |
104 | : "=a" (__ret), "+m" (*__ptr) \ | ||
105 | : "r" (__new), "0" (__old) \ | ||
89 | : "memory"); \ | 106 | : "memory"); \ |
90 | break; \ | 107 | break; \ |
108 | } \ | ||
91 | case 8: \ | 109 | case 8: \ |
92 | asm volatile(lock "cmpxchgq %1,%2" \ | 110 | { \ |
93 | : "=a"(__ret) \ | 111 | volatile u64 *__ptr = (volatile u64 *)(ptr); \ |
94 | : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ | 112 | asm volatile(lock "cmpxchgq %2,%1" \ |
113 | : "=a" (__ret), "+m" (*__ptr) \ | ||
114 | : "r" (__new), "0" (__old) \ | ||
95 | : "memory"); \ | 115 | : "memory"); \ |
96 | break; \ | 116 | break; \ |
117 | } \ | ||
97 | default: \ | 118 | default: \ |
98 | __cmpxchg_wrong_size(); \ | 119 | __cmpxchg_wrong_size(); \ |
99 | } \ | 120 | } \ |
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 468145914389..781a50b29a49 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h | |||
@@ -6,7 +6,7 @@ | |||
6 | 6 | ||
7 | #include <asm/required-features.h> | 7 | #include <asm/required-features.h> |
8 | 8 | ||
9 | #define NCAPINTS 9 /* N 32-bit words worth of info */ | 9 | #define NCAPINTS 10 /* N 32-bit words worth of info */ |
10 | 10 | ||
11 | /* | 11 | /* |
12 | * Note: If the comment begins with a quoted string, that string is used | 12 | * Note: If the comment begins with a quoted string, that string is used |
@@ -89,7 +89,7 @@ | |||
89 | #define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* "" Lfence synchronizes RDTSC */ | 89 | #define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* "" Lfence synchronizes RDTSC */ |
90 | #define X86_FEATURE_11AP (3*32+19) /* "" Bad local APIC aka 11AP */ | 90 | #define X86_FEATURE_11AP (3*32+19) /* "" Bad local APIC aka 11AP */ |
91 | #define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */ | 91 | #define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */ |
92 | #define X86_FEATURE_AMDC1E (3*32+21) /* AMD C1E detected */ | 92 | /* 21 available, was AMD_C1E */ |
93 | #define X86_FEATURE_XTOPOLOGY (3*32+22) /* cpu topology enum extensions */ | 93 | #define X86_FEATURE_XTOPOLOGY (3*32+22) /* cpu topology enum extensions */ |
94 | #define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */ | 94 | #define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */ |
95 | #define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */ | 95 | #define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */ |
@@ -124,6 +124,8 @@ | |||
124 | #define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ | 124 | #define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ |
125 | #define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */ | 125 | #define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */ |
126 | #define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */ | 126 | #define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */ |
127 | #define X86_FEATURE_F16C (4*32+29) /* 16-bit fp conversions */ | ||
128 | #define X86_FEATURE_RDRND (4*32+30) /* The RDRAND instruction */ | ||
127 | #define X86_FEATURE_HYPERVISOR (4*32+31) /* Running on a hypervisor */ | 129 | #define X86_FEATURE_HYPERVISOR (4*32+31) /* Running on a hypervisor */ |
128 | 130 | ||
129 | /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ | 131 | /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ |
@@ -157,22 +159,29 @@ | |||
157 | 159 | ||
158 | /* | 160 | /* |
159 | * Auxiliary flags: Linux defined - For features scattered in various | 161 | * Auxiliary flags: Linux defined - For features scattered in various |
160 | * CPUID levels like 0x6, 0xA etc | 162 | * CPUID levels like 0x6, 0xA etc, word 7 |
161 | */ | 163 | */ |
162 | #define X86_FEATURE_IDA (7*32+ 0) /* Intel Dynamic Acceleration */ | 164 | #define X86_FEATURE_IDA (7*32+ 0) /* Intel Dynamic Acceleration */ |
163 | #define X86_FEATURE_ARAT (7*32+ 1) /* Always Running APIC Timer */ | 165 | #define X86_FEATURE_ARAT (7*32+ 1) /* Always Running APIC Timer */ |
164 | #define X86_FEATURE_CPB (7*32+ 2) /* AMD Core Performance Boost */ | 166 | #define X86_FEATURE_CPB (7*32+ 2) /* AMD Core Performance Boost */ |
167 | #define X86_FEATURE_EPB (7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ | ||
168 | #define X86_FEATURE_XSAVEOPT (7*32+ 4) /* Optimized Xsave */ | ||
169 | #define X86_FEATURE_PLN (7*32+ 5) /* Intel Power Limit Notification */ | ||
170 | #define X86_FEATURE_PTS (7*32+ 6) /* Intel Package Thermal Status */ | ||
165 | 171 | ||
166 | /* Virtualization flags: Linux defined */ | 172 | /* Virtualization flags: Linux defined, word 8 */ |
167 | #define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */ | 173 | #define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */ |
168 | #define X86_FEATURE_VNMI (8*32+ 1) /* Intel Virtual NMI */ | 174 | #define X86_FEATURE_VNMI (8*32+ 1) /* Intel Virtual NMI */ |
169 | #define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */ | 175 | #define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */ |
170 | #define X86_FEATURE_EPT (8*32+ 3) /* Intel Extended Page Table */ | 176 | #define X86_FEATURE_EPT (8*32+ 3) /* Intel Extended Page Table */ |
171 | #define X86_FEATURE_VPID (8*32+ 4) /* Intel Virtual Processor ID */ | 177 | #define X86_FEATURE_VPID (8*32+ 4) /* Intel Virtual Processor ID */ |
172 | #define X86_FEATURE_NPT (8*32+5) /* AMD Nested Page Table support */ | 178 | #define X86_FEATURE_NPT (8*32+ 5) /* AMD Nested Page Table support */ |
173 | #define X86_FEATURE_LBRV (8*32+6) /* AMD LBR Virtualization support */ | 179 | #define X86_FEATURE_LBRV (8*32+ 6) /* AMD LBR Virtualization support */ |
174 | #define X86_FEATURE_SVML (8*32+7) /* "svm_lock" AMD SVM locking MSR */ | 180 | #define X86_FEATURE_SVML (8*32+ 7) /* "svm_lock" AMD SVM locking MSR */ |
175 | #define X86_FEATURE_NRIPS (8*32+8) /* "nrip_save" AMD SVM next_rip save */ | 181 | #define X86_FEATURE_NRIPS (8*32+ 8) /* "nrip_save" AMD SVM next_rip save */ |
182 | |||
183 | /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ | ||
184 | #define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ | ||
176 | 185 | ||
177 | #if defined(__KERNEL__) && !defined(__ASSEMBLY__) | 186 | #if defined(__KERNEL__) && !defined(__ASSEMBLY__) |
178 | 187 | ||
@@ -194,7 +203,9 @@ extern const char * const x86_power_flags[32]; | |||
194 | (((bit)>>5)==4 && (1UL<<((bit)&31) & REQUIRED_MASK4)) || \ | 203 | (((bit)>>5)==4 && (1UL<<((bit)&31) & REQUIRED_MASK4)) || \ |
195 | (((bit)>>5)==5 && (1UL<<((bit)&31) & REQUIRED_MASK5)) || \ | 204 | (((bit)>>5)==5 && (1UL<<((bit)&31) & REQUIRED_MASK5)) || \ |
196 | (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \ | 205 | (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \ |
197 | (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) ) \ | 206 | (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) || \ |
207 | (((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8)) || \ | ||
208 | (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) ) \ | ||
198 | ? 1 : \ | 209 | ? 1 : \ |
199 | test_cpu_cap(c, bit)) | 210 | test_cpu_cap(c, bit)) |
200 | 211 | ||
@@ -291,7 +302,7 @@ extern const char * const x86_power_flags[32]; | |||
291 | * patch the target code for additional performance. | 302 | * patch the target code for additional performance. |
292 | * | 303 | * |
293 | */ | 304 | */ |
294 | static __always_inline __pure bool __static_cpu_has(u8 bit) | 305 | static __always_inline __pure bool __static_cpu_has(u16 bit) |
295 | { | 306 | { |
296 | #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5) | 307 | #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5) |
297 | asm goto("1: jmp %l[t_no]\n" | 308 | asm goto("1: jmp %l[t_no]\n" |
@@ -300,11 +311,11 @@ static __always_inline __pure bool __static_cpu_has(u8 bit) | |||
300 | _ASM_ALIGN "\n" | 311 | _ASM_ALIGN "\n" |
301 | _ASM_PTR "1b\n" | 312 | _ASM_PTR "1b\n" |
302 | _ASM_PTR "0\n" /* no replacement */ | 313 | _ASM_PTR "0\n" /* no replacement */ |
303 | " .byte %P0\n" /* feature bit */ | 314 | " .word %P0\n" /* feature bit */ |
304 | " .byte 2b - 1b\n" /* source len */ | 315 | " .byte 2b - 1b\n" /* source len */ |
305 | " .byte 0\n" /* replacement len */ | 316 | " .byte 0\n" /* replacement len */ |
306 | " .byte 0xff + 0 - (2b-1b)\n" /* padding */ | ||
307 | ".previous\n" | 317 | ".previous\n" |
318 | /* skipping size check since replacement size = 0 */ | ||
308 | : : "i" (bit) : : t_no); | 319 | : : "i" (bit) : : t_no); |
309 | return true; | 320 | return true; |
310 | t_no: | 321 | t_no: |
@@ -318,10 +329,12 @@ static __always_inline __pure bool __static_cpu_has(u8 bit) | |||
318 | _ASM_ALIGN "\n" | 329 | _ASM_ALIGN "\n" |
319 | _ASM_PTR "1b\n" | 330 | _ASM_PTR "1b\n" |
320 | _ASM_PTR "3f\n" | 331 | _ASM_PTR "3f\n" |
321 | " .byte %P1\n" /* feature bit */ | 332 | " .word %P1\n" /* feature bit */ |
322 | " .byte 2b - 1b\n" /* source len */ | 333 | " .byte 2b - 1b\n" /* source len */ |
323 | " .byte 4f - 3f\n" /* replacement len */ | 334 | " .byte 4f - 3f\n" /* replacement len */ |
324 | " .byte 0xff + (4f-3f) - (2b-1b)\n" /* padding */ | 335 | ".previous\n" |
336 | ".section .discard,\"aw\",@progbits\n" | ||
337 | " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ | ||
325 | ".previous\n" | 338 | ".previous\n" |
326 | ".section .altinstr_replacement,\"ax\"\n" | 339 | ".section .altinstr_replacement,\"ax\"\n" |
327 | "3: movb $1,%0\n" | 340 | "3: movb $1,%0\n" |
@@ -337,7 +350,7 @@ static __always_inline __pure bool __static_cpu_has(u8 bit) | |||
337 | ( \ | 350 | ( \ |
338 | __builtin_constant_p(boot_cpu_has(bit)) ? \ | 351 | __builtin_constant_p(boot_cpu_has(bit)) ? \ |
339 | boot_cpu_has(bit) : \ | 352 | boot_cpu_has(bit) : \ |
340 | (__builtin_constant_p(bit) && !((bit) & ~0xff)) ? \ | 353 | __builtin_constant_p(bit) ? \ |
341 | __static_cpu_has(bit) : \ | 354 | __static_cpu_has(bit) : \ |
342 | boot_cpu_has(bit) \ | 355 | boot_cpu_has(bit) \ |
343 | ) | 356 | ) |
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index ac91eed21061..d4c419f883a0 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h | |||
@@ -54,7 +54,6 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) | |||
54 | 54 | ||
55 | #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) | 55 | #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) |
56 | #define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) | 56 | #define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) |
57 | #define dma_is_consistent(d, h) (1) | ||
58 | 57 | ||
59 | extern int dma_supported(struct device *hwdev, u64 mask); | 58 | extern int dma_supported(struct device *hwdev, u64 mask); |
60 | extern int dma_set_mask(struct device *dev, u64 mask); | 59 | extern int dma_set_mask(struct device *dev, u64 mask); |
@@ -87,13 +86,6 @@ dma_cache_sync(struct device *dev, void *vaddr, size_t size, | |||
87 | flush_write_buffers(); | 86 | flush_write_buffers(); |
88 | } | 87 | } |
89 | 88 | ||
90 | static inline int dma_get_cache_alignment(void) | ||
91 | { | ||
92 | /* no easy way to get cache size on all x86, so return the | ||
93 | * maximum possible, to be safe */ | ||
94 | return boot_cpu_data.x86_clflush_size; | ||
95 | } | ||
96 | |||
97 | static inline unsigned long dma_alloc_coherent_mask(struct device *dev, | 89 | static inline unsigned long dma_alloc_coherent_mask(struct device *dev, |
98 | gfp_t gfp) | 90 | gfp_t gfp) |
99 | { | 91 | { |
diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index a726650fc80f..8caac76ac324 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h | |||
@@ -61,7 +61,7 @@ void *kmap(struct page *page); | |||
61 | void kunmap(struct page *page); | 61 | void kunmap(struct page *page); |
62 | void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot); | 62 | void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot); |
63 | void *kmap_atomic(struct page *page, enum km_type type); | 63 | void *kmap_atomic(struct page *page, enum km_type type); |
64 | void kunmap_atomic(void *kvaddr, enum km_type type); | 64 | void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type); |
65 | void *kmap_atomic_pfn(unsigned long pfn, enum km_type type); | 65 | void *kmap_atomic_pfn(unsigned long pfn, enum km_type type); |
66 | void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); | 66 | void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); |
67 | struct page *kmap_atomic_to_page(void *ptr); | 67 | struct page *kmap_atomic_to_page(void *ptr); |
diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h index 942255310e6a..528a11e8d3e3 100644 --- a/arch/x86/include/asm/hw_breakpoint.h +++ b/arch/x86/include/asm/hw_breakpoint.h | |||
@@ -20,10 +20,10 @@ struct arch_hw_breakpoint { | |||
20 | #include <linux/list.h> | 20 | #include <linux/list.h> |
21 | 21 | ||
22 | /* Available HW breakpoint length encodings */ | 22 | /* Available HW breakpoint length encodings */ |
23 | #define X86_BREAKPOINT_LEN_X 0x00 | ||
23 | #define X86_BREAKPOINT_LEN_1 0x40 | 24 | #define X86_BREAKPOINT_LEN_1 0x40 |
24 | #define X86_BREAKPOINT_LEN_2 0x44 | 25 | #define X86_BREAKPOINT_LEN_2 0x44 |
25 | #define X86_BREAKPOINT_LEN_4 0x4c | 26 | #define X86_BREAKPOINT_LEN_4 0x4c |
26 | #define X86_BREAKPOINT_LEN_EXECUTE 0x40 | ||
27 | 27 | ||
28 | #ifdef CONFIG_X86_64 | 28 | #ifdef CONFIG_X86_64 |
29 | #define X86_BREAKPOINT_LEN_8 0x48 | 29 | #define X86_BREAKPOINT_LEN_8 0x48 |
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index 70abda7058c8..ff2546ce7178 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h | |||
@@ -45,5 +45,6 @@ extern const struct hypervisor_x86 *x86_hyper; | |||
45 | /* Recognized hypervisors */ | 45 | /* Recognized hypervisors */ |
46 | extern const struct hypervisor_x86 x86_hyper_vmware; | 46 | extern const struct hypervisor_x86 x86_hyper_vmware; |
47 | extern const struct hypervisor_x86 x86_hyper_ms_hyperv; | 47 | extern const struct hypervisor_x86 x86_hyper_ms_hyperv; |
48 | extern const struct hypervisor_x86 x86_hyper_xen_hvm; | ||
48 | 49 | ||
49 | #endif | 50 | #endif |
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index c991b3a7b904..a73a8d5a5e69 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h | |||
@@ -31,7 +31,6 @@ extern void mxcsr_feature_mask_init(void); | |||
31 | extern int init_fpu(struct task_struct *child); | 31 | extern int init_fpu(struct task_struct *child); |
32 | extern asmlinkage void math_state_restore(void); | 32 | extern asmlinkage void math_state_restore(void); |
33 | extern void __math_state_restore(void); | 33 | extern void __math_state_restore(void); |
34 | extern void init_thread_xstate(void); | ||
35 | extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); | 34 | extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); |
36 | 35 | ||
37 | extern user_regset_active_fn fpregs_active, xfpregs_active; | 36 | extern user_regset_active_fn fpregs_active, xfpregs_active; |
@@ -58,11 +57,25 @@ extern int restore_i387_xstate_ia32(void __user *buf); | |||
58 | 57 | ||
59 | #define X87_FSW_ES (1 << 7) /* Exception Summary */ | 58 | #define X87_FSW_ES (1 << 7) /* Exception Summary */ |
60 | 59 | ||
60 | static __always_inline __pure bool use_xsaveopt(void) | ||
61 | { | ||
62 | return static_cpu_has(X86_FEATURE_XSAVEOPT); | ||
63 | } | ||
64 | |||
61 | static __always_inline __pure bool use_xsave(void) | 65 | static __always_inline __pure bool use_xsave(void) |
62 | { | 66 | { |
63 | return static_cpu_has(X86_FEATURE_XSAVE); | 67 | return static_cpu_has(X86_FEATURE_XSAVE); |
64 | } | 68 | } |
65 | 69 | ||
70 | extern void __sanitize_i387_state(struct task_struct *); | ||
71 | |||
72 | static inline void sanitize_i387_state(struct task_struct *tsk) | ||
73 | { | ||
74 | if (!use_xsaveopt()) | ||
75 | return; | ||
76 | __sanitize_i387_state(tsk); | ||
77 | } | ||
78 | |||
66 | #ifdef CONFIG_X86_64 | 79 | #ifdef CONFIG_X86_64 |
67 | 80 | ||
68 | /* Ignore delayed exceptions from user space */ | 81 | /* Ignore delayed exceptions from user space */ |
@@ -127,6 +140,15 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx) | |||
127 | { | 140 | { |
128 | int err; | 141 | int err; |
129 | 142 | ||
143 | /* | ||
144 | * Clear the bytes not touched by the fxsave and reserved | ||
145 | * for the SW usage. | ||
146 | */ | ||
147 | err = __clear_user(&fx->sw_reserved, | ||
148 | sizeof(struct _fpx_sw_bytes)); | ||
149 | if (unlikely(err)) | ||
150 | return -EFAULT; | ||
151 | |||
130 | asm volatile("1: rex64/fxsave (%[fx])\n\t" | 152 | asm volatile("1: rex64/fxsave (%[fx])\n\t" |
131 | "2:\n" | 153 | "2:\n" |
132 | ".section .fixup,\"ax\"\n" | 154 | ".section .fixup,\"ax\"\n" |
@@ -482,6 +504,8 @@ static inline void fpu_copy(struct fpu *dst, struct fpu *src) | |||
482 | memcpy(dst->state, src->state, xstate_size); | 504 | memcpy(dst->state, src->state, xstate_size); |
483 | } | 505 | } |
484 | 506 | ||
507 | extern void fpu_finit(struct fpu *fpu); | ||
508 | |||
485 | #endif /* __ASSEMBLY__ */ | 509 | #endif /* __ASSEMBLY__ */ |
486 | 510 | ||
487 | #define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5 | 511 | #define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5 |
diff --git a/arch/x86/include/asm/intel_scu_ipc.h b/arch/x86/include/asm/intel_scu_ipc.h index 4470c9ad4a3e..29f66793cc55 100644 --- a/arch/x86/include/asm/intel_scu_ipc.h +++ b/arch/x86/include/asm/intel_scu_ipc.h | |||
@@ -1,6 +1,12 @@ | |||
1 | #ifndef _ASM_X86_INTEL_SCU_IPC_H_ | 1 | #ifndef _ASM_X86_INTEL_SCU_IPC_H_ |
2 | #define _ASM_X86_INTEL_SCU_IPC_H_ | 2 | #define _ASM_X86_INTEL_SCU_IPC_H_ |
3 | 3 | ||
4 | #define IPCMSG_VRTC 0xFA /* Set vRTC device */ | ||
5 | |||
6 | /* Command id associated with message IPCMSG_VRTC */ | ||
7 | #define IPC_CMD_VRTC_SETTIME 1 /* Set time */ | ||
8 | #define IPC_CMD_VRTC_SETALARM 2 /* Set alarm */ | ||
9 | |||
4 | /* Read single register */ | 10 | /* Read single register */ |
5 | int intel_scu_ipc_ioread8(u16 addr, u8 *data); | 11 | int intel_scu_ipc_ioread8(u16 addr, u8 *data); |
6 | 12 | ||
@@ -28,20 +34,6 @@ int intel_scu_ipc_writev(u16 *addr, u8 *data, int len); | |||
28 | /* Update single register based on the mask */ | 34 | /* Update single register based on the mask */ |
29 | int intel_scu_ipc_update_register(u16 addr, u8 data, u8 mask); | 35 | int intel_scu_ipc_update_register(u16 addr, u8 data, u8 mask); |
30 | 36 | ||
31 | /* | ||
32 | * Indirect register read | ||
33 | * Can be used when SCCB(System Controller Configuration Block) register | ||
34 | * HRIM(Honor Restricted IPC Messages) is set (bit 23) | ||
35 | */ | ||
36 | int intel_scu_ipc_register_read(u32 addr, u32 *data); | ||
37 | |||
38 | /* | ||
39 | * Indirect register write | ||
40 | * Can be used when SCCB(System Controller Configuration Block) register | ||
41 | * HRIM(Honor Restricted IPC Messages) is set (bit 23) | ||
42 | */ | ||
43 | int intel_scu_ipc_register_write(u32 addr, u32 data); | ||
44 | |||
45 | /* Issue commands to the SCU with or without data */ | 37 | /* Issue commands to the SCU with or without data */ |
46 | int intel_scu_ipc_simple_command(int cmd, int sub); | 38 | int intel_scu_ipc_simple_command(int cmd, int sub); |
47 | int intel_scu_ipc_command(int cmd, int sub, u32 *in, int inlen, | 39 | int intel_scu_ipc_command(int cmd, int sub, u32 *in, int inlen, |
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 8767d99c4f64..e2ca30092557 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h | |||
@@ -125,6 +125,9 @@ | |||
125 | */ | 125 | */ |
126 | #define MCE_SELF_VECTOR 0xeb | 126 | #define MCE_SELF_VECTOR 0xeb |
127 | 127 | ||
128 | /* Xen vector callback to receive events in a HVM domain */ | ||
129 | #define XEN_HVM_EVTCHN_CALLBACK 0xe9 | ||
130 | |||
128 | #define NR_VECTORS 256 | 131 | #define NR_VECTORS 256 |
129 | 132 | ||
130 | #define FPU_IRQ 13 | 133 | #define FPU_IRQ 13 |
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h index fa7c0b974761..5bdfca86581b 100644 --- a/arch/x86/include/asm/kdebug.h +++ b/arch/x86/include/asm/kdebug.h | |||
@@ -33,5 +33,11 @@ extern void __show_regs(struct pt_regs *regs, int all); | |||
33 | extern void show_regs(struct pt_regs *regs); | 33 | extern void show_regs(struct pt_regs *regs); |
34 | extern unsigned long oops_begin(void); | 34 | extern unsigned long oops_begin(void); |
35 | extern void oops_end(unsigned long, struct pt_regs *, int signr); | 35 | extern void oops_end(unsigned long, struct pt_regs *, int signr); |
36 | #ifdef CONFIG_KEXEC | ||
37 | extern int in_crash_kexec; | ||
38 | #else | ||
39 | /* no crash dump is ever in progress if no crash kernel can be kexec'd */ | ||
40 | #define in_crash_kexec 0 | ||
41 | #endif | ||
36 | 42 | ||
37 | #endif /* _ASM_X86_KDEBUG_H */ | 43 | #endif /* _ASM_X86_KDEBUG_H */ |
diff --git a/arch/x86/include/asm/kgdb.h b/arch/x86/include/asm/kgdb.h index 006da3687cdc..396f5b5fc4d7 100644 --- a/arch/x86/include/asm/kgdb.h +++ b/arch/x86/include/asm/kgdb.h | |||
@@ -39,9 +39,11 @@ enum regnames { | |||
39 | GDB_FS, /* 14 */ | 39 | GDB_FS, /* 14 */ |
40 | GDB_GS, /* 15 */ | 40 | GDB_GS, /* 15 */ |
41 | }; | 41 | }; |
42 | #define GDB_ORIG_AX 41 | ||
43 | #define DBG_MAX_REG_NUM 16 | ||
42 | #define NUMREGBYTES ((GDB_GS+1)*4) | 44 | #define NUMREGBYTES ((GDB_GS+1)*4) |
43 | #else /* ! CONFIG_X86_32 */ | 45 | #else /* ! CONFIG_X86_32 */ |
44 | enum regnames64 { | 46 | enum regnames { |
45 | GDB_AX, /* 0 */ | 47 | GDB_AX, /* 0 */ |
46 | GDB_BX, /* 1 */ | 48 | GDB_BX, /* 1 */ |
47 | GDB_CX, /* 2 */ | 49 | GDB_CX, /* 2 */ |
@@ -59,15 +61,15 @@ enum regnames64 { | |||
59 | GDB_R14, /* 14 */ | 61 | GDB_R14, /* 14 */ |
60 | GDB_R15, /* 15 */ | 62 | GDB_R15, /* 15 */ |
61 | GDB_PC, /* 16 */ | 63 | GDB_PC, /* 16 */ |
64 | GDB_PS, /* 17 */ | ||
65 | GDB_CS, /* 18 */ | ||
66 | GDB_SS, /* 19 */ | ||
62 | }; | 67 | }; |
63 | 68 | #define GDB_ORIG_AX 57 | |
64 | enum regnames32 { | 69 | #define DBG_MAX_REG_NUM 20 |
65 | GDB_PS = 34, | 70 | /* 17 64 bit regs and 3 32 bit regs */ |
66 | GDB_CS, | 71 | #define NUMREGBYTES ((17 * 8) + (3 * 4)) |
67 | GDB_SS, | 72 | #endif /* ! CONFIG_X86_32 */ |
68 | }; | ||
69 | #define NUMREGBYTES ((GDB_SS+1)*4) | ||
70 | #endif /* CONFIG_X86_32 */ | ||
71 | 73 | ||
72 | static inline void arch_kgdb_breakpoint(void) | 74 | static inline void arch_kgdb_breakpoint(void) |
73 | { | 75 | { |
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index ff90055c7f0b..4d8dcbdfc120 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h | |||
@@ -22,6 +22,8 @@ | |||
22 | #define __KVM_HAVE_XEN_HVM | 22 | #define __KVM_HAVE_XEN_HVM |
23 | #define __KVM_HAVE_VCPU_EVENTS | 23 | #define __KVM_HAVE_VCPU_EVENTS |
24 | #define __KVM_HAVE_DEBUGREGS | 24 | #define __KVM_HAVE_DEBUGREGS |
25 | #define __KVM_HAVE_XSAVE | ||
26 | #define __KVM_HAVE_XCRS | ||
25 | 27 | ||
26 | /* Architectural interrupt line count. */ | 28 | /* Architectural interrupt line count. */ |
27 | #define KVM_NR_INTERRUPTS 256 | 29 | #define KVM_NR_INTERRUPTS 256 |
@@ -299,4 +301,24 @@ struct kvm_debugregs { | |||
299 | __u64 reserved[9]; | 301 | __u64 reserved[9]; |
300 | }; | 302 | }; |
301 | 303 | ||
304 | /* for KVM_CAP_XSAVE */ | ||
305 | struct kvm_xsave { | ||
306 | __u32 region[1024]; | ||
307 | }; | ||
308 | |||
309 | #define KVM_MAX_XCRS 16 | ||
310 | |||
311 | struct kvm_xcr { | ||
312 | __u32 xcr; | ||
313 | __u32 reserved; | ||
314 | __u64 value; | ||
315 | }; | ||
316 | |||
317 | struct kvm_xcrs { | ||
318 | __u32 nr_xcrs; | ||
319 | __u32 flags; | ||
320 | struct kvm_xcr xcrs[KVM_MAX_XCRS]; | ||
321 | __u64 padding[16]; | ||
322 | }; | ||
323 | |||
302 | #endif /* _ASM_X86_KVM_H */ | 324 | #endif /* _ASM_X86_KVM_H */ |
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 0b2729bf2070..51cfd730ac5d 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h | |||
@@ -51,8 +51,10 @@ struct x86_emulate_ctxt; | |||
51 | #define X86EMUL_UNHANDLEABLE 1 | 51 | #define X86EMUL_UNHANDLEABLE 1 |
52 | /* Terminate emulation but return success to the caller. */ | 52 | /* Terminate emulation but return success to the caller. */ |
53 | #define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */ | 53 | #define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */ |
54 | #define X86EMUL_RETRY_INSTR 2 /* retry the instruction for some reason */ | 54 | #define X86EMUL_RETRY_INSTR 3 /* retry the instruction for some reason */ |
55 | #define X86EMUL_CMPXCHG_FAILED 2 /* cmpxchg did not see expected value */ | 55 | #define X86EMUL_CMPXCHG_FAILED 4 /* cmpxchg did not see expected value */ |
56 | #define X86EMUL_IO_NEEDED 5 /* IO is needed to complete emulation */ | ||
57 | |||
56 | struct x86_emulate_ops { | 58 | struct x86_emulate_ops { |
57 | /* | 59 | /* |
58 | * read_std: Read bytes of standard (non-emulated/special) memory. | 60 | * read_std: Read bytes of standard (non-emulated/special) memory. |
@@ -92,6 +94,7 @@ struct x86_emulate_ops { | |||
92 | int (*read_emulated)(unsigned long addr, | 94 | int (*read_emulated)(unsigned long addr, |
93 | void *val, | 95 | void *val, |
94 | unsigned int bytes, | 96 | unsigned int bytes, |
97 | unsigned int *error, | ||
95 | struct kvm_vcpu *vcpu); | 98 | struct kvm_vcpu *vcpu); |
96 | 99 | ||
97 | /* | 100 | /* |
@@ -104,6 +107,7 @@ struct x86_emulate_ops { | |||
104 | int (*write_emulated)(unsigned long addr, | 107 | int (*write_emulated)(unsigned long addr, |
105 | const void *val, | 108 | const void *val, |
106 | unsigned int bytes, | 109 | unsigned int bytes, |
110 | unsigned int *error, | ||
107 | struct kvm_vcpu *vcpu); | 111 | struct kvm_vcpu *vcpu); |
108 | 112 | ||
109 | /* | 113 | /* |
@@ -118,6 +122,7 @@ struct x86_emulate_ops { | |||
118 | const void *old, | 122 | const void *old, |
119 | const void *new, | 123 | const void *new, |
120 | unsigned int bytes, | 124 | unsigned int bytes, |
125 | unsigned int *error, | ||
121 | struct kvm_vcpu *vcpu); | 126 | struct kvm_vcpu *vcpu); |
122 | 127 | ||
123 | int (*pio_in_emulated)(int size, unsigned short port, void *val, | 128 | int (*pio_in_emulated)(int size, unsigned short port, void *val, |
@@ -132,18 +137,26 @@ struct x86_emulate_ops { | |||
132 | int seg, struct kvm_vcpu *vcpu); | 137 | int seg, struct kvm_vcpu *vcpu); |
133 | u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu); | 138 | u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu); |
134 | void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu); | 139 | void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu); |
140 | unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu); | ||
135 | void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu); | 141 | void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu); |
136 | ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu); | 142 | ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu); |
137 | void (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu); | 143 | int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu); |
138 | int (*cpl)(struct kvm_vcpu *vcpu); | 144 | int (*cpl)(struct kvm_vcpu *vcpu); |
139 | void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); | 145 | int (*get_dr)(int dr, unsigned long *dest, struct kvm_vcpu *vcpu); |
146 | int (*set_dr)(int dr, unsigned long value, struct kvm_vcpu *vcpu); | ||
147 | int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); | ||
148 | int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); | ||
140 | }; | 149 | }; |
141 | 150 | ||
142 | /* Type, address-of, and value of an instruction's operand. */ | 151 | /* Type, address-of, and value of an instruction's operand. */ |
143 | struct operand { | 152 | struct operand { |
144 | enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type; | 153 | enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type; |
145 | unsigned int bytes; | 154 | unsigned int bytes; |
146 | unsigned long val, orig_val, *ptr; | 155 | unsigned long orig_val, *ptr; |
156 | union { | ||
157 | unsigned long val; | ||
158 | char valptr[sizeof(unsigned long) + 2]; | ||
159 | }; | ||
147 | }; | 160 | }; |
148 | 161 | ||
149 | struct fetch_cache { | 162 | struct fetch_cache { |
@@ -186,6 +199,7 @@ struct decode_cache { | |||
186 | unsigned long modrm_val; | 199 | unsigned long modrm_val; |
187 | struct fetch_cache fetch; | 200 | struct fetch_cache fetch; |
188 | struct read_cache io_read; | 201 | struct read_cache io_read; |
202 | struct read_cache mem_read; | ||
189 | }; | 203 | }; |
190 | 204 | ||
191 | struct x86_emulate_ctxt { | 205 | struct x86_emulate_ctxt { |
@@ -202,6 +216,12 @@ struct x86_emulate_ctxt { | |||
202 | int interruptibility; | 216 | int interruptibility; |
203 | 217 | ||
204 | bool restart; /* restart string instruction after writeback */ | 218 | bool restart; /* restart string instruction after writeback */ |
219 | |||
220 | int exception; /* exception that happens during emulation or -1 */ | ||
221 | u32 error_code; /* error code for exception */ | ||
222 | bool error_code_valid; | ||
223 | unsigned long cr2; /* faulted address in case of #PF */ | ||
224 | |||
205 | /* decode cache */ | 225 | /* decode cache */ |
206 | struct decode_cache decode; | 226 | struct decode_cache decode; |
207 | }; | 227 | }; |
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 76f5483cffec..502e53f999cf 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/mm.h> | 15 | #include <linux/mm.h> |
16 | #include <linux/mmu_notifier.h> | 16 | #include <linux/mmu_notifier.h> |
17 | #include <linux/tracepoint.h> | 17 | #include <linux/tracepoint.h> |
18 | #include <linux/cpumask.h> | ||
18 | 19 | ||
19 | #include <linux/kvm.h> | 20 | #include <linux/kvm.h> |
20 | #include <linux/kvm_para.h> | 21 | #include <linux/kvm_para.h> |
@@ -39,11 +40,14 @@ | |||
39 | 0xFFFFFF0000000000ULL) | 40 | 0xFFFFFF0000000000ULL) |
40 | 41 | ||
41 | #define INVALID_PAGE (~(hpa_t)0) | 42 | #define INVALID_PAGE (~(hpa_t)0) |
43 | #define VALID_PAGE(x) ((x) != INVALID_PAGE) | ||
44 | |||
42 | #define UNMAPPED_GVA (~(gpa_t)0) | 45 | #define UNMAPPED_GVA (~(gpa_t)0) |
43 | 46 | ||
44 | /* KVM Hugepage definitions for x86 */ | 47 | /* KVM Hugepage definitions for x86 */ |
45 | #define KVM_NR_PAGE_SIZES 3 | 48 | #define KVM_NR_PAGE_SIZES 3 |
46 | #define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + (((x) - 1) * 9)) | 49 | #define KVM_HPAGE_GFN_SHIFT(x) (((x) - 1) * 9) |
50 | #define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x)) | ||
47 | #define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x)) | 51 | #define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x)) |
48 | #define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) | 52 | #define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) |
49 | #define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE) | 53 | #define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE) |
@@ -69,8 +73,6 @@ | |||
69 | 73 | ||
70 | #define IOPL_SHIFT 12 | 74 | #define IOPL_SHIFT 12 |
71 | 75 | ||
72 | #define KVM_ALIAS_SLOTS 4 | ||
73 | |||
74 | #define KVM_PERMILLE_MMU_PAGES 20 | 76 | #define KVM_PERMILLE_MMU_PAGES 20 |
75 | #define KVM_MIN_ALLOC_MMU_PAGES 64 | 77 | #define KVM_MIN_ALLOC_MMU_PAGES 64 |
76 | #define KVM_MMU_HASH_SHIFT 10 | 78 | #define KVM_MMU_HASH_SHIFT 10 |
@@ -241,7 +243,7 @@ struct kvm_mmu { | |||
241 | void (*prefetch_page)(struct kvm_vcpu *vcpu, | 243 | void (*prefetch_page)(struct kvm_vcpu *vcpu, |
242 | struct kvm_mmu_page *page); | 244 | struct kvm_mmu_page *page); |
243 | int (*sync_page)(struct kvm_vcpu *vcpu, | 245 | int (*sync_page)(struct kvm_vcpu *vcpu, |
244 | struct kvm_mmu_page *sp); | 246 | struct kvm_mmu_page *sp, bool clear_unsync); |
245 | void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); | 247 | void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); |
246 | hpa_t root_hpa; | 248 | hpa_t root_hpa; |
247 | int root_level; | 249 | int root_level; |
@@ -301,8 +303,8 @@ struct kvm_vcpu_arch { | |||
301 | unsigned long mmu_seq; | 303 | unsigned long mmu_seq; |
302 | } update_pte; | 304 | } update_pte; |
303 | 305 | ||
304 | struct i387_fxsave_struct host_fx_image; | 306 | struct fpu guest_fpu; |
305 | struct i387_fxsave_struct guest_fx_image; | 307 | u64 xcr0; |
306 | 308 | ||
307 | gva_t mmio_fault_cr2; | 309 | gva_t mmio_fault_cr2; |
308 | struct kvm_pio_request pio; | 310 | struct kvm_pio_request pio; |
@@ -360,26 +362,11 @@ struct kvm_vcpu_arch { | |||
360 | 362 | ||
361 | /* fields used by HYPER-V emulation */ | 363 | /* fields used by HYPER-V emulation */ |
362 | u64 hv_vapic; | 364 | u64 hv_vapic; |
363 | }; | ||
364 | |||
365 | struct kvm_mem_alias { | ||
366 | gfn_t base_gfn; | ||
367 | unsigned long npages; | ||
368 | gfn_t target_gfn; | ||
369 | #define KVM_ALIAS_INVALID 1UL | ||
370 | unsigned long flags; | ||
371 | }; | ||
372 | 365 | ||
373 | #define KVM_ARCH_HAS_UNALIAS_INSTANTIATION | 366 | cpumask_var_t wbinvd_dirty_mask; |
374 | |||
375 | struct kvm_mem_aliases { | ||
376 | struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; | ||
377 | int naliases; | ||
378 | }; | 367 | }; |
379 | 368 | ||
380 | struct kvm_arch { | 369 | struct kvm_arch { |
381 | struct kvm_mem_aliases *aliases; | ||
382 | |||
383 | unsigned int n_free_mmu_pages; | 370 | unsigned int n_free_mmu_pages; |
384 | unsigned int n_requested_mmu_pages; | 371 | unsigned int n_requested_mmu_pages; |
385 | unsigned int n_alloc_mmu_pages; | 372 | unsigned int n_alloc_mmu_pages; |
@@ -533,6 +520,8 @@ struct kvm_x86_ops { | |||
533 | 520 | ||
534 | void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry); | 521 | void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry); |
535 | 522 | ||
523 | bool (*has_wbinvd_exit)(void); | ||
524 | |||
536 | const struct trace_print_flags *exit_reasons_str; | 525 | const struct trace_print_flags *exit_reasons_str; |
537 | }; | 526 | }; |
538 | 527 | ||
@@ -576,7 +565,6 @@ enum emulation_result { | |||
576 | #define EMULTYPE_SKIP (1 << 2) | 565 | #define EMULTYPE_SKIP (1 << 2) |
577 | int emulate_instruction(struct kvm_vcpu *vcpu, | 566 | int emulate_instruction(struct kvm_vcpu *vcpu, |
578 | unsigned long cr2, u16 error_code, int emulation_type); | 567 | unsigned long cr2, u16 error_code, int emulation_type); |
579 | void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); | ||
580 | void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); | 568 | void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); |
581 | void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); | 569 | void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); |
582 | 570 | ||
@@ -591,10 +579,7 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); | |||
591 | int kvm_emulate_halt(struct kvm_vcpu *vcpu); | 579 | int kvm_emulate_halt(struct kvm_vcpu *vcpu); |
592 | int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); | 580 | int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); |
593 | int emulate_clts(struct kvm_vcpu *vcpu); | 581 | int emulate_clts(struct kvm_vcpu *vcpu); |
594 | int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, | 582 | int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); |
595 | unsigned long *dest); | ||
596 | int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, | ||
597 | unsigned long value); | ||
598 | 583 | ||
599 | void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); | 584 | void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); |
600 | int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); | 585 | int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); |
@@ -602,15 +587,16 @@ int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); | |||
602 | int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, | 587 | int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, |
603 | bool has_error_code, u32 error_code); | 588 | bool has_error_code, u32 error_code); |
604 | 589 | ||
605 | void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); | 590 | int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); |
606 | void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); | 591 | int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); |
607 | void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); | 592 | int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); |
608 | void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); | 593 | void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); |
609 | int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val); | 594 | int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val); |
610 | int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val); | 595 | int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val); |
611 | unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); | 596 | unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); |
612 | void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw); | 597 | void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw); |
613 | void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); | 598 | void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); |
599 | int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr); | ||
614 | 600 | ||
615 | int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); | 601 | int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); |
616 | int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); | 602 | int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); |
@@ -630,12 +616,7 @@ int kvm_pic_set_irq(void *opaque, int irq, int level); | |||
630 | 616 | ||
631 | void kvm_inject_nmi(struct kvm_vcpu *vcpu); | 617 | void kvm_inject_nmi(struct kvm_vcpu *vcpu); |
632 | 618 | ||
633 | void fx_init(struct kvm_vcpu *vcpu); | 619 | int fx_init(struct kvm_vcpu *vcpu); |
634 | |||
635 | int emulator_write_emulated(unsigned long addr, | ||
636 | const void *val, | ||
637 | unsigned int bytes, | ||
638 | struct kvm_vcpu *vcpu); | ||
639 | 620 | ||
640 | void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); | 621 | void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); |
641 | void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | 622 | void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, |
@@ -664,8 +645,6 @@ void kvm_disable_tdp(void); | |||
664 | int complete_pio(struct kvm_vcpu *vcpu); | 645 | int complete_pio(struct kvm_vcpu *vcpu); |
665 | bool kvm_check_iopl(struct kvm_vcpu *vcpu); | 646 | bool kvm_check_iopl(struct kvm_vcpu *vcpu); |
666 | 647 | ||
667 | struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn); | ||
668 | |||
669 | static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) | 648 | static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) |
670 | { | 649 | { |
671 | struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); | 650 | struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); |
@@ -719,21 +698,6 @@ static inline unsigned long read_msr(unsigned long msr) | |||
719 | } | 698 | } |
720 | #endif | 699 | #endif |
721 | 700 | ||
722 | static inline void kvm_fx_save(struct i387_fxsave_struct *image) | ||
723 | { | ||
724 | asm("fxsave (%0)":: "r" (image)); | ||
725 | } | ||
726 | |||
727 | static inline void kvm_fx_restore(struct i387_fxsave_struct *image) | ||
728 | { | ||
729 | asm("fxrstor (%0)":: "r" (image)); | ||
730 | } | ||
731 | |||
732 | static inline void kvm_fx_finit(void) | ||
733 | { | ||
734 | asm("finit"); | ||
735 | } | ||
736 | |||
737 | static inline u32 get_rdx_init_val(void) | 701 | static inline u32 get_rdx_init_val(void) |
738 | { | 702 | { |
739 | return 0x600; /* P6 family */ | 703 | return 0x600; /* P6 family */ |
diff --git a/arch/x86/include/asm/local64.h b/arch/x86/include/asm/local64.h new file mode 100644 index 000000000000..36c93b5cc239 --- /dev/null +++ b/arch/x86/include/asm/local64.h | |||
@@ -0,0 +1 @@ | |||
#include <asm-generic/local64.h> | |||
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index f32a4301c4d4..c62c13cb9788 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h | |||
@@ -38,6 +38,10 @@ | |||
38 | #define MCM_ADDR_MEM 3 /* memory address */ | 38 | #define MCM_ADDR_MEM 3 /* memory address */ |
39 | #define MCM_ADDR_GENERIC 7 /* generic */ | 39 | #define MCM_ADDR_GENERIC 7 /* generic */ |
40 | 40 | ||
41 | /* CTL2 register defines */ | ||
42 | #define MCI_CTL2_CMCI_EN (1ULL << 30) | ||
43 | #define MCI_CTL2_CMCI_THRESHOLD_MASK 0x7fffULL | ||
44 | |||
41 | #define MCJ_CTX_MASK 3 | 45 | #define MCJ_CTX_MASK 3 |
42 | #define MCJ_CTX(flags) ((flags) & MCJ_CTX_MASK) | 46 | #define MCJ_CTX(flags) ((flags) & MCJ_CTX_MASK) |
43 | #define MCJ_CTX_RANDOM 0 /* inject context: random */ | 47 | #define MCJ_CTX_RANDOM 0 /* inject context: random */ |
diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h index 451d30e7f62d..16350740edf6 100644 --- a/arch/x86/include/asm/mrst.h +++ b/arch/x86/include/asm/mrst.h | |||
@@ -13,6 +13,32 @@ | |||
13 | extern int pci_mrst_init(void); | 13 | extern int pci_mrst_init(void); |
14 | int __init sfi_parse_mrtc(struct sfi_table_header *table); | 14 | int __init sfi_parse_mrtc(struct sfi_table_header *table); |
15 | 15 | ||
16 | /* | ||
17 | * Medfield is the follow-up of Moorestown, it combines two chip solution into | ||
18 | * one. Other than that it also added always-on and constant tsc and lapic | ||
19 | * timers. Medfield is the platform name, and the chip name is called Penwell | ||
20 | * we treat Medfield/Penwell as a variant of Moorestown. Penwell can be | ||
21 | * identified via MSRs. | ||
22 | */ | ||
23 | enum mrst_cpu_type { | ||
24 | MRST_CPU_CHIP_LINCROFT = 1, | ||
25 | MRST_CPU_CHIP_PENWELL, | ||
26 | }; | ||
27 | |||
28 | extern enum mrst_cpu_type __mrst_cpu_chip; | ||
29 | static enum mrst_cpu_type mrst_identify_cpu(void) | ||
30 | { | ||
31 | return __mrst_cpu_chip; | ||
32 | } | ||
33 | |||
34 | enum mrst_timer_options { | ||
35 | MRST_TIMER_DEFAULT, | ||
36 | MRST_TIMER_APBT_ONLY, | ||
37 | MRST_TIMER_LAPIC_APBT, | ||
38 | }; | ||
39 | |||
40 | extern enum mrst_timer_options mrst_timer_options; | ||
41 | |||
16 | #define SFI_MTMR_MAX_NUM 8 | 42 | #define SFI_MTMR_MAX_NUM 8 |
17 | #define SFI_MRTC_MAX 8 | 43 | #define SFI_MRTC_MAX 8 |
18 | 44 | ||
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 8c7ae4318629..986f7790fdb2 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h | |||
@@ -20,6 +20,7 @@ | |||
20 | #define _EFER_LMA 10 /* Long mode active (read-only) */ | 20 | #define _EFER_LMA 10 /* Long mode active (read-only) */ |
21 | #define _EFER_NX 11 /* No execute enable */ | 21 | #define _EFER_NX 11 /* No execute enable */ |
22 | #define _EFER_SVME 12 /* Enable virtualization */ | 22 | #define _EFER_SVME 12 /* Enable virtualization */ |
23 | #define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */ | ||
23 | #define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */ | 24 | #define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */ |
24 | 25 | ||
25 | #define EFER_SCE (1<<_EFER_SCE) | 26 | #define EFER_SCE (1<<_EFER_SCE) |
@@ -27,6 +28,7 @@ | |||
27 | #define EFER_LMA (1<<_EFER_LMA) | 28 | #define EFER_LMA (1<<_EFER_LMA) |
28 | #define EFER_NX (1<<_EFER_NX) | 29 | #define EFER_NX (1<<_EFER_NX) |
29 | #define EFER_SVME (1<<_EFER_SVME) | 30 | #define EFER_SVME (1<<_EFER_SVME) |
31 | #define EFER_LMSLE (1<<_EFER_LMSLE) | ||
30 | #define EFER_FFXSR (1<<_EFER_FFXSR) | 32 | #define EFER_FFXSR (1<<_EFER_FFXSR) |
31 | 33 | ||
32 | /* Intel MSRs. Some also available on other CPUs */ | 34 | /* Intel MSRs. Some also available on other CPUs */ |
@@ -94,9 +96,6 @@ | |||
94 | #define MSR_IA32_MC0_CTL2 0x00000280 | 96 | #define MSR_IA32_MC0_CTL2 0x00000280 |
95 | #define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x)) | 97 | #define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x)) |
96 | 98 | ||
97 | #define CMCI_EN (1ULL << 30) | ||
98 | #define CMCI_THRESHOLD_MASK 0xffffULL | ||
99 | |||
100 | #define MSR_P6_PERFCTR0 0x000000c1 | 99 | #define MSR_P6_PERFCTR0 0x000000c1 |
101 | #define MSR_P6_PERFCTR1 0x000000c2 | 100 | #define MSR_P6_PERFCTR1 0x000000c2 |
102 | #define MSR_P6_EVNTSEL0 0x00000186 | 101 | #define MSR_P6_EVNTSEL0 0x00000186 |
@@ -159,8 +158,6 @@ | |||
159 | #define MSR_K7_FID_VID_STATUS 0xc0010042 | 158 | #define MSR_K7_FID_VID_STATUS 0xc0010042 |
160 | 159 | ||
161 | /* K6 MSRs */ | 160 | /* K6 MSRs */ |
162 | #define MSR_K6_EFER 0xc0000080 | ||
163 | #define MSR_K6_STAR 0xc0000081 | ||
164 | #define MSR_K6_WHCR 0xc0000082 | 161 | #define MSR_K6_WHCR 0xc0000082 |
165 | #define MSR_K6_UWCCR 0xc0000085 | 162 | #define MSR_K6_UWCCR 0xc0000085 |
166 | #define MSR_K6_EPMR 0xc0000086 | 163 | #define MSR_K6_EPMR 0xc0000086 |
@@ -224,12 +221,14 @@ | |||
224 | #define MSR_IA32_THERM_CONTROL 0x0000019a | 221 | #define MSR_IA32_THERM_CONTROL 0x0000019a |
225 | #define MSR_IA32_THERM_INTERRUPT 0x0000019b | 222 | #define MSR_IA32_THERM_INTERRUPT 0x0000019b |
226 | 223 | ||
227 | #define THERM_INT_LOW_ENABLE (1 << 0) | 224 | #define THERM_INT_HIGH_ENABLE (1 << 0) |
228 | #define THERM_INT_HIGH_ENABLE (1 << 1) | 225 | #define THERM_INT_LOW_ENABLE (1 << 1) |
226 | #define THERM_INT_PLN_ENABLE (1 << 24) | ||
229 | 227 | ||
230 | #define MSR_IA32_THERM_STATUS 0x0000019c | 228 | #define MSR_IA32_THERM_STATUS 0x0000019c |
231 | 229 | ||
232 | #define THERM_STATUS_PROCHOT (1 << 0) | 230 | #define THERM_STATUS_PROCHOT (1 << 0) |
231 | #define THERM_STATUS_POWER_LIMIT (1 << 10) | ||
233 | 232 | ||
234 | #define MSR_THERM2_CTL 0x0000019d | 233 | #define MSR_THERM2_CTL 0x0000019d |
235 | 234 | ||
@@ -239,6 +238,19 @@ | |||
239 | 238 | ||
240 | #define MSR_IA32_TEMPERATURE_TARGET 0x000001a2 | 239 | #define MSR_IA32_TEMPERATURE_TARGET 0x000001a2 |
241 | 240 | ||
241 | #define MSR_IA32_ENERGY_PERF_BIAS 0x000001b0 | ||
242 | |||
243 | #define MSR_IA32_PACKAGE_THERM_STATUS 0x000001b1 | ||
244 | |||
245 | #define PACKAGE_THERM_STATUS_PROCHOT (1 << 0) | ||
246 | #define PACKAGE_THERM_STATUS_POWER_LIMIT (1 << 10) | ||
247 | |||
248 | #define MSR_IA32_PACKAGE_THERM_INTERRUPT 0x000001b2 | ||
249 | |||
250 | #define PACKAGE_THERM_INT_HIGH_ENABLE (1 << 0) | ||
251 | #define PACKAGE_THERM_INT_LOW_ENABLE (1 << 1) | ||
252 | #define PACKAGE_THERM_INT_PLN_ENABLE (1 << 24) | ||
253 | |||
242 | /* MISC_ENABLE bits: architectural */ | 254 | /* MISC_ENABLE bits: architectural */ |
243 | #define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0) | 255 | #define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0) |
244 | #define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1) | 256 | #define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1) |
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index c5bc4c2d33f5..084ef95274cd 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h | |||
@@ -148,8 +148,8 @@ static inline unsigned long long native_read_pmc(int counter) | |||
148 | #define rdmsr(msr, val1, val2) \ | 148 | #define rdmsr(msr, val1, val2) \ |
149 | do { \ | 149 | do { \ |
150 | u64 __val = native_read_msr((msr)); \ | 150 | u64 __val = native_read_msr((msr)); \ |
151 | (val1) = (u32)__val; \ | 151 | (void)((val1) = (u32)__val); \ |
152 | (val2) = (u32)(__val >> 32); \ | 152 | (void)((val2) = (u32)(__val >> 32)); \ |
153 | } while (0) | 153 | } while (0) |
154 | 154 | ||
155 | static inline void wrmsr(unsigned msr, unsigned low, unsigned high) | 155 | static inline void wrmsr(unsigned msr, unsigned low, unsigned high) |
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 93da9c3f3341..932f0f86b4b7 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h | |||
@@ -17,7 +17,9 @@ int do_nmi_callback(struct pt_regs *regs, int cpu); | |||
17 | 17 | ||
18 | extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); | 18 | extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); |
19 | extern int check_nmi_watchdog(void); | 19 | extern int check_nmi_watchdog(void); |
20 | #if !defined(CONFIG_LOCKUP_DETECTOR) | ||
20 | extern int nmi_watchdog_enabled; | 21 | extern int nmi_watchdog_enabled; |
22 | #endif | ||
21 | extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); | 23 | extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); |
22 | extern int reserve_perfctr_nmi(unsigned int); | 24 | extern int reserve_perfctr_nmi(unsigned int); |
23 | extern void release_perfctr_nmi(unsigned int); | 25 | extern void release_perfctr_nmi(unsigned int); |
diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h new file mode 100644 index 000000000000..08fde475cb3b --- /dev/null +++ b/arch/x86/include/asm/olpc_ofw.h | |||
@@ -0,0 +1,31 @@ | |||
1 | #ifndef _ASM_X86_OLPC_OFW_H | ||
2 | #define _ASM_X86_OLPC_OFW_H | ||
3 | |||
4 | /* index into the page table containing the entry OFW occupies */ | ||
5 | #define OLPC_OFW_PDE_NR 1022 | ||
6 | |||
7 | #define OLPC_OFW_SIG 0x2057464F /* aka "OFW " */ | ||
8 | |||
9 | #ifdef CONFIG_OLPC_OPENFIRMWARE | ||
10 | |||
11 | /* run an OFW command by calling into the firmware */ | ||
12 | #define olpc_ofw(name, args, res) \ | ||
13 | __olpc_ofw((name), ARRAY_SIZE(args), args, ARRAY_SIZE(res), res) | ||
14 | |||
15 | extern int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res, | ||
16 | void **res); | ||
17 | |||
18 | /* determine whether OFW is available and lives in the proper memory */ | ||
19 | extern void olpc_ofw_detect(void); | ||
20 | |||
21 | /* install OFW's pde permanently into the kernel's pgtable */ | ||
22 | extern void setup_olpc_ofw_pgd(void); | ||
23 | |||
24 | #else /* !CONFIG_OLPC_OPENFIRMWARE */ | ||
25 | |||
26 | static inline void olpc_ofw_detect(void) { } | ||
27 | static inline void setup_olpc_ofw_pgd(void) { } | ||
28 | |||
29 | #endif /* !CONFIG_OLPC_OPENFIRMWARE */ | ||
30 | |||
31 | #endif /* _ASM_X86_OLPC_OFW_H */ | ||
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index 625c3f0e741a..8ca82839288a 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h | |||
@@ -37,6 +37,13 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr, | |||
37 | #define __pa_nodebug(x) __phys_addr_nodebug((unsigned long)(x)) | 37 | #define __pa_nodebug(x) __phys_addr_nodebug((unsigned long)(x)) |
38 | /* __pa_symbol should be used for C visible symbols. | 38 | /* __pa_symbol should be used for C visible symbols. |
39 | This seems to be the official gcc blessed way to do such arithmetic. */ | 39 | This seems to be the official gcc blessed way to do such arithmetic. */ |
40 | /* | ||
41 | * We need __phys_reloc_hide() here because gcc may assume that there is no | ||
42 | * overflow during __pa() calculation and can optimize it unexpectedly. | ||
43 | * Newer versions of gcc provide -fno-strict-overflow switch to handle this | ||
44 | * case properly. Once all supported versions of gcc understand it, we can | ||
45 | * remove this Voodoo magic stuff. (i.e. once gcc3.x is deprecated) | ||
46 | */ | ||
40 | #define __pa_symbol(x) __pa(__phys_reloc_hide((unsigned long)(x))) | 47 | #define __pa_symbol(x) __pa(__phys_reloc_hide((unsigned long)(x))) |
41 | 48 | ||
42 | #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) | 49 | #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) |
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index cd2a31dc5fb8..49c7219826f9 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h | |||
@@ -30,6 +30,7 @@ | |||
30 | #define PCI_HAS_IO_ECS 0x40000 | 30 | #define PCI_HAS_IO_ECS 0x40000 |
31 | #define PCI_NOASSIGN_ROMS 0x80000 | 31 | #define PCI_NOASSIGN_ROMS 0x80000 |
32 | #define PCI_ROOT_NO_CRS 0x100000 | 32 | #define PCI_ROOT_NO_CRS 0x100000 |
33 | #define PCI_NOASSIGN_BARS 0x200000 | ||
33 | 34 | ||
34 | extern unsigned int pci_probe; | 35 | extern unsigned int pci_probe; |
35 | extern unsigned long pirq_table_addr; | 36 | extern unsigned long pirq_table_addr; |
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 254883d0c7e0..6e742cc4251b 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h | |||
@@ -68,8 +68,9 @@ union cpuid10_eax { | |||
68 | 68 | ||
69 | union cpuid10_edx { | 69 | union cpuid10_edx { |
70 | struct { | 70 | struct { |
71 | unsigned int num_counters_fixed:4; | 71 | unsigned int num_counters_fixed:5; |
72 | unsigned int reserved:28; | 72 | unsigned int bit_width_fixed:8; |
73 | unsigned int reserved:19; | ||
73 | } split; | 74 | } split; |
74 | unsigned int full; | 75 | unsigned int full; |
75 | }; | 76 | }; |
@@ -140,6 +141,19 @@ extern unsigned long perf_instruction_pointer(struct pt_regs *regs); | |||
140 | extern unsigned long perf_misc_flags(struct pt_regs *regs); | 141 | extern unsigned long perf_misc_flags(struct pt_regs *regs); |
141 | #define perf_misc_flags(regs) perf_misc_flags(regs) | 142 | #define perf_misc_flags(regs) perf_misc_flags(regs) |
142 | 143 | ||
144 | #include <asm/stacktrace.h> | ||
145 | |||
146 | /* | ||
147 | * We abuse bit 3 from flags to pass exact information, see perf_misc_flags | ||
148 | * and the comment with PERF_EFLAGS_EXACT. | ||
149 | */ | ||
150 | #define perf_arch_fetch_caller_regs(regs, __ip) { \ | ||
151 | (regs)->ip = (__ip); \ | ||
152 | (regs)->bp = caller_frame_pointer(); \ | ||
153 | (regs)->cs = __KERNEL_CS; \ | ||
154 | regs->flags = 0; \ | ||
155 | } | ||
156 | |||
143 | #else | 157 | #else |
144 | static inline void init_hw_perf_events(void) { } | 158 | static inline void init_hw_perf_events(void) { } |
145 | static inline void perf_events_lapic_init(void) { } | 159 | static inline void perf_events_lapic_init(void) { } |
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h index 64a8ebff06fc..def500776b16 100644 --- a/arch/x86/include/asm/perf_event_p4.h +++ b/arch/x86/include/asm/perf_event_p4.h | |||
@@ -19,7 +19,6 @@ | |||
19 | #define ARCH_P4_RESERVED_ESCR (2) /* IQ_ESCR(0,1) not always present */ | 19 | #define ARCH_P4_RESERVED_ESCR (2) /* IQ_ESCR(0,1) not always present */ |
20 | #define ARCH_P4_MAX_ESCR (ARCH_P4_TOTAL_ESCR - ARCH_P4_RESERVED_ESCR) | 20 | #define ARCH_P4_MAX_ESCR (ARCH_P4_TOTAL_ESCR - ARCH_P4_RESERVED_ESCR) |
21 | #define ARCH_P4_MAX_CCCR (18) | 21 | #define ARCH_P4_MAX_CCCR (18) |
22 | #define ARCH_P4_MAX_COUNTER (ARCH_P4_MAX_CCCR / 2) | ||
23 | 22 | ||
24 | #define P4_ESCR_EVENT_MASK 0x7e000000U | 23 | #define P4_ESCR_EVENT_MASK 0x7e000000U |
25 | #define P4_ESCR_EVENT_SHIFT 25 | 24 | #define P4_ESCR_EVENT_SHIFT 25 |
@@ -71,10 +70,6 @@ | |||
71 | #define P4_CCCR_THRESHOLD(v) ((v) << P4_CCCR_THRESHOLD_SHIFT) | 70 | #define P4_CCCR_THRESHOLD(v) ((v) << P4_CCCR_THRESHOLD_SHIFT) |
72 | #define P4_CCCR_ESEL(v) ((v) << P4_CCCR_ESCR_SELECT_SHIFT) | 71 | #define P4_CCCR_ESEL(v) ((v) << P4_CCCR_ESCR_SELECT_SHIFT) |
73 | 72 | ||
74 | /* Custom bits in reerved CCCR area */ | ||
75 | #define P4_CCCR_CACHE_OPS_MASK 0x0000003fU | ||
76 | |||
77 | |||
78 | /* Non HT mask */ | 73 | /* Non HT mask */ |
79 | #define P4_CCCR_MASK \ | 74 | #define P4_CCCR_MASK \ |
80 | (P4_CCCR_OVF | \ | 75 | (P4_CCCR_OVF | \ |
@@ -106,8 +101,7 @@ | |||
106 | * ESCR and CCCR but rather an only packed value should | 101 | * ESCR and CCCR but rather an only packed value should |
107 | * be unpacked and written to a proper addresses | 102 | * be unpacked and written to a proper addresses |
108 | * | 103 | * |
109 | * the base idea is to pack as much info as | 104 | * the base idea is to pack as much info as possible |
110 | * possible | ||
111 | */ | 105 | */ |
112 | #define p4_config_pack_escr(v) (((u64)(v)) << 32) | 106 | #define p4_config_pack_escr(v) (((u64)(v)) << 32) |
113 | #define p4_config_pack_cccr(v) (((u64)(v)) & 0xffffffffULL) | 107 | #define p4_config_pack_cccr(v) (((u64)(v)) & 0xffffffffULL) |
@@ -130,8 +124,6 @@ | |||
130 | t; \ | 124 | t; \ |
131 | }) | 125 | }) |
132 | 126 | ||
133 | #define p4_config_unpack_cache_event(v) (((u64)(v)) & P4_CCCR_CACHE_OPS_MASK) | ||
134 | |||
135 | #define P4_CONFIG_HT_SHIFT 63 | 127 | #define P4_CONFIG_HT_SHIFT 63 |
136 | #define P4_CONFIG_HT (1ULL << P4_CONFIG_HT_SHIFT) | 128 | #define P4_CONFIG_HT (1ULL << P4_CONFIG_HT_SHIFT) |
137 | 129 | ||
@@ -214,6 +206,12 @@ static inline u32 p4_default_escr_conf(int cpu, int exclude_os, int exclude_usr) | |||
214 | return escr; | 206 | return escr; |
215 | } | 207 | } |
216 | 208 | ||
209 | /* | ||
210 | * This are the events which should be used in "Event Select" | ||
211 | * field of ESCR register, they are like unique keys which allow | ||
212 | * the kernel to determinate which CCCR and COUNTER should be | ||
213 | * used to track an event | ||
214 | */ | ||
217 | enum P4_EVENTS { | 215 | enum P4_EVENTS { |
218 | P4_EVENT_TC_DELIVER_MODE, | 216 | P4_EVENT_TC_DELIVER_MODE, |
219 | P4_EVENT_BPU_FETCH_REQUEST, | 217 | P4_EVENT_BPU_FETCH_REQUEST, |
@@ -561,7 +559,7 @@ enum P4_EVENT_OPCODES { | |||
561 | * a caller should use P4_ESCR_EMASK_NAME helper to | 559 | * a caller should use P4_ESCR_EMASK_NAME helper to |
562 | * pick the EventMask needed, for example | 560 | * pick the EventMask needed, for example |
563 | * | 561 | * |
564 | * P4_ESCR_EMASK_NAME(P4_EVENT_TC_DELIVER_MODE, DD) | 562 | * P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD) |
565 | */ | 563 | */ |
566 | enum P4_ESCR_EMASKS { | 564 | enum P4_ESCR_EMASKS { |
567 | P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, DD, 0), | 565 | P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, DD, 0), |
@@ -753,43 +751,50 @@ enum P4_ESCR_EMASKS { | |||
753 | P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_COMPLETED, BOGUS, 1), | 751 | P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_COMPLETED, BOGUS, 1), |
754 | }; | 752 | }; |
755 | 753 | ||
756 | /* P4 PEBS: stale for a while */ | 754 | /* |
757 | #define P4_PEBS_METRIC_MASK 0x00001fffU | 755 | * P4 PEBS specifics (Replay Event only) |
758 | #define P4_PEBS_UOB_TAG 0x01000000U | 756 | * |
759 | #define P4_PEBS_ENABLE 0x02000000U | 757 | * Format (bits): |
760 | 758 | * 0-6: metric from P4_PEBS_METRIC enum | |
761 | /* Replay metrics for MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT */ | 759 | * 7 : reserved |
762 | #define P4_PEBS__1stl_cache_load_miss_retired 0x3000001 | 760 | * 8 : reserved |
763 | #define P4_PEBS__2ndl_cache_load_miss_retired 0x3000002 | 761 | * 9-11 : reserved |
764 | #define P4_PEBS__dtlb_load_miss_retired 0x3000004 | 762 | * |
765 | #define P4_PEBS__dtlb_store_miss_retired 0x3000004 | 763 | * Note we have UOP and PEBS bits reserved for now |
766 | #define P4_PEBS__dtlb_all_miss_retired 0x3000004 | 764 | * just in case if we will need them once |
767 | #define P4_PEBS__tagged_mispred_branch 0x3018000 | 765 | */ |
768 | #define P4_PEBS__mob_load_replay_retired 0x3000200 | 766 | #define P4_PEBS_CONFIG_ENABLE (1 << 7) |
769 | #define P4_PEBS__split_load_retired 0x3000400 | 767 | #define P4_PEBS_CONFIG_UOP_TAG (1 << 8) |
770 | #define P4_PEBS__split_store_retired 0x3000400 | 768 | #define P4_PEBS_CONFIG_METRIC_MASK 0x3f |
771 | 769 | #define P4_PEBS_CONFIG_MASK 0xff | |
772 | #define P4_VERT__1stl_cache_load_miss_retired 0x0000001 | 770 | |
773 | #define P4_VERT__2ndl_cache_load_miss_retired 0x0000001 | 771 | /* |
774 | #define P4_VERT__dtlb_load_miss_retired 0x0000001 | 772 | * mem: Only counters MSR_IQ_COUNTER4 (16) and |
775 | #define P4_VERT__dtlb_store_miss_retired 0x0000002 | 773 | * MSR_IQ_COUNTER5 (17) are allowed for PEBS sampling |
776 | #define P4_VERT__dtlb_all_miss_retired 0x0000003 | 774 | */ |
777 | #define P4_VERT__tagged_mispred_branch 0x0000010 | 775 | #define P4_PEBS_ENABLE 0x02000000U |
778 | #define P4_VERT__mob_load_replay_retired 0x0000001 | 776 | #define P4_PEBS_ENABLE_UOP_TAG 0x01000000U |
779 | #define P4_VERT__split_load_retired 0x0000001 | 777 | |
780 | #define P4_VERT__split_store_retired 0x0000002 | 778 | #define p4_config_unpack_metric(v) (((u64)(v)) & P4_PEBS_CONFIG_METRIC_MASK) |
781 | 779 | #define p4_config_unpack_pebs(v) (((u64)(v)) & P4_PEBS_CONFIG_MASK) | |
782 | enum P4_CACHE_EVENTS { | 780 | |
783 | P4_CACHE__NONE, | 781 | #define p4_config_pebs_has(v, mask) (p4_config_unpack_pebs(v) & (mask)) |
784 | 782 | ||
785 | P4_CACHE__1stl_cache_load_miss_retired, | 783 | enum P4_PEBS_METRIC { |
786 | P4_CACHE__2ndl_cache_load_miss_retired, | 784 | P4_PEBS_METRIC__none, |
787 | P4_CACHE__dtlb_load_miss_retired, | 785 | |
788 | P4_CACHE__dtlb_store_miss_retired, | 786 | P4_PEBS_METRIC__1stl_cache_load_miss_retired, |
789 | P4_CACHE__itlb_reference_hit, | 787 | P4_PEBS_METRIC__2ndl_cache_load_miss_retired, |
790 | P4_CACHE__itlb_reference_miss, | 788 | P4_PEBS_METRIC__dtlb_load_miss_retired, |
791 | 789 | P4_PEBS_METRIC__dtlb_store_miss_retired, | |
792 | P4_CACHE__MAX | 790 | P4_PEBS_METRIC__dtlb_all_miss_retired, |
791 | P4_PEBS_METRIC__tagged_mispred_branch, | ||
792 | P4_PEBS_METRIC__mob_load_replay_retired, | ||
793 | P4_PEBS_METRIC__split_load_retired, | ||
794 | P4_PEBS_METRIC__split_store_retired, | ||
795 | |||
796 | P4_PEBS_METRIC__max | ||
793 | }; | 797 | }; |
794 | 798 | ||
795 | #endif /* PERF_EVENT_P4_H */ | 799 | #endif /* PERF_EVENT_P4_H */ |
800 | |||
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 181be528c612..076052cd62be 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h | |||
@@ -126,8 +126,8 @@ static inline int pgd_large(pgd_t pgd) { return 0; } | |||
126 | /* x86-64 always has all page tables mapped. */ | 126 | /* x86-64 always has all page tables mapped. */ |
127 | #define pte_offset_map(dir, address) pte_offset_kernel((dir), (address)) | 127 | #define pte_offset_map(dir, address) pte_offset_kernel((dir), (address)) |
128 | #define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address)) | 128 | #define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address)) |
129 | #define pte_unmap(pte) /* NOP */ | 129 | #define pte_unmap(pte) ((void)(pte))/* NOP */ |
130 | #define pte_unmap_nested(pte) /* NOP */ | 130 | #define pte_unmap_nested(pte) ((void)(pte)) /* NOP */ |
131 | 131 | ||
132 | #define update_mmu_cache(vma, address, ptep) do { } while (0) | 132 | #define update_mmu_cache(vma, address, ptep) do { } while (0) |
133 | 133 | ||
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 7e5c6a60b8ee..325b7bdbebaa 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h | |||
@@ -762,6 +762,7 @@ extern void init_c1e_mask(void); | |||
762 | extern unsigned long boot_option_idle_override; | 762 | extern unsigned long boot_option_idle_override; |
763 | extern unsigned long idle_halt; | 763 | extern unsigned long idle_halt; |
764 | extern unsigned long idle_nomwait; | 764 | extern unsigned long idle_nomwait; |
765 | extern bool c1e_detected; | ||
765 | 766 | ||
766 | /* | 767 | /* |
767 | * on systems with caches, caches must be flashed as the absolute | 768 | * on systems with caches, caches must be flashed as the absolute |
@@ -1025,4 +1026,24 @@ unsigned long calc_aperfmperf_ratio(struct aperfmperf *old, | |||
1025 | return ratio; | 1026 | return ratio; |
1026 | } | 1027 | } |
1027 | 1028 | ||
1029 | /* | ||
1030 | * AMD errata checking | ||
1031 | */ | ||
1032 | #ifdef CONFIG_CPU_SUP_AMD | ||
1033 | extern const int amd_erratum_383[]; | ||
1034 | extern const int amd_erratum_400[]; | ||
1035 | extern bool cpu_has_amd_erratum(const int *); | ||
1036 | |||
1037 | #define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 } | ||
1038 | #define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 } | ||
1039 | #define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \ | ||
1040 | ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end)) | ||
1041 | #define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff) | ||
1042 | #define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff) | ||
1043 | #define AMD_MODEL_RANGE_END(range) ((range) & 0xfff) | ||
1044 | |||
1045 | #else | ||
1046 | #define cpu_has_amd_erratum(x) (false) | ||
1047 | #endif /* CONFIG_CPU_SUP_AMD */ | ||
1048 | |||
1028 | #endif /* _ASM_X86_PROCESSOR_H */ | 1049 | #endif /* _ASM_X86_PROCESSOR_H */ |
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h index 64cf2d24fad1..6c7fc25f2c34 100644 --- a/arch/x86/include/asm/required-features.h +++ b/arch/x86/include/asm/required-features.h | |||
@@ -84,5 +84,7 @@ | |||
84 | #define REQUIRED_MASK5 0 | 84 | #define REQUIRED_MASK5 0 |
85 | #define REQUIRED_MASK6 0 | 85 | #define REQUIRED_MASK6 0 |
86 | #define REQUIRED_MASK7 0 | 86 | #define REQUIRED_MASK7 0 |
87 | #define REQUIRED_MASK8 0 | ||
88 | #define REQUIRED_MASK9 0 | ||
87 | 89 | ||
88 | #endif /* _ASM_X86_REQUIRED_FEATURES_H */ | 90 | #endif /* _ASM_X86_REQUIRED_FEATURES_H */ |
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index 606ede126972..d1e41b0f9b60 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h | |||
@@ -118,7 +118,7 @@ static inline void __down_read(struct rw_semaphore *sem) | |||
118 | { | 118 | { |
119 | asm volatile("# beginning down_read\n\t" | 119 | asm volatile("# beginning down_read\n\t" |
120 | LOCK_PREFIX _ASM_INC "(%1)\n\t" | 120 | LOCK_PREFIX _ASM_INC "(%1)\n\t" |
121 | /* adds 0x00000001, returns the old value */ | 121 | /* adds 0x00000001 */ |
122 | " jns 1f\n" | 122 | " jns 1f\n" |
123 | " call call_rwsem_down_read_failed\n" | 123 | " call call_rwsem_down_read_failed\n" |
124 | "1:\n\t" | 124 | "1:\n\t" |
@@ -156,11 +156,9 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) | |||
156 | static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) | 156 | static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) |
157 | { | 157 | { |
158 | rwsem_count_t tmp; | 158 | rwsem_count_t tmp; |
159 | |||
160 | tmp = RWSEM_ACTIVE_WRITE_BIAS; | ||
161 | asm volatile("# beginning down_write\n\t" | 159 | asm volatile("# beginning down_write\n\t" |
162 | LOCK_PREFIX " xadd %1,(%2)\n\t" | 160 | LOCK_PREFIX " xadd %1,(%2)\n\t" |
163 | /* subtract 0x0000ffff, returns the old value */ | 161 | /* adds 0xffff0001, returns the old value */ |
164 | " test %1,%1\n\t" | 162 | " test %1,%1\n\t" |
165 | /* was the count 0 before? */ | 163 | /* was the count 0 before? */ |
166 | " jz 1f\n" | 164 | " jz 1f\n" |
@@ -168,7 +166,7 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) | |||
168 | "1:\n" | 166 | "1:\n" |
169 | "# ending down_write" | 167 | "# ending down_write" |
170 | : "+m" (sem->count), "=d" (tmp) | 168 | : "+m" (sem->count), "=d" (tmp) |
171 | : "a" (sem), "1" (tmp) | 169 | : "a" (sem), "1" (RWSEM_ACTIVE_WRITE_BIAS) |
172 | : "memory", "cc"); | 170 | : "memory", "cc"); |
173 | } | 171 | } |
174 | 172 | ||
@@ -195,16 +193,16 @@ static inline int __down_write_trylock(struct rw_semaphore *sem) | |||
195 | */ | 193 | */ |
196 | static inline void __up_read(struct rw_semaphore *sem) | 194 | static inline void __up_read(struct rw_semaphore *sem) |
197 | { | 195 | { |
198 | rwsem_count_t tmp = -RWSEM_ACTIVE_READ_BIAS; | 196 | rwsem_count_t tmp; |
199 | asm volatile("# beginning __up_read\n\t" | 197 | asm volatile("# beginning __up_read\n\t" |
200 | LOCK_PREFIX " xadd %1,(%2)\n\t" | 198 | LOCK_PREFIX " xadd %1,(%2)\n\t" |
201 | /* subtracts 1, returns the old value */ | 199 | /* subtracts 1, returns the old value */ |
202 | " jns 1f\n\t" | 200 | " jns 1f\n\t" |
203 | " call call_rwsem_wake\n" | 201 | " call call_rwsem_wake\n" /* expects old value in %edx */ |
204 | "1:\n" | 202 | "1:\n" |
205 | "# ending __up_read\n" | 203 | "# ending __up_read\n" |
206 | : "+m" (sem->count), "=d" (tmp) | 204 | : "+m" (sem->count), "=d" (tmp) |
207 | : "a" (sem), "1" (tmp) | 205 | : "a" (sem), "1" (-RWSEM_ACTIVE_READ_BIAS) |
208 | : "memory", "cc"); | 206 | : "memory", "cc"); |
209 | } | 207 | } |
210 | 208 | ||
@@ -216,10 +214,9 @@ static inline void __up_write(struct rw_semaphore *sem) | |||
216 | rwsem_count_t tmp; | 214 | rwsem_count_t tmp; |
217 | asm volatile("# beginning __up_write\n\t" | 215 | asm volatile("# beginning __up_write\n\t" |
218 | LOCK_PREFIX " xadd %1,(%2)\n\t" | 216 | LOCK_PREFIX " xadd %1,(%2)\n\t" |
219 | /* tries to transition | 217 | /* subtracts 0xffff0001, returns the old value */ |
220 | 0xffff0001 -> 0x00000000 */ | 218 | " jns 1f\n\t" |
221 | " jz 1f\n" | 219 | " call call_rwsem_wake\n" /* expects old value in %edx */ |
222 | " call call_rwsem_wake\n" | ||
223 | "1:\n\t" | 220 | "1:\n\t" |
224 | "# ending __up_write\n" | 221 | "# ending __up_write\n" |
225 | : "+m" (sem->count), "=d" (tmp) | 222 | : "+m" (sem->count), "=d" (tmp) |
diff --git a/arch/x86/include/asm/scatterlist.h b/arch/x86/include/asm/scatterlist.h index fb0b1874396f..4240878b9d76 100644 --- a/arch/x86/include/asm/scatterlist.h +++ b/arch/x86/include/asm/scatterlist.h | |||
@@ -3,7 +3,6 @@ | |||
3 | 3 | ||
4 | #include <asm-generic/scatterlist.h> | 4 | #include <asm-generic/scatterlist.h> |
5 | 5 | ||
6 | #define ISA_DMA_THRESHOLD (0x00ffffff) | ||
7 | #define ARCH_HAS_SG_CHAIN | 6 | #define ARCH_HAS_SG_CHAIN |
8 | 7 | ||
9 | #endif /* _ASM_X86_SCATTERLIST_H */ | 8 | #endif /* _ASM_X86_SCATTERLIST_H */ |
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 86b1506f4179..ef292c792d74 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h | |||
@@ -82,7 +82,7 @@ void *extend_brk(size_t size, size_t align); | |||
82 | * executable.) | 82 | * executable.) |
83 | */ | 83 | */ |
84 | #define RESERVE_BRK(name,sz) \ | 84 | #define RESERVE_BRK(name,sz) \ |
85 | static void __section(.discard) __used \ | 85 | static void __section(.discard.text) __used \ |
86 | __brk_reservation_fn_##name##__(void) { \ | 86 | __brk_reservation_fn_##name##__(void) { \ |
87 | asm volatile ( \ | 87 | asm volatile ( \ |
88 | ".pushsection .brk_reservation,\"aw\",@nobits;" \ | 88 | ".pushsection .brk_reservation,\"aw\",@nobits;" \ |
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 4dab78edbad9..2b16a2ad23dc 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h | |||
@@ -1,6 +1,13 @@ | |||
1 | /* | ||
2 | * Copyright (C) 1991, 1992 Linus Torvalds | ||
3 | * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs | ||
4 | */ | ||
5 | |||
1 | #ifndef _ASM_X86_STACKTRACE_H | 6 | #ifndef _ASM_X86_STACKTRACE_H |
2 | #define _ASM_X86_STACKTRACE_H | 7 | #define _ASM_X86_STACKTRACE_H |
3 | 8 | ||
9 | #include <linux/uaccess.h> | ||
10 | |||
4 | extern int kstack_depth_to_print; | 11 | extern int kstack_depth_to_print; |
5 | 12 | ||
6 | struct thread_info; | 13 | struct thread_info; |
@@ -42,4 +49,46 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, | |||
42 | unsigned long *stack, unsigned long bp, | 49 | unsigned long *stack, unsigned long bp, |
43 | const struct stacktrace_ops *ops, void *data); | 50 | const struct stacktrace_ops *ops, void *data); |
44 | 51 | ||
52 | #ifdef CONFIG_X86_32 | ||
53 | #define STACKSLOTS_PER_LINE 8 | ||
54 | #define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :) | ||
55 | #else | ||
56 | #define STACKSLOTS_PER_LINE 4 | ||
57 | #define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) | ||
58 | #endif | ||
59 | |||
60 | extern void | ||
61 | show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, | ||
62 | unsigned long *stack, unsigned long bp, char *log_lvl); | ||
63 | |||
64 | extern void | ||
65 | show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, | ||
66 | unsigned long *sp, unsigned long bp, char *log_lvl); | ||
67 | |||
68 | extern unsigned int code_bytes; | ||
69 | |||
70 | /* The form of the top of the frame on the stack */ | ||
71 | struct stack_frame { | ||
72 | struct stack_frame *next_frame; | ||
73 | unsigned long return_address; | ||
74 | }; | ||
75 | |||
76 | struct stack_frame_ia32 { | ||
77 | u32 next_frame; | ||
78 | u32 return_address; | ||
79 | }; | ||
80 | |||
81 | static inline unsigned long caller_frame_pointer(void) | ||
82 | { | ||
83 | struct stack_frame *frame; | ||
84 | |||
85 | get_bp(frame); | ||
86 | |||
87 | #ifdef CONFIG_FRAME_POINTER | ||
88 | frame = frame->next_frame; | ||
89 | #endif | ||
90 | |||
91 | return (unsigned long)frame; | ||
92 | } | ||
93 | |||
45 | #endif /* _ASM_X86_STACKTRACE_H */ | 94 | #endif /* _ASM_X86_STACKTRACE_H */ |
diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h index 3ad421784ae7..cb238526a9f1 100644 --- a/arch/x86/include/asm/sys_ia32.h +++ b/arch/x86/include/asm/sys_ia32.h | |||
@@ -18,13 +18,13 @@ | |||
18 | #include <asm/ia32.h> | 18 | #include <asm/ia32.h> |
19 | 19 | ||
20 | /* ia32/sys_ia32.c */ | 20 | /* ia32/sys_ia32.c */ |
21 | asmlinkage long sys32_truncate64(char __user *, unsigned long, unsigned long); | 21 | asmlinkage long sys32_truncate64(const char __user *, unsigned long, unsigned long); |
22 | asmlinkage long sys32_ftruncate64(unsigned int, unsigned long, unsigned long); | 22 | asmlinkage long sys32_ftruncate64(unsigned int, unsigned long, unsigned long); |
23 | 23 | ||
24 | asmlinkage long sys32_stat64(char __user *, struct stat64 __user *); | 24 | asmlinkage long sys32_stat64(const char __user *, struct stat64 __user *); |
25 | asmlinkage long sys32_lstat64(char __user *, struct stat64 __user *); | 25 | asmlinkage long sys32_lstat64(const char __user *, struct stat64 __user *); |
26 | asmlinkage long sys32_fstat64(unsigned int, struct stat64 __user *); | 26 | asmlinkage long sys32_fstat64(unsigned int, struct stat64 __user *); |
27 | asmlinkage long sys32_fstatat(unsigned int, char __user *, | 27 | asmlinkage long sys32_fstatat(unsigned int, const char __user *, |
28 | struct stat64 __user *, int); | 28 | struct stat64 __user *, int); |
29 | struct mmap_arg_struct32; | 29 | struct mmap_arg_struct32; |
30 | asmlinkage long sys32_mmap(struct mmap_arg_struct32 __user *); | 30 | asmlinkage long sys32_mmap(struct mmap_arg_struct32 __user *); |
@@ -49,12 +49,12 @@ asmlinkage long sys32_rt_sigpending(compat_sigset_t __user *, compat_size_t); | |||
49 | asmlinkage long sys32_rt_sigqueueinfo(int, int, compat_siginfo_t __user *); | 49 | asmlinkage long sys32_rt_sigqueueinfo(int, int, compat_siginfo_t __user *); |
50 | 50 | ||
51 | asmlinkage long sys32_pread(unsigned int, char __user *, u32, u32, u32); | 51 | asmlinkage long sys32_pread(unsigned int, char __user *, u32, u32, u32); |
52 | asmlinkage long sys32_pwrite(unsigned int, char __user *, u32, u32, u32); | 52 | asmlinkage long sys32_pwrite(unsigned int, const char __user *, u32, u32, u32); |
53 | 53 | ||
54 | asmlinkage long sys32_personality(unsigned long); | 54 | asmlinkage long sys32_personality(unsigned long); |
55 | asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32); | 55 | asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32); |
56 | 56 | ||
57 | asmlinkage long sys32_execve(char __user *, compat_uptr_t __user *, | 57 | asmlinkage long sys32_execve(const char __user *, compat_uptr_t __user *, |
58 | compat_uptr_t __user *, struct pt_regs *); | 58 | compat_uptr_t __user *, struct pt_regs *); |
59 | asmlinkage long sys32_clone(unsigned int, unsigned int, struct pt_regs *); | 59 | asmlinkage long sys32_clone(unsigned int, unsigned int, struct pt_regs *); |
60 | 60 | ||
@@ -80,4 +80,7 @@ asmlinkage long sys32_rt_sigreturn(struct pt_regs *); | |||
80 | 80 | ||
81 | /* ia32/ipc32.c */ | 81 | /* ia32/ipc32.c */ |
82 | asmlinkage long sys32_ipc(u32, int, int, int, compat_uptr_t, u32); | 82 | asmlinkage long sys32_ipc(u32, int, int, int, compat_uptr_t, u32); |
83 | |||
84 | asmlinkage long sys32_fanotify_mark(int, unsigned int, u32, u32, int, | ||
85 | const char __user *); | ||
83 | #endif /* _ASM_X86_SYS_IA32_H */ | 86 | #endif /* _ASM_X86_SYS_IA32_H */ |
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 5c044b43e9a7..feb2ff9bfc2d 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h | |||
@@ -23,7 +23,7 @@ long sys_iopl(unsigned int, struct pt_regs *); | |||
23 | /* kernel/process.c */ | 23 | /* kernel/process.c */ |
24 | int sys_fork(struct pt_regs *); | 24 | int sys_fork(struct pt_regs *); |
25 | int sys_vfork(struct pt_regs *); | 25 | int sys_vfork(struct pt_regs *); |
26 | long sys_execve(char __user *, char __user * __user *, | 26 | long sys_execve(const char __user *, char __user * __user *, |
27 | char __user * __user *, struct pt_regs *); | 27 | char __user * __user *, struct pt_regs *); |
28 | long sys_clone(unsigned long, unsigned long, void __user *, | 28 | long sys_clone(unsigned long, unsigned long, void __user *, |
29 | void __user *, struct pt_regs *); | 29 | void __user *, struct pt_regs *); |
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index e7f4d33c55ed..33ecc3ea8782 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h | |||
@@ -457,4 +457,11 @@ static __always_inline void rdtsc_barrier(void) | |||
457 | alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); | 457 | alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); |
458 | } | 458 | } |
459 | 459 | ||
460 | /* | ||
461 | * We handle most unaligned accesses in hardware. On the other hand | ||
462 | * unaligned DMA can be quite expensive on some Nehalem processors. | ||
463 | * | ||
464 | * Based on this we disable the IP header alignment in network drivers. | ||
465 | */ | ||
466 | #define NET_IP_ALIGN 0 | ||
460 | #endif /* _ASM_X86_SYSTEM_H */ | 467 | #endif /* _ASM_X86_SYSTEM_H */ |
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index beb9b5f8f8a4..b766a5e8ba0e 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h | |||
@@ -343,10 +343,13 @@ | |||
343 | #define __NR_rt_tgsigqueueinfo 335 | 343 | #define __NR_rt_tgsigqueueinfo 335 |
344 | #define __NR_perf_event_open 336 | 344 | #define __NR_perf_event_open 336 |
345 | #define __NR_recvmmsg 337 | 345 | #define __NR_recvmmsg 337 |
346 | #define __NR_fanotify_init 338 | ||
347 | #define __NR_fanotify_mark 339 | ||
348 | #define __NR_prlimit64 340 | ||
346 | 349 | ||
347 | #ifdef __KERNEL__ | 350 | #ifdef __KERNEL__ |
348 | 351 | ||
349 | #define NR_syscalls 338 | 352 | #define NR_syscalls 341 |
350 | 353 | ||
351 | #define __ARCH_WANT_IPC_PARSE_VERSION | 354 | #define __ARCH_WANT_IPC_PARSE_VERSION |
352 | #define __ARCH_WANT_OLD_READDIR | 355 | #define __ARCH_WANT_OLD_READDIR |
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index ff4307b0e81e..363e9b8a715b 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h | |||
@@ -663,6 +663,12 @@ __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo) | |||
663 | __SYSCALL(__NR_perf_event_open, sys_perf_event_open) | 663 | __SYSCALL(__NR_perf_event_open, sys_perf_event_open) |
664 | #define __NR_recvmmsg 299 | 664 | #define __NR_recvmmsg 299 |
665 | __SYSCALL(__NR_recvmmsg, sys_recvmmsg) | 665 | __SYSCALL(__NR_recvmmsg, sys_recvmmsg) |
666 | #define __NR_fanotify_init 300 | ||
667 | __SYSCALL(__NR_fanotify_init, sys_fanotify_init) | ||
668 | #define __NR_fanotify_mark 301 | ||
669 | __SYSCALL(__NR_fanotify_mark, sys_fanotify_mark) | ||
670 | #define __NR_prlimit64 302 | ||
671 | __SYSCALL(__NR_prlimit64, sys_prlimit64) | ||
666 | 672 | ||
667 | #ifndef __NO_STUBS | 673 | #ifndef __NO_STUBS |
668 | #define __ARCH_WANT_OLD_READDIR | 674 | #define __ARCH_WANT_OLD_READDIR |
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index aa558ac0306e..42d412fd8b02 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h | |||
@@ -34,6 +34,7 @@ | |||
34 | */ | 34 | */ |
35 | 35 | ||
36 | #define UV_ITEMS_PER_DESCRIPTOR 8 | 36 | #define UV_ITEMS_PER_DESCRIPTOR 8 |
37 | /* the 'throttle' to prevent the hardware stay-busy bug */ | ||
37 | #define MAX_BAU_CONCURRENT 3 | 38 | #define MAX_BAU_CONCURRENT 3 |
38 | #define UV_CPUS_PER_ACT_STATUS 32 | 39 | #define UV_CPUS_PER_ACT_STATUS 32 |
39 | #define UV_ACT_STATUS_MASK 0x3 | 40 | #define UV_ACT_STATUS_MASK 0x3 |
@@ -45,10 +46,26 @@ | |||
45 | #define UV_DESC_BASE_PNODE_SHIFT 49 | 46 | #define UV_DESC_BASE_PNODE_SHIFT 49 |
46 | #define UV_PAYLOADQ_PNODE_SHIFT 49 | 47 | #define UV_PAYLOADQ_PNODE_SHIFT 49 |
47 | #define UV_PTC_BASENAME "sgi_uv/ptc_statistics" | 48 | #define UV_PTC_BASENAME "sgi_uv/ptc_statistics" |
49 | #define UV_BAU_BASENAME "sgi_uv/bau_tunables" | ||
50 | #define UV_BAU_TUNABLES_DIR "sgi_uv" | ||
51 | #define UV_BAU_TUNABLES_FILE "bau_tunables" | ||
52 | #define WHITESPACE " \t\n" | ||
48 | #define uv_physnodeaddr(x) ((__pa((unsigned long)(x)) & uv_mmask)) | 53 | #define uv_physnodeaddr(x) ((__pa((unsigned long)(x)) & uv_mmask)) |
49 | #define UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT 15 | 54 | #define UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT 15 |
50 | #define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT 16 | 55 | #define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT 16 |
51 | #define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL | 56 | #define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x0000000009UL |
57 | /* [19:16] SOFT_ACK timeout period 19: 1 is urgency 7 17:16 1 is multiplier */ | ||
58 | #define BAU_MISC_CONTROL_MULT_MASK 3 | ||
59 | |||
60 | #define UVH_AGING_PRESCALE_SEL 0x000000b000UL | ||
61 | /* [30:28] URGENCY_7 an index into a table of times */ | ||
62 | #define BAU_URGENCY_7_SHIFT 28 | ||
63 | #define BAU_URGENCY_7_MASK 7 | ||
64 | |||
65 | #define UVH_TRANSACTION_TIMEOUT 0x000000b200UL | ||
66 | /* [45:40] BAU - BAU transaction timeout select - a multiplier */ | ||
67 | #define BAU_TRANS_SHIFT 40 | ||
68 | #define BAU_TRANS_MASK 0x3f | ||
52 | 69 | ||
53 | /* | 70 | /* |
54 | * bits in UVH_LB_BAU_SB_ACTIVATION_STATUS_0/1 | 71 | * bits in UVH_LB_BAU_SB_ACTIVATION_STATUS_0/1 |
@@ -59,24 +76,21 @@ | |||
59 | #define DESC_STATUS_SOURCE_TIMEOUT 3 | 76 | #define DESC_STATUS_SOURCE_TIMEOUT 3 |
60 | 77 | ||
61 | /* | 78 | /* |
62 | * source side threshholds at which message retries print a warning | 79 | * delay for 'plugged' timeout retries, in microseconds |
63 | */ | ||
64 | #define SOURCE_TIMEOUT_LIMIT 20 | ||
65 | #define DESTINATION_TIMEOUT_LIMIT 20 | ||
66 | |||
67 | /* | ||
68 | * misc. delays, in microseconds | ||
69 | */ | 80 | */ |
70 | #define THROTTLE_DELAY 10 | 81 | #define PLUGGED_DELAY 10 |
71 | #define TIMEOUT_DELAY 10 | ||
72 | #define BIOS_TO 1000 | ||
73 | /* BIOS is assumed to set the destination timeout to 1003520 nanoseconds */ | ||
74 | 82 | ||
75 | /* | 83 | /* |
76 | * threshholds at which to use IPI to free resources | 84 | * threshholds at which to use IPI to free resources |
77 | */ | 85 | */ |
86 | /* after this # consecutive 'plugged' timeouts, use IPI to release resources */ | ||
78 | #define PLUGSB4RESET 100 | 87 | #define PLUGSB4RESET 100 |
79 | #define TIMEOUTSB4RESET 100 | 88 | /* after this many consecutive timeouts, use IPI to release resources */ |
89 | #define TIMEOUTSB4RESET 1 | ||
90 | /* at this number uses of IPI to release resources, giveup the request */ | ||
91 | #define IPI_RESET_LIMIT 1 | ||
92 | /* after this # consecutive successes, bump up the throttle if it was lowered */ | ||
93 | #define COMPLETE_THRESHOLD 5 | ||
80 | 94 | ||
81 | /* | 95 | /* |
82 | * number of entries in the destination side payload queue | 96 | * number of entries in the destination side payload queue |
@@ -96,6 +110,13 @@ | |||
96 | #define FLUSH_COMPLETE 4 | 110 | #define FLUSH_COMPLETE 4 |
97 | 111 | ||
98 | /* | 112 | /* |
113 | * tuning the action when the numalink network is extremely delayed | ||
114 | */ | ||
115 | #define CONGESTED_RESPONSE_US 1000 /* 'long' response time, in microseconds */ | ||
116 | #define CONGESTED_REPS 10 /* long delays averaged over this many broadcasts */ | ||
117 | #define CONGESTED_PERIOD 30 /* time for the bau to be disabled, in seconds */ | ||
118 | |||
119 | /* | ||
99 | * Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor) | 120 | * Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor) |
100 | * If the 'multilevel' flag in the header portion of the descriptor | 121 | * If the 'multilevel' flag in the header portion of the descriptor |
101 | * has been set to 0, then endpoint multi-unicast mode is selected. | 122 | * has been set to 0, then endpoint multi-unicast mode is selected. |
@@ -300,37 +321,16 @@ struct bau_payload_queue_entry { | |||
300 | /* bytes 24-31 */ | 321 | /* bytes 24-31 */ |
301 | }; | 322 | }; |
302 | 323 | ||
303 | /* | 324 | struct msg_desc { |
304 | * one per-cpu; to locate the software tables | 325 | struct bau_payload_queue_entry *msg; |
305 | */ | 326 | int msg_slot; |
306 | struct bau_control { | 327 | int sw_ack_slot; |
307 | struct bau_desc *descriptor_base; | ||
308 | struct bau_payload_queue_entry *va_queue_first; | 328 | struct bau_payload_queue_entry *va_queue_first; |
309 | struct bau_payload_queue_entry *va_queue_last; | 329 | struct bau_payload_queue_entry *va_queue_last; |
310 | struct bau_payload_queue_entry *bau_msg_head; | 330 | }; |
311 | struct bau_control *uvhub_master; | 331 | |
312 | struct bau_control *socket_master; | 332 | struct reset_args { |
313 | unsigned long timeout_interval; | 333 | int sender; |
314 | atomic_t active_descriptor_count; | ||
315 | int max_concurrent; | ||
316 | int max_concurrent_constant; | ||
317 | int retry_message_scans; | ||
318 | int plugged_tries; | ||
319 | int timeout_tries; | ||
320 | int ipi_attempts; | ||
321 | int conseccompletes; | ||
322 | short cpu; | ||
323 | short uvhub_cpu; | ||
324 | short uvhub; | ||
325 | short cpus_in_socket; | ||
326 | short cpus_in_uvhub; | ||
327 | unsigned short message_number; | ||
328 | unsigned short uvhub_quiesce; | ||
329 | short socket_acknowledge_count[DEST_Q_SIZE]; | ||
330 | cycles_t send_message; | ||
331 | spinlock_t masks_lock; | ||
332 | spinlock_t uvhub_lock; | ||
333 | spinlock_t queue_lock; | ||
334 | }; | 334 | }; |
335 | 335 | ||
336 | /* | 336 | /* |
@@ -344,18 +344,25 @@ struct ptc_stats { | |||
344 | unsigned long s_dtimeout; /* destination side timeouts */ | 344 | unsigned long s_dtimeout; /* destination side timeouts */ |
345 | unsigned long s_time; /* time spent in sending side */ | 345 | unsigned long s_time; /* time spent in sending side */ |
346 | unsigned long s_retriesok; /* successful retries */ | 346 | unsigned long s_retriesok; /* successful retries */ |
347 | unsigned long s_ntargcpu; /* number of cpus targeted */ | 347 | unsigned long s_ntargcpu; /* total number of cpu's targeted */ |
348 | unsigned long s_ntarguvhub; /* number of uvhubs targeted */ | 348 | unsigned long s_ntargself; /* times the sending cpu was targeted */ |
349 | unsigned long s_ntarguvhub16; /* number of times >= 16 target hubs */ | 349 | unsigned long s_ntarglocals; /* targets of cpus on the local blade */ |
350 | unsigned long s_ntarguvhub8; /* number of times >= 8 target hubs */ | 350 | unsigned long s_ntargremotes; /* targets of cpus on remote blades */ |
351 | unsigned long s_ntarguvhub4; /* number of times >= 4 target hubs */ | 351 | unsigned long s_ntarglocaluvhub; /* targets of the local hub */ |
352 | unsigned long s_ntarguvhub2; /* number of times >= 2 target hubs */ | 352 | unsigned long s_ntargremoteuvhub; /* remotes hubs targeted */ |
353 | unsigned long s_ntarguvhub1; /* number of times == 1 target hub */ | 353 | unsigned long s_ntarguvhub; /* total number of uvhubs targeted */ |
354 | unsigned long s_ntarguvhub16; /* number of times target hubs >= 16*/ | ||
355 | unsigned long s_ntarguvhub8; /* number of times target hubs >= 8 */ | ||
356 | unsigned long s_ntarguvhub4; /* number of times target hubs >= 4 */ | ||
357 | unsigned long s_ntarguvhub2; /* number of times target hubs >= 2 */ | ||
358 | unsigned long s_ntarguvhub1; /* number of times target hubs == 1 */ | ||
354 | unsigned long s_resets_plug; /* ipi-style resets from plug state */ | 359 | unsigned long s_resets_plug; /* ipi-style resets from plug state */ |
355 | unsigned long s_resets_timeout; /* ipi-style resets from timeouts */ | 360 | unsigned long s_resets_timeout; /* ipi-style resets from timeouts */ |
356 | unsigned long s_busy; /* status stayed busy past s/w timer */ | 361 | unsigned long s_busy; /* status stayed busy past s/w timer */ |
357 | unsigned long s_throttles; /* waits in throttle */ | 362 | unsigned long s_throttles; /* waits in throttle */ |
358 | unsigned long s_retry_messages; /* retry broadcasts */ | 363 | unsigned long s_retry_messages; /* retry broadcasts */ |
364 | unsigned long s_bau_reenabled; /* for bau enable/disable */ | ||
365 | unsigned long s_bau_disabled; /* for bau enable/disable */ | ||
359 | /* destination statistics */ | 366 | /* destination statistics */ |
360 | unsigned long d_alltlb; /* times all tlb's on this cpu were flushed */ | 367 | unsigned long d_alltlb; /* times all tlb's on this cpu were flushed */ |
361 | unsigned long d_onetlb; /* times just one tlb on this cpu was flushed */ | 368 | unsigned long d_onetlb; /* times just one tlb on this cpu was flushed */ |
@@ -370,6 +377,52 @@ struct ptc_stats { | |||
370 | unsigned long d_rcanceled; /* number of messages canceled by resets */ | 377 | unsigned long d_rcanceled; /* number of messages canceled by resets */ |
371 | }; | 378 | }; |
372 | 379 | ||
380 | /* | ||
381 | * one per-cpu; to locate the software tables | ||
382 | */ | ||
383 | struct bau_control { | ||
384 | struct bau_desc *descriptor_base; | ||
385 | struct bau_payload_queue_entry *va_queue_first; | ||
386 | struct bau_payload_queue_entry *va_queue_last; | ||
387 | struct bau_payload_queue_entry *bau_msg_head; | ||
388 | struct bau_control *uvhub_master; | ||
389 | struct bau_control *socket_master; | ||
390 | struct ptc_stats *statp; | ||
391 | unsigned long timeout_interval; | ||
392 | unsigned long set_bau_on_time; | ||
393 | atomic_t active_descriptor_count; | ||
394 | int plugged_tries; | ||
395 | int timeout_tries; | ||
396 | int ipi_attempts; | ||
397 | int conseccompletes; | ||
398 | int baudisabled; | ||
399 | int set_bau_off; | ||
400 | short cpu; | ||
401 | short uvhub_cpu; | ||
402 | short uvhub; | ||
403 | short cpus_in_socket; | ||
404 | short cpus_in_uvhub; | ||
405 | unsigned short message_number; | ||
406 | unsigned short uvhub_quiesce; | ||
407 | short socket_acknowledge_count[DEST_Q_SIZE]; | ||
408 | cycles_t send_message; | ||
409 | spinlock_t uvhub_lock; | ||
410 | spinlock_t queue_lock; | ||
411 | /* tunables */ | ||
412 | int max_bau_concurrent; | ||
413 | int max_bau_concurrent_constant; | ||
414 | int plugged_delay; | ||
415 | int plugsb4reset; | ||
416 | int timeoutsb4reset; | ||
417 | int ipi_reset_limit; | ||
418 | int complete_threshold; | ||
419 | int congested_response_us; | ||
420 | int congested_reps; | ||
421 | int congested_period; | ||
422 | cycles_t period_time; | ||
423 | long period_requests; | ||
424 | }; | ||
425 | |||
373 | static inline int bau_uvhub_isset(int uvhub, struct bau_target_uvhubmask *dstp) | 426 | static inline int bau_uvhub_isset(int uvhub, struct bau_target_uvhubmask *dstp) |
374 | { | 427 | { |
375 | return constant_test_bit(uvhub, &dstp->bits[0]); | 428 | return constant_test_bit(uvhub, &dstp->bits[0]); |
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 9e6779f7cf2d..9f0cbd987d50 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h | |||
@@ -257,6 +257,7 @@ enum vmcs_field { | |||
257 | #define EXIT_REASON_IO_INSTRUCTION 30 | 257 | #define EXIT_REASON_IO_INSTRUCTION 30 |
258 | #define EXIT_REASON_MSR_READ 31 | 258 | #define EXIT_REASON_MSR_READ 31 |
259 | #define EXIT_REASON_MSR_WRITE 32 | 259 | #define EXIT_REASON_MSR_WRITE 32 |
260 | #define EXIT_REASON_INVALID_STATE 33 | ||
260 | #define EXIT_REASON_MWAIT_INSTRUCTION 36 | 261 | #define EXIT_REASON_MWAIT_INSTRUCTION 36 |
261 | #define EXIT_REASON_MONITOR_INSTRUCTION 39 | 262 | #define EXIT_REASON_MONITOR_INSTRUCTION 39 |
262 | #define EXIT_REASON_PAUSE_INSTRUCTION 40 | 263 | #define EXIT_REASON_PAUSE_INSTRUCTION 40 |
@@ -266,6 +267,7 @@ enum vmcs_field { | |||
266 | #define EXIT_REASON_EPT_VIOLATION 48 | 267 | #define EXIT_REASON_EPT_VIOLATION 48 |
267 | #define EXIT_REASON_EPT_MISCONFIG 49 | 268 | #define EXIT_REASON_EPT_MISCONFIG 49 |
268 | #define EXIT_REASON_WBINVD 54 | 269 | #define EXIT_REASON_WBINVD 54 |
270 | #define EXIT_REASON_XSETBV 55 | ||
269 | 271 | ||
270 | /* | 272 | /* |
271 | * Interruption-information format | 273 | * Interruption-information format |
@@ -375,6 +377,9 @@ enum vmcs_field { | |||
375 | #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) | 377 | #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) |
376 | #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) | 378 | #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) |
377 | 379 | ||
380 | #define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT (1ull << 9) /* (41 - 32) */ | ||
381 | #define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */ | ||
382 | |||
378 | #define VMX_EPT_DEFAULT_GAW 3 | 383 | #define VMX_EPT_DEFAULT_GAW 3 |
379 | #define VMX_EPT_MAX_GAW 0x4 | 384 | #define VMX_EPT_MAX_GAW 0x4 |
380 | #define VMX_EPT_MT_EPTE_SHIFT 3 | 385 | #define VMX_EPT_MT_EPTE_SHIFT 3 |
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 9c371e4a9fa6..7fda040a76cd 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h | |||
@@ -417,6 +417,12 @@ HYPERVISOR_nmi_op(unsigned long op, unsigned long arg) | |||
417 | return _hypercall2(int, nmi_op, op, arg); | 417 | return _hypercall2(int, nmi_op, op, arg); |
418 | } | 418 | } |
419 | 419 | ||
420 | static inline unsigned long __must_check | ||
421 | HYPERVISOR_hvm_op(int op, void *arg) | ||
422 | { | ||
423 | return _hypercall2(unsigned long, hvm_op, op, arg); | ||
424 | } | ||
425 | |||
420 | static inline void | 426 | static inline void |
421 | MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set) | 427 | MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set) |
422 | { | 428 | { |
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 018a0a400799..bf5f7d32bd08 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h | |||
@@ -112,13 +112,9 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine) | |||
112 | */ | 112 | */ |
113 | static inline unsigned long mfn_to_local_pfn(unsigned long mfn) | 113 | static inline unsigned long mfn_to_local_pfn(unsigned long mfn) |
114 | { | 114 | { |
115 | extern unsigned long max_mapnr; | ||
116 | unsigned long pfn = mfn_to_pfn(mfn); | 115 | unsigned long pfn = mfn_to_pfn(mfn); |
117 | if ((pfn < max_mapnr) | 116 | if (get_phys_to_machine(pfn) != mfn) |
118 | && !xen_feature(XENFEAT_auto_translated_physmap) | 117 | return -1; /* force !pfn_valid() */ |
119 | && (get_phys_to_machine(pfn) != mfn)) | ||
120 | return max_mapnr; /* force !pfn_valid() */ | ||
121 | /* XXX fixme; not true with sparsemem */ | ||
122 | return pfn; | 118 | return pfn; |
123 | } | 119 | } |
124 | 120 | ||
diff --git a/arch/x86/include/asm/xen/swiotlb-xen.h b/arch/x86/include/asm/xen/swiotlb-xen.h new file mode 100644 index 000000000000..1be1ab7d6a41 --- /dev/null +++ b/arch/x86/include/asm/xen/swiotlb-xen.h | |||
@@ -0,0 +1,14 @@ | |||
1 | #ifndef _ASM_X86_SWIOTLB_XEN_H | ||
2 | #define _ASM_X86_SWIOTLB_XEN_H | ||
3 | |||
4 | #ifdef CONFIG_SWIOTLB_XEN | ||
5 | extern int xen_swiotlb; | ||
6 | extern int __init pci_xen_swiotlb_detect(void); | ||
7 | extern void __init pci_xen_swiotlb_init(void); | ||
8 | #else | ||
9 | #define xen_swiotlb (0) | ||
10 | static inline int __init pci_xen_swiotlb_detect(void) { return 0; } | ||
11 | static inline void __init pci_xen_swiotlb_init(void) { } | ||
12 | #endif | ||
13 | |||
14 | #endif /* _ASM_X86_SWIOTLB_XEN_H */ | ||
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index 2c4390cae228..c6ce2452f10c 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h | |||
@@ -3,7 +3,8 @@ | |||
3 | 3 | ||
4 | #include <linux/types.h> | 4 | #include <linux/types.h> |
5 | #include <asm/processor.h> | 5 | #include <asm/processor.h> |
6 | #include <asm/i387.h> | 6 | |
7 | #define XSTATE_CPUID 0x0000000d | ||
7 | 8 | ||
8 | #define XSTATE_FP 0x1 | 9 | #define XSTATE_FP 0x1 |
9 | #define XSTATE_SSE 0x2 | 10 | #define XSTATE_SSE 0x2 |
@@ -13,6 +14,12 @@ | |||
13 | 14 | ||
14 | #define FXSAVE_SIZE 512 | 15 | #define FXSAVE_SIZE 512 |
15 | 16 | ||
17 | #define XSAVE_HDR_SIZE 64 | ||
18 | #define XSAVE_HDR_OFFSET FXSAVE_SIZE | ||
19 | |||
20 | #define XSAVE_YMM_SIZE 256 | ||
21 | #define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET) | ||
22 | |||
16 | /* | 23 | /* |
17 | * These are the features that the OS can handle currently. | 24 | * These are the features that the OS can handle currently. |
18 | */ | 25 | */ |
@@ -26,10 +33,8 @@ | |||
26 | 33 | ||
27 | extern unsigned int xstate_size; | 34 | extern unsigned int xstate_size; |
28 | extern u64 pcntxt_mask; | 35 | extern u64 pcntxt_mask; |
29 | extern struct xsave_struct *init_xstate_buf; | ||
30 | extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; | 36 | extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; |
31 | 37 | ||
32 | extern void xsave_cntxt_init(void); | ||
33 | extern void xsave_init(void); | 38 | extern void xsave_init(void); |
34 | extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask); | 39 | extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask); |
35 | extern int init_fpu(struct task_struct *child); | 40 | extern int init_fpu(struct task_struct *child); |
@@ -59,6 +64,16 @@ static inline int fpu_xrstor_checking(struct fpu *fpu) | |||
59 | static inline int xsave_user(struct xsave_struct __user *buf) | 64 | static inline int xsave_user(struct xsave_struct __user *buf) |
60 | { | 65 | { |
61 | int err; | 66 | int err; |
67 | |||
68 | /* | ||
69 | * Clear the xsave header first, so that reserved fields are | ||
70 | * initialized to zero. | ||
71 | */ | ||
72 | err = __clear_user(&buf->xsave_hdr, | ||
73 | sizeof(struct xsave_hdr_struct)); | ||
74 | if (unlikely(err)) | ||
75 | return -EFAULT; | ||
76 | |||
62 | __asm__ __volatile__("1: .byte " REX_PREFIX "0x0f,0xae,0x27\n" | 77 | __asm__ __volatile__("1: .byte " REX_PREFIX "0x0f,0xae,0x27\n" |
63 | "2:\n" | 78 | "2:\n" |
64 | ".section .fixup,\"ax\"\n" | 79 | ".section .fixup,\"ax\"\n" |
@@ -111,12 +126,25 @@ static inline void xrstor_state(struct xsave_struct *fx, u64 mask) | |||
111 | : "memory"); | 126 | : "memory"); |
112 | } | 127 | } |
113 | 128 | ||
129 | static inline void xsave_state(struct xsave_struct *fx, u64 mask) | ||
130 | { | ||
131 | u32 lmask = mask; | ||
132 | u32 hmask = mask >> 32; | ||
133 | |||
134 | asm volatile(".byte " REX_PREFIX "0x0f,0xae,0x27\n\t" | ||
135 | : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) | ||
136 | : "memory"); | ||
137 | } | ||
138 | |||
114 | static inline void fpu_xsave(struct fpu *fpu) | 139 | static inline void fpu_xsave(struct fpu *fpu) |
115 | { | 140 | { |
116 | /* This, however, we can work around by forcing the compiler to select | 141 | /* This, however, we can work around by forcing the compiler to select |
117 | an addressing mode that doesn't require extended registers. */ | 142 | an addressing mode that doesn't require extended registers. */ |
118 | __asm__ __volatile__(".byte " REX_PREFIX "0x0f,0xae,0x27" | 143 | alternative_input( |
119 | : : "D" (&(fpu->state->xsave)), | 144 | ".byte " REX_PREFIX "0x0f,0xae,0x27", |
120 | "a" (-1), "d"(-1) : "memory"); | 145 | ".byte " REX_PREFIX "0x0f,0xae,0x37", |
146 | X86_FEATURE_XSAVEOPT, | ||
147 | [fx] "D" (&fpu->state->xsave), "a" (-1), "d" (-1) : | ||
148 | "memory"); | ||
121 | } | 149 | } |
122 | #endif | 150 | #endif |
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index e77b22083721..0925676266bd 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile | |||
@@ -104,6 +104,7 @@ obj-$(CONFIG_SCx200) += scx200.o | |||
104 | scx200-y += scx200_32.o | 104 | scx200-y += scx200_32.o |
105 | 105 | ||
106 | obj-$(CONFIG_OLPC) += olpc.o | 106 | obj-$(CONFIG_OLPC) += olpc.o |
107 | obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o | ||
107 | obj-$(CONFIG_X86_MRST) += mrst.o | 108 | obj-$(CONFIG_X86_MRST) += mrst.o |
108 | 109 | ||
109 | microcode-y := microcode_core.o | 110 | microcode-y := microcode_core.o |
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S index 580b4e296010..28595d6df47c 100644 --- a/arch/x86/kernel/acpi/realmode/wakeup.S +++ b/arch/x86/kernel/acpi/realmode/wakeup.S | |||
@@ -104,7 +104,7 @@ _start: | |||
104 | movl %eax, %ecx | 104 | movl %eax, %ecx |
105 | orl %edx, %ecx | 105 | orl %edx, %ecx |
106 | jz 1f | 106 | jz 1f |
107 | movl $0xc0000080, %ecx | 107 | movl $MSR_EFER, %ecx |
108 | wrmsr | 108 | wrmsr |
109 | 1: | 109 | 1: |
110 | 110 | ||
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index fcc3c61fdecc..33cec152070d 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c | |||
@@ -2,7 +2,7 @@ | |||
2 | * sleep.c - x86-specific ACPI sleep support. | 2 | * sleep.c - x86-specific ACPI sleep support. |
3 | * | 3 | * |
4 | * Copyright (C) 2001-2003 Patrick Mochel | 4 | * Copyright (C) 2001-2003 Patrick Mochel |
5 | * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz> | 5 | * Copyright (C) 2001-2003 Pavel Machek <pavel@ucw.cz> |
6 | */ | 6 | */ |
7 | 7 | ||
8 | #include <linux/acpi.h> | 8 | #include <linux/acpi.h> |
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 70237732a6c7..f65ab8b014c4 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c | |||
@@ -214,6 +214,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start, | |||
214 | u8 *instr = a->instr; | 214 | u8 *instr = a->instr; |
215 | BUG_ON(a->replacementlen > a->instrlen); | 215 | BUG_ON(a->replacementlen > a->instrlen); |
216 | BUG_ON(a->instrlen > sizeof(insnbuf)); | 216 | BUG_ON(a->instrlen > sizeof(insnbuf)); |
217 | BUG_ON(a->cpuid >= NCAPINTS*32); | ||
217 | if (!boot_cpu_has(a->cpuid)) | 218 | if (!boot_cpu_has(a->cpuid)) |
218 | continue; | 219 | continue; |
219 | #ifdef CONFIG_X86_64 | 220 | #ifdef CONFIG_X86_64 |
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 0d20286d78c6..fa044e1e30a2 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c | |||
@@ -2572,6 +2572,11 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, | |||
2572 | static int amd_iommu_domain_has_cap(struct iommu_domain *domain, | 2572 | static int amd_iommu_domain_has_cap(struct iommu_domain *domain, |
2573 | unsigned long cap) | 2573 | unsigned long cap) |
2574 | { | 2574 | { |
2575 | switch (cap) { | ||
2576 | case IOMMU_CAP_CACHE_COHERENCY: | ||
2577 | return 1; | ||
2578 | } | ||
2579 | |||
2575 | return 0; | 2580 | return 0; |
2576 | } | 2581 | } |
2577 | 2582 | ||
@@ -2609,8 +2614,7 @@ int __init amd_iommu_init_passthrough(void) | |||
2609 | 2614 | ||
2610 | pt_domain->mode |= PAGE_MODE_NONE; | 2615 | pt_domain->mode |= PAGE_MODE_NONE; |
2611 | 2616 | ||
2612 | while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { | 2617 | for_each_pci_dev(dev) { |
2613 | |||
2614 | if (!check_device(&dev->dev)) | 2618 | if (!check_device(&dev->dev)) |
2615 | continue; | 2619 | continue; |
2616 | 2620 | ||
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index a35347501d36..8dd77800ff5d 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c | |||
@@ -43,10 +43,11 @@ | |||
43 | 43 | ||
44 | #include <asm/fixmap.h> | 44 | #include <asm/fixmap.h> |
45 | #include <asm/apb_timer.h> | 45 | #include <asm/apb_timer.h> |
46 | #include <asm/mrst.h> | ||
46 | 47 | ||
47 | #define APBT_MASK CLOCKSOURCE_MASK(32) | 48 | #define APBT_MASK CLOCKSOURCE_MASK(32) |
48 | #define APBT_SHIFT 22 | 49 | #define APBT_SHIFT 22 |
49 | #define APBT_CLOCKEVENT_RATING 150 | 50 | #define APBT_CLOCKEVENT_RATING 110 |
50 | #define APBT_CLOCKSOURCE_RATING 250 | 51 | #define APBT_CLOCKSOURCE_RATING 250 |
51 | #define APBT_MIN_DELTA_USEC 200 | 52 | #define APBT_MIN_DELTA_USEC 200 |
52 | 53 | ||
@@ -83,8 +84,6 @@ struct apbt_dev { | |||
83 | char name[10]; | 84 | char name[10]; |
84 | }; | 85 | }; |
85 | 86 | ||
86 | int disable_apbt_percpu __cpuinitdata; | ||
87 | |||
88 | static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev); | 87 | static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev); |
89 | 88 | ||
90 | #ifdef CONFIG_SMP | 89 | #ifdef CONFIG_SMP |
@@ -195,29 +194,6 @@ static struct clock_event_device apbt_clockevent = { | |||
195 | }; | 194 | }; |
196 | 195 | ||
197 | /* | 196 | /* |
198 | * if user does not want to use per CPU apb timer, just give it a lower rating | ||
199 | * than local apic timer and skip the late per cpu timer init. | ||
200 | */ | ||
201 | static inline int __init setup_x86_mrst_timer(char *arg) | ||
202 | { | ||
203 | if (!arg) | ||
204 | return -EINVAL; | ||
205 | |||
206 | if (strcmp("apbt_only", arg) == 0) | ||
207 | disable_apbt_percpu = 0; | ||
208 | else if (strcmp("lapic_and_apbt", arg) == 0) | ||
209 | disable_apbt_percpu = 1; | ||
210 | else { | ||
211 | pr_warning("X86 MRST timer option %s not recognised" | ||
212 | " use x86_mrst_timer=apbt_only or lapic_and_apbt\n", | ||
213 | arg); | ||
214 | return -EINVAL; | ||
215 | } | ||
216 | return 0; | ||
217 | } | ||
218 | __setup("x86_mrst_timer=", setup_x86_mrst_timer); | ||
219 | |||
220 | /* | ||
221 | * start count down from 0xffff_ffff. this is done by toggling the enable bit | 197 | * start count down from 0xffff_ffff. this is done by toggling the enable bit |
222 | * then load initial load count to ~0. | 198 | * then load initial load count to ~0. |
223 | */ | 199 | */ |
@@ -335,7 +311,7 @@ static int __init apbt_clockevent_register(void) | |||
335 | adev->num = smp_processor_id(); | 311 | adev->num = smp_processor_id(); |
336 | memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device)); | 312 | memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device)); |
337 | 313 | ||
338 | if (disable_apbt_percpu) { | 314 | if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) { |
339 | apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100; | 315 | apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100; |
340 | global_clock_event = &adev->evt; | 316 | global_clock_event = &adev->evt; |
341 | printk(KERN_DEBUG "%s clockevent registered as global\n", | 317 | printk(KERN_DEBUG "%s clockevent registered as global\n", |
@@ -429,7 +405,8 @@ static int apbt_cpuhp_notify(struct notifier_block *n, | |||
429 | 405 | ||
430 | static __init int apbt_late_init(void) | 406 | static __init int apbt_late_init(void) |
431 | { | 407 | { |
432 | if (disable_apbt_percpu || !apb_timer_block_enabled) | 408 | if (mrst_timer_options == MRST_TIMER_LAPIC_APBT || |
409 | !apb_timer_block_enabled) | ||
433 | return 0; | 410 | return 0; |
434 | /* This notifier should be called after workqueue is ready */ | 411 | /* This notifier should be called after workqueue is ready */ |
435 | hotcpu_notifier(apbt_cpuhp_notify, -20); | 412 | hotcpu_notifier(apbt_cpuhp_notify, -20); |
@@ -450,6 +427,8 @@ static void apbt_set_mode(enum clock_event_mode mode, | |||
450 | int timer_num; | 427 | int timer_num; |
451 | struct apbt_dev *adev = EVT_TO_APBT_DEV(evt); | 428 | struct apbt_dev *adev = EVT_TO_APBT_DEV(evt); |
452 | 429 | ||
430 | BUG_ON(!apbt_virt_address); | ||
431 | |||
453 | timer_num = adev->num; | 432 | timer_num = adev->num; |
454 | pr_debug("%s CPU %d timer %d mode=%d\n", | 433 | pr_debug("%s CPU %d timer %d mode=%d\n", |
455 | __func__, first_cpu(*evt->cpumask), timer_num, mode); | 434 | __func__, first_cpu(*evt->cpumask), timer_num, mode); |
@@ -676,7 +655,7 @@ void __init apbt_time_init(void) | |||
676 | } | 655 | } |
677 | #ifdef CONFIG_SMP | 656 | #ifdef CONFIG_SMP |
678 | /* kernel cmdline disable apb timer, so we will use lapic timers */ | 657 | /* kernel cmdline disable apb timer, so we will use lapic timers */ |
679 | if (disable_apbt_percpu) { | 658 | if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) { |
680 | printk(KERN_INFO "apbt: disabled per cpu timer\n"); | 659 | printk(KERN_INFO "apbt: disabled per cpu timer\n"); |
681 | return; | 660 | return; |
682 | } | 661 | } |
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index b5d8b0bcf235..a2e0caf26e17 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c | |||
@@ -280,7 +280,7 @@ void __init early_gart_iommu_check(void) | |||
280 | * or BIOS forget to put that in reserved. | 280 | * or BIOS forget to put that in reserved. |
281 | * try to update e820 to make that region as reserved. | 281 | * try to update e820 to make that region as reserved. |
282 | */ | 282 | */ |
283 | u32 agp_aper_base = 0, agp_aper_order = 0; | 283 | u32 agp_aper_order = 0; |
284 | int i, fix, slot, valid_agp = 0; | 284 | int i, fix, slot, valid_agp = 0; |
285 | u32 ctl; | 285 | u32 ctl; |
286 | u32 aper_size = 0, aper_order = 0, last_aper_order = 0; | 286 | u32 aper_size = 0, aper_order = 0, last_aper_order = 0; |
@@ -291,7 +291,7 @@ void __init early_gart_iommu_check(void) | |||
291 | return; | 291 | return; |
292 | 292 | ||
293 | /* This is mostly duplicate of iommu_hole_init */ | 293 | /* This is mostly duplicate of iommu_hole_init */ |
294 | agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp); | 294 | search_agp_bridge(&agp_aper_order, &valid_agp); |
295 | 295 | ||
296 | fix = 0; | 296 | fix = 0; |
297 | for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { | 297 | for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { |
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 565c1bfc507d..910f20b457c4 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile | |||
@@ -2,7 +2,12 @@ | |||
2 | # Makefile for local APIC drivers and for the IO-APIC code | 2 | # Makefile for local APIC drivers and for the IO-APIC code |
3 | # | 3 | # |
4 | 4 | ||
5 | obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o nmi.o | 5 | obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o |
6 | ifneq ($(CONFIG_HARDLOCKUP_DETECTOR),y) | ||
7 | obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o | ||
8 | endif | ||
9 | obj-$(CONFIG_HARDLOCKUP_DETECTOR) += hw_nmi.o | ||
10 | |||
6 | obj-$(CONFIG_X86_IO_APIC) += io_apic.o | 11 | obj-$(CONFIG_X86_IO_APIC) += io_apic.o |
7 | obj-$(CONFIG_SMP) += ipi.o | 12 | obj-$(CONFIG_SMP) += ipi.o |
8 | 13 | ||
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index a96489ee6cab..e3b534cda49a 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c | |||
@@ -460,7 +460,7 @@ static void lapic_timer_broadcast(const struct cpumask *mask) | |||
460 | } | 460 | } |
461 | 461 | ||
462 | /* | 462 | /* |
463 | * Setup the local APIC timer for this CPU. Copy the initilized values | 463 | * Setup the local APIC timer for this CPU. Copy the initialized values |
464 | * of the boot CPU and register the clock event in the framework. | 464 | * of the boot CPU and register the clock event in the framework. |
465 | */ | 465 | */ |
466 | static void __cpuinit setup_APIC_timer(void) | 466 | static void __cpuinit setup_APIC_timer(void) |
@@ -1606,7 +1606,7 @@ void __init init_apic_mappings(void) | |||
1606 | * acpi lapic path already maps that address in | 1606 | * acpi lapic path already maps that address in |
1607 | * acpi_register_lapic_address() | 1607 | * acpi_register_lapic_address() |
1608 | */ | 1608 | */ |
1609 | if (!acpi_lapic) | 1609 | if (!acpi_lapic && !smp_found_config) |
1610 | set_fixmap_nocache(FIX_APIC_BASE, apic_phys); | 1610 | set_fixmap_nocache(FIX_APIC_BASE, apic_phys); |
1611 | 1611 | ||
1612 | apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", | 1612 | apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", |
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 425e53a87feb..8593582d8022 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c | |||
@@ -129,7 +129,6 @@ int es7000_plat; | |||
129 | * GSI override for ES7000 platforms. | 129 | * GSI override for ES7000 platforms. |
130 | */ | 130 | */ |
131 | 131 | ||
132 | static unsigned int base; | ||
133 | 132 | ||
134 | static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) | 133 | static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) |
135 | { | 134 | { |
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c new file mode 100644 index 000000000000..cefd6942f0e9 --- /dev/null +++ b/arch/x86/kernel/apic/hw_nmi.c | |||
@@ -0,0 +1,107 @@ | |||
1 | /* | ||
2 | * HW NMI watchdog support | ||
3 | * | ||
4 | * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. | ||
5 | * | ||
6 | * Arch specific calls to support NMI watchdog | ||
7 | * | ||
8 | * Bits copied from original nmi.c file | ||
9 | * | ||
10 | */ | ||
11 | #include <asm/apic.h> | ||
12 | |||
13 | #include <linux/cpumask.h> | ||
14 | #include <linux/kdebug.h> | ||
15 | #include <linux/notifier.h> | ||
16 | #include <linux/kprobes.h> | ||
17 | #include <linux/nmi.h> | ||
18 | #include <linux/module.h> | ||
19 | |||
20 | /* For reliability, we're prepared to waste bits here. */ | ||
21 | static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; | ||
22 | |||
23 | u64 hw_nmi_get_sample_period(void) | ||
24 | { | ||
25 | return (u64)(cpu_khz) * 1000 * 60; | ||
26 | } | ||
27 | |||
28 | #ifdef ARCH_HAS_NMI_WATCHDOG | ||
29 | void arch_trigger_all_cpu_backtrace(void) | ||
30 | { | ||
31 | int i; | ||
32 | |||
33 | cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); | ||
34 | |||
35 | printk(KERN_INFO "sending NMI to all CPUs:\n"); | ||
36 | apic->send_IPI_all(NMI_VECTOR); | ||
37 | |||
38 | /* Wait for up to 10 seconds for all CPUs to do the backtrace */ | ||
39 | for (i = 0; i < 10 * 1000; i++) { | ||
40 | if (cpumask_empty(to_cpumask(backtrace_mask))) | ||
41 | break; | ||
42 | mdelay(1); | ||
43 | } | ||
44 | } | ||
45 | |||
46 | static int __kprobes | ||
47 | arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self, | ||
48 | unsigned long cmd, void *__args) | ||
49 | { | ||
50 | struct die_args *args = __args; | ||
51 | struct pt_regs *regs; | ||
52 | int cpu = smp_processor_id(); | ||
53 | |||
54 | switch (cmd) { | ||
55 | case DIE_NMI: | ||
56 | case DIE_NMI_IPI: | ||
57 | break; | ||
58 | |||
59 | default: | ||
60 | return NOTIFY_DONE; | ||
61 | } | ||
62 | |||
63 | regs = args->regs; | ||
64 | |||
65 | if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { | ||
66 | static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED; | ||
67 | |||
68 | arch_spin_lock(&lock); | ||
69 | printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); | ||
70 | show_regs(regs); | ||
71 | dump_stack(); | ||
72 | arch_spin_unlock(&lock); | ||
73 | cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); | ||
74 | return NOTIFY_STOP; | ||
75 | } | ||
76 | |||
77 | return NOTIFY_DONE; | ||
78 | } | ||
79 | |||
80 | static __read_mostly struct notifier_block backtrace_notifier = { | ||
81 | .notifier_call = arch_trigger_all_cpu_backtrace_handler, | ||
82 | .next = NULL, | ||
83 | .priority = 1 | ||
84 | }; | ||
85 | |||
86 | static int __init register_trigger_all_cpu_backtrace(void) | ||
87 | { | ||
88 | register_die_notifier(&backtrace_notifier); | ||
89 | return 0; | ||
90 | } | ||
91 | early_initcall(register_trigger_all_cpu_backtrace); | ||
92 | #endif | ||
93 | |||
94 | /* STUB calls to mimic old nmi_watchdog behaviour */ | ||
95 | #if defined(CONFIG_X86_LOCAL_APIC) | ||
96 | unsigned int nmi_watchdog = NMI_NONE; | ||
97 | EXPORT_SYMBOL(nmi_watchdog); | ||
98 | void acpi_nmi_enable(void) { return; } | ||
99 | void acpi_nmi_disable(void) { return; } | ||
100 | #endif | ||
101 | atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ | ||
102 | EXPORT_SYMBOL(nmi_active); | ||
103 | int unknown_nmi_panic; | ||
104 | void cpu_nmi_set_wd_enabled(void) { return; } | ||
105 | void stop_apic_nmi_watchdog(void *unused) { return; } | ||
106 | void setup_apic_nmi_watchdog(void *unused) { return; } | ||
107 | int __init check_nmi_watchdog(void) { return 0; } | ||
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index e41ed24ab26d..4dc0084ec1b1 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c | |||
@@ -3397,7 +3397,7 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) | |||
3397 | 3397 | ||
3398 | cfg = desc->chip_data; | 3398 | cfg = desc->chip_data; |
3399 | 3399 | ||
3400 | read_msi_msg_desc(desc, &msg); | 3400 | get_cached_msi_msg_desc(desc, &msg); |
3401 | 3401 | ||
3402 | msg.data &= ~MSI_DATA_VECTOR_MASK; | 3402 | msg.data &= ~MSI_DATA_VECTOR_MASK; |
3403 | msg.data |= MSI_DATA_VECTOR(cfg->vector); | 3403 | msg.data |= MSI_DATA_VECTOR(cfg->vector); |
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index 1edaf15c0b8e..a43f71cb30f8 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c | |||
@@ -401,13 +401,6 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) | |||
401 | int cpu = smp_processor_id(); | 401 | int cpu = smp_processor_id(); |
402 | int rc = 0; | 402 | int rc = 0; |
403 | 403 | ||
404 | /* check for other users first */ | ||
405 | if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) | ||
406 | == NOTIFY_STOP) { | ||
407 | rc = 1; | ||
408 | touched = 1; | ||
409 | } | ||
410 | |||
411 | sum = get_timer_irqs(cpu); | 404 | sum = get_timer_irqs(cpu); |
412 | 405 | ||
413 | if (__get_cpu_var(nmi_touch)) { | 406 | if (__get_cpu_var(nmi_touch)) { |
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index e46f98f36e31..7b598b84c902 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c | |||
@@ -604,6 +604,10 @@ int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data) | |||
604 | { | 604 | { |
605 | if (reason != DIE_NMI_IPI) | 605 | if (reason != DIE_NMI_IPI) |
606 | return NOTIFY_OK; | 606 | return NOTIFY_OK; |
607 | |||
608 | if (in_crash_kexec) | ||
609 | /* do nothing if entering the crash kernel */ | ||
610 | return NOTIFY_OK; | ||
607 | /* | 611 | /* |
608 | * Use a lock so only one cpu prints at a time | 612 | * Use a lock so only one cpu prints at a time |
609 | * to prevent intermixed output. | 613 | * to prevent intermixed output. |
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index c4f9182ca3ac..4c9c67bf09b7 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c | |||
@@ -140,7 +140,7 @@ | |||
140 | * is now the way life works). | 140 | * is now the way life works). |
141 | * Fix thinko in suspend() (wrong return). | 141 | * Fix thinko in suspend() (wrong return). |
142 | * Notify drivers on critical suspend. | 142 | * Notify drivers on critical suspend. |
143 | * Make kapmd absorb more idle time (Pavel Machek <pavel@suse.cz> | 143 | * Make kapmd absorb more idle time (Pavel Machek <pavel@ucw.cz> |
144 | * modified by sfr). | 144 | * modified by sfr). |
145 | * Disable interrupts while we are suspended (Andy Henroid | 145 | * Disable interrupts while we are suspended (Andy Henroid |
146 | * <andy_henroid@yahoo.com> fixed by sfr). | 146 | * <andy_henroid@yahoo.com> fixed by sfr). |
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 3a785da34b6f..3f0ebe429a01 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile | |||
@@ -12,11 +12,11 @@ endif | |||
12 | nostackp := $(call cc-option, -fno-stack-protector) | 12 | nostackp := $(call cc-option, -fno-stack-protector) |
13 | CFLAGS_common.o := $(nostackp) | 13 | CFLAGS_common.o := $(nostackp) |
14 | 14 | ||
15 | obj-y := intel_cacheinfo.o addon_cpuid_features.o | 15 | obj-y := intel_cacheinfo.o scattered.o topology.o |
16 | obj-y += proc.o capflags.o powerflags.o common.o | 16 | obj-y += proc.o capflags.o powerflags.o common.o |
17 | obj-y += vmware.o hypervisor.o sched.o mshyperv.o | 17 | obj-y += vmware.o hypervisor.o sched.o mshyperv.o |
18 | 18 | ||
19 | obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o | 19 | obj-$(CONFIG_X86_32) += bugs.o |
20 | obj-$(CONFIG_X86_64) += bugs_64.o | 20 | obj-$(CONFIG_X86_64) += bugs_64.o |
21 | 21 | ||
22 | obj-$(CONFIG_CPU_SUP_INTEL) += intel.o | 22 | obj-$(CONFIG_CPU_SUP_INTEL) += intel.o |
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index e485825130d2..60a57b13082d 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c | |||
@@ -466,7 +466,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) | |||
466 | } | 466 | } |
467 | 467 | ||
468 | } | 468 | } |
469 | if (c->x86 == 0x10 || c->x86 == 0x11) | 469 | if (c->x86 >= 0x10) |
470 | set_cpu_cap(c, X86_FEATURE_REP_GOOD); | 470 | set_cpu_cap(c, X86_FEATURE_REP_GOOD); |
471 | 471 | ||
472 | /* get apicid instead of initial apic id from cpuid */ | 472 | /* get apicid instead of initial apic id from cpuid */ |
@@ -529,7 +529,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) | |||
529 | num_cache_leaves = 3; | 529 | num_cache_leaves = 3; |
530 | } | 530 | } |
531 | 531 | ||
532 | if (c->x86 >= 0xf && c->x86 <= 0x11) | 532 | if (c->x86 >= 0xf) |
533 | set_cpu_cap(c, X86_FEATURE_K8); | 533 | set_cpu_cap(c, X86_FEATURE_K8); |
534 | 534 | ||
535 | if (cpu_has_xmm2) { | 535 | if (cpu_has_xmm2) { |
@@ -546,7 +546,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) | |||
546 | fam10h_check_enable_mmcfg(); | 546 | fam10h_check_enable_mmcfg(); |
547 | } | 547 | } |
548 | 548 | ||
549 | if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) { | 549 | if (c == &boot_cpu_data && c->x86 >= 0xf) { |
550 | unsigned long long tseg; | 550 | unsigned long long tseg; |
551 | 551 | ||
552 | /* | 552 | /* |
@@ -609,3 +609,74 @@ static const struct cpu_dev __cpuinitconst amd_cpu_dev = { | |||
609 | }; | 609 | }; |
610 | 610 | ||
611 | cpu_dev_register(amd_cpu_dev); | 611 | cpu_dev_register(amd_cpu_dev); |
612 | |||
613 | /* | ||
614 | * AMD errata checking | ||
615 | * | ||
616 | * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or | ||
617 | * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that | ||
618 | * have an OSVW id assigned, which it takes as first argument. Both take a | ||
619 | * variable number of family-specific model-stepping ranges created by | ||
620 | * AMD_MODEL_RANGE(). Each erratum also has to be declared as extern const | ||
621 | * int[] in arch/x86/include/asm/processor.h. | ||
622 | * | ||
623 | * Example: | ||
624 | * | ||
625 | * const int amd_erratum_319[] = | ||
626 | * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2), | ||
627 | * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0), | ||
628 | * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0)); | ||
629 | */ | ||
630 | |||
631 | const int amd_erratum_400[] = | ||
632 | AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf), | ||
633 | AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf)); | ||
634 | EXPORT_SYMBOL_GPL(amd_erratum_400); | ||
635 | |||
636 | const int amd_erratum_383[] = | ||
637 | AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf)); | ||
638 | EXPORT_SYMBOL_GPL(amd_erratum_383); | ||
639 | |||
640 | bool cpu_has_amd_erratum(const int *erratum) | ||
641 | { | ||
642 | struct cpuinfo_x86 *cpu = ¤t_cpu_data; | ||
643 | int osvw_id = *erratum++; | ||
644 | u32 range; | ||
645 | u32 ms; | ||
646 | |||
647 | /* | ||
648 | * If called early enough that current_cpu_data hasn't been initialized | ||
649 | * yet, fall back to boot_cpu_data. | ||
650 | */ | ||
651 | if (cpu->x86 == 0) | ||
652 | cpu = &boot_cpu_data; | ||
653 | |||
654 | if (cpu->x86_vendor != X86_VENDOR_AMD) | ||
655 | return false; | ||
656 | |||
657 | if (osvw_id >= 0 && osvw_id < 65536 && | ||
658 | cpu_has(cpu, X86_FEATURE_OSVW)) { | ||
659 | u64 osvw_len; | ||
660 | |||
661 | rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len); | ||
662 | if (osvw_id < osvw_len) { | ||
663 | u64 osvw_bits; | ||
664 | |||
665 | rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6), | ||
666 | osvw_bits); | ||
667 | return osvw_bits & (1ULL << (osvw_id & 0x3f)); | ||
668 | } | ||
669 | } | ||
670 | |||
671 | /* OSVW unavailable or ID unknown, match family-model-stepping range */ | ||
672 | ms = (cpu->x86_model << 8) | cpu->x86_mask; | ||
673 | while ((range = *erratum++)) | ||
674 | if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) && | ||
675 | (ms >= AMD_MODEL_RANGE_START(range)) && | ||
676 | (ms <= AMD_MODEL_RANGE_END(range))) | ||
677 | return true; | ||
678 | |||
679 | return false; | ||
680 | } | ||
681 | |||
682 | EXPORT_SYMBOL_GPL(cpu_has_amd_erratum); | ||
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 68e4a6f2211e..490dac63c2d2 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c | |||
@@ -140,10 +140,18 @@ EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); | |||
140 | static int __init x86_xsave_setup(char *s) | 140 | static int __init x86_xsave_setup(char *s) |
141 | { | 141 | { |
142 | setup_clear_cpu_cap(X86_FEATURE_XSAVE); | 142 | setup_clear_cpu_cap(X86_FEATURE_XSAVE); |
143 | setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); | ||
143 | return 1; | 144 | return 1; |
144 | } | 145 | } |
145 | __setup("noxsave", x86_xsave_setup); | 146 | __setup("noxsave", x86_xsave_setup); |
146 | 147 | ||
148 | static int __init x86_xsaveopt_setup(char *s) | ||
149 | { | ||
150 | setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); | ||
151 | return 1; | ||
152 | } | ||
153 | __setup("noxsaveopt", x86_xsaveopt_setup); | ||
154 | |||
147 | #ifdef CONFIG_X86_32 | 155 | #ifdef CONFIG_X86_32 |
148 | static int cachesize_override __cpuinitdata = -1; | 156 | static int cachesize_override __cpuinitdata = -1; |
149 | static int disable_x86_serial_nr __cpuinitdata = 1; | 157 | static int disable_x86_serial_nr __cpuinitdata = 1; |
@@ -551,6 +559,16 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) | |||
551 | c->x86_capability[4] = excap; | 559 | c->x86_capability[4] = excap; |
552 | } | 560 | } |
553 | 561 | ||
562 | /* Additional Intel-defined flags: level 0x00000007 */ | ||
563 | if (c->cpuid_level >= 0x00000007) { | ||
564 | u32 eax, ebx, ecx, edx; | ||
565 | |||
566 | cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx); | ||
567 | |||
568 | if (eax > 0) | ||
569 | c->x86_capability[9] = ebx; | ||
570 | } | ||
571 | |||
554 | /* AMD-defined flags: level 0x80000001 */ | 572 | /* AMD-defined flags: level 0x80000001 */ |
555 | xlvl = cpuid_eax(0x80000000); | 573 | xlvl = cpuid_eax(0x80000000); |
556 | c->extended_cpuid_level = xlvl; | 574 | c->extended_cpuid_level = xlvl; |
@@ -576,6 +594,7 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) | |||
576 | if (c->extended_cpuid_level >= 0x80000007) | 594 | if (c->extended_cpuid_level >= 0x80000007) |
577 | c->x86_power = cpuid_edx(0x80000007); | 595 | c->x86_power = cpuid_edx(0x80000007); |
578 | 596 | ||
597 | init_scattered_cpuid_features(c); | ||
579 | } | 598 | } |
580 | 599 | ||
581 | static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c) | 600 | static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c) |
@@ -731,7 +750,6 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c) | |||
731 | 750 | ||
732 | get_model_name(c); /* Default name */ | 751 | get_model_name(c); /* Default name */ |
733 | 752 | ||
734 | init_scattered_cpuid_features(c); | ||
735 | detect_nopl(c); | 753 | detect_nopl(c); |
736 | } | 754 | } |
737 | 755 | ||
@@ -1192,6 +1210,7 @@ void __cpuinit cpu_init(void) | |||
1192 | dbg_restore_debug_regs(); | 1210 | dbg_restore_debug_regs(); |
1193 | 1211 | ||
1194 | fpu_init(); | 1212 | fpu_init(); |
1213 | xsave_init(); | ||
1195 | 1214 | ||
1196 | raw_local_save_flags(kernel_eflags); | 1215 | raw_local_save_flags(kernel_eflags); |
1197 | 1216 | ||
@@ -1252,12 +1271,7 @@ void __cpuinit cpu_init(void) | |||
1252 | clear_used_math(); | 1271 | clear_used_math(); |
1253 | mxcsr_feature_mask_init(); | 1272 | mxcsr_feature_mask_init(); |
1254 | 1273 | ||
1255 | /* | 1274 | fpu_init(); |
1256 | * Boot processor to setup the FP and extended state context info. | ||
1257 | */ | ||
1258 | if (smp_processor_id() == boot_cpu_id) | ||
1259 | init_thread_xstate(); | ||
1260 | |||
1261 | xsave_init(); | 1275 | xsave_init(); |
1262 | } | 1276 | } |
1263 | #endif | 1277 | #endif |
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 1d3cddaa40ee..cd8da247dda1 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | |||
@@ -34,7 +34,6 @@ | |||
34 | #include <linux/compiler.h> | 34 | #include <linux/compiler.h> |
35 | #include <linux/dmi.h> | 35 | #include <linux/dmi.h> |
36 | #include <linux/slab.h> | 36 | #include <linux/slab.h> |
37 | #include <trace/events/power.h> | ||
38 | 37 | ||
39 | #include <linux/acpi.h> | 38 | #include <linux/acpi.h> |
40 | #include <linux/io.h> | 39 | #include <linux/io.h> |
@@ -73,7 +72,7 @@ struct acpi_cpufreq_data { | |||
73 | static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data); | 72 | static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data); |
74 | 73 | ||
75 | /* acpi_perf_data is a pointer to percpu data. */ | 74 | /* acpi_perf_data is a pointer to percpu data. */ |
76 | static struct acpi_processor_performance *acpi_perf_data; | 75 | static struct acpi_processor_performance __percpu *acpi_perf_data; |
77 | 76 | ||
78 | static struct cpufreq_driver acpi_cpufreq_driver; | 77 | static struct cpufreq_driver acpi_cpufreq_driver; |
79 | 78 | ||
@@ -324,8 +323,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, | |||
324 | } | 323 | } |
325 | } | 324 | } |
326 | 325 | ||
327 | trace_power_frequency(POWER_PSTATE, data->freq_table[next_state].frequency); | ||
328 | |||
329 | switch (data->cpu_feature) { | 326 | switch (data->cpu_feature) { |
330 | case SYSTEM_INTEL_MSR_CAPABLE: | 327 | case SYSTEM_INTEL_MSR_CAPABLE: |
331 | cmd.type = SYSTEM_INTEL_MSR_CAPABLE; | 328 | cmd.type = SYSTEM_INTEL_MSR_CAPABLE; |
@@ -351,7 +348,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, | |||
351 | 348 | ||
352 | freqs.old = perf->states[perf->state].core_frequency * 1000; | 349 | freqs.old = perf->states[perf->state].core_frequency * 1000; |
353 | freqs.new = data->freq_table[next_state].frequency; | 350 | freqs.new = data->freq_table[next_state].frequency; |
354 | for_each_cpu(i, cmd.mask) { | 351 | for_each_cpu(i, policy->cpus) { |
355 | freqs.cpu = i; | 352 | freqs.cpu = i; |
356 | cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); | 353 | cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); |
357 | } | 354 | } |
@@ -367,7 +364,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, | |||
367 | } | 364 | } |
368 | } | 365 | } |
369 | 366 | ||
370 | for_each_cpu(i, cmd.mask) { | 367 | for_each_cpu(i, policy->cpus) { |
371 | freqs.cpu = i; | 368 | freqs.cpu = i; |
372 | cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); | 369 | cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); |
373 | } | 370 | } |
diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c index 16e3483be9e3..32974cf84232 100644 --- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c +++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c | |||
@@ -169,12 +169,9 @@ static int gx_freq_mult[16] = { | |||
169 | * Low Level chipset interface * | 169 | * Low Level chipset interface * |
170 | ****************************************************************/ | 170 | ****************************************************************/ |
171 | static struct pci_device_id gx_chipset_tbl[] __initdata = { | 171 | static struct pci_device_id gx_chipset_tbl[] __initdata = { |
172 | { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, | 172 | { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY), }, |
173 | PCI_ANY_ID, PCI_ANY_ID }, | 173 | { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5520), }, |
174 | { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520, | 174 | { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5510), }, |
175 | PCI_ANY_ID, PCI_ANY_ID }, | ||
176 | { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510, | ||
177 | PCI_ANY_ID, PCI_ANY_ID }, | ||
178 | { 0, }, | 175 | { 0, }, |
179 | }; | 176 | }; |
180 | 177 | ||
@@ -199,7 +196,7 @@ static __init struct pci_dev *gx_detect_chipset(void) | |||
199 | } | 196 | } |
200 | 197 | ||
201 | /* detect which companion chip is used */ | 198 | /* detect which companion chip is used */ |
202 | while ((gx_pci = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, gx_pci)) != NULL) { | 199 | for_each_pci_dev(gx_pci) { |
203 | if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL) | 200 | if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL) |
204 | return gx_pci; | 201 | return gx_pci; |
205 | } | 202 | } |
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c index 7e7eea4f8261..03162dac6271 100644 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.c +++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c | |||
@@ -426,7 +426,7 @@ static int guess_fsb(int mult) | |||
426 | } | 426 | } |
427 | 427 | ||
428 | 428 | ||
429 | static int __init longhaul_get_ranges(void) | 429 | static int __cpuinit longhaul_get_ranges(void) |
430 | { | 430 | { |
431 | unsigned int i, j, k = 0; | 431 | unsigned int i, j, k = 0; |
432 | unsigned int ratio; | 432 | unsigned int ratio; |
@@ -530,7 +530,7 @@ static int __init longhaul_get_ranges(void) | |||
530 | } | 530 | } |
531 | 531 | ||
532 | 532 | ||
533 | static void __init longhaul_setup_voltagescaling(void) | 533 | static void __cpuinit longhaul_setup_voltagescaling(void) |
534 | { | 534 | { |
535 | union msr_longhaul longhaul; | 535 | union msr_longhaul longhaul; |
536 | struct mV_pos minvid, maxvid, vid; | 536 | struct mV_pos minvid, maxvid, vid; |
@@ -784,7 +784,7 @@ static int longhaul_setup_southbridge(void) | |||
784 | return 0; | 784 | return 0; |
785 | } | 785 | } |
786 | 786 | ||
787 | static int __init longhaul_cpu_init(struct cpufreq_policy *policy) | 787 | static int __cpuinit longhaul_cpu_init(struct cpufreq_policy *policy) |
788 | { | 788 | { |
789 | struct cpuinfo_x86 *c = &cpu_data(0); | 789 | struct cpuinfo_x86 *c = &cpu_data(0); |
790 | char *cpuname = NULL; | 790 | char *cpuname = NULL; |
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.h b/arch/x86/kernel/cpu/cpufreq/longhaul.h index e2360a469f79..cbf48fbca881 100644 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.h +++ b/arch/x86/kernel/cpu/cpufreq/longhaul.h | |||
@@ -56,7 +56,7 @@ union msr_longhaul { | |||
56 | /* | 56 | /* |
57 | * VIA C3 Samuel 1 & Samuel 2 (stepping 0) | 57 | * VIA C3 Samuel 1 & Samuel 2 (stepping 0) |
58 | */ | 58 | */ |
59 | static const int __initdata samuel1_mults[16] = { | 59 | static const int __cpuinitdata samuel1_mults[16] = { |
60 | -1, /* 0000 -> RESERVED */ | 60 | -1, /* 0000 -> RESERVED */ |
61 | 30, /* 0001 -> 3.0x */ | 61 | 30, /* 0001 -> 3.0x */ |
62 | 40, /* 0010 -> 4.0x */ | 62 | 40, /* 0010 -> 4.0x */ |
@@ -75,7 +75,7 @@ static const int __initdata samuel1_mults[16] = { | |||
75 | -1, /* 1111 -> RESERVED */ | 75 | -1, /* 1111 -> RESERVED */ |
76 | }; | 76 | }; |
77 | 77 | ||
78 | static const int __initdata samuel1_eblcr[16] = { | 78 | static const int __cpuinitdata samuel1_eblcr[16] = { |
79 | 50, /* 0000 -> RESERVED */ | 79 | 50, /* 0000 -> RESERVED */ |
80 | 30, /* 0001 -> 3.0x */ | 80 | 30, /* 0001 -> 3.0x */ |
81 | 40, /* 0010 -> 4.0x */ | 81 | 40, /* 0010 -> 4.0x */ |
@@ -97,7 +97,7 @@ static const int __initdata samuel1_eblcr[16] = { | |||
97 | /* | 97 | /* |
98 | * VIA C3 Samuel2 Stepping 1->15 | 98 | * VIA C3 Samuel2 Stepping 1->15 |
99 | */ | 99 | */ |
100 | static const int __initdata samuel2_eblcr[16] = { | 100 | static const int __cpuinitdata samuel2_eblcr[16] = { |
101 | 50, /* 0000 -> 5.0x */ | 101 | 50, /* 0000 -> 5.0x */ |
102 | 30, /* 0001 -> 3.0x */ | 102 | 30, /* 0001 -> 3.0x */ |
103 | 40, /* 0010 -> 4.0x */ | 103 | 40, /* 0010 -> 4.0x */ |
@@ -119,7 +119,7 @@ static const int __initdata samuel2_eblcr[16] = { | |||
119 | /* | 119 | /* |
120 | * VIA C3 Ezra | 120 | * VIA C3 Ezra |
121 | */ | 121 | */ |
122 | static const int __initdata ezra_mults[16] = { | 122 | static const int __cpuinitdata ezra_mults[16] = { |
123 | 100, /* 0000 -> 10.0x */ | 123 | 100, /* 0000 -> 10.0x */ |
124 | 30, /* 0001 -> 3.0x */ | 124 | 30, /* 0001 -> 3.0x */ |
125 | 40, /* 0010 -> 4.0x */ | 125 | 40, /* 0010 -> 4.0x */ |
@@ -138,7 +138,7 @@ static const int __initdata ezra_mults[16] = { | |||
138 | 120, /* 1111 -> 12.0x */ | 138 | 120, /* 1111 -> 12.0x */ |
139 | }; | 139 | }; |
140 | 140 | ||
141 | static const int __initdata ezra_eblcr[16] = { | 141 | static const int __cpuinitdata ezra_eblcr[16] = { |
142 | 50, /* 0000 -> 5.0x */ | 142 | 50, /* 0000 -> 5.0x */ |
143 | 30, /* 0001 -> 3.0x */ | 143 | 30, /* 0001 -> 3.0x */ |
144 | 40, /* 0010 -> 4.0x */ | 144 | 40, /* 0010 -> 4.0x */ |
@@ -160,7 +160,7 @@ static const int __initdata ezra_eblcr[16] = { | |||
160 | /* | 160 | /* |
161 | * VIA C3 (Ezra-T) [C5M]. | 161 | * VIA C3 (Ezra-T) [C5M]. |
162 | */ | 162 | */ |
163 | static const int __initdata ezrat_mults[32] = { | 163 | static const int __cpuinitdata ezrat_mults[32] = { |
164 | 100, /* 0000 -> 10.0x */ | 164 | 100, /* 0000 -> 10.0x */ |
165 | 30, /* 0001 -> 3.0x */ | 165 | 30, /* 0001 -> 3.0x */ |
166 | 40, /* 0010 -> 4.0x */ | 166 | 40, /* 0010 -> 4.0x */ |
@@ -196,7 +196,7 @@ static const int __initdata ezrat_mults[32] = { | |||
196 | -1, /* 1111 -> RESERVED (12.0x) */ | 196 | -1, /* 1111 -> RESERVED (12.0x) */ |
197 | }; | 197 | }; |
198 | 198 | ||
199 | static const int __initdata ezrat_eblcr[32] = { | 199 | static const int __cpuinitdata ezrat_eblcr[32] = { |
200 | 50, /* 0000 -> 5.0x */ | 200 | 50, /* 0000 -> 5.0x */ |
201 | 30, /* 0001 -> 3.0x */ | 201 | 30, /* 0001 -> 3.0x */ |
202 | 40, /* 0010 -> 4.0x */ | 202 | 40, /* 0010 -> 4.0x */ |
@@ -235,7 +235,7 @@ static const int __initdata ezrat_eblcr[32] = { | |||
235 | /* | 235 | /* |
236 | * VIA C3 Nehemiah */ | 236 | * VIA C3 Nehemiah */ |
237 | 237 | ||
238 | static const int __initdata nehemiah_mults[32] = { | 238 | static const int __cpuinitdata nehemiah_mults[32] = { |
239 | 100, /* 0000 -> 10.0x */ | 239 | 100, /* 0000 -> 10.0x */ |
240 | -1, /* 0001 -> 16.0x */ | 240 | -1, /* 0001 -> 16.0x */ |
241 | 40, /* 0010 -> 4.0x */ | 241 | 40, /* 0010 -> 4.0x */ |
@@ -270,7 +270,7 @@ static const int __initdata nehemiah_mults[32] = { | |||
270 | -1, /* 1111 -> 12.0x */ | 270 | -1, /* 1111 -> 12.0x */ |
271 | }; | 271 | }; |
272 | 272 | ||
273 | static const int __initdata nehemiah_eblcr[32] = { | 273 | static const int __cpuinitdata nehemiah_eblcr[32] = { |
274 | 50, /* 0000 -> 5.0x */ | 274 | 50, /* 0000 -> 5.0x */ |
275 | 160, /* 0001 -> 16.0x */ | 275 | 160, /* 0001 -> 16.0x */ |
276 | 40, /* 0010 -> 4.0x */ | 276 | 40, /* 0010 -> 4.0x */ |
@@ -315,7 +315,7 @@ struct mV_pos { | |||
315 | unsigned short pos; | 315 | unsigned short pos; |
316 | }; | 316 | }; |
317 | 317 | ||
318 | static const struct mV_pos __initdata vrm85_mV[32] = { | 318 | static const struct mV_pos __cpuinitdata vrm85_mV[32] = { |
319 | {1250, 8}, {1200, 6}, {1150, 4}, {1100, 2}, | 319 | {1250, 8}, {1200, 6}, {1150, 4}, {1100, 2}, |
320 | {1050, 0}, {1800, 30}, {1750, 28}, {1700, 26}, | 320 | {1050, 0}, {1800, 30}, {1750, 28}, {1700, 26}, |
321 | {1650, 24}, {1600, 22}, {1550, 20}, {1500, 18}, | 321 | {1650, 24}, {1600, 22}, {1550, 20}, {1500, 18}, |
@@ -326,14 +326,14 @@ static const struct mV_pos __initdata vrm85_mV[32] = { | |||
326 | {1475, 17}, {1425, 15}, {1375, 13}, {1325, 11} | 326 | {1475, 17}, {1425, 15}, {1375, 13}, {1325, 11} |
327 | }; | 327 | }; |
328 | 328 | ||
329 | static const unsigned char __initdata mV_vrm85[32] = { | 329 | static const unsigned char __cpuinitdata mV_vrm85[32] = { |
330 | 0x04, 0x14, 0x03, 0x13, 0x02, 0x12, 0x01, 0x11, | 330 | 0x04, 0x14, 0x03, 0x13, 0x02, 0x12, 0x01, 0x11, |
331 | 0x00, 0x10, 0x0f, 0x1f, 0x0e, 0x1e, 0x0d, 0x1d, | 331 | 0x00, 0x10, 0x0f, 0x1f, 0x0e, 0x1e, 0x0d, 0x1d, |
332 | 0x0c, 0x1c, 0x0b, 0x1b, 0x0a, 0x1a, 0x09, 0x19, | 332 | 0x0c, 0x1c, 0x0b, 0x1b, 0x0a, 0x1a, 0x09, 0x19, |
333 | 0x08, 0x18, 0x07, 0x17, 0x06, 0x16, 0x05, 0x15 | 333 | 0x08, 0x18, 0x07, 0x17, 0x06, 0x16, 0x05, 0x15 |
334 | }; | 334 | }; |
335 | 335 | ||
336 | static const struct mV_pos __initdata mobilevrm_mV[32] = { | 336 | static const struct mV_pos __cpuinitdata mobilevrm_mV[32] = { |
337 | {1750, 31}, {1700, 30}, {1650, 29}, {1600, 28}, | 337 | {1750, 31}, {1700, 30}, {1650, 29}, {1600, 28}, |
338 | {1550, 27}, {1500, 26}, {1450, 25}, {1400, 24}, | 338 | {1550, 27}, {1500, 26}, {1450, 25}, {1400, 24}, |
339 | {1350, 23}, {1300, 22}, {1250, 21}, {1200, 20}, | 339 | {1350, 23}, {1300, 22}, {1250, 21}, {1200, 20}, |
@@ -344,7 +344,7 @@ static const struct mV_pos __initdata mobilevrm_mV[32] = { | |||
344 | {675, 3}, {650, 2}, {625, 1}, {600, 0} | 344 | {675, 3}, {650, 2}, {625, 1}, {600, 0} |
345 | }; | 345 | }; |
346 | 346 | ||
347 | static const unsigned char __initdata mV_mobilevrm[32] = { | 347 | static const unsigned char __cpuinitdata mV_mobilevrm[32] = { |
348 | 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, | 348 | 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, |
349 | 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, | 349 | 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, |
350 | 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, | 350 | 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, |
diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c index e7b559d74c52..fc09f142d94d 100644 --- a/arch/x86/kernel/cpu/cpufreq/longrun.c +++ b/arch/x86/kernel/cpu/cpufreq/longrun.c | |||
@@ -165,8 +165,8 @@ static unsigned int longrun_get(unsigned int cpu) | |||
165 | * TMTA rules: | 165 | * TMTA rules: |
166 | * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq) | 166 | * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq) |
167 | */ | 167 | */ |
168 | static unsigned int __init longrun_determine_freqs(unsigned int *low_freq, | 168 | static unsigned int __cpuinit longrun_determine_freqs(unsigned int *low_freq, |
169 | unsigned int *high_freq) | 169 | unsigned int *high_freq) |
170 | { | 170 | { |
171 | u32 msr_lo, msr_hi; | 171 | u32 msr_lo, msr_hi; |
172 | u32 save_lo, save_hi; | 172 | u32 save_lo, save_hi; |
@@ -258,7 +258,7 @@ static unsigned int __init longrun_determine_freqs(unsigned int *low_freq, | |||
258 | } | 258 | } |
259 | 259 | ||
260 | 260 | ||
261 | static int __init longrun_cpu_init(struct cpufreq_policy *policy) | 261 | static int __cpuinit longrun_cpu_init(struct cpufreq_policy *policy) |
262 | { | 262 | { |
263 | int result = 0; | 263 | int result = 0; |
264 | 264 | ||
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c index 7b8a8ba67b07..bd1cac747f67 100644 --- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c +++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c | |||
@@ -178,13 +178,8 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c) | |||
178 | } | 178 | } |
179 | } | 179 | } |
180 | 180 | ||
181 | if (c->x86 != 0xF) { | 181 | if (c->x86 != 0xF) |
182 | if (!cpu_has(c, X86_FEATURE_EST)) | ||
183 | printk(KERN_WARNING PFX "Unknown CPU. " | ||
184 | "Please send an e-mail to " | ||
185 | "<cpufreq@vger.kernel.org>\n"); | ||
186 | return 0; | 182 | return 0; |
187 | } | ||
188 | 183 | ||
189 | /* on P-4s, the TSC runs with constant frequency independent whether | 184 | /* on P-4s, the TSC runs with constant frequency independent whether |
190 | * throttling is active or not. */ | 185 | * throttling is active or not. */ |
diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c index a36de5bbb622..994230d4dc4e 100644 --- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c | |||
@@ -110,7 +110,7 @@ struct pcc_cpu { | |||
110 | u32 output_offset; | 110 | u32 output_offset; |
111 | }; | 111 | }; |
112 | 112 | ||
113 | static struct pcc_cpu *pcc_cpu_info; | 113 | static struct pcc_cpu __percpu *pcc_cpu_info; |
114 | 114 | ||
115 | static int pcc_cpufreq_verify(struct cpufreq_policy *policy) | 115 | static int pcc_cpufreq_verify(struct cpufreq_policy *policy) |
116 | { | 116 | { |
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c index 9a97116f89e5..4a45fd6e41ba 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c | |||
@@ -569,7 +569,7 @@ static int powernow_verify(struct cpufreq_policy *policy) | |||
569 | * We will then get the same kind of behaviour already tested under | 569 | * We will then get the same kind of behaviour already tested under |
570 | * the "well-known" other OS. | 570 | * the "well-known" other OS. |
571 | */ | 571 | */ |
572 | static int __init fixup_sgtc(void) | 572 | static int __cpuinit fixup_sgtc(void) |
573 | { | 573 | { |
574 | unsigned int sgtc; | 574 | unsigned int sgtc; |
575 | unsigned int m; | 575 | unsigned int m; |
@@ -603,7 +603,7 @@ static unsigned int powernow_get(unsigned int cpu) | |||
603 | } | 603 | } |
604 | 604 | ||
605 | 605 | ||
606 | static int __init acer_cpufreq_pst(const struct dmi_system_id *d) | 606 | static int __cpuinit acer_cpufreq_pst(const struct dmi_system_id *d) |
607 | { | 607 | { |
608 | printk(KERN_WARNING PFX | 608 | printk(KERN_WARNING PFX |
609 | "%s laptop with broken PST tables in BIOS detected.\n", | 609 | "%s laptop with broken PST tables in BIOS detected.\n", |
@@ -621,7 +621,7 @@ static int __init acer_cpufreq_pst(const struct dmi_system_id *d) | |||
621 | * A BIOS update is all that can save them. | 621 | * A BIOS update is all that can save them. |
622 | * Mention this, and disable cpufreq. | 622 | * Mention this, and disable cpufreq. |
623 | */ | 623 | */ |
624 | static struct dmi_system_id __initdata powernow_dmi_table[] = { | 624 | static struct dmi_system_id __cpuinitdata powernow_dmi_table[] = { |
625 | { | 625 | { |
626 | .callback = acer_cpufreq_pst, | 626 | .callback = acer_cpufreq_pst, |
627 | .ident = "Acer Aspire", | 627 | .ident = "Acer Aspire", |
@@ -633,7 +633,7 @@ static struct dmi_system_id __initdata powernow_dmi_table[] = { | |||
633 | { } | 633 | { } |
634 | }; | 634 | }; |
635 | 635 | ||
636 | static int __init powernow_cpu_init(struct cpufreq_policy *policy) | 636 | static int __cpuinit powernow_cpu_init(struct cpufreq_policy *policy) |
637 | { | 637 | { |
638 | union msr_fidvidstatus fidvidstatus; | 638 | union msr_fidvidstatus fidvidstatus; |
639 | int result; | 639 | int result; |
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 3e90cce3dc8b..491977baf6c0 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c | |||
@@ -9,7 +9,7 @@ | |||
9 | * Based on the powernow-k7.c module written by Dave Jones. | 9 | * Based on the powernow-k7.c module written by Dave Jones. |
10 | * (C) 2003 Dave Jones on behalf of SuSE Labs | 10 | * (C) 2003 Dave Jones on behalf of SuSE Labs |
11 | * (C) 2004 Dominik Brodowski <linux@brodo.de> | 11 | * (C) 2004 Dominik Brodowski <linux@brodo.de> |
12 | * (C) 2004 Pavel Machek <pavel@suse.cz> | 12 | * (C) 2004 Pavel Machek <pavel@ucw.cz> |
13 | * Licensed under the terms of the GNU GPL License version 2. | 13 | * Licensed under the terms of the GNU GPL License version 2. |
14 | * Based upon datasheets & sample CPUs kindly provided by AMD. | 14 | * Based upon datasheets & sample CPUs kindly provided by AMD. |
15 | * | 15 | * |
@@ -806,6 +806,8 @@ static int find_psb_table(struct powernow_k8_data *data) | |||
806 | * www.amd.com | 806 | * www.amd.com |
807 | */ | 807 | */ |
808 | printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n"); | 808 | printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n"); |
809 | printk(KERN_ERR PFX "Make sure that your BIOS is up to date" | ||
810 | " and Cool'N'Quiet support is enabled in BIOS setup\n"); | ||
809 | return -ENODEV; | 811 | return -ENODEV; |
810 | } | 812 | } |
811 | 813 | ||
@@ -910,8 +912,8 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data, | |||
910 | { | 912 | { |
911 | int i; | 913 | int i; |
912 | u32 hi = 0, lo = 0; | 914 | u32 hi = 0, lo = 0; |
913 | rdmsr(MSR_PSTATE_CUR_LIMIT, hi, lo); | 915 | rdmsr(MSR_PSTATE_CUR_LIMIT, lo, hi); |
914 | data->max_hw_pstate = (hi & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT; | 916 | data->max_hw_pstate = (lo & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT; |
915 | 917 | ||
916 | for (i = 0; i < data->acpi_data.state_count; i++) { | 918 | for (i = 0; i < data->acpi_data.state_count; i++) { |
917 | u32 index; | 919 | u32 index; |
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index dd531cc56a8f..8095f8611f8a 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c | |||
@@ -34,6 +34,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = | |||
34 | { | 34 | { |
35 | &x86_hyper_vmware, | 35 | &x86_hyper_vmware, |
36 | &x86_hyper_ms_hyperv, | 36 | &x86_hyper_ms_hyperv, |
37 | #ifdef CONFIG_XEN_PVHVM | ||
38 | &x86_hyper_xen_hvm, | ||
39 | #endif | ||
37 | }; | 40 | }; |
38 | 41 | ||
39 | const struct hypervisor_x86 *x86_hyper; | 42 | const struct hypervisor_x86 *x86_hyper; |
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 33eae2062cf5..898c2f4eab88 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c | |||
@@ -347,8 +347,8 @@ static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node) | |||
347 | return l3; | 347 | return l3; |
348 | } | 348 | } |
349 | 349 | ||
350 | static void __cpuinit | 350 | static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, |
351 | amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) | 351 | int index) |
352 | { | 352 | { |
353 | int node; | 353 | int node; |
354 | 354 | ||
@@ -396,20 +396,39 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) | |||
396 | this_leaf->l3 = l3_caches[node]; | 396 | this_leaf->l3 = l3_caches[node]; |
397 | } | 397 | } |
398 | 398 | ||
399 | /* | ||
400 | * check whether a slot used for disabling an L3 index is occupied. | ||
401 | * @l3: L3 cache descriptor | ||
402 | * @slot: slot number (0..1) | ||
403 | * | ||
404 | * @returns: the disabled index if used or negative value if slot free. | ||
405 | */ | ||
406 | int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot) | ||
407 | { | ||
408 | unsigned int reg = 0; | ||
409 | |||
410 | pci_read_config_dword(l3->dev, 0x1BC + slot * 4, ®); | ||
411 | |||
412 | /* check whether this slot is activated already */ | ||
413 | if (reg & (3UL << 30)) | ||
414 | return reg & 0xfff; | ||
415 | |||
416 | return -1; | ||
417 | } | ||
418 | |||
399 | static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, | 419 | static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, |
400 | unsigned int slot) | 420 | unsigned int slot) |
401 | { | 421 | { |
402 | struct pci_dev *dev = this_leaf->l3->dev; | 422 | int index; |
403 | unsigned int reg = 0; | ||
404 | 423 | ||
405 | if (!this_leaf->l3 || !this_leaf->l3->can_disable) | 424 | if (!this_leaf->l3 || !this_leaf->l3->can_disable) |
406 | return -EINVAL; | 425 | return -EINVAL; |
407 | 426 | ||
408 | if (!dev) | 427 | index = amd_get_l3_disable_slot(this_leaf->l3, slot); |
409 | return -EINVAL; | 428 | if (index >= 0) |
429 | return sprintf(buf, "%d\n", index); | ||
410 | 430 | ||
411 | pci_read_config_dword(dev, 0x1BC + slot * 4, ®); | 431 | return sprintf(buf, "FREE\n"); |
412 | return sprintf(buf, "0x%08x\n", reg); | ||
413 | } | 432 | } |
414 | 433 | ||
415 | #define SHOW_CACHE_DISABLE(slot) \ | 434 | #define SHOW_CACHE_DISABLE(slot) \ |
@@ -451,37 +470,74 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu, | |||
451 | } | 470 | } |
452 | } | 471 | } |
453 | 472 | ||
454 | 473 | /* | |
455 | static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, | 474 | * disable a L3 cache index by using a disable-slot |
456 | const char *buf, size_t count, | 475 | * |
457 | unsigned int slot) | 476 | * @l3: L3 cache descriptor |
477 | * @cpu: A CPU on the node containing the L3 cache | ||
478 | * @slot: slot number (0..1) | ||
479 | * @index: index to disable | ||
480 | * | ||
481 | * @return: 0 on success, error status on failure | ||
482 | */ | ||
483 | int amd_set_l3_disable_slot(struct amd_l3_cache *l3, int cpu, unsigned slot, | ||
484 | unsigned long index) | ||
458 | { | 485 | { |
459 | struct pci_dev *dev = this_leaf->l3->dev; | 486 | int ret = 0; |
460 | int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); | ||
461 | unsigned long val = 0; | ||
462 | 487 | ||
463 | #define SUBCACHE_MASK (3UL << 20) | 488 | #define SUBCACHE_MASK (3UL << 20) |
464 | #define SUBCACHE_INDEX 0xfff | 489 | #define SUBCACHE_INDEX 0xfff |
465 | 490 | ||
466 | if (!this_leaf->l3 || !this_leaf->l3->can_disable) | 491 | /* |
492 | * check whether this slot is already used or | ||
493 | * the index is already disabled | ||
494 | */ | ||
495 | ret = amd_get_l3_disable_slot(l3, slot); | ||
496 | if (ret >= 0) | ||
467 | return -EINVAL; | 497 | return -EINVAL; |
468 | 498 | ||
499 | /* | ||
500 | * check whether the other slot has disabled the | ||
501 | * same index already | ||
502 | */ | ||
503 | if (index == amd_get_l3_disable_slot(l3, !slot)) | ||
504 | return -EINVAL; | ||
505 | |||
506 | /* do not allow writes outside of allowed bits */ | ||
507 | if ((index & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) || | ||
508 | ((index & SUBCACHE_INDEX) > l3->indices)) | ||
509 | return -EINVAL; | ||
510 | |||
511 | amd_l3_disable_index(l3, cpu, slot, index); | ||
512 | |||
513 | return 0; | ||
514 | } | ||
515 | |||
516 | static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, | ||
517 | const char *buf, size_t count, | ||
518 | unsigned int slot) | ||
519 | { | ||
520 | unsigned long val = 0; | ||
521 | int cpu, err = 0; | ||
522 | |||
469 | if (!capable(CAP_SYS_ADMIN)) | 523 | if (!capable(CAP_SYS_ADMIN)) |
470 | return -EPERM; | 524 | return -EPERM; |
471 | 525 | ||
472 | if (!dev) | 526 | if (!this_leaf->l3 || !this_leaf->l3->can_disable) |
473 | return -EINVAL; | 527 | return -EINVAL; |
474 | 528 | ||
475 | if (strict_strtoul(buf, 10, &val) < 0) | 529 | cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); |
476 | return -EINVAL; | ||
477 | 530 | ||
478 | /* do not allow writes outside of allowed bits */ | 531 | if (strict_strtoul(buf, 10, &val) < 0) |
479 | if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) || | ||
480 | ((val & SUBCACHE_INDEX) > this_leaf->l3->indices)) | ||
481 | return -EINVAL; | 532 | return -EINVAL; |
482 | 533 | ||
483 | amd_l3_disable_index(this_leaf->l3, cpu, slot, val); | 534 | err = amd_set_l3_disable_slot(this_leaf->l3, cpu, slot, val); |
484 | 535 | if (err) { | |
536 | if (err == -EEXIST) | ||
537 | printk(KERN_WARNING "L3 disable slot %d in use!\n", | ||
538 | slot); | ||
539 | return err; | ||
540 | } | ||
485 | return count; | 541 | return count; |
486 | } | 542 | } |
487 | 543 | ||
@@ -502,7 +558,7 @@ static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, | |||
502 | 558 | ||
503 | #else /* CONFIG_CPU_SUP_AMD */ | 559 | #else /* CONFIG_CPU_SUP_AMD */ |
504 | static void __cpuinit | 560 | static void __cpuinit |
505 | amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) | 561 | amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index) |
506 | { | 562 | { |
507 | }; | 563 | }; |
508 | #endif /* CONFIG_CPU_SUP_AMD */ | 564 | #endif /* CONFIG_CPU_SUP_AMD */ |
@@ -518,7 +574,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index, | |||
518 | 574 | ||
519 | if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { | 575 | if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { |
520 | amd_cpuid4(index, &eax, &ebx, &ecx); | 576 | amd_cpuid4(index, &eax, &ebx, &ecx); |
521 | amd_check_l3_disable(index, this_leaf); | 577 | amd_check_l3_disable(this_leaf, index); |
522 | } else { | 578 | } else { |
523 | cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); | 579 | cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); |
524 | } | 580 | } |
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 18cc42562250..ed41562909fe 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c | |||
@@ -51,7 +51,7 @@ | |||
51 | static DEFINE_MUTEX(mce_read_mutex); | 51 | static DEFINE_MUTEX(mce_read_mutex); |
52 | 52 | ||
53 | #define rcu_dereference_check_mce(p) \ | 53 | #define rcu_dereference_check_mce(p) \ |
54 | rcu_dereference_check((p), \ | 54 | rcu_dereference_index_check((p), \ |
55 | rcu_read_lock_sched_held() || \ | 55 | rcu_read_lock_sched_held() || \ |
56 | lockdep_is_held(&mce_read_mutex)) | 56 | lockdep_is_held(&mce_read_mutex)) |
57 | 57 | ||
@@ -107,8 +107,8 @@ EXPORT_SYMBOL_GPL(x86_mce_decoder_chain); | |||
107 | static int default_decode_mce(struct notifier_block *nb, unsigned long val, | 107 | static int default_decode_mce(struct notifier_block *nb, unsigned long val, |
108 | void *data) | 108 | void *data) |
109 | { | 109 | { |
110 | pr_emerg("No human readable MCE decoding support on this CPU type.\n"); | 110 | pr_emerg(HW_ERR "No human readable MCE decoding support on this CPU type.\n"); |
111 | pr_emerg("Run the message through 'mcelog --ascii' to decode.\n"); | 111 | pr_emerg(HW_ERR "Run the message through 'mcelog --ascii' to decode.\n"); |
112 | 112 | ||
113 | return NOTIFY_STOP; | 113 | return NOTIFY_STOP; |
114 | } | 114 | } |
@@ -211,11 +211,11 @@ void mce_log(struct mce *mce) | |||
211 | 211 | ||
212 | static void print_mce(struct mce *m) | 212 | static void print_mce(struct mce *m) |
213 | { | 213 | { |
214 | pr_emerg("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", | 214 | pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n", |
215 | m->extcpu, m->mcgstatus, m->bank, m->status); | 215 | m->extcpu, m->mcgstatus, m->bank, m->status); |
216 | 216 | ||
217 | if (m->ip) { | 217 | if (m->ip) { |
218 | pr_emerg("RIP%s %02x:<%016Lx> ", | 218 | pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ", |
219 | !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", | 219 | !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", |
220 | m->cs, m->ip); | 220 | m->cs, m->ip); |
221 | 221 | ||
@@ -224,14 +224,14 @@ static void print_mce(struct mce *m) | |||
224 | pr_cont("\n"); | 224 | pr_cont("\n"); |
225 | } | 225 | } |
226 | 226 | ||
227 | pr_emerg("TSC %llx ", m->tsc); | 227 | pr_emerg(HW_ERR "TSC %llx ", m->tsc); |
228 | if (m->addr) | 228 | if (m->addr) |
229 | pr_cont("ADDR %llx ", m->addr); | 229 | pr_cont("ADDR %llx ", m->addr); |
230 | if (m->misc) | 230 | if (m->misc) |
231 | pr_cont("MISC %llx ", m->misc); | 231 | pr_cont("MISC %llx ", m->misc); |
232 | 232 | ||
233 | pr_cont("\n"); | 233 | pr_cont("\n"); |
234 | pr_emerg("PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", | 234 | pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", |
235 | m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); | 235 | m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); |
236 | 236 | ||
237 | /* | 237 | /* |
@@ -241,16 +241,6 @@ static void print_mce(struct mce *m) | |||
241 | atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); | 241 | atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); |
242 | } | 242 | } |
243 | 243 | ||
244 | static void print_mce_head(void) | ||
245 | { | ||
246 | pr_emerg("\nHARDWARE ERROR\n"); | ||
247 | } | ||
248 | |||
249 | static void print_mce_tail(void) | ||
250 | { | ||
251 | pr_emerg("This is not a software problem!\n"); | ||
252 | } | ||
253 | |||
254 | #define PANIC_TIMEOUT 5 /* 5 seconds */ | 244 | #define PANIC_TIMEOUT 5 /* 5 seconds */ |
255 | 245 | ||
256 | static atomic_t mce_paniced; | 246 | static atomic_t mce_paniced; |
@@ -291,7 +281,6 @@ static void mce_panic(char *msg, struct mce *final, char *exp) | |||
291 | if (atomic_inc_return(&mce_fake_paniced) > 1) | 281 | if (atomic_inc_return(&mce_fake_paniced) > 1) |
292 | return; | 282 | return; |
293 | } | 283 | } |
294 | print_mce_head(); | ||
295 | /* First print corrected ones that are still unlogged */ | 284 | /* First print corrected ones that are still unlogged */ |
296 | for (i = 0; i < MCE_LOG_LEN; i++) { | 285 | for (i = 0; i < MCE_LOG_LEN; i++) { |
297 | struct mce *m = &mcelog.entry[i]; | 286 | struct mce *m = &mcelog.entry[i]; |
@@ -322,16 +311,15 @@ static void mce_panic(char *msg, struct mce *final, char *exp) | |||
322 | apei_err = apei_write_mce(final); | 311 | apei_err = apei_write_mce(final); |
323 | } | 312 | } |
324 | if (cpu_missing) | 313 | if (cpu_missing) |
325 | printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); | 314 | pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n"); |
326 | print_mce_tail(); | ||
327 | if (exp) | 315 | if (exp) |
328 | printk(KERN_EMERG "Machine check: %s\n", exp); | 316 | pr_emerg(HW_ERR "Machine check: %s\n", exp); |
329 | if (!fake_panic) { | 317 | if (!fake_panic) { |
330 | if (panic_timeout == 0) | 318 | if (panic_timeout == 0) |
331 | panic_timeout = mce_panic_timeout; | 319 | panic_timeout = mce_panic_timeout; |
332 | panic(msg); | 320 | panic(msg); |
333 | } else | 321 | } else |
334 | printk(KERN_EMERG "Fake kernel panic: %s\n", msg); | 322 | pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); |
335 | } | 323 | } |
336 | 324 | ||
337 | /* Support code for software error injection */ | 325 | /* Support code for software error injection */ |
@@ -600,6 +588,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) | |||
600 | */ | 588 | */ |
601 | if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { | 589 | if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { |
602 | mce_log(&m); | 590 | mce_log(&m); |
591 | atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m); | ||
603 | add_taint(TAINT_MACHINE_CHECK); | 592 | add_taint(TAINT_MACHINE_CHECK); |
604 | } | 593 | } |
605 | 594 | ||
@@ -1220,7 +1209,7 @@ int mce_notify_irq(void) | |||
1220 | schedule_work(&mce_trigger_work); | 1209 | schedule_work(&mce_trigger_work); |
1221 | 1210 | ||
1222 | if (__ratelimit(&ratelimit)) | 1211 | if (__ratelimit(&ratelimit)) |
1223 | printk(KERN_INFO "Machine check events logged\n"); | 1212 | pr_info(HW_ERR "Machine check events logged\n"); |
1224 | 1213 | ||
1225 | return 1; | 1214 | return 1; |
1226 | } | 1215 | } |
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 62b48e40920a..6fcd0936194f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c | |||
@@ -95,19 +95,20 @@ static void cmci_discover(int banks, int boot) | |||
95 | rdmsrl(MSR_IA32_MCx_CTL2(i), val); | 95 | rdmsrl(MSR_IA32_MCx_CTL2(i), val); |
96 | 96 | ||
97 | /* Already owned by someone else? */ | 97 | /* Already owned by someone else? */ |
98 | if (val & CMCI_EN) { | 98 | if (val & MCI_CTL2_CMCI_EN) { |
99 | if (test_and_clear_bit(i, owned) && !boot) | 99 | if (test_and_clear_bit(i, owned) && !boot) |
100 | print_update("SHD", &hdr, i); | 100 | print_update("SHD", &hdr, i); |
101 | __clear_bit(i, __get_cpu_var(mce_poll_banks)); | 101 | __clear_bit(i, __get_cpu_var(mce_poll_banks)); |
102 | continue; | 102 | continue; |
103 | } | 103 | } |
104 | 104 | ||
105 | val |= CMCI_EN | CMCI_THRESHOLD; | 105 | val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; |
106 | val |= MCI_CTL2_CMCI_EN | CMCI_THRESHOLD; | ||
106 | wrmsrl(MSR_IA32_MCx_CTL2(i), val); | 107 | wrmsrl(MSR_IA32_MCx_CTL2(i), val); |
107 | rdmsrl(MSR_IA32_MCx_CTL2(i), val); | 108 | rdmsrl(MSR_IA32_MCx_CTL2(i), val); |
108 | 109 | ||
109 | /* Did the enable bit stick? -- the bank supports CMCI */ | 110 | /* Did the enable bit stick? -- the bank supports CMCI */ |
110 | if (val & CMCI_EN) { | 111 | if (val & MCI_CTL2_CMCI_EN) { |
111 | if (!test_and_set_bit(i, owned) && !boot) | 112 | if (!test_and_set_bit(i, owned) && !boot) |
112 | print_update("CMCI", &hdr, i); | 113 | print_update("CMCI", &hdr, i); |
113 | __clear_bit(i, __get_cpu_var(mce_poll_banks)); | 114 | __clear_bit(i, __get_cpu_var(mce_poll_banks)); |
@@ -155,7 +156,7 @@ void cmci_clear(void) | |||
155 | continue; | 156 | continue; |
156 | /* Disable CMCI */ | 157 | /* Disable CMCI */ |
157 | rdmsrl(MSR_IA32_MCx_CTL2(i), val); | 158 | rdmsrl(MSR_IA32_MCx_CTL2(i), val); |
158 | val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); | 159 | val &= ~(MCI_CTL2_CMCI_EN|MCI_CTL2_CMCI_THRESHOLD_MASK); |
159 | wrmsrl(MSR_IA32_MCx_CTL2(i), val); | 160 | wrmsrl(MSR_IA32_MCx_CTL2(i), val); |
160 | __clear_bit(i, __get_cpu_var(mce_banks_owned)); | 161 | __clear_bit(i, __get_cpu_var(mce_banks_owned)); |
161 | } | 162 | } |
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index e1a0a3bf9716..c2a8b26d4fea 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c | |||
@@ -34,15 +34,25 @@ | |||
34 | /* How long to wait between reporting thermal events */ | 34 | /* How long to wait between reporting thermal events */ |
35 | #define CHECK_INTERVAL (300 * HZ) | 35 | #define CHECK_INTERVAL (300 * HZ) |
36 | 36 | ||
37 | #define THERMAL_THROTTLING_EVENT 0 | ||
38 | #define POWER_LIMIT_EVENT 1 | ||
39 | |||
37 | /* | 40 | /* |
38 | * Current thermal throttling state: | 41 | * Current thermal event state: |
39 | */ | 42 | */ |
40 | struct thermal_state { | 43 | struct _thermal_state { |
41 | bool is_throttled; | 44 | bool new_event; |
42 | 45 | int event; | |
43 | u64 next_check; | 46 | u64 next_check; |
44 | unsigned long throttle_count; | 47 | unsigned long count; |
45 | unsigned long last_throttle_count; | 48 | unsigned long last_count; |
49 | }; | ||
50 | |||
51 | struct thermal_state { | ||
52 | struct _thermal_state core_throttle; | ||
53 | struct _thermal_state core_power_limit; | ||
54 | struct _thermal_state package_throttle; | ||
55 | struct _thermal_state package_power_limit; | ||
46 | }; | 56 | }; |
47 | 57 | ||
48 | static DEFINE_PER_CPU(struct thermal_state, thermal_state); | 58 | static DEFINE_PER_CPU(struct thermal_state, thermal_state); |
@@ -53,11 +63,13 @@ static u32 lvtthmr_init __read_mostly; | |||
53 | 63 | ||
54 | #ifdef CONFIG_SYSFS | 64 | #ifdef CONFIG_SYSFS |
55 | #define define_therm_throt_sysdev_one_ro(_name) \ | 65 | #define define_therm_throt_sysdev_one_ro(_name) \ |
56 | static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) | 66 | static SYSDEV_ATTR(_name, 0444, \ |
67 | therm_throt_sysdev_show_##_name, \ | ||
68 | NULL) \ | ||
57 | 69 | ||
58 | #define define_therm_throt_sysdev_show_func(name) \ | 70 | #define define_therm_throt_sysdev_show_func(event, name) \ |
59 | \ | 71 | \ |
60 | static ssize_t therm_throt_sysdev_show_##name( \ | 72 | static ssize_t therm_throt_sysdev_show_##event##_##name( \ |
61 | struct sys_device *dev, \ | 73 | struct sys_device *dev, \ |
62 | struct sysdev_attribute *attr, \ | 74 | struct sysdev_attribute *attr, \ |
63 | char *buf) \ | 75 | char *buf) \ |
@@ -66,30 +78,42 @@ static ssize_t therm_throt_sysdev_show_##name( \ | |||
66 | ssize_t ret; \ | 78 | ssize_t ret; \ |
67 | \ | 79 | \ |
68 | preempt_disable(); /* CPU hotplug */ \ | 80 | preempt_disable(); /* CPU hotplug */ \ |
69 | if (cpu_online(cpu)) \ | 81 | if (cpu_online(cpu)) { \ |
70 | ret = sprintf(buf, "%lu\n", \ | 82 | ret = sprintf(buf, "%lu\n", \ |
71 | per_cpu(thermal_state, cpu).name); \ | 83 | per_cpu(thermal_state, cpu).event.name); \ |
72 | else \ | 84 | } else \ |
73 | ret = 0; \ | 85 | ret = 0; \ |
74 | preempt_enable(); \ | 86 | preempt_enable(); \ |
75 | \ | 87 | \ |
76 | return ret; \ | 88 | return ret; \ |
77 | } | 89 | } |
78 | 90 | ||
79 | define_therm_throt_sysdev_show_func(throttle_count); | 91 | define_therm_throt_sysdev_show_func(core_throttle, count); |
80 | define_therm_throt_sysdev_one_ro(throttle_count); | 92 | define_therm_throt_sysdev_one_ro(core_throttle_count); |
93 | |||
94 | define_therm_throt_sysdev_show_func(core_power_limit, count); | ||
95 | define_therm_throt_sysdev_one_ro(core_power_limit_count); | ||
96 | |||
97 | define_therm_throt_sysdev_show_func(package_throttle, count); | ||
98 | define_therm_throt_sysdev_one_ro(package_throttle_count); | ||
99 | |||
100 | define_therm_throt_sysdev_show_func(package_power_limit, count); | ||
101 | define_therm_throt_sysdev_one_ro(package_power_limit_count); | ||
81 | 102 | ||
82 | static struct attribute *thermal_throttle_attrs[] = { | 103 | static struct attribute *thermal_throttle_attrs[] = { |
83 | &attr_throttle_count.attr, | 104 | &attr_core_throttle_count.attr, |
84 | NULL | 105 | NULL |
85 | }; | 106 | }; |
86 | 107 | ||
87 | static struct attribute_group thermal_throttle_attr_group = { | 108 | static struct attribute_group thermal_attr_group = { |
88 | .attrs = thermal_throttle_attrs, | 109 | .attrs = thermal_throttle_attrs, |
89 | .name = "thermal_throttle" | 110 | .name = "thermal_throttle" |
90 | }; | 111 | }; |
91 | #endif /* CONFIG_SYSFS */ | 112 | #endif /* CONFIG_SYSFS */ |
92 | 113 | ||
114 | #define CORE_LEVEL 0 | ||
115 | #define PACKAGE_LEVEL 1 | ||
116 | |||
93 | /*** | 117 | /*** |
94 | * therm_throt_process - Process thermal throttling event from interrupt | 118 | * therm_throt_process - Process thermal throttling event from interrupt |
95 | * @curr: Whether the condition is current or not (boolean), since the | 119 | * @curr: Whether the condition is current or not (boolean), since the |
@@ -106,39 +130,70 @@ static struct attribute_group thermal_throttle_attr_group = { | |||
106 | * 1 : Event should be logged further, and a message has been | 130 | * 1 : Event should be logged further, and a message has been |
107 | * printed to the syslog. | 131 | * printed to the syslog. |
108 | */ | 132 | */ |
109 | static int therm_throt_process(bool is_throttled) | 133 | static int therm_throt_process(bool new_event, int event, int level) |
110 | { | 134 | { |
111 | struct thermal_state *state; | 135 | struct _thermal_state *state; |
112 | unsigned int this_cpu; | 136 | unsigned int this_cpu = smp_processor_id(); |
113 | bool was_throttled; | 137 | bool old_event; |
114 | u64 now; | 138 | u64 now; |
139 | struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu); | ||
115 | 140 | ||
116 | this_cpu = smp_processor_id(); | ||
117 | now = get_jiffies_64(); | 141 | now = get_jiffies_64(); |
118 | state = &per_cpu(thermal_state, this_cpu); | 142 | if (level == CORE_LEVEL) { |
143 | if (event == THERMAL_THROTTLING_EVENT) | ||
144 | state = &pstate->core_throttle; | ||
145 | else if (event == POWER_LIMIT_EVENT) | ||
146 | state = &pstate->core_power_limit; | ||
147 | else | ||
148 | return 0; | ||
149 | } else if (level == PACKAGE_LEVEL) { | ||
150 | if (event == THERMAL_THROTTLING_EVENT) | ||
151 | state = &pstate->package_throttle; | ||
152 | else if (event == POWER_LIMIT_EVENT) | ||
153 | state = &pstate->package_power_limit; | ||
154 | else | ||
155 | return 0; | ||
156 | } else | ||
157 | return 0; | ||
119 | 158 | ||
120 | was_throttled = state->is_throttled; | 159 | old_event = state->new_event; |
121 | state->is_throttled = is_throttled; | 160 | state->new_event = new_event; |
122 | 161 | ||
123 | if (is_throttled) | 162 | if (new_event) |
124 | state->throttle_count++; | 163 | state->count++; |
125 | 164 | ||
126 | if (time_before64(now, state->next_check) && | 165 | if (time_before64(now, state->next_check) && |
127 | state->throttle_count != state->last_throttle_count) | 166 | state->count != state->last_count) |
128 | return 0; | 167 | return 0; |
129 | 168 | ||
130 | state->next_check = now + CHECK_INTERVAL; | 169 | state->next_check = now + CHECK_INTERVAL; |
131 | state->last_throttle_count = state->throttle_count; | 170 | state->last_count = state->count; |
132 | 171 | ||
133 | /* if we just entered the thermal event */ | 172 | /* if we just entered the thermal event */ |
134 | if (is_throttled) { | 173 | if (new_event) { |
135 | printk(KERN_CRIT "CPU%d: Temperature above threshold, cpu clock throttled (total events = %lu)\n", this_cpu, state->throttle_count); | 174 | if (event == THERMAL_THROTTLING_EVENT) |
175 | printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n", | ||
176 | this_cpu, | ||
177 | level == CORE_LEVEL ? "Core" : "Package", | ||
178 | state->count); | ||
179 | else | ||
180 | printk(KERN_CRIT "CPU%d: %s power limit notification (total events = %lu)\n", | ||
181 | this_cpu, | ||
182 | level == CORE_LEVEL ? "Core" : "Package", | ||
183 | state->count); | ||
136 | 184 | ||
137 | add_taint(TAINT_MACHINE_CHECK); | 185 | add_taint(TAINT_MACHINE_CHECK); |
138 | return 1; | 186 | return 1; |
139 | } | 187 | } |
140 | if (was_throttled) { | 188 | if (old_event) { |
141 | printk(KERN_INFO "CPU%d: Temperature/speed normal\n", this_cpu); | 189 | if (event == THERMAL_THROTTLING_EVENT) |
190 | printk(KERN_INFO "CPU%d: %s temperature/speed normal\n", | ||
191 | this_cpu, | ||
192 | level == CORE_LEVEL ? "Core" : "Package"); | ||
193 | else | ||
194 | printk(KERN_INFO "CPU%d: %s power limit normal\n", | ||
195 | this_cpu, | ||
196 | level == CORE_LEVEL ? "Core" : "Package"); | ||
142 | return 1; | 197 | return 1; |
143 | } | 198 | } |
144 | 199 | ||
@@ -149,13 +204,32 @@ static int therm_throt_process(bool is_throttled) | |||
149 | /* Add/Remove thermal_throttle interface for CPU device: */ | 204 | /* Add/Remove thermal_throttle interface for CPU device: */ |
150 | static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) | 205 | static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) |
151 | { | 206 | { |
152 | return sysfs_create_group(&sys_dev->kobj, | 207 | int err; |
153 | &thermal_throttle_attr_group); | 208 | struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); |
209 | |||
210 | err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group); | ||
211 | if (err) | ||
212 | return err; | ||
213 | |||
214 | if (cpu_has(c, X86_FEATURE_PLN)) | ||
215 | err = sysfs_add_file_to_group(&sys_dev->kobj, | ||
216 | &attr_core_power_limit_count.attr, | ||
217 | thermal_attr_group.name); | ||
218 | if (cpu_has(c, X86_FEATURE_PTS)) | ||
219 | err = sysfs_add_file_to_group(&sys_dev->kobj, | ||
220 | &attr_package_throttle_count.attr, | ||
221 | thermal_attr_group.name); | ||
222 | if (cpu_has(c, X86_FEATURE_PLN)) | ||
223 | err = sysfs_add_file_to_group(&sys_dev->kobj, | ||
224 | &attr_package_power_limit_count.attr, | ||
225 | thermal_attr_group.name); | ||
226 | |||
227 | return err; | ||
154 | } | 228 | } |
155 | 229 | ||
156 | static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) | 230 | static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) |
157 | { | 231 | { |
158 | sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); | 232 | sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group); |
159 | } | 233 | } |
160 | 234 | ||
161 | /* Mutex protecting device creation against CPU hotplug: */ | 235 | /* Mutex protecting device creation against CPU hotplug: */ |
@@ -226,14 +300,50 @@ device_initcall(thermal_throttle_init_device); | |||
226 | 300 | ||
227 | #endif /* CONFIG_SYSFS */ | 301 | #endif /* CONFIG_SYSFS */ |
228 | 302 | ||
303 | /* | ||
304 | * Set up the most two significant bit to notify mce log that this thermal | ||
305 | * event type. | ||
306 | * This is a temp solution. May be changed in the future with mce log | ||
307 | * infrasture. | ||
308 | */ | ||
309 | #define CORE_THROTTLED (0) | ||
310 | #define CORE_POWER_LIMIT ((__u64)1 << 62) | ||
311 | #define PACKAGE_THROTTLED ((__u64)2 << 62) | ||
312 | #define PACKAGE_POWER_LIMIT ((__u64)3 << 62) | ||
313 | |||
229 | /* Thermal transition interrupt handler */ | 314 | /* Thermal transition interrupt handler */ |
230 | static void intel_thermal_interrupt(void) | 315 | static void intel_thermal_interrupt(void) |
231 | { | 316 | { |
232 | __u64 msr_val; | 317 | __u64 msr_val; |
318 | struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); | ||
233 | 319 | ||
234 | rdmsrl(MSR_IA32_THERM_STATUS, msr_val); | 320 | rdmsrl(MSR_IA32_THERM_STATUS, msr_val); |
235 | if (therm_throt_process((msr_val & THERM_STATUS_PROCHOT) != 0)) | 321 | |
236 | mce_log_therm_throt_event(msr_val); | 322 | if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT, |
323 | THERMAL_THROTTLING_EVENT, | ||
324 | CORE_LEVEL) != 0) | ||
325 | mce_log_therm_throt_event(CORE_THROTTLED | msr_val); | ||
326 | |||
327 | if (cpu_has(c, X86_FEATURE_PLN)) | ||
328 | if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, | ||
329 | POWER_LIMIT_EVENT, | ||
330 | CORE_LEVEL) != 0) | ||
331 | mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val); | ||
332 | |||
333 | if (cpu_has(c, X86_FEATURE_PTS)) { | ||
334 | rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); | ||
335 | if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, | ||
336 | THERMAL_THROTTLING_EVENT, | ||
337 | PACKAGE_LEVEL) != 0) | ||
338 | mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val); | ||
339 | if (cpu_has(c, X86_FEATURE_PLN)) | ||
340 | if (therm_throt_process(msr_val & | ||
341 | PACKAGE_THERM_STATUS_POWER_LIMIT, | ||
342 | POWER_LIMIT_EVENT, | ||
343 | PACKAGE_LEVEL) != 0) | ||
344 | mce_log_therm_throt_event(PACKAGE_POWER_LIMIT | ||
345 | | msr_val); | ||
346 | } | ||
237 | } | 347 | } |
238 | 348 | ||
239 | static void unexpected_thermal_interrupt(void) | 349 | static void unexpected_thermal_interrupt(void) |
@@ -335,8 +445,26 @@ void intel_init_thermal(struct cpuinfo_x86 *c) | |||
335 | apic_write(APIC_LVTTHMR, h); | 445 | apic_write(APIC_LVTTHMR, h); |
336 | 446 | ||
337 | rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); | 447 | rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); |
338 | wrmsr(MSR_IA32_THERM_INTERRUPT, | 448 | if (cpu_has(c, X86_FEATURE_PLN)) |
339 | l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); | 449 | wrmsr(MSR_IA32_THERM_INTERRUPT, |
450 | l | (THERM_INT_LOW_ENABLE | ||
451 | | THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h); | ||
452 | else | ||
453 | wrmsr(MSR_IA32_THERM_INTERRUPT, | ||
454 | l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); | ||
455 | |||
456 | if (cpu_has(c, X86_FEATURE_PTS)) { | ||
457 | rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); | ||
458 | if (cpu_has(c, X86_FEATURE_PLN)) | ||
459 | wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, | ||
460 | l | (PACKAGE_THERM_INT_LOW_ENABLE | ||
461 | | PACKAGE_THERM_INT_HIGH_ENABLE | ||
462 | | PACKAGE_THERM_INT_PLN_ENABLE), h); | ||
463 | else | ||
464 | wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, | ||
465 | l | (PACKAGE_THERM_INT_LOW_ENABLE | ||
466 | | PACKAGE_THERM_INT_HIGH_ENABLE), h); | ||
467 | } | ||
340 | 468 | ||
341 | smp_thermal_vector = intel_thermal_interrupt; | 469 | smp_thermal_vector = intel_thermal_interrupt; |
342 | 470 | ||
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 16f41bbe46b6..d944bf6c50e9 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <asm/mshyperv.h> | 18 | #include <asm/mshyperv.h> |
19 | 19 | ||
20 | struct ms_hyperv_info ms_hyperv; | 20 | struct ms_hyperv_info ms_hyperv; |
21 | EXPORT_SYMBOL_GPL(ms_hyperv); | ||
21 | 22 | ||
22 | static bool __init ms_hyperv_platform(void) | 23 | static bool __init ms_hyperv_platform(void) |
23 | { | 24 | { |
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 06130b52f012..c5f59d071425 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c | |||
@@ -632,9 +632,9 @@ static void __init mtrr_print_out_one_result(int i) | |||
632 | unsigned long gran_base, chunk_base, lose_base; | 632 | unsigned long gran_base, chunk_base, lose_base; |
633 | char gran_factor, chunk_factor, lose_factor; | 633 | char gran_factor, chunk_factor, lose_factor; |
634 | 634 | ||
635 | gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), | 635 | gran_base = to_size_factor(result[i].gran_sizek, &gran_factor); |
636 | chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), | 636 | chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor); |
637 | lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), | 637 | lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor); |
638 | 638 | ||
639 | pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t", | 639 | pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t", |
640 | result[i].bad ? "*BAD*" : " ", | 640 | result[i].bad ? "*BAD*" : " ", |
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index fd31a441c61c..7d28d7d03885 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c | |||
@@ -433,13 +433,12 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, | |||
433 | { | 433 | { |
434 | unsigned int mask_lo, mask_hi, base_lo, base_hi; | 434 | unsigned int mask_lo, mask_hi, base_lo, base_hi; |
435 | unsigned int tmp, hi; | 435 | unsigned int tmp, hi; |
436 | int cpu; | ||
437 | 436 | ||
438 | /* | 437 | /* |
439 | * get_mtrr doesn't need to update mtrr_state, also it could be called | 438 | * get_mtrr doesn't need to update mtrr_state, also it could be called |
440 | * from any cpu, so try to print it out directly. | 439 | * from any cpu, so try to print it out directly. |
441 | */ | 440 | */ |
442 | cpu = get_cpu(); | 441 | get_cpu(); |
443 | 442 | ||
444 | rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); | 443 | rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); |
445 | 444 | ||
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 79556bd9b602..01c0f3ee6cc3 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c | |||
@@ -35,6 +35,7 @@ | |||
35 | 35 | ||
36 | #include <linux/types.h> /* FIXME: kvm_para.h needs this */ | 36 | #include <linux/types.h> /* FIXME: kvm_para.h needs this */ |
37 | 37 | ||
38 | #include <linux/stop_machine.h> | ||
38 | #include <linux/kvm_para.h> | 39 | #include <linux/kvm_para.h> |
39 | #include <linux/uaccess.h> | 40 | #include <linux/uaccess.h> |
40 | #include <linux/module.h> | 41 | #include <linux/module.h> |
@@ -143,22 +144,28 @@ struct set_mtrr_data { | |||
143 | mtrr_type smp_type; | 144 | mtrr_type smp_type; |
144 | }; | 145 | }; |
145 | 146 | ||
147 | static DEFINE_PER_CPU(struct cpu_stop_work, mtrr_work); | ||
148 | |||
146 | /** | 149 | /** |
147 | * ipi_handler - Synchronisation handler. Executed by "other" CPUs. | 150 | * mtrr_work_handler - Synchronisation handler. Executed by "other" CPUs. |
148 | * @info: pointer to mtrr configuration data | 151 | * @info: pointer to mtrr configuration data |
149 | * | 152 | * |
150 | * Returns nothing. | 153 | * Returns nothing. |
151 | */ | 154 | */ |
152 | static void ipi_handler(void *info) | 155 | static int mtrr_work_handler(void *info) |
153 | { | 156 | { |
154 | #ifdef CONFIG_SMP | 157 | #ifdef CONFIG_SMP |
155 | struct set_mtrr_data *data = info; | 158 | struct set_mtrr_data *data = info; |
156 | unsigned long flags; | 159 | unsigned long flags; |
157 | 160 | ||
161 | atomic_dec(&data->count); | ||
162 | while (!atomic_read(&data->gate)) | ||
163 | cpu_relax(); | ||
164 | |||
158 | local_irq_save(flags); | 165 | local_irq_save(flags); |
159 | 166 | ||
160 | atomic_dec(&data->count); | 167 | atomic_dec(&data->count); |
161 | while (!atomic_read(&data->gate)) | 168 | while (atomic_read(&data->gate)) |
162 | cpu_relax(); | 169 | cpu_relax(); |
163 | 170 | ||
164 | /* The master has cleared me to execute */ | 171 | /* The master has cleared me to execute */ |
@@ -173,12 +180,13 @@ static void ipi_handler(void *info) | |||
173 | } | 180 | } |
174 | 181 | ||
175 | atomic_dec(&data->count); | 182 | atomic_dec(&data->count); |
176 | while (atomic_read(&data->gate)) | 183 | while (!atomic_read(&data->gate)) |
177 | cpu_relax(); | 184 | cpu_relax(); |
178 | 185 | ||
179 | atomic_dec(&data->count); | 186 | atomic_dec(&data->count); |
180 | local_irq_restore(flags); | 187 | local_irq_restore(flags); |
181 | #endif | 188 | #endif |
189 | return 0; | ||
182 | } | 190 | } |
183 | 191 | ||
184 | static inline int types_compatible(mtrr_type type1, mtrr_type type2) | 192 | static inline int types_compatible(mtrr_type type1, mtrr_type type2) |
@@ -198,7 +206,7 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2) | |||
198 | * | 206 | * |
199 | * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly: | 207 | * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly: |
200 | * | 208 | * |
201 | * 1. Send IPI to do the following: | 209 | * 1. Queue work to do the following on all processors: |
202 | * 2. Disable Interrupts | 210 | * 2. Disable Interrupts |
203 | * 3. Wait for all procs to do so | 211 | * 3. Wait for all procs to do so |
204 | * 4. Enter no-fill cache mode | 212 | * 4. Enter no-fill cache mode |
@@ -215,14 +223,17 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2) | |||
215 | * 15. Enable interrupts. | 223 | * 15. Enable interrupts. |
216 | * | 224 | * |
217 | * What does that mean for us? Well, first we set data.count to the number | 225 | * What does that mean for us? Well, first we set data.count to the number |
218 | * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait | 226 | * of CPUs. As each CPU announces that it started the rendezvous handler by |
219 | * until it hits 0 and proceed. We set the data.gate flag and reset data.count. | 227 | * decrementing the count, We reset data.count and set the data.gate flag |
220 | * Meanwhile, they are waiting for that flag to be set. Once it's set, each | 228 | * allowing all the cpu's to proceed with the work. As each cpu disables |
229 | * interrupts, it'll decrement data.count once. We wait until it hits 0 and | ||
230 | * proceed. We clear the data.gate flag and reset data.count. Meanwhile, they | ||
231 | * are waiting for that flag to be cleared. Once it's cleared, each | ||
221 | * CPU goes through the transition of updating MTRRs. | 232 | * CPU goes through the transition of updating MTRRs. |
222 | * The CPU vendors may each do it differently, | 233 | * The CPU vendors may each do it differently, |
223 | * so we call mtrr_if->set() callback and let them take care of it. | 234 | * so we call mtrr_if->set() callback and let them take care of it. |
224 | * When they're done, they again decrement data->count and wait for data.gate | 235 | * When they're done, they again decrement data->count and wait for data.gate |
225 | * to be reset. | 236 | * to be set. |
226 | * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag | 237 | * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag |
227 | * Everyone then enables interrupts and we all continue on. | 238 | * Everyone then enables interrupts and we all continue on. |
228 | * | 239 | * |
@@ -234,6 +245,9 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ | |||
234 | { | 245 | { |
235 | struct set_mtrr_data data; | 246 | struct set_mtrr_data data; |
236 | unsigned long flags; | 247 | unsigned long flags; |
248 | int cpu; | ||
249 | |||
250 | preempt_disable(); | ||
237 | 251 | ||
238 | data.smp_reg = reg; | 252 | data.smp_reg = reg; |
239 | data.smp_base = base; | 253 | data.smp_base = base; |
@@ -246,10 +260,15 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ | |||
246 | atomic_set(&data.gate, 0); | 260 | atomic_set(&data.gate, 0); |
247 | 261 | ||
248 | /* Start the ball rolling on other CPUs */ | 262 | /* Start the ball rolling on other CPUs */ |
249 | if (smp_call_function(ipi_handler, &data, 0) != 0) | 263 | for_each_online_cpu(cpu) { |
250 | panic("mtrr: timed out waiting for other CPUs\n"); | 264 | struct cpu_stop_work *work = &per_cpu(mtrr_work, cpu); |
265 | |||
266 | if (cpu == smp_processor_id()) | ||
267 | continue; | ||
268 | |||
269 | stop_one_cpu_nowait(cpu, mtrr_work_handler, &data, work); | ||
270 | } | ||
251 | 271 | ||
252 | local_irq_save(flags); | ||
253 | 272 | ||
254 | while (atomic_read(&data.count)) | 273 | while (atomic_read(&data.count)) |
255 | cpu_relax(); | 274 | cpu_relax(); |
@@ -259,6 +278,16 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ | |||
259 | smp_wmb(); | 278 | smp_wmb(); |
260 | atomic_set(&data.gate, 1); | 279 | atomic_set(&data.gate, 1); |
261 | 280 | ||
281 | local_irq_save(flags); | ||
282 | |||
283 | while (atomic_read(&data.count)) | ||
284 | cpu_relax(); | ||
285 | |||
286 | /* Ok, reset count and toggle gate */ | ||
287 | atomic_set(&data.count, num_booting_cpus() - 1); | ||
288 | smp_wmb(); | ||
289 | atomic_set(&data.gate, 0); | ||
290 | |||
262 | /* Do our MTRR business */ | 291 | /* Do our MTRR business */ |
263 | 292 | ||
264 | /* | 293 | /* |
@@ -279,7 +308,7 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ | |||
279 | 308 | ||
280 | atomic_set(&data.count, num_booting_cpus() - 1); | 309 | atomic_set(&data.count, num_booting_cpus() - 1); |
281 | smp_wmb(); | 310 | smp_wmb(); |
282 | atomic_set(&data.gate, 0); | 311 | atomic_set(&data.gate, 1); |
283 | 312 | ||
284 | /* | 313 | /* |
285 | * Wait here for everyone to have seen the gate change | 314 | * Wait here for everyone to have seen the gate change |
@@ -289,6 +318,7 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ | |||
289 | cpu_relax(); | 318 | cpu_relax(); |
290 | 319 | ||
291 | local_irq_restore(flags); | 320 | local_irq_restore(flags); |
321 | preempt_enable(); | ||
292 | } | 322 | } |
293 | 323 | ||
294 | /** | 324 | /** |
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 5db5b7d65a18..f2da20fda02d 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c | |||
@@ -220,6 +220,7 @@ struct x86_pmu { | |||
220 | struct perf_event *event); | 220 | struct perf_event *event); |
221 | struct event_constraint *event_constraints; | 221 | struct event_constraint *event_constraints; |
222 | void (*quirks)(void); | 222 | void (*quirks)(void); |
223 | int perfctr_second_write; | ||
223 | 224 | ||
224 | int (*cpu_prepare)(int cpu); | 225 | int (*cpu_prepare)(int cpu); |
225 | void (*cpu_starting)(int cpu); | 226 | void (*cpu_starting)(int cpu); |
@@ -295,10 +296,10 @@ x86_perf_event_update(struct perf_event *event) | |||
295 | * count to the generic event atomically: | 296 | * count to the generic event atomically: |
296 | */ | 297 | */ |
297 | again: | 298 | again: |
298 | prev_raw_count = atomic64_read(&hwc->prev_count); | 299 | prev_raw_count = local64_read(&hwc->prev_count); |
299 | rdmsrl(hwc->event_base + idx, new_raw_count); | 300 | rdmsrl(hwc->event_base + idx, new_raw_count); |
300 | 301 | ||
301 | if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, | 302 | if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, |
302 | new_raw_count) != prev_raw_count) | 303 | new_raw_count) != prev_raw_count) |
303 | goto again; | 304 | goto again; |
304 | 305 | ||
@@ -313,8 +314,8 @@ again: | |||
313 | delta = (new_raw_count << shift) - (prev_raw_count << shift); | 314 | delta = (new_raw_count << shift) - (prev_raw_count << shift); |
314 | delta >>= shift; | 315 | delta >>= shift; |
315 | 316 | ||
316 | atomic64_add(delta, &event->count); | 317 | local64_add(delta, &event->count); |
317 | atomic64_sub(delta, &hwc->period_left); | 318 | local64_sub(delta, &hwc->period_left); |
318 | 319 | ||
319 | return new_raw_count; | 320 | return new_raw_count; |
320 | } | 321 | } |
@@ -438,7 +439,7 @@ static int x86_setup_perfctr(struct perf_event *event) | |||
438 | if (!hwc->sample_period) { | 439 | if (!hwc->sample_period) { |
439 | hwc->sample_period = x86_pmu.max_period; | 440 | hwc->sample_period = x86_pmu.max_period; |
440 | hwc->last_period = hwc->sample_period; | 441 | hwc->last_period = hwc->sample_period; |
441 | atomic64_set(&hwc->period_left, hwc->sample_period); | 442 | local64_set(&hwc->period_left, hwc->sample_period); |
442 | } else { | 443 | } else { |
443 | /* | 444 | /* |
444 | * If we have a PMU initialized but no APIC | 445 | * If we have a PMU initialized but no APIC |
@@ -885,7 +886,7 @@ static int | |||
885 | x86_perf_event_set_period(struct perf_event *event) | 886 | x86_perf_event_set_period(struct perf_event *event) |
886 | { | 887 | { |
887 | struct hw_perf_event *hwc = &event->hw; | 888 | struct hw_perf_event *hwc = &event->hw; |
888 | s64 left = atomic64_read(&hwc->period_left); | 889 | s64 left = local64_read(&hwc->period_left); |
889 | s64 period = hwc->sample_period; | 890 | s64 period = hwc->sample_period; |
890 | int ret = 0, idx = hwc->idx; | 891 | int ret = 0, idx = hwc->idx; |
891 | 892 | ||
@@ -897,14 +898,14 @@ x86_perf_event_set_period(struct perf_event *event) | |||
897 | */ | 898 | */ |
898 | if (unlikely(left <= -period)) { | 899 | if (unlikely(left <= -period)) { |
899 | left = period; | 900 | left = period; |
900 | atomic64_set(&hwc->period_left, left); | 901 | local64_set(&hwc->period_left, left); |
901 | hwc->last_period = period; | 902 | hwc->last_period = period; |
902 | ret = 1; | 903 | ret = 1; |
903 | } | 904 | } |
904 | 905 | ||
905 | if (unlikely(left <= 0)) { | 906 | if (unlikely(left <= 0)) { |
906 | left += period; | 907 | left += period; |
907 | atomic64_set(&hwc->period_left, left); | 908 | local64_set(&hwc->period_left, left); |
908 | hwc->last_period = period; | 909 | hwc->last_period = period; |
909 | ret = 1; | 910 | ret = 1; |
910 | } | 911 | } |
@@ -923,10 +924,19 @@ x86_perf_event_set_period(struct perf_event *event) | |||
923 | * The hw event starts counting from this event offset, | 924 | * The hw event starts counting from this event offset, |
924 | * mark it to be able to extra future deltas: | 925 | * mark it to be able to extra future deltas: |
925 | */ | 926 | */ |
926 | atomic64_set(&hwc->prev_count, (u64)-left); | 927 | local64_set(&hwc->prev_count, (u64)-left); |
927 | 928 | ||
928 | wrmsrl(hwc->event_base + idx, | 929 | wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask); |
930 | |||
931 | /* | ||
932 | * Due to erratum on certan cpu we need | ||
933 | * a second write to be sure the register | ||
934 | * is updated properly | ||
935 | */ | ||
936 | if (x86_pmu.perfctr_second_write) { | ||
937 | wrmsrl(hwc->event_base + idx, | ||
929 | (u64)(-left) & x86_pmu.cntval_mask); | 938 | (u64)(-left) & x86_pmu.cntval_mask); |
939 | } | ||
930 | 940 | ||
931 | perf_event_update_userpage(event); | 941 | perf_event_update_userpage(event); |
932 | 942 | ||
@@ -969,7 +979,7 @@ static int x86_pmu_enable(struct perf_event *event) | |||
969 | * skip the schedulability test here, it will be peformed | 979 | * skip the schedulability test here, it will be peformed |
970 | * at commit time(->commit_txn) as a whole | 980 | * at commit time(->commit_txn) as a whole |
971 | */ | 981 | */ |
972 | if (cpuc->group_flag & PERF_EVENT_TXN_STARTED) | 982 | if (cpuc->group_flag & PERF_EVENT_TXN) |
973 | goto out; | 983 | goto out; |
974 | 984 | ||
975 | ret = x86_pmu.schedule_events(cpuc, n, assign); | 985 | ret = x86_pmu.schedule_events(cpuc, n, assign); |
@@ -1096,7 +1106,7 @@ static void x86_pmu_disable(struct perf_event *event) | |||
1096 | * The events never got scheduled and ->cancel_txn will truncate | 1106 | * The events never got scheduled and ->cancel_txn will truncate |
1097 | * the event_list. | 1107 | * the event_list. |
1098 | */ | 1108 | */ |
1099 | if (cpuc->group_flag & PERF_EVENT_TXN_STARTED) | 1109 | if (cpuc->group_flag & PERF_EVENT_TXN) |
1100 | return; | 1110 | return; |
1101 | 1111 | ||
1102 | x86_pmu_stop(event); | 1112 | x86_pmu_stop(event); |
@@ -1388,7 +1398,7 @@ static void x86_pmu_start_txn(const struct pmu *pmu) | |||
1388 | { | 1398 | { |
1389 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | 1399 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); |
1390 | 1400 | ||
1391 | cpuc->group_flag |= PERF_EVENT_TXN_STARTED; | 1401 | cpuc->group_flag |= PERF_EVENT_TXN; |
1392 | cpuc->n_txn = 0; | 1402 | cpuc->n_txn = 0; |
1393 | } | 1403 | } |
1394 | 1404 | ||
@@ -1401,7 +1411,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu) | |||
1401 | { | 1411 | { |
1402 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | 1412 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); |
1403 | 1413 | ||
1404 | cpuc->group_flag &= ~PERF_EVENT_TXN_STARTED; | 1414 | cpuc->group_flag &= ~PERF_EVENT_TXN; |
1405 | /* | 1415 | /* |
1406 | * Truncate the collected events. | 1416 | * Truncate the collected events. |
1407 | */ | 1417 | */ |
@@ -1435,11 +1445,7 @@ static int x86_pmu_commit_txn(const struct pmu *pmu) | |||
1435 | */ | 1445 | */ |
1436 | memcpy(cpuc->assign, assign, n*sizeof(int)); | 1446 | memcpy(cpuc->assign, assign, n*sizeof(int)); |
1437 | 1447 | ||
1438 | /* | 1448 | cpuc->group_flag &= ~PERF_EVENT_TXN; |
1439 | * Clear out the txn count so that ->cancel_txn() which gets | ||
1440 | * run after ->commit_txn() doesn't undo things. | ||
1441 | */ | ||
1442 | cpuc->n_txn = 0; | ||
1443 | 1449 | ||
1444 | return 0; | 1450 | return 0; |
1445 | } | 1451 | } |
@@ -1607,8 +1613,6 @@ static const struct stacktrace_ops backtrace_ops = { | |||
1607 | .walk_stack = print_context_stack_bp, | 1613 | .walk_stack = print_context_stack_bp, |
1608 | }; | 1614 | }; |
1609 | 1615 | ||
1610 | #include "../dumpstack.h" | ||
1611 | |||
1612 | static void | 1616 | static void |
1613 | perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) | 1617 | perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) |
1614 | { | 1618 | { |
@@ -1730,22 +1734,6 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) | |||
1730 | return entry; | 1734 | return entry; |
1731 | } | 1735 | } |
1732 | 1736 | ||
1733 | void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip) | ||
1734 | { | ||
1735 | regs->ip = ip; | ||
1736 | /* | ||
1737 | * perf_arch_fetch_caller_regs adds another call, we need to increment | ||
1738 | * the skip level | ||
1739 | */ | ||
1740 | regs->bp = rewind_frame_pointer(skip + 1); | ||
1741 | regs->cs = __KERNEL_CS; | ||
1742 | /* | ||
1743 | * We abuse bit 3 to pass exact information, see perf_misc_flags | ||
1744 | * and the comment with PERF_EFLAGS_EXACT. | ||
1745 | */ | ||
1746 | regs->flags = 0; | ||
1747 | } | ||
1748 | |||
1749 | unsigned long perf_instruction_pointer(struct pt_regs *regs) | 1737 | unsigned long perf_instruction_pointer(struct pt_regs *regs) |
1750 | { | 1738 | { |
1751 | unsigned long ip; | 1739 | unsigned long ip; |
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index ae85d69644d1..febb12cea795 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c | |||
@@ -21,22 +21,36 @@ struct p4_event_bind { | |||
21 | char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */ | 21 | char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */ |
22 | }; | 22 | }; |
23 | 23 | ||
24 | struct p4_cache_event_bind { | 24 | struct p4_pebs_bind { |
25 | unsigned int metric_pebs; | 25 | unsigned int metric_pebs; |
26 | unsigned int metric_vert; | 26 | unsigned int metric_vert; |
27 | }; | 27 | }; |
28 | 28 | ||
29 | #define P4_GEN_CACHE_EVENT_BIND(name) \ | 29 | /* it sets P4_PEBS_ENABLE_UOP_TAG as well */ |
30 | [P4_CACHE__##name] = { \ | 30 | #define P4_GEN_PEBS_BIND(name, pebs, vert) \ |
31 | .metric_pebs = P4_PEBS__##name, \ | 31 | [P4_PEBS_METRIC__##name] = { \ |
32 | .metric_vert = P4_VERT__##name, \ | 32 | .metric_pebs = pebs | P4_PEBS_ENABLE_UOP_TAG, \ |
33 | .metric_vert = vert, \ | ||
33 | } | 34 | } |
34 | 35 | ||
35 | static struct p4_cache_event_bind p4_cache_event_bind_map[] = { | 36 | /* |
36 | P4_GEN_CACHE_EVENT_BIND(1stl_cache_load_miss_retired), | 37 | * note we have P4_PEBS_ENABLE_UOP_TAG always set here |
37 | P4_GEN_CACHE_EVENT_BIND(2ndl_cache_load_miss_retired), | 38 | * |
38 | P4_GEN_CACHE_EVENT_BIND(dtlb_load_miss_retired), | 39 | * it's needed for mapping P4_PEBS_CONFIG_METRIC_MASK bits of |
39 | P4_GEN_CACHE_EVENT_BIND(dtlb_store_miss_retired), | 40 | * event configuration to find out which values are to be |
41 | * written into MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT | ||
42 | * resgisters | ||
43 | */ | ||
44 | static struct p4_pebs_bind p4_pebs_bind_map[] = { | ||
45 | P4_GEN_PEBS_BIND(1stl_cache_load_miss_retired, 0x0000001, 0x0000001), | ||
46 | P4_GEN_PEBS_BIND(2ndl_cache_load_miss_retired, 0x0000002, 0x0000001), | ||
47 | P4_GEN_PEBS_BIND(dtlb_load_miss_retired, 0x0000004, 0x0000001), | ||
48 | P4_GEN_PEBS_BIND(dtlb_store_miss_retired, 0x0000004, 0x0000002), | ||
49 | P4_GEN_PEBS_BIND(dtlb_all_miss_retired, 0x0000004, 0x0000003), | ||
50 | P4_GEN_PEBS_BIND(tagged_mispred_branch, 0x0018000, 0x0000010), | ||
51 | P4_GEN_PEBS_BIND(mob_load_replay_retired, 0x0000200, 0x0000001), | ||
52 | P4_GEN_PEBS_BIND(split_load_retired, 0x0000400, 0x0000001), | ||
53 | P4_GEN_PEBS_BIND(split_store_retired, 0x0000400, 0x0000002), | ||
40 | }; | 54 | }; |
41 | 55 | ||
42 | /* | 56 | /* |
@@ -281,10 +295,10 @@ static struct p4_event_bind p4_event_bind_map[] = { | |||
281 | }, | 295 | }, |
282 | }; | 296 | }; |
283 | 297 | ||
284 | #define P4_GEN_CACHE_EVENT(event, bit, cache_event) \ | 298 | #define P4_GEN_CACHE_EVENT(event, bit, metric) \ |
285 | p4_config_pack_escr(P4_ESCR_EVENT(event) | \ | 299 | p4_config_pack_escr(P4_ESCR_EVENT(event) | \ |
286 | P4_ESCR_EMASK_BIT(event, bit)) | \ | 300 | P4_ESCR_EMASK_BIT(event, bit)) | \ |
287 | p4_config_pack_cccr(cache_event | \ | 301 | p4_config_pack_cccr(metric | \ |
288 | P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event)))) | 302 | P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event)))) |
289 | 303 | ||
290 | static __initconst const u64 p4_hw_cache_event_ids | 304 | static __initconst const u64 p4_hw_cache_event_ids |
@@ -296,34 +310,34 @@ static __initconst const u64 p4_hw_cache_event_ids | |||
296 | [ C(OP_READ) ] = { | 310 | [ C(OP_READ) ] = { |
297 | [ C(RESULT_ACCESS) ] = 0x0, | 311 | [ C(RESULT_ACCESS) ] = 0x0, |
298 | [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, | 312 | [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, |
299 | P4_CACHE__1stl_cache_load_miss_retired), | 313 | P4_PEBS_METRIC__1stl_cache_load_miss_retired), |
300 | }, | 314 | }, |
301 | }, | 315 | }, |
302 | [ C(LL ) ] = { | 316 | [ C(LL ) ] = { |
303 | [ C(OP_READ) ] = { | 317 | [ C(OP_READ) ] = { |
304 | [ C(RESULT_ACCESS) ] = 0x0, | 318 | [ C(RESULT_ACCESS) ] = 0x0, |
305 | [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, | 319 | [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, |
306 | P4_CACHE__2ndl_cache_load_miss_retired), | 320 | P4_PEBS_METRIC__2ndl_cache_load_miss_retired), |
307 | }, | 321 | }, |
308 | }, | 322 | }, |
309 | [ C(DTLB) ] = { | 323 | [ C(DTLB) ] = { |
310 | [ C(OP_READ) ] = { | 324 | [ C(OP_READ) ] = { |
311 | [ C(RESULT_ACCESS) ] = 0x0, | 325 | [ C(RESULT_ACCESS) ] = 0x0, |
312 | [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, | 326 | [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, |
313 | P4_CACHE__dtlb_load_miss_retired), | 327 | P4_PEBS_METRIC__dtlb_load_miss_retired), |
314 | }, | 328 | }, |
315 | [ C(OP_WRITE) ] = { | 329 | [ C(OP_WRITE) ] = { |
316 | [ C(RESULT_ACCESS) ] = 0x0, | 330 | [ C(RESULT_ACCESS) ] = 0x0, |
317 | [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, | 331 | [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, |
318 | P4_CACHE__dtlb_store_miss_retired), | 332 | P4_PEBS_METRIC__dtlb_store_miss_retired), |
319 | }, | 333 | }, |
320 | }, | 334 | }, |
321 | [ C(ITLB) ] = { | 335 | [ C(ITLB) ] = { |
322 | [ C(OP_READ) ] = { | 336 | [ C(OP_READ) ] = { |
323 | [ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT, | 337 | [ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT, |
324 | P4_CACHE__itlb_reference_hit), | 338 | P4_PEBS_METRIC__none), |
325 | [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS, | 339 | [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS, |
326 | P4_CACHE__itlb_reference_miss), | 340 | P4_PEBS_METRIC__none), |
327 | }, | 341 | }, |
328 | [ C(OP_WRITE) ] = { | 342 | [ C(OP_WRITE) ] = { |
329 | [ C(RESULT_ACCESS) ] = -1, | 343 | [ C(RESULT_ACCESS) ] = -1, |
@@ -414,11 +428,37 @@ static u64 p4_pmu_event_map(int hw_event) | |||
414 | return config; | 428 | return config; |
415 | } | 429 | } |
416 | 430 | ||
431 | static int p4_validate_raw_event(struct perf_event *event) | ||
432 | { | ||
433 | unsigned int v; | ||
434 | |||
435 | /* user data may have out-of-bound event index */ | ||
436 | v = p4_config_unpack_event(event->attr.config); | ||
437 | if (v >= ARRAY_SIZE(p4_event_bind_map)) { | ||
438 | pr_warning("P4 PMU: Unknown event code: %d\n", v); | ||
439 | return -EINVAL; | ||
440 | } | ||
441 | |||
442 | /* | ||
443 | * it may have some screwed PEBS bits | ||
444 | */ | ||
445 | if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE)) { | ||
446 | pr_warning("P4 PMU: PEBS are not supported yet\n"); | ||
447 | return -EINVAL; | ||
448 | } | ||
449 | v = p4_config_unpack_metric(event->attr.config); | ||
450 | if (v >= ARRAY_SIZE(p4_pebs_bind_map)) { | ||
451 | pr_warning("P4 PMU: Unknown metric code: %d\n", v); | ||
452 | return -EINVAL; | ||
453 | } | ||
454 | |||
455 | return 0; | ||
456 | } | ||
457 | |||
417 | static int p4_hw_config(struct perf_event *event) | 458 | static int p4_hw_config(struct perf_event *event) |
418 | { | 459 | { |
419 | int cpu = get_cpu(); | 460 | int cpu = get_cpu(); |
420 | int rc = 0; | 461 | int rc = 0; |
421 | unsigned int evnt; | ||
422 | u32 escr, cccr; | 462 | u32 escr, cccr; |
423 | 463 | ||
424 | /* | 464 | /* |
@@ -438,12 +478,9 @@ static int p4_hw_config(struct perf_event *event) | |||
438 | 478 | ||
439 | if (event->attr.type == PERF_TYPE_RAW) { | 479 | if (event->attr.type == PERF_TYPE_RAW) { |
440 | 480 | ||
441 | /* user data may have out-of-bound event index */ | 481 | rc = p4_validate_raw_event(event); |
442 | evnt = p4_config_unpack_event(event->attr.config); | 482 | if (rc) |
443 | if (evnt >= ARRAY_SIZE(p4_event_bind_map)) { | ||
444 | rc = -EINVAL; | ||
445 | goto out; | 483 | goto out; |
446 | } | ||
447 | 484 | ||
448 | /* | 485 | /* |
449 | * We don't control raw events so it's up to the caller | 486 | * We don't control raw events so it's up to the caller |
@@ -451,12 +488,15 @@ static int p4_hw_config(struct perf_event *event) | |||
451 | * on HT machine but allow HT-compatible specifics to be | 488 | * on HT machine but allow HT-compatible specifics to be |
452 | * passed on) | 489 | * passed on) |
453 | * | 490 | * |
491 | * Note that for RAW events we allow user to use P4_CCCR_RESERVED | ||
492 | * bits since we keep additional info here (for cache events and etc) | ||
493 | * | ||
454 | * XXX: HT wide things should check perf_paranoid_cpu() && | 494 | * XXX: HT wide things should check perf_paranoid_cpu() && |
455 | * CAP_SYS_ADMIN | 495 | * CAP_SYS_ADMIN |
456 | */ | 496 | */ |
457 | event->hw.config |= event->attr.config & | 497 | event->hw.config |= event->attr.config & |
458 | (p4_config_pack_escr(P4_ESCR_MASK_HT) | | 498 | (p4_config_pack_escr(P4_ESCR_MASK_HT) | |
459 | p4_config_pack_cccr(P4_CCCR_MASK_HT)); | 499 | p4_config_pack_cccr(P4_CCCR_MASK_HT | P4_CCCR_RESERVED)); |
460 | } | 500 | } |
461 | 501 | ||
462 | rc = x86_setup_perfctr(event); | 502 | rc = x86_setup_perfctr(event); |
@@ -482,6 +522,29 @@ static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc) | |||
482 | return overflow; | 522 | return overflow; |
483 | } | 523 | } |
484 | 524 | ||
525 | static void p4_pmu_disable_pebs(void) | ||
526 | { | ||
527 | /* | ||
528 | * FIXME | ||
529 | * | ||
530 | * It's still allowed that two threads setup same cache | ||
531 | * events so we can't simply clear metrics until we knew | ||
532 | * noone is depending on us, so we need kind of counter | ||
533 | * for "ReplayEvent" users. | ||
534 | * | ||
535 | * What is more complex -- RAW events, if user (for some | ||
536 | * reason) will pass some cache event metric with improper | ||
537 | * event opcode -- it's fine from hardware point of view | ||
538 | * but completely nonsence from "meaning" of such action. | ||
539 | * | ||
540 | * So at moment let leave metrics turned on forever -- it's | ||
541 | * ok for now but need to be revisited! | ||
542 | * | ||
543 | * (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)0); | ||
544 | * (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)0); | ||
545 | */ | ||
546 | } | ||
547 | |||
485 | static inline void p4_pmu_disable_event(struct perf_event *event) | 548 | static inline void p4_pmu_disable_event(struct perf_event *event) |
486 | { | 549 | { |
487 | struct hw_perf_event *hwc = &event->hw; | 550 | struct hw_perf_event *hwc = &event->hw; |
@@ -507,6 +570,26 @@ static void p4_pmu_disable_all(void) | |||
507 | continue; | 570 | continue; |
508 | p4_pmu_disable_event(event); | 571 | p4_pmu_disable_event(event); |
509 | } | 572 | } |
573 | |||
574 | p4_pmu_disable_pebs(); | ||
575 | } | ||
576 | |||
577 | /* configuration must be valid */ | ||
578 | static void p4_pmu_enable_pebs(u64 config) | ||
579 | { | ||
580 | struct p4_pebs_bind *bind; | ||
581 | unsigned int idx; | ||
582 | |||
583 | BUILD_BUG_ON(P4_PEBS_METRIC__max > P4_PEBS_CONFIG_METRIC_MASK); | ||
584 | |||
585 | idx = p4_config_unpack_metric(config); | ||
586 | if (idx == P4_PEBS_METRIC__none) | ||
587 | return; | ||
588 | |||
589 | bind = &p4_pebs_bind_map[idx]; | ||
590 | |||
591 | (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs); | ||
592 | (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert); | ||
510 | } | 593 | } |
511 | 594 | ||
512 | static void p4_pmu_enable_event(struct perf_event *event) | 595 | static void p4_pmu_enable_event(struct perf_event *event) |
@@ -515,9 +598,7 @@ static void p4_pmu_enable_event(struct perf_event *event) | |||
515 | int thread = p4_ht_config_thread(hwc->config); | 598 | int thread = p4_ht_config_thread(hwc->config); |
516 | u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config)); | 599 | u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config)); |
517 | unsigned int idx = p4_config_unpack_event(hwc->config); | 600 | unsigned int idx = p4_config_unpack_event(hwc->config); |
518 | unsigned int idx_cache = p4_config_unpack_cache_event(hwc->config); | ||
519 | struct p4_event_bind *bind; | 601 | struct p4_event_bind *bind; |
520 | struct p4_cache_event_bind *bind_cache; | ||
521 | u64 escr_addr, cccr; | 602 | u64 escr_addr, cccr; |
522 | 603 | ||
523 | bind = &p4_event_bind_map[idx]; | 604 | bind = &p4_event_bind_map[idx]; |
@@ -537,16 +618,10 @@ static void p4_pmu_enable_event(struct perf_event *event) | |||
537 | cccr = p4_config_unpack_cccr(hwc->config); | 618 | cccr = p4_config_unpack_cccr(hwc->config); |
538 | 619 | ||
539 | /* | 620 | /* |
540 | * it could be Cache event so that we need to | 621 | * it could be Cache event so we need to write metrics |
541 | * set metrics into additional MSRs | 622 | * into additional MSRs |
542 | */ | 623 | */ |
543 | BUILD_BUG_ON(P4_CACHE__MAX > P4_CCCR_CACHE_OPS_MASK); | 624 | p4_pmu_enable_pebs(hwc->config); |
544 | if (idx_cache > P4_CACHE__NONE && | ||
545 | idx_cache < ARRAY_SIZE(p4_cache_event_bind_map)) { | ||
546 | bind_cache = &p4_cache_event_bind_map[idx_cache]; | ||
547 | (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind_cache->metric_pebs); | ||
548 | (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind_cache->metric_vert); | ||
549 | } | ||
550 | 625 | ||
551 | (void)checking_wrmsrl(escr_addr, escr_conf); | 626 | (void)checking_wrmsrl(escr_addr, escr_conf); |
552 | (void)checking_wrmsrl(hwc->config_base + hwc->idx, | 627 | (void)checking_wrmsrl(hwc->config_base + hwc->idx, |
@@ -581,6 +656,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) | |||
581 | cpuc = &__get_cpu_var(cpu_hw_events); | 656 | cpuc = &__get_cpu_var(cpu_hw_events); |
582 | 657 | ||
583 | for (idx = 0; idx < x86_pmu.num_counters; idx++) { | 658 | for (idx = 0; idx < x86_pmu.num_counters; idx++) { |
659 | int overflow; | ||
584 | 660 | ||
585 | if (!test_bit(idx, cpuc->active_mask)) | 661 | if (!test_bit(idx, cpuc->active_mask)) |
586 | continue; | 662 | continue; |
@@ -591,12 +667,14 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) | |||
591 | WARN_ON_ONCE(hwc->idx != idx); | 667 | WARN_ON_ONCE(hwc->idx != idx); |
592 | 668 | ||
593 | /* it might be unflagged overflow */ | 669 | /* it might be unflagged overflow */ |
594 | handled = p4_pmu_clear_cccr_ovf(hwc); | 670 | overflow = p4_pmu_clear_cccr_ovf(hwc); |
595 | 671 | ||
596 | val = x86_perf_event_update(event); | 672 | val = x86_perf_event_update(event); |
597 | if (!handled && (val & (1ULL << (x86_pmu.cntval_bits - 1)))) | 673 | if (!overflow && (val & (1ULL << (x86_pmu.cntval_bits - 1)))) |
598 | continue; | 674 | continue; |
599 | 675 | ||
676 | handled += overflow; | ||
677 | |||
600 | /* event overflow for sure */ | 678 | /* event overflow for sure */ |
601 | data.period = event->hw.last_period; | 679 | data.period = event->hw.last_period; |
602 | 680 | ||
@@ -612,7 +690,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) | |||
612 | inc_irq_stat(apic_perf_irqs); | 690 | inc_irq_stat(apic_perf_irqs); |
613 | } | 691 | } |
614 | 692 | ||
615 | return handled; | 693 | return handled > 0; |
616 | } | 694 | } |
617 | 695 | ||
618 | /* | 696 | /* |
@@ -829,6 +907,15 @@ static __initconst const struct x86_pmu p4_pmu = { | |||
829 | .max_period = (1ULL << 39) - 1, | 907 | .max_period = (1ULL << 39) - 1, |
830 | .hw_config = p4_hw_config, | 908 | .hw_config = p4_hw_config, |
831 | .schedule_events = p4_pmu_schedule_events, | 909 | .schedule_events = p4_pmu_schedule_events, |
910 | /* | ||
911 | * This handles erratum N15 in intel doc 249199-029, | ||
912 | * the counter may not be updated correctly on write | ||
913 | * so we need a second write operation to do the trick | ||
914 | * (the official workaround didn't work) | ||
915 | * | ||
916 | * the former idea is taken from OProfile code | ||
917 | */ | ||
918 | .perfctr_second_write = 1, | ||
832 | }; | 919 | }; |
833 | 920 | ||
834 | static __init int p4_pmu_init(void) | 921 | static __init int p4_pmu_init(void) |
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c new file mode 100644 index 000000000000..34b4dad6f0b8 --- /dev/null +++ b/arch/x86/kernel/cpu/scattered.c | |||
@@ -0,0 +1,63 @@ | |||
1 | /* | ||
2 | * Routines to indentify additional cpu features that are scattered in | ||
3 | * cpuid space. | ||
4 | */ | ||
5 | #include <linux/cpu.h> | ||
6 | |||
7 | #include <asm/pat.h> | ||
8 | #include <asm/processor.h> | ||
9 | |||
10 | #include <asm/apic.h> | ||
11 | |||
12 | struct cpuid_bit { | ||
13 | u16 feature; | ||
14 | u8 reg; | ||
15 | u8 bit; | ||
16 | u32 level; | ||
17 | u32 sub_leaf; | ||
18 | }; | ||
19 | |||
20 | enum cpuid_regs { | ||
21 | CR_EAX = 0, | ||
22 | CR_ECX, | ||
23 | CR_EDX, | ||
24 | CR_EBX | ||
25 | }; | ||
26 | |||
27 | void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) | ||
28 | { | ||
29 | u32 max_level; | ||
30 | u32 regs[4]; | ||
31 | const struct cpuid_bit *cb; | ||
32 | |||
33 | static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { | ||
34 | { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 }, | ||
35 | { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 }, | ||
36 | { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 }, | ||
37 | { X86_FEATURE_PTS, CR_EAX, 6, 0x00000006, 0 }, | ||
38 | { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, | ||
39 | { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, | ||
40 | { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 }, | ||
41 | { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 }, | ||
42 | { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 }, | ||
43 | { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 }, | ||
44 | { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 }, | ||
45 | { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 }, | ||
46 | { 0, 0, 0, 0, 0 } | ||
47 | }; | ||
48 | |||
49 | for (cb = cpuid_bits; cb->feature; cb++) { | ||
50 | |||
51 | /* Verify that the level is valid */ | ||
52 | max_level = cpuid_eax(cb->level & 0xffff0000); | ||
53 | if (max_level < cb->level || | ||
54 | max_level > (cb->level | 0xffff)) | ||
55 | continue; | ||
56 | |||
57 | cpuid_count(cb->level, cb->sub_leaf, ®s[CR_EAX], | ||
58 | ®s[CR_EBX], ®s[CR_ECX], ®s[CR_EDX]); | ||
59 | |||
60 | if (regs[cb->reg] & (1 << cb->bit)) | ||
61 | set_cpu_cap(c, cb->feature); | ||
62 | } | ||
63 | } | ||
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/topology.c index 10fa5684a662..4397e987a1cf 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/topology.c | |||
@@ -1,62 +1,14 @@ | |||
1 | /* | 1 | /* |
2 | * Routines to indentify additional cpu features that are scattered in | 2 | * Check for extended topology enumeration cpuid leaf 0xb and if it |
3 | * cpuid space. | 3 | * exists, use it for populating initial_apicid and cpu topology |
4 | * detection. | ||
4 | */ | 5 | */ |
5 | #include <linux/cpu.h> | ||
6 | 6 | ||
7 | #include <linux/cpu.h> | ||
8 | #include <asm/apic.h> | ||
7 | #include <asm/pat.h> | 9 | #include <asm/pat.h> |
8 | #include <asm/processor.h> | 10 | #include <asm/processor.h> |
9 | 11 | ||
10 | #include <asm/apic.h> | ||
11 | |||
12 | struct cpuid_bit { | ||
13 | u16 feature; | ||
14 | u8 reg; | ||
15 | u8 bit; | ||
16 | u32 level; | ||
17 | }; | ||
18 | |||
19 | enum cpuid_regs { | ||
20 | CR_EAX = 0, | ||
21 | CR_ECX, | ||
22 | CR_EDX, | ||
23 | CR_EBX | ||
24 | }; | ||
25 | |||
26 | void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) | ||
27 | { | ||
28 | u32 max_level; | ||
29 | u32 regs[4]; | ||
30 | const struct cpuid_bit *cb; | ||
31 | |||
32 | static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { | ||
33 | { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 }, | ||
34 | { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 }, | ||
35 | { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006 }, | ||
36 | { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007 }, | ||
37 | { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a }, | ||
38 | { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a }, | ||
39 | { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a }, | ||
40 | { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a }, | ||
41 | { 0, 0, 0, 0 } | ||
42 | }; | ||
43 | |||
44 | for (cb = cpuid_bits; cb->feature; cb++) { | ||
45 | |||
46 | /* Verify that the level is valid */ | ||
47 | max_level = cpuid_eax(cb->level & 0xffff0000); | ||
48 | if (max_level < cb->level || | ||
49 | max_level > (cb->level | 0xffff)) | ||
50 | continue; | ||
51 | |||
52 | cpuid(cb->level, ®s[CR_EAX], ®s[CR_EBX], | ||
53 | ®s[CR_ECX], ®s[CR_EDX]); | ||
54 | |||
55 | if (regs[cb->reg] & (1 << cb->bit)) | ||
56 | set_cpu_cap(c, cb->feature); | ||
57 | } | ||
58 | } | ||
59 | |||
60 | /* leaf 0xb SMT level */ | 12 | /* leaf 0xb SMT level */ |
61 | #define SMT_LEVEL 0 | 13 | #define SMT_LEVEL 0 |
62 | 14 | ||
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index b9d1ff588445..227b0448960d 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c | |||
@@ -51,7 +51,7 @@ static inline int __vmware_platform(void) | |||
51 | 51 | ||
52 | static unsigned long vmware_get_tsc_khz(void) | 52 | static unsigned long vmware_get_tsc_khz(void) |
53 | { | 53 | { |
54 | uint64_t tsc_hz; | 54 | uint64_t tsc_hz, lpj; |
55 | uint32_t eax, ebx, ecx, edx; | 55 | uint32_t eax, ebx, ecx, edx; |
56 | 56 | ||
57 | VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); | 57 | VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); |
@@ -62,6 +62,13 @@ static unsigned long vmware_get_tsc_khz(void) | |||
62 | printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n", | 62 | printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n", |
63 | (unsigned long) tsc_hz / 1000, | 63 | (unsigned long) tsc_hz / 1000, |
64 | (unsigned long) tsc_hz % 1000); | 64 | (unsigned long) tsc_hz % 1000); |
65 | |||
66 | if (!preset_lpj) { | ||
67 | lpj = ((u64)tsc_hz * 1000); | ||
68 | do_div(lpj, HZ); | ||
69 | preset_lpj = lpj; | ||
70 | } | ||
71 | |||
65 | return tsc_hz; | 72 | return tsc_hz; |
66 | } | 73 | } |
67 | 74 | ||
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index ebd4c51d096a..764c7c2b1811 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c | |||
@@ -28,6 +28,8 @@ | |||
28 | #include <asm/reboot.h> | 28 | #include <asm/reboot.h> |
29 | #include <asm/virtext.h> | 29 | #include <asm/virtext.h> |
30 | 30 | ||
31 | int in_crash_kexec; | ||
32 | |||
31 | #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) | 33 | #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) |
32 | 34 | ||
33 | static void kdump_nmi_callback(int cpu, struct die_args *args) | 35 | static void kdump_nmi_callback(int cpu, struct die_args *args) |
@@ -61,6 +63,7 @@ static void kdump_nmi_callback(int cpu, struct die_args *args) | |||
61 | 63 | ||
62 | static void kdump_nmi_shootdown_cpus(void) | 64 | static void kdump_nmi_shootdown_cpus(void) |
63 | { | 65 | { |
66 | in_crash_kexec = 1; | ||
64 | nmi_shootdown_cpus(kdump_nmi_callback); | 67 | nmi_shootdown_cpus(kdump_nmi_callback); |
65 | 68 | ||
66 | disable_local_APIC(); | 69 | disable_local_APIC(); |
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index c89a386930b7..6e8752c1bd52 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c | |||
@@ -18,7 +18,6 @@ | |||
18 | 18 | ||
19 | #include <asm/stacktrace.h> | 19 | #include <asm/stacktrace.h> |
20 | 20 | ||
21 | #include "dumpstack.h" | ||
22 | 21 | ||
23 | int panic_on_unrecovered_nmi; | 22 | int panic_on_unrecovered_nmi; |
24 | int panic_on_io_nmi; | 23 | int panic_on_io_nmi; |
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h deleted file mode 100644 index e1a93be4fd44..000000000000 --- a/arch/x86/kernel/dumpstack.h +++ /dev/null | |||
@@ -1,56 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (C) 1991, 1992 Linus Torvalds | ||
3 | * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs | ||
4 | */ | ||
5 | |||
6 | #ifndef DUMPSTACK_H | ||
7 | #define DUMPSTACK_H | ||
8 | |||
9 | #ifdef CONFIG_X86_32 | ||
10 | #define STACKSLOTS_PER_LINE 8 | ||
11 | #define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :) | ||
12 | #else | ||
13 | #define STACKSLOTS_PER_LINE 4 | ||
14 | #define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) | ||
15 | #endif | ||
16 | |||
17 | #include <linux/uaccess.h> | ||
18 | |||
19 | extern void | ||
20 | show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, | ||
21 | unsigned long *stack, unsigned long bp, char *log_lvl); | ||
22 | |||
23 | extern void | ||
24 | show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, | ||
25 | unsigned long *sp, unsigned long bp, char *log_lvl); | ||
26 | |||
27 | extern unsigned int code_bytes; | ||
28 | |||
29 | /* The form of the top of the frame on the stack */ | ||
30 | struct stack_frame { | ||
31 | struct stack_frame *next_frame; | ||
32 | unsigned long return_address; | ||
33 | }; | ||
34 | |||
35 | struct stack_frame_ia32 { | ||
36 | u32 next_frame; | ||
37 | u32 return_address; | ||
38 | }; | ||
39 | |||
40 | static inline unsigned long rewind_frame_pointer(int n) | ||
41 | { | ||
42 | struct stack_frame *frame; | ||
43 | |||
44 | get_bp(frame); | ||
45 | |||
46 | #ifdef CONFIG_FRAME_POINTER | ||
47 | while (n--) { | ||
48 | if (probe_kernel_address(&frame->next_frame, frame)) | ||
49 | break; | ||
50 | } | ||
51 | #endif | ||
52 | |||
53 | return (unsigned long)frame; | ||
54 | } | ||
55 | |||
56 | #endif /* DUMPSTACK_H */ | ||
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 11540a189d93..0f6376ffa2d9 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c | |||
@@ -16,8 +16,6 @@ | |||
16 | 16 | ||
17 | #include <asm/stacktrace.h> | 17 | #include <asm/stacktrace.h> |
18 | 18 | ||
19 | #include "dumpstack.h" | ||
20 | |||
21 | 19 | ||
22 | void dump_trace(struct task_struct *task, struct pt_regs *regs, | 20 | void dump_trace(struct task_struct *task, struct pt_regs *regs, |
23 | unsigned long *stack, unsigned long bp, | 21 | unsigned long *stack, unsigned long bp, |
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 272c9f1f05f3..57a21f11c791 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c | |||
@@ -16,7 +16,6 @@ | |||
16 | 16 | ||
17 | #include <asm/stacktrace.h> | 17 | #include <asm/stacktrace.h> |
18 | 18 | ||
19 | #include "dumpstack.h" | ||
20 | 19 | ||
21 | #define N_EXCEPTION_STACKS_END \ | 20 | #define N_EXCEPTION_STACKS_END \ |
22 | (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2) | 21 | (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2) |
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index cd49141cf153..227d00920d2f 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S | |||
@@ -611,14 +611,14 @@ ldt_ss: | |||
611 | * compensating for the offset by changing to the ESPFIX segment with | 611 | * compensating for the offset by changing to the ESPFIX segment with |
612 | * a base address that matches for the difference. | 612 | * a base address that matches for the difference. |
613 | */ | 613 | */ |
614 | #define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8) | ||
614 | mov %esp, %edx /* load kernel esp */ | 615 | mov %esp, %edx /* load kernel esp */ |
615 | mov PT_OLDESP(%esp), %eax /* load userspace esp */ | 616 | mov PT_OLDESP(%esp), %eax /* load userspace esp */ |
616 | mov %dx, %ax /* eax: new kernel esp */ | 617 | mov %dx, %ax /* eax: new kernel esp */ |
617 | sub %eax, %edx /* offset (low word is 0) */ | 618 | sub %eax, %edx /* offset (low word is 0) */ |
618 | PER_CPU(gdt_page, %ebx) | ||
619 | shr $16, %edx | 619 | shr $16, %edx |
620 | mov %dl, GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx) /* bits 16..23 */ | 620 | mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ |
621 | mov %dh, GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx) /* bits 24..31 */ | 621 | mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ |
622 | pushl $__ESPFIX_SS | 622 | pushl $__ESPFIX_SS |
623 | CFI_ADJUST_CFA_OFFSET 4 | 623 | CFI_ADJUST_CFA_OFFSET 4 |
624 | push %eax /* new kernel esp */ | 624 | push %eax /* new kernel esp */ |
@@ -791,9 +791,8 @@ ptregs_clone: | |||
791 | * normal stack and adjusts ESP with the matching offset. | 791 | * normal stack and adjusts ESP with the matching offset. |
792 | */ | 792 | */ |
793 | /* fixup the stack */ | 793 | /* fixup the stack */ |
794 | PER_CPU(gdt_page, %ebx) | 794 | mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */ |
795 | mov GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx), %al /* bits 16..23 */ | 795 | mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ |
796 | mov GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx), %ah /* bits 24..31 */ | ||
797 | shl $16, %eax | 796 | shl $16, %eax |
798 | addl %esp, %eax /* the adjusted stack pointer */ | 797 | addl %esp, %eax /* the adjusted stack pointer */ |
799 | pushl $__KERNEL_DS | 798 | pushl $__KERNEL_DS |
@@ -914,7 +913,7 @@ ENTRY(simd_coprocessor_error) | |||
914 | .balign 4 | 913 | .balign 4 |
915 | .long 661b | 914 | .long 661b |
916 | .long 663f | 915 | .long 663f |
917 | .byte X86_FEATURE_XMM | 916 | .word X86_FEATURE_XMM |
918 | .byte 662b-661b | 917 | .byte 662b-661b |
919 | .byte 664f-663f | 918 | .byte 664f-663f |
920 | .previous | 919 | .previous |
@@ -1166,6 +1165,9 @@ ENTRY(xen_failsafe_callback) | |||
1166 | .previous | 1165 | .previous |
1167 | ENDPROC(xen_failsafe_callback) | 1166 | ENDPROC(xen_failsafe_callback) |
1168 | 1167 | ||
1168 | BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK, | ||
1169 | xen_evtchn_do_upcall) | ||
1170 | |||
1169 | #endif /* CONFIG_XEN */ | 1171 | #endif /* CONFIG_XEN */ |
1170 | 1172 | ||
1171 | #ifdef CONFIG_FUNCTION_TRACER | 1173 | #ifdef CONFIG_FUNCTION_TRACER |
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 4db7c4d12ffa..17be5ec7cbba 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S | |||
@@ -1065,6 +1065,7 @@ ENTRY(\sym) | |||
1065 | END(\sym) | 1065 | END(\sym) |
1066 | .endm | 1066 | .endm |
1067 | 1067 | ||
1068 | #define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) | ||
1068 | .macro paranoidzeroentry_ist sym do_sym ist | 1069 | .macro paranoidzeroentry_ist sym do_sym ist |
1069 | ENTRY(\sym) | 1070 | ENTRY(\sym) |
1070 | INTR_FRAME | 1071 | INTR_FRAME |
@@ -1076,10 +1077,9 @@ ENTRY(\sym) | |||
1076 | TRACE_IRQS_OFF | 1077 | TRACE_IRQS_OFF |
1077 | movq %rsp,%rdi /* pt_regs pointer */ | 1078 | movq %rsp,%rdi /* pt_regs pointer */ |
1078 | xorl %esi,%esi /* no error code */ | 1079 | xorl %esi,%esi /* no error code */ |
1079 | PER_CPU(init_tss, %r12) | 1080 | subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist) |
1080 | subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12) | ||
1081 | call \do_sym | 1081 | call \do_sym |
1082 | addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12) | 1082 | addq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist) |
1083 | jmp paranoid_exit /* %ebx: no swapgs flag */ | 1083 | jmp paranoid_exit /* %ebx: no swapgs flag */ |
1084 | CFI_ENDPROC | 1084 | CFI_ENDPROC |
1085 | END(\sym) | 1085 | END(\sym) |
@@ -1185,13 +1185,13 @@ END(kernel_thread_helper) | |||
1185 | * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. | 1185 | * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. |
1186 | * | 1186 | * |
1187 | * C extern interface: | 1187 | * C extern interface: |
1188 | * extern long execve(char *name, char **argv, char **envp) | 1188 | * extern long execve(const char *name, char **argv, char **envp) |
1189 | * | 1189 | * |
1190 | * asm input arguments: | 1190 | * asm input arguments: |
1191 | * rdi: name, rsi: argv, rdx: envp | 1191 | * rdi: name, rsi: argv, rdx: envp |
1192 | * | 1192 | * |
1193 | * We want to fallback into: | 1193 | * We want to fallback into: |
1194 | * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs) | 1194 | * extern long sys_execve(const char *name, char **argv,char **envp, struct pt_regs *regs) |
1195 | * | 1195 | * |
1196 | * do_sys_execve asm fallback arguments: | 1196 | * do_sys_execve asm fallback arguments: |
1197 | * rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack | 1197 | * rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack |
@@ -1329,6 +1329,9 @@ ENTRY(xen_failsafe_callback) | |||
1329 | CFI_ENDPROC | 1329 | CFI_ENDPROC |
1330 | END(xen_failsafe_callback) | 1330 | END(xen_failsafe_callback) |
1331 | 1331 | ||
1332 | apicinterrupt XEN_HVM_EVTCHN_CALLBACK \ | ||
1333 | xen_hvm_callback_vector xen_evtchn_do_upcall | ||
1334 | |||
1332 | #endif /* CONFIG_XEN */ | 1335 | #endif /* CONFIG_XEN */ |
1333 | 1336 | ||
1334 | /* | 1337 | /* |
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index b2e246037392..784360c0625c 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c | |||
@@ -20,7 +20,7 @@ | |||
20 | 20 | ||
21 | static void __init i386_default_early_setup(void) | 21 | static void __init i386_default_early_setup(void) |
22 | { | 22 | { |
23 | /* Initilize 32bit specific setup functions */ | 23 | /* Initialize 32bit specific setup functions */ |
24 | x86_init.resources.probe_roms = probe_roms; | 24 | x86_init.resources.probe_roms = probe_roms; |
25 | x86_init.resources.reserve_resources = i386_reserve_resources; | 25 | x86_init.resources.reserve_resources = i386_reserve_resources; |
26 | x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; | 26 | x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; |
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 37c3d4b17d85..ff4c453e13f3 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S | |||
@@ -131,6 +131,12 @@ ENTRY(startup_32) | |||
131 | movsl | 131 | movsl |
132 | 1: | 132 | 1: |
133 | 133 | ||
134 | #ifdef CONFIG_OLPC_OPENFIRMWARE | ||
135 | /* save OFW's pgdir table for later use when calling into OFW */ | ||
136 | movl %cr3, %eax | ||
137 | movl %eax, pa(olpc_ofw_pgd) | ||
138 | #endif | ||
139 | |||
134 | #ifdef CONFIG_PARAVIRT | 140 | #ifdef CONFIG_PARAVIRT |
135 | /* This is can only trip for a broken bootloader... */ | 141 | /* This is can only trip for a broken bootloader... */ |
136 | cmpw $0x207, pa(boot_params + BP_version) | 142 | cmpw $0x207, pa(boot_params + BP_version) |
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 3d1e6f16b7a6..239046bd447f 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S | |||
@@ -234,9 +234,8 @@ ENTRY(secondary_startup_64) | |||
234 | * init data section till per cpu areas are set up. | 234 | * init data section till per cpu areas are set up. |
235 | */ | 235 | */ |
236 | movl $MSR_GS_BASE,%ecx | 236 | movl $MSR_GS_BASE,%ecx |
237 | movq initial_gs(%rip),%rax | 237 | movl initial_gs(%rip),%eax |
238 | movq %rax,%rdx | 238 | movl initial_gs+4(%rip),%edx |
239 | shrq $32,%rdx | ||
240 | wrmsr | 239 | wrmsr |
241 | 240 | ||
242 | /* esi is pointer to real mode structure with interesting info. | 241 | /* esi is pointer to real mode structure with interesting info. |
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index ba390d731175..351f9c0fea1f 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c | |||
@@ -16,7 +16,6 @@ | |||
16 | #include <asm/hpet.h> | 16 | #include <asm/hpet.h> |
17 | 17 | ||
18 | #define HPET_MASK CLOCKSOURCE_MASK(32) | 18 | #define HPET_MASK CLOCKSOURCE_MASK(32) |
19 | #define HPET_SHIFT 22 | ||
20 | 19 | ||
21 | /* FSEC = 10^-15 | 20 | /* FSEC = 10^-15 |
22 | NSEC = 10^-9 */ | 21 | NSEC = 10^-9 */ |
@@ -583,7 +582,7 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu) | |||
583 | * scaled math multiplication factor for nanosecond to hpet tick | 582 | * scaled math multiplication factor for nanosecond to hpet tick |
584 | * conversion. | 583 | * conversion. |
585 | */ | 584 | */ |
586 | hpet_freq = 1000000000000000ULL; | 585 | hpet_freq = FSEC_PER_SEC; |
587 | do_div(hpet_freq, hpet_period); | 586 | do_div(hpet_freq, hpet_period); |
588 | evt->mult = div_sc((unsigned long) hpet_freq, | 587 | evt->mult = div_sc((unsigned long) hpet_freq, |
589 | NSEC_PER_SEC, evt->shift); | 588 | NSEC_PER_SEC, evt->shift); |
@@ -787,7 +786,6 @@ static struct clocksource clocksource_hpet = { | |||
787 | .rating = 250, | 786 | .rating = 250, |
788 | .read = read_hpet, | 787 | .read = read_hpet, |
789 | .mask = HPET_MASK, | 788 | .mask = HPET_MASK, |
790 | .shift = HPET_SHIFT, | ||
791 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, | 789 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, |
792 | .resume = hpet_resume_counter, | 790 | .resume = hpet_resume_counter, |
793 | #ifdef CONFIG_X86_64 | 791 | #ifdef CONFIG_X86_64 |
@@ -798,6 +796,7 @@ static struct clocksource clocksource_hpet = { | |||
798 | static int hpet_clocksource_register(void) | 796 | static int hpet_clocksource_register(void) |
799 | { | 797 | { |
800 | u64 start, now; | 798 | u64 start, now; |
799 | u64 hpet_freq; | ||
801 | cycle_t t1; | 800 | cycle_t t1; |
802 | 801 | ||
803 | /* Start the counter */ | 802 | /* Start the counter */ |
@@ -832,9 +831,15 @@ static int hpet_clocksource_register(void) | |||
832 | * mult = (hpet_period * 2^shift)/10^6 | 831 | * mult = (hpet_period * 2^shift)/10^6 |
833 | * mult = (hpet_period << shift)/FSEC_PER_NSEC | 832 | * mult = (hpet_period << shift)/FSEC_PER_NSEC |
834 | */ | 833 | */ |
835 | clocksource_hpet.mult = div_sc(hpet_period, FSEC_PER_NSEC, HPET_SHIFT); | ||
836 | 834 | ||
837 | clocksource_register(&clocksource_hpet); | 835 | /* Need to convert hpet_period (fsec/cyc) to cyc/sec: |
836 | * | ||
837 | * cyc/sec = FSEC_PER_SEC/hpet_period(fsec/cyc) | ||
838 | * cyc/sec = (FSEC_PER_NSEC * NSEC_PER_SEC)/hpet_period | ||
839 | */ | ||
840 | hpet_freq = FSEC_PER_SEC; | ||
841 | do_div(hpet_freq, hpet_period); | ||
842 | clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq); | ||
838 | 843 | ||
839 | return 0; | 844 | return 0; |
840 | } | 845 | } |
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index a8f1b803d2fd..a474ec37c32f 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c | |||
@@ -208,6 +208,9 @@ int arch_bp_generic_fields(int x86_len, int x86_type, | |||
208 | { | 208 | { |
209 | /* Len */ | 209 | /* Len */ |
210 | switch (x86_len) { | 210 | switch (x86_len) { |
211 | case X86_BREAKPOINT_LEN_X: | ||
212 | *gen_len = sizeof(long); | ||
213 | break; | ||
211 | case X86_BREAKPOINT_LEN_1: | 214 | case X86_BREAKPOINT_LEN_1: |
212 | *gen_len = HW_BREAKPOINT_LEN_1; | 215 | *gen_len = HW_BREAKPOINT_LEN_1; |
213 | break; | 216 | break; |
@@ -251,6 +254,29 @@ static int arch_build_bp_info(struct perf_event *bp) | |||
251 | 254 | ||
252 | info->address = bp->attr.bp_addr; | 255 | info->address = bp->attr.bp_addr; |
253 | 256 | ||
257 | /* Type */ | ||
258 | switch (bp->attr.bp_type) { | ||
259 | case HW_BREAKPOINT_W: | ||
260 | info->type = X86_BREAKPOINT_WRITE; | ||
261 | break; | ||
262 | case HW_BREAKPOINT_W | HW_BREAKPOINT_R: | ||
263 | info->type = X86_BREAKPOINT_RW; | ||
264 | break; | ||
265 | case HW_BREAKPOINT_X: | ||
266 | info->type = X86_BREAKPOINT_EXECUTE; | ||
267 | /* | ||
268 | * x86 inst breakpoints need to have a specific undefined len. | ||
269 | * But we still need to check userspace is not trying to setup | ||
270 | * an unsupported length, to get a range breakpoint for example. | ||
271 | */ | ||
272 | if (bp->attr.bp_len == sizeof(long)) { | ||
273 | info->len = X86_BREAKPOINT_LEN_X; | ||
274 | return 0; | ||
275 | } | ||
276 | default: | ||
277 | return -EINVAL; | ||
278 | } | ||
279 | |||
254 | /* Len */ | 280 | /* Len */ |
255 | switch (bp->attr.bp_len) { | 281 | switch (bp->attr.bp_len) { |
256 | case HW_BREAKPOINT_LEN_1: | 282 | case HW_BREAKPOINT_LEN_1: |
@@ -271,21 +297,6 @@ static int arch_build_bp_info(struct perf_event *bp) | |||
271 | return -EINVAL; | 297 | return -EINVAL; |
272 | } | 298 | } |
273 | 299 | ||
274 | /* Type */ | ||
275 | switch (bp->attr.bp_type) { | ||
276 | case HW_BREAKPOINT_W: | ||
277 | info->type = X86_BREAKPOINT_WRITE; | ||
278 | break; | ||
279 | case HW_BREAKPOINT_W | HW_BREAKPOINT_R: | ||
280 | info->type = X86_BREAKPOINT_RW; | ||
281 | break; | ||
282 | case HW_BREAKPOINT_X: | ||
283 | info->type = X86_BREAKPOINT_EXECUTE; | ||
284 | break; | ||
285 | default: | ||
286 | return -EINVAL; | ||
287 | } | ||
288 | |||
289 | return 0; | 300 | return 0; |
290 | } | 301 | } |
291 | /* | 302 | /* |
@@ -305,6 +316,9 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp) | |||
305 | ret = -EINVAL; | 316 | ret = -EINVAL; |
306 | 317 | ||
307 | switch (info->len) { | 318 | switch (info->len) { |
319 | case X86_BREAKPOINT_LEN_X: | ||
320 | align = sizeof(long) -1; | ||
321 | break; | ||
308 | case X86_BREAKPOINT_LEN_1: | 322 | case X86_BREAKPOINT_LEN_1: |
309 | align = 0; | 323 | align = 0; |
310 | break; | 324 | break; |
@@ -466,6 +480,13 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args) | |||
466 | 480 | ||
467 | perf_bp_event(bp, args->regs); | 481 | perf_bp_event(bp, args->regs); |
468 | 482 | ||
483 | /* | ||
484 | * Set up resume flag to avoid breakpoint recursion when | ||
485 | * returning back to origin. | ||
486 | */ | ||
487 | if (bp->hw.info.type == X86_BREAKPOINT_EXECUTE) | ||
488 | args->regs->flags |= X86_EFLAGS_RF; | ||
489 | |||
469 | rcu_read_unlock(); | 490 | rcu_read_unlock(); |
470 | } | 491 | } |
471 | /* | 492 | /* |
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 86cef6b32253..1f11f5ce668f 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c | |||
@@ -59,18 +59,18 @@ void __cpuinit mxcsr_feature_mask_init(void) | |||
59 | stts(); | 59 | stts(); |
60 | } | 60 | } |
61 | 61 | ||
62 | void __cpuinit init_thread_xstate(void) | 62 | static void __cpuinit init_thread_xstate(void) |
63 | { | 63 | { |
64 | /* | ||
65 | * Note that xstate_size might be overwriten later during | ||
66 | * xsave_init(). | ||
67 | */ | ||
68 | |||
64 | if (!HAVE_HWFP) { | 69 | if (!HAVE_HWFP) { |
65 | xstate_size = sizeof(struct i387_soft_struct); | 70 | xstate_size = sizeof(struct i387_soft_struct); |
66 | return; | 71 | return; |
67 | } | 72 | } |
68 | 73 | ||
69 | if (cpu_has_xsave) { | ||
70 | xsave_cntxt_init(); | ||
71 | return; | ||
72 | } | ||
73 | |||
74 | if (cpu_has_fxsr) | 74 | if (cpu_has_fxsr) |
75 | xstate_size = sizeof(struct i387_fxsave_struct); | 75 | xstate_size = sizeof(struct i387_fxsave_struct); |
76 | #ifdef CONFIG_X86_32 | 76 | #ifdef CONFIG_X86_32 |
@@ -84,6 +84,7 @@ void __cpuinit init_thread_xstate(void) | |||
84 | * Called at bootup to set up the initial FPU state that is later cloned | 84 | * Called at bootup to set up the initial FPU state that is later cloned |
85 | * into all processes. | 85 | * into all processes. |
86 | */ | 86 | */ |
87 | |||
87 | void __cpuinit fpu_init(void) | 88 | void __cpuinit fpu_init(void) |
88 | { | 89 | { |
89 | unsigned long oldcr0 = read_cr0(); | 90 | unsigned long oldcr0 = read_cr0(); |
@@ -93,21 +94,26 @@ void __cpuinit fpu_init(void) | |||
93 | 94 | ||
94 | write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */ | 95 | write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */ |
95 | 96 | ||
96 | /* | ||
97 | * Boot processor to setup the FP and extended state context info. | ||
98 | */ | ||
99 | if (!smp_processor_id()) | 97 | if (!smp_processor_id()) |
100 | init_thread_xstate(); | 98 | init_thread_xstate(); |
101 | xsave_init(); | ||
102 | 99 | ||
103 | mxcsr_feature_mask_init(); | 100 | mxcsr_feature_mask_init(); |
104 | /* clean state in init */ | 101 | /* clean state in init */ |
105 | current_thread_info()->status = 0; | 102 | current_thread_info()->status = 0; |
106 | clear_used_math(); | 103 | clear_used_math(); |
107 | } | 104 | } |
108 | #endif /* CONFIG_X86_64 */ | ||
109 | 105 | ||
110 | static void fpu_finit(struct fpu *fpu) | 106 | #else /* CONFIG_X86_64 */ |
107 | |||
108 | void __cpuinit fpu_init(void) | ||
109 | { | ||
110 | if (!smp_processor_id()) | ||
111 | init_thread_xstate(); | ||
112 | } | ||
113 | |||
114 | #endif /* CONFIG_X86_32 */ | ||
115 | |||
116 | void fpu_finit(struct fpu *fpu) | ||
111 | { | 117 | { |
112 | #ifdef CONFIG_X86_32 | 118 | #ifdef CONFIG_X86_32 |
113 | if (!HAVE_HWFP) { | 119 | if (!HAVE_HWFP) { |
@@ -132,6 +138,7 @@ static void fpu_finit(struct fpu *fpu) | |||
132 | fp->fos = 0xffff0000u; | 138 | fp->fos = 0xffff0000u; |
133 | } | 139 | } |
134 | } | 140 | } |
141 | EXPORT_SYMBOL_GPL(fpu_finit); | ||
135 | 142 | ||
136 | /* | 143 | /* |
137 | * The _current_ task is using the FPU for the first time | 144 | * The _current_ task is using the FPU for the first time |
@@ -190,6 +197,8 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset, | |||
190 | if (ret) | 197 | if (ret) |
191 | return ret; | 198 | return ret; |
192 | 199 | ||
200 | sanitize_i387_state(target); | ||
201 | |||
193 | return user_regset_copyout(&pos, &count, &kbuf, &ubuf, | 202 | return user_regset_copyout(&pos, &count, &kbuf, &ubuf, |
194 | &target->thread.fpu.state->fxsave, 0, -1); | 203 | &target->thread.fpu.state->fxsave, 0, -1); |
195 | } | 204 | } |
@@ -207,6 +216,8 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, | |||
207 | if (ret) | 216 | if (ret) |
208 | return ret; | 217 | return ret; |
209 | 218 | ||
219 | sanitize_i387_state(target); | ||
220 | |||
210 | ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, | 221 | ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, |
211 | &target->thread.fpu.state->fxsave, 0, -1); | 222 | &target->thread.fpu.state->fxsave, 0, -1); |
212 | 223 | ||
@@ -446,6 +457,8 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, | |||
446 | -1); | 457 | -1); |
447 | } | 458 | } |
448 | 459 | ||
460 | sanitize_i387_state(target); | ||
461 | |||
449 | if (kbuf && pos == 0 && count == sizeof(env)) { | 462 | if (kbuf && pos == 0 && count == sizeof(env)) { |
450 | convert_from_fxsr(kbuf, target); | 463 | convert_from_fxsr(kbuf, target); |
451 | return 0; | 464 | return 0; |
@@ -467,6 +480,8 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, | |||
467 | if (ret) | 480 | if (ret) |
468 | return ret; | 481 | return ret; |
469 | 482 | ||
483 | sanitize_i387_state(target); | ||
484 | |||
470 | if (!HAVE_HWFP) | 485 | if (!HAVE_HWFP) |
471 | return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); | 486 | return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); |
472 | 487 | ||
@@ -533,6 +548,9 @@ static int save_i387_xsave(void __user *buf) | |||
533 | struct _fpstate_ia32 __user *fx = buf; | 548 | struct _fpstate_ia32 __user *fx = buf; |
534 | int err = 0; | 549 | int err = 0; |
535 | 550 | ||
551 | |||
552 | sanitize_i387_state(tsk); | ||
553 | |||
536 | /* | 554 | /* |
537 | * For legacy compatible, we always set FP/SSE bits in the bit | 555 | * For legacy compatible, we always set FP/SSE bits in the bit |
538 | * vector while saving the state to the user context. | 556 | * vector while saving the state to the user context. |
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 01ab17ae2ae7..ef10940e1af0 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c | |||
@@ -49,55 +49,94 @@ | |||
49 | #include <asm/system.h> | 49 | #include <asm/system.h> |
50 | #include <asm/apic.h> | 50 | #include <asm/apic.h> |
51 | 51 | ||
52 | /** | 52 | struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = |
53 | * pt_regs_to_gdb_regs - Convert ptrace regs to GDB regs | ||
54 | * @gdb_regs: A pointer to hold the registers in the order GDB wants. | ||
55 | * @regs: The &struct pt_regs of the current process. | ||
56 | * | ||
57 | * Convert the pt_regs in @regs into the format for registers that | ||
58 | * GDB expects, stored in @gdb_regs. | ||
59 | */ | ||
60 | void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) | ||
61 | { | 53 | { |
62 | #ifndef CONFIG_X86_32 | 54 | #ifdef CONFIG_X86_32 |
63 | u32 *gdb_regs32 = (u32 *)gdb_regs; | 55 | { "ax", 4, offsetof(struct pt_regs, ax) }, |
56 | { "cx", 4, offsetof(struct pt_regs, cx) }, | ||
57 | { "dx", 4, offsetof(struct pt_regs, dx) }, | ||
58 | { "bx", 4, offsetof(struct pt_regs, bx) }, | ||
59 | { "sp", 4, offsetof(struct pt_regs, sp) }, | ||
60 | { "bp", 4, offsetof(struct pt_regs, bp) }, | ||
61 | { "si", 4, offsetof(struct pt_regs, si) }, | ||
62 | { "di", 4, offsetof(struct pt_regs, di) }, | ||
63 | { "ip", 4, offsetof(struct pt_regs, ip) }, | ||
64 | { "flags", 4, offsetof(struct pt_regs, flags) }, | ||
65 | { "cs", 4, offsetof(struct pt_regs, cs) }, | ||
66 | { "ss", 4, offsetof(struct pt_regs, ss) }, | ||
67 | { "ds", 4, offsetof(struct pt_regs, ds) }, | ||
68 | { "es", 4, offsetof(struct pt_regs, es) }, | ||
69 | { "fs", 4, -1 }, | ||
70 | { "gs", 4, -1 }, | ||
71 | #else | ||
72 | { "ax", 8, offsetof(struct pt_regs, ax) }, | ||
73 | { "bx", 8, offsetof(struct pt_regs, bx) }, | ||
74 | { "cx", 8, offsetof(struct pt_regs, cx) }, | ||
75 | { "dx", 8, offsetof(struct pt_regs, dx) }, | ||
76 | { "si", 8, offsetof(struct pt_regs, dx) }, | ||
77 | { "di", 8, offsetof(struct pt_regs, di) }, | ||
78 | { "bp", 8, offsetof(struct pt_regs, bp) }, | ||
79 | { "sp", 8, offsetof(struct pt_regs, sp) }, | ||
80 | { "r8", 8, offsetof(struct pt_regs, r8) }, | ||
81 | { "r9", 8, offsetof(struct pt_regs, r9) }, | ||
82 | { "r10", 8, offsetof(struct pt_regs, r10) }, | ||
83 | { "r11", 8, offsetof(struct pt_regs, r11) }, | ||
84 | { "r12", 8, offsetof(struct pt_regs, r12) }, | ||
85 | { "r13", 8, offsetof(struct pt_regs, r13) }, | ||
86 | { "r14", 8, offsetof(struct pt_regs, r14) }, | ||
87 | { "r15", 8, offsetof(struct pt_regs, r15) }, | ||
88 | { "ip", 8, offsetof(struct pt_regs, ip) }, | ||
89 | { "flags", 4, offsetof(struct pt_regs, flags) }, | ||
90 | { "cs", 4, offsetof(struct pt_regs, cs) }, | ||
91 | { "ss", 4, offsetof(struct pt_regs, ss) }, | ||
64 | #endif | 92 | #endif |
65 | gdb_regs[GDB_AX] = regs->ax; | 93 | }; |
66 | gdb_regs[GDB_BX] = regs->bx; | 94 | |
67 | gdb_regs[GDB_CX] = regs->cx; | 95 | int dbg_set_reg(int regno, void *mem, struct pt_regs *regs) |
68 | gdb_regs[GDB_DX] = regs->dx; | 96 | { |
69 | gdb_regs[GDB_SI] = regs->si; | 97 | if ( |
70 | gdb_regs[GDB_DI] = regs->di; | ||
71 | gdb_regs[GDB_BP] = regs->bp; | ||
72 | gdb_regs[GDB_PC] = regs->ip; | ||
73 | #ifdef CONFIG_X86_32 | 98 | #ifdef CONFIG_X86_32 |
74 | gdb_regs[GDB_PS] = regs->flags; | 99 | regno == GDB_SS || regno == GDB_FS || regno == GDB_GS || |
75 | gdb_regs[GDB_DS] = regs->ds; | 100 | #endif |
76 | gdb_regs[GDB_ES] = regs->es; | 101 | regno == GDB_SP || regno == GDB_ORIG_AX) |
77 | gdb_regs[GDB_CS] = regs->cs; | 102 | return 0; |
78 | gdb_regs[GDB_FS] = 0xFFFF; | 103 | |
79 | gdb_regs[GDB_GS] = 0xFFFF; | 104 | if (dbg_reg_def[regno].offset != -1) |
80 | if (user_mode_vm(regs)) { | 105 | memcpy((void *)regs + dbg_reg_def[regno].offset, mem, |
81 | gdb_regs[GDB_SS] = regs->ss; | 106 | dbg_reg_def[regno].size); |
82 | gdb_regs[GDB_SP] = regs->sp; | 107 | return 0; |
83 | } else { | 108 | } |
84 | gdb_regs[GDB_SS] = __KERNEL_DS; | 109 | |
85 | gdb_regs[GDB_SP] = kernel_stack_pointer(regs); | 110 | char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs) |
111 | { | ||
112 | if (regno == GDB_ORIG_AX) { | ||
113 | memcpy(mem, ®s->orig_ax, sizeof(regs->orig_ax)); | ||
114 | return "orig_ax"; | ||
86 | } | 115 | } |
87 | #else | 116 | if (regno >= DBG_MAX_REG_NUM || regno < 0) |
88 | gdb_regs[GDB_R8] = regs->r8; | 117 | return NULL; |
89 | gdb_regs[GDB_R9] = regs->r9; | 118 | |
90 | gdb_regs[GDB_R10] = regs->r10; | 119 | if (dbg_reg_def[regno].offset != -1) |
91 | gdb_regs[GDB_R11] = regs->r11; | 120 | memcpy(mem, (void *)regs + dbg_reg_def[regno].offset, |
92 | gdb_regs[GDB_R12] = regs->r12; | 121 | dbg_reg_def[regno].size); |
93 | gdb_regs[GDB_R13] = regs->r13; | 122 | |
94 | gdb_regs[GDB_R14] = regs->r14; | 123 | switch (regno) { |
95 | gdb_regs[GDB_R15] = regs->r15; | 124 | #ifdef CONFIG_X86_32 |
96 | gdb_regs32[GDB_PS] = regs->flags; | 125 | case GDB_SS: |
97 | gdb_regs32[GDB_CS] = regs->cs; | 126 | if (!user_mode_vm(regs)) |
98 | gdb_regs32[GDB_SS] = regs->ss; | 127 | *(unsigned long *)mem = __KERNEL_DS; |
99 | gdb_regs[GDB_SP] = kernel_stack_pointer(regs); | 128 | break; |
129 | case GDB_SP: | ||
130 | if (!user_mode_vm(regs)) | ||
131 | *(unsigned long *)mem = kernel_stack_pointer(regs); | ||
132 | break; | ||
133 | case GDB_GS: | ||
134 | case GDB_FS: | ||
135 | *(unsigned long *)mem = 0xFFFF; | ||
136 | break; | ||
100 | #endif | 137 | #endif |
138 | } | ||
139 | return dbg_reg_def[regno].name; | ||
101 | } | 140 | } |
102 | 141 | ||
103 | /** | 142 | /** |
@@ -150,54 +189,13 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) | |||
150 | gdb_regs[GDB_SP] = p->thread.sp; | 189 | gdb_regs[GDB_SP] = p->thread.sp; |
151 | } | 190 | } |
152 | 191 | ||
153 | /** | ||
154 | * gdb_regs_to_pt_regs - Convert GDB regs to ptrace regs. | ||
155 | * @gdb_regs: A pointer to hold the registers we've received from GDB. | ||
156 | * @regs: A pointer to a &struct pt_regs to hold these values in. | ||
157 | * | ||
158 | * Convert the GDB regs in @gdb_regs into the pt_regs, and store them | ||
159 | * in @regs. | ||
160 | */ | ||
161 | void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) | ||
162 | { | ||
163 | #ifndef CONFIG_X86_32 | ||
164 | u32 *gdb_regs32 = (u32 *)gdb_regs; | ||
165 | #endif | ||
166 | regs->ax = gdb_regs[GDB_AX]; | ||
167 | regs->bx = gdb_regs[GDB_BX]; | ||
168 | regs->cx = gdb_regs[GDB_CX]; | ||
169 | regs->dx = gdb_regs[GDB_DX]; | ||
170 | regs->si = gdb_regs[GDB_SI]; | ||
171 | regs->di = gdb_regs[GDB_DI]; | ||
172 | regs->bp = gdb_regs[GDB_BP]; | ||
173 | regs->ip = gdb_regs[GDB_PC]; | ||
174 | #ifdef CONFIG_X86_32 | ||
175 | regs->flags = gdb_regs[GDB_PS]; | ||
176 | regs->ds = gdb_regs[GDB_DS]; | ||
177 | regs->es = gdb_regs[GDB_ES]; | ||
178 | regs->cs = gdb_regs[GDB_CS]; | ||
179 | #else | ||
180 | regs->r8 = gdb_regs[GDB_R8]; | ||
181 | regs->r9 = gdb_regs[GDB_R9]; | ||
182 | regs->r10 = gdb_regs[GDB_R10]; | ||
183 | regs->r11 = gdb_regs[GDB_R11]; | ||
184 | regs->r12 = gdb_regs[GDB_R12]; | ||
185 | regs->r13 = gdb_regs[GDB_R13]; | ||
186 | regs->r14 = gdb_regs[GDB_R14]; | ||
187 | regs->r15 = gdb_regs[GDB_R15]; | ||
188 | regs->flags = gdb_regs32[GDB_PS]; | ||
189 | regs->cs = gdb_regs32[GDB_CS]; | ||
190 | regs->ss = gdb_regs32[GDB_SS]; | ||
191 | #endif | ||
192 | } | ||
193 | |||
194 | static struct hw_breakpoint { | 192 | static struct hw_breakpoint { |
195 | unsigned enabled; | 193 | unsigned enabled; |
196 | unsigned long addr; | 194 | unsigned long addr; |
197 | int len; | 195 | int len; |
198 | int type; | 196 | int type; |
199 | struct perf_event **pev; | 197 | struct perf_event **pev; |
200 | } breakinfo[4]; | 198 | } breakinfo[HBP_NUM]; |
201 | 199 | ||
202 | static unsigned long early_dr7; | 200 | static unsigned long early_dr7; |
203 | 201 | ||
@@ -205,7 +203,7 @@ static void kgdb_correct_hw_break(void) | |||
205 | { | 203 | { |
206 | int breakno; | 204 | int breakno; |
207 | 205 | ||
208 | for (breakno = 0; breakno < 4; breakno++) { | 206 | for (breakno = 0; breakno < HBP_NUM; breakno++) { |
209 | struct perf_event *bp; | 207 | struct perf_event *bp; |
210 | struct arch_hw_breakpoint *info; | 208 | struct arch_hw_breakpoint *info; |
211 | int val; | 209 | int val; |
@@ -292,10 +290,10 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) | |||
292 | { | 290 | { |
293 | int i; | 291 | int i; |
294 | 292 | ||
295 | for (i = 0; i < 4; i++) | 293 | for (i = 0; i < HBP_NUM; i++) |
296 | if (breakinfo[i].addr == addr && breakinfo[i].enabled) | 294 | if (breakinfo[i].addr == addr && breakinfo[i].enabled) |
297 | break; | 295 | break; |
298 | if (i == 4) | 296 | if (i == HBP_NUM) |
299 | return -1; | 297 | return -1; |
300 | 298 | ||
301 | if (hw_break_release_slot(i)) { | 299 | if (hw_break_release_slot(i)) { |
@@ -313,7 +311,7 @@ static void kgdb_remove_all_hw_break(void) | |||
313 | int cpu = raw_smp_processor_id(); | 311 | int cpu = raw_smp_processor_id(); |
314 | struct perf_event *bp; | 312 | struct perf_event *bp; |
315 | 313 | ||
316 | for (i = 0; i < 4; i++) { | 314 | for (i = 0; i < HBP_NUM; i++) { |
317 | if (!breakinfo[i].enabled) | 315 | if (!breakinfo[i].enabled) |
318 | continue; | 316 | continue; |
319 | bp = *per_cpu_ptr(breakinfo[i].pev, cpu); | 317 | bp = *per_cpu_ptr(breakinfo[i].pev, cpu); |
@@ -333,10 +331,10 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) | |||
333 | { | 331 | { |
334 | int i; | 332 | int i; |
335 | 333 | ||
336 | for (i = 0; i < 4; i++) | 334 | for (i = 0; i < HBP_NUM; i++) |
337 | if (!breakinfo[i].enabled) | 335 | if (!breakinfo[i].enabled) |
338 | break; | 336 | break; |
339 | if (i == 4) | 337 | if (i == HBP_NUM) |
340 | return -1; | 338 | return -1; |
341 | 339 | ||
342 | switch (bptype) { | 340 | switch (bptype) { |
@@ -397,7 +395,7 @@ void kgdb_disable_hw_debug(struct pt_regs *regs) | |||
397 | 395 | ||
398 | /* Disable hardware debugging while we are in kgdb: */ | 396 | /* Disable hardware debugging while we are in kgdb: */ |
399 | set_debugreg(0UL, 7); | 397 | set_debugreg(0UL, 7); |
400 | for (i = 0; i < 4; i++) { | 398 | for (i = 0; i < HBP_NUM; i++) { |
401 | if (!breakinfo[i].enabled) | 399 | if (!breakinfo[i].enabled) |
402 | continue; | 400 | continue; |
403 | if (dbg_is_early) { | 401 | if (dbg_is_early) { |
@@ -458,7 +456,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, | |||
458 | { | 456 | { |
459 | unsigned long addr; | 457 | unsigned long addr; |
460 | char *ptr; | 458 | char *ptr; |
461 | int newPC; | ||
462 | 459 | ||
463 | switch (remcomInBuffer[0]) { | 460 | switch (remcomInBuffer[0]) { |
464 | case 'c': | 461 | case 'c': |
@@ -469,8 +466,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, | |||
469 | linux_regs->ip = addr; | 466 | linux_regs->ip = addr; |
470 | case 'D': | 467 | case 'D': |
471 | case 'k': | 468 | case 'k': |
472 | newPC = linux_regs->ip; | ||
473 | |||
474 | /* clear the trace bit */ | 469 | /* clear the trace bit */ |
475 | linux_regs->flags &= ~X86_EFLAGS_TF; | 470 | linux_regs->flags &= ~X86_EFLAGS_TF; |
476 | atomic_set(&kgdb_cpu_doing_single_step, -1); | 471 | atomic_set(&kgdb_cpu_doing_single_step, -1); |
@@ -645,7 +640,7 @@ void kgdb_arch_late(void) | |||
645 | attr.bp_len = HW_BREAKPOINT_LEN_1; | 640 | attr.bp_len = HW_BREAKPOINT_LEN_1; |
646 | attr.bp_type = HW_BREAKPOINT_W; | 641 | attr.bp_type = HW_BREAKPOINT_W; |
647 | attr.disabled = 1; | 642 | attr.disabled = 1; |
648 | for (i = 0; i < 4; i++) { | 643 | for (i = 0; i < HBP_NUM; i++) { |
649 | if (breakinfo[i].pev) | 644 | if (breakinfo[i].pev) |
650 | continue; | 645 | continue; |
651 | breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL); | 646 | breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL); |
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 675879b65ce6..1bfb6cf4dd55 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c | |||
@@ -126,16 +126,22 @@ static void __kprobes synthesize_reljump(void *from, void *to) | |||
126 | } | 126 | } |
127 | 127 | ||
128 | /* | 128 | /* |
129 | * Check for the REX prefix which can only exist on X86_64 | 129 | * Skip the prefixes of the instruction. |
130 | * X86_32 always returns 0 | ||
131 | */ | 130 | */ |
132 | static int __kprobes is_REX_prefix(kprobe_opcode_t *insn) | 131 | static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn) |
133 | { | 132 | { |
133 | insn_attr_t attr; | ||
134 | |||
135 | attr = inat_get_opcode_attribute((insn_byte_t)*insn); | ||
136 | while (inat_is_legacy_prefix(attr)) { | ||
137 | insn++; | ||
138 | attr = inat_get_opcode_attribute((insn_byte_t)*insn); | ||
139 | } | ||
134 | #ifdef CONFIG_X86_64 | 140 | #ifdef CONFIG_X86_64 |
135 | if ((*insn & 0xf0) == 0x40) | 141 | if (inat_is_rex_prefix(attr)) |
136 | return 1; | 142 | insn++; |
137 | #endif | 143 | #endif |
138 | return 0; | 144 | return insn; |
139 | } | 145 | } |
140 | 146 | ||
141 | /* | 147 | /* |
@@ -272,6 +278,9 @@ static int __kprobes can_probe(unsigned long paddr) | |||
272 | */ | 278 | */ |
273 | static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) | 279 | static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) |
274 | { | 280 | { |
281 | /* Skip prefixes */ | ||
282 | insn = skip_prefixes(insn); | ||
283 | |||
275 | switch (*insn) { | 284 | switch (*insn) { |
276 | case 0xfa: /* cli */ | 285 | case 0xfa: /* cli */ |
277 | case 0xfb: /* sti */ | 286 | case 0xfb: /* sti */ |
@@ -280,13 +289,6 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) | |||
280 | return 1; | 289 | return 1; |
281 | } | 290 | } |
282 | 291 | ||
283 | /* | ||
284 | * on X86_64, 0x40-0x4f are REX prefixes so we need to look | ||
285 | * at the next byte instead.. but of course not recurse infinitely | ||
286 | */ | ||
287 | if (is_REX_prefix(insn)) | ||
288 | return is_IF_modifier(++insn); | ||
289 | |||
290 | return 0; | 292 | return 0; |
291 | } | 293 | } |
292 | 294 | ||
@@ -803,9 +805,8 @@ static void __kprobes resume_execution(struct kprobe *p, | |||
803 | unsigned long orig_ip = (unsigned long)p->addr; | 805 | unsigned long orig_ip = (unsigned long)p->addr; |
804 | kprobe_opcode_t *insn = p->ainsn.insn; | 806 | kprobe_opcode_t *insn = p->ainsn.insn; |
805 | 807 | ||
806 | /*skip the REX prefix*/ | 808 | /* Skip prefixes */ |
807 | if (is_REX_prefix(insn)) | 809 | insn = skip_prefixes(insn); |
808 | insn++; | ||
809 | 810 | ||
810 | regs->flags &= ~X86_EFLAGS_TF; | 811 | regs->flags &= ~X86_EFLAGS_TF; |
811 | switch (*insn) { | 812 | switch (*insn) { |
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index d86dbf7e54be..d7b6f7fb4fec 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c | |||
@@ -274,6 +274,18 @@ static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt) | |||
274 | 274 | ||
275 | void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { } | 275 | void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { } |
276 | 276 | ||
277 | static void __init smp_register_lapic_address(unsigned long address) | ||
278 | { | ||
279 | mp_lapic_addr = address; | ||
280 | |||
281 | set_fixmap_nocache(FIX_APIC_BASE, address); | ||
282 | if (boot_cpu_physical_apicid == -1U) { | ||
283 | boot_cpu_physical_apicid = read_apic_id(); | ||
284 | apic_version[boot_cpu_physical_apicid] = | ||
285 | GET_APIC_VERSION(apic_read(APIC_LVR)); | ||
286 | } | ||
287 | } | ||
288 | |||
277 | static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) | 289 | static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) |
278 | { | 290 | { |
279 | char str[16]; | 291 | char str[16]; |
@@ -295,6 +307,10 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) | |||
295 | if (early) | 307 | if (early) |
296 | return 1; | 308 | return 1; |
297 | 309 | ||
310 | /* Initialize the lapic mapping */ | ||
311 | if (!acpi_lapic) | ||
312 | smp_register_lapic_address(mpc->lapic); | ||
313 | |||
298 | if (mpc->oemptr) | 314 | if (mpc->oemptr) |
299 | x86_init.mpparse.smp_read_mpc_oem(mpc); | 315 | x86_init.mpparse.smp_read_mpc_oem(mpc); |
300 | 316 | ||
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c index 5915e0b33303..79ae68154e87 100644 --- a/arch/x86/kernel/mrst.c +++ b/arch/x86/kernel/mrst.c | |||
@@ -25,8 +25,34 @@ | |||
25 | #include <asm/i8259.h> | 25 | #include <asm/i8259.h> |
26 | #include <asm/apb_timer.h> | 26 | #include <asm/apb_timer.h> |
27 | 27 | ||
28 | /* | ||
29 | * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock, | ||
30 | * cmdline option x86_mrst_timer can be used to override the configuration | ||
31 | * to prefer one or the other. | ||
32 | * at runtime, there are basically three timer configurations: | ||
33 | * 1. per cpu apbt clock only | ||
34 | * 2. per cpu always-on lapic clocks only, this is Penwell/Medfield only | ||
35 | * 3. per cpu lapic clock (C3STOP) and one apbt clock, with broadcast. | ||
36 | * | ||
37 | * by default (without cmdline option), platform code first detects cpu type | ||
38 | * to see if we are on lincroft or penwell, then set up both lapic or apbt | ||
39 | * clocks accordingly. | ||
40 | * i.e. by default, medfield uses configuration #2, moorestown uses #1. | ||
41 | * config #3 is supported but not recommended on medfield. | ||
42 | * | ||
43 | * rating and feature summary: | ||
44 | * lapic (with C3STOP) --------- 100 | ||
45 | * apbt (always-on) ------------ 110 | ||
46 | * lapic (always-on,ARAT) ------ 150 | ||
47 | */ | ||
48 | |||
49 | __cpuinitdata enum mrst_timer_options mrst_timer_options; | ||
50 | |||
28 | static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; | 51 | static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; |
29 | static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; | 52 | static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; |
53 | enum mrst_cpu_type __mrst_cpu_chip; | ||
54 | EXPORT_SYMBOL_GPL(__mrst_cpu_chip); | ||
55 | |||
30 | int sfi_mtimer_num; | 56 | int sfi_mtimer_num; |
31 | 57 | ||
32 | struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX]; | 58 | struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX]; |
@@ -167,18 +193,6 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table) | |||
167 | return 0; | 193 | return 0; |
168 | } | 194 | } |
169 | 195 | ||
170 | /* | ||
171 | * the secondary clock in Moorestown can be APBT or LAPIC clock, default to | ||
172 | * APBT but cmdline option can also override it. | ||
173 | */ | ||
174 | static void __cpuinit mrst_setup_secondary_clock(void) | ||
175 | { | ||
176 | /* restore default lapic clock if disabled by cmdline */ | ||
177 | if (disable_apbt_percpu) | ||
178 | return setup_secondary_APIC_clock(); | ||
179 | apbt_setup_secondary_clock(); | ||
180 | } | ||
181 | |||
182 | static unsigned long __init mrst_calibrate_tsc(void) | 196 | static unsigned long __init mrst_calibrate_tsc(void) |
183 | { | 197 | { |
184 | unsigned long flags, fast_calibrate; | 198 | unsigned long flags, fast_calibrate; |
@@ -195,6 +209,21 @@ static unsigned long __init mrst_calibrate_tsc(void) | |||
195 | 209 | ||
196 | void __init mrst_time_init(void) | 210 | void __init mrst_time_init(void) |
197 | { | 211 | { |
212 | switch (mrst_timer_options) { | ||
213 | case MRST_TIMER_APBT_ONLY: | ||
214 | break; | ||
215 | case MRST_TIMER_LAPIC_APBT: | ||
216 | x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock; | ||
217 | x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock; | ||
218 | break; | ||
219 | default: | ||
220 | if (!boot_cpu_has(X86_FEATURE_ARAT)) | ||
221 | break; | ||
222 | x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock; | ||
223 | x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock; | ||
224 | return; | ||
225 | } | ||
226 | /* we need at least one APB timer */ | ||
198 | sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr); | 227 | sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr); |
199 | pre_init_apic_IRQ0(); | 228 | pre_init_apic_IRQ0(); |
200 | apbt_time_init(); | 229 | apbt_time_init(); |
@@ -205,16 +234,21 @@ void __init mrst_rtc_init(void) | |||
205 | sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); | 234 | sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); |
206 | } | 235 | } |
207 | 236 | ||
208 | /* | 237 | void __cpuinit mrst_arch_setup(void) |
209 | * if we use per cpu apb timer, the bootclock already setup. if we use lapic | ||
210 | * timer and one apbt timer for broadcast, we need to set up lapic boot clock. | ||
211 | */ | ||
212 | static void __init mrst_setup_boot_clock(void) | ||
213 | { | 238 | { |
214 | pr_info("%s: per cpu apbt flag %d \n", __func__, disable_apbt_percpu); | 239 | if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27) |
215 | if (disable_apbt_percpu) | 240 | __mrst_cpu_chip = MRST_CPU_CHIP_PENWELL; |
216 | setup_boot_APIC_clock(); | 241 | else if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x26) |
217 | }; | 242 | __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT; |
243 | else { | ||
244 | pr_err("Unknown Moorestown CPU (%d:%d), default to Lincroft\n", | ||
245 | boot_cpu_data.x86, boot_cpu_data.x86_model); | ||
246 | __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT; | ||
247 | } | ||
248 | pr_debug("Moorestown CPU %s identified\n", | ||
249 | (__mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) ? | ||
250 | "Lincroft" : "Penwell"); | ||
251 | } | ||
218 | 252 | ||
219 | /* MID systems don't have i8042 controller */ | 253 | /* MID systems don't have i8042 controller */ |
220 | static int mrst_i8042_detect(void) | 254 | static int mrst_i8042_detect(void) |
@@ -232,11 +266,13 @@ void __init x86_mrst_early_setup(void) | |||
232 | x86_init.resources.reserve_resources = x86_init_noop; | 266 | x86_init.resources.reserve_resources = x86_init_noop; |
233 | 267 | ||
234 | x86_init.timers.timer_init = mrst_time_init; | 268 | x86_init.timers.timer_init = mrst_time_init; |
235 | x86_init.timers.setup_percpu_clockev = mrst_setup_boot_clock; | 269 | x86_init.timers.setup_percpu_clockev = x86_init_noop; |
236 | 270 | ||
237 | x86_init.irqs.pre_vector_init = x86_init_noop; | 271 | x86_init.irqs.pre_vector_init = x86_init_noop; |
238 | 272 | ||
239 | x86_cpuinit.setup_percpu_clockev = mrst_setup_secondary_clock; | 273 | x86_init.oem.arch_setup = mrst_arch_setup; |
274 | |||
275 | x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock; | ||
240 | 276 | ||
241 | x86_platform.calibrate_tsc = mrst_calibrate_tsc; | 277 | x86_platform.calibrate_tsc = mrst_calibrate_tsc; |
242 | x86_platform.i8042_detect = mrst_i8042_detect; | 278 | x86_platform.i8042_detect = mrst_i8042_detect; |
@@ -250,3 +286,26 @@ void __init x86_mrst_early_setup(void) | |||
250 | x86_init.mpparse.get_smp_config = x86_init_uint_noop; | 286 | x86_init.mpparse.get_smp_config = x86_init_uint_noop; |
251 | 287 | ||
252 | } | 288 | } |
289 | |||
290 | /* | ||
291 | * if user does not want to use per CPU apb timer, just give it a lower rating | ||
292 | * than local apic timer and skip the late per cpu timer init. | ||
293 | */ | ||
294 | static inline int __init setup_x86_mrst_timer(char *arg) | ||
295 | { | ||
296 | if (!arg) | ||
297 | return -EINVAL; | ||
298 | |||
299 | if (strcmp("apbt_only", arg) == 0) | ||
300 | mrst_timer_options = MRST_TIMER_APBT_ONLY; | ||
301 | else if (strcmp("lapic_and_apbt", arg) == 0) | ||
302 | mrst_timer_options = MRST_TIMER_LAPIC_APBT; | ||
303 | else { | ||
304 | pr_warning("X86 MRST timer option %s not recognised" | ||
305 | " use x86_mrst_timer=apbt_only or lapic_and_apbt\n", | ||
306 | arg); | ||
307 | return -EINVAL; | ||
308 | } | ||
309 | return 0; | ||
310 | } | ||
311 | __setup("x86_mrst_timer=", setup_x86_mrst_timer); | ||
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c index 8297160c41b3..0e0cdde519be 100644 --- a/arch/x86/kernel/olpc.c +++ b/arch/x86/kernel/olpc.c | |||
@@ -21,10 +21,7 @@ | |||
21 | #include <asm/geode.h> | 21 | #include <asm/geode.h> |
22 | #include <asm/setup.h> | 22 | #include <asm/setup.h> |
23 | #include <asm/olpc.h> | 23 | #include <asm/olpc.h> |
24 | 24 | #include <asm/olpc_ofw.h> | |
25 | #ifdef CONFIG_OPEN_FIRMWARE | ||
26 | #include <asm/ofw.h> | ||
27 | #endif | ||
28 | 25 | ||
29 | struct olpc_platform_t olpc_platform_info; | 26 | struct olpc_platform_t olpc_platform_info; |
30 | EXPORT_SYMBOL_GPL(olpc_platform_info); | 27 | EXPORT_SYMBOL_GPL(olpc_platform_info); |
@@ -145,7 +142,7 @@ restart: | |||
145 | * The OBF flag will sometimes misbehave due to what we believe | 142 | * The OBF flag will sometimes misbehave due to what we believe |
146 | * is a hardware quirk.. | 143 | * is a hardware quirk.. |
147 | */ | 144 | */ |
148 | printk(KERN_DEBUG "olpc-ec: running cmd 0x%x\n", cmd); | 145 | pr_devel("olpc-ec: running cmd 0x%x\n", cmd); |
149 | outb(cmd, 0x6c); | 146 | outb(cmd, 0x6c); |
150 | 147 | ||
151 | if (wait_on_ibf(0x6c, 0)) { | 148 | if (wait_on_ibf(0x6c, 0)) { |
@@ -162,8 +159,7 @@ restart: | |||
162 | " EC accept data!\n"); | 159 | " EC accept data!\n"); |
163 | goto err; | 160 | goto err; |
164 | } | 161 | } |
165 | printk(KERN_DEBUG "olpc-ec: sending cmd arg 0x%x\n", | 162 | pr_devel("olpc-ec: sending cmd arg 0x%x\n", inbuf[i]); |
166 | inbuf[i]); | ||
167 | outb(inbuf[i], 0x68); | 163 | outb(inbuf[i], 0x68); |
168 | } | 164 | } |
169 | } | 165 | } |
@@ -176,8 +172,7 @@ restart: | |||
176 | goto restart; | 172 | goto restart; |
177 | } | 173 | } |
178 | outbuf[i] = inb(0x68); | 174 | outbuf[i] = inb(0x68); |
179 | printk(KERN_DEBUG "olpc-ec: received 0x%x\n", | 175 | pr_devel("olpc-ec: received 0x%x\n", outbuf[i]); |
180 | outbuf[i]); | ||
181 | } | 176 | } |
182 | } | 177 | } |
183 | 178 | ||
@@ -188,14 +183,15 @@ err: | |||
188 | } | 183 | } |
189 | EXPORT_SYMBOL_GPL(olpc_ec_cmd); | 184 | EXPORT_SYMBOL_GPL(olpc_ec_cmd); |
190 | 185 | ||
191 | #ifdef CONFIG_OPEN_FIRMWARE | 186 | #ifdef CONFIG_OLPC_OPENFIRMWARE |
192 | static void __init platform_detect(void) | 187 | static void __init platform_detect(void) |
193 | { | 188 | { |
194 | size_t propsize; | 189 | size_t propsize; |
195 | __be32 rev; | 190 | __be32 rev; |
191 | const void *args[] = { NULL, "board-revision-int", &rev, (void *)4 }; | ||
192 | void *res[] = { &propsize }; | ||
196 | 193 | ||
197 | if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4, | 194 | if (olpc_ofw("getprop", args, res) || propsize != 4) { |
198 | &propsize) || propsize != 4) { | ||
199 | printk(KERN_ERR "ofw: getprop call failed!\n"); | 195 | printk(KERN_ERR "ofw: getprop call failed!\n"); |
200 | rev = cpu_to_be32(0); | 196 | rev = cpu_to_be32(0); |
201 | } | 197 | } |
diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/kernel/olpc_ofw.c new file mode 100644 index 000000000000..3218aa71ab5e --- /dev/null +++ b/arch/x86/kernel/olpc_ofw.c | |||
@@ -0,0 +1,106 @@ | |||
1 | #include <linux/kernel.h> | ||
2 | #include <linux/module.h> | ||
3 | #include <linux/init.h> | ||
4 | #include <asm/page.h> | ||
5 | #include <asm/setup.h> | ||
6 | #include <asm/io.h> | ||
7 | #include <asm/pgtable.h> | ||
8 | #include <asm/olpc_ofw.h> | ||
9 | |||
10 | /* address of OFW callback interface; will be NULL if OFW isn't found */ | ||
11 | static int (*olpc_ofw_cif)(int *); | ||
12 | |||
13 | /* page dir entry containing OFW's pgdir table; filled in by head_32.S */ | ||
14 | u32 olpc_ofw_pgd __initdata; | ||
15 | |||
16 | static DEFINE_SPINLOCK(ofw_lock); | ||
17 | |||
18 | #define MAXARGS 10 | ||
19 | |||
20 | void __init setup_olpc_ofw_pgd(void) | ||
21 | { | ||
22 | pgd_t *base, *ofw_pde; | ||
23 | |||
24 | if (!olpc_ofw_cif) | ||
25 | return; | ||
26 | |||
27 | /* fetch OFW's PDE */ | ||
28 | base = early_ioremap(olpc_ofw_pgd, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD); | ||
29 | if (!base) { | ||
30 | printk(KERN_ERR "failed to remap OFW's pgd - disabling OFW!\n"); | ||
31 | olpc_ofw_cif = NULL; | ||
32 | return; | ||
33 | } | ||
34 | ofw_pde = &base[OLPC_OFW_PDE_NR]; | ||
35 | |||
36 | /* install OFW's PDE permanently into the kernel's pgtable */ | ||
37 | set_pgd(&swapper_pg_dir[OLPC_OFW_PDE_NR], *ofw_pde); | ||
38 | /* implicit optimization barrier here due to uninline function return */ | ||
39 | |||
40 | early_iounmap(base, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD); | ||
41 | } | ||
42 | |||
43 | int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res, | ||
44 | void **res) | ||
45 | { | ||
46 | int ofw_args[MAXARGS + 3]; | ||
47 | unsigned long flags; | ||
48 | int ret, i, *p; | ||
49 | |||
50 | BUG_ON(nr_args + nr_res > MAXARGS); | ||
51 | |||
52 | if (!olpc_ofw_cif) | ||
53 | return -EIO; | ||
54 | |||
55 | ofw_args[0] = (int)name; | ||
56 | ofw_args[1] = nr_args; | ||
57 | ofw_args[2] = nr_res; | ||
58 | |||
59 | p = &ofw_args[3]; | ||
60 | for (i = 0; i < nr_args; i++, p++) | ||
61 | *p = (int)args[i]; | ||
62 | |||
63 | /* call into ofw */ | ||
64 | spin_lock_irqsave(&ofw_lock, flags); | ||
65 | ret = olpc_ofw_cif(ofw_args); | ||
66 | spin_unlock_irqrestore(&ofw_lock, flags); | ||
67 | |||
68 | if (!ret) { | ||
69 | for (i = 0; i < nr_res; i++, p++) | ||
70 | *((int *)res[i]) = *p; | ||
71 | } | ||
72 | |||
73 | return ret; | ||
74 | } | ||
75 | EXPORT_SYMBOL_GPL(__olpc_ofw); | ||
76 | |||
77 | /* OFW cif _should_ be above this address */ | ||
78 | #define OFW_MIN 0xff000000 | ||
79 | |||
80 | /* OFW starts on a 1MB boundary */ | ||
81 | #define OFW_BOUND (1<<20) | ||
82 | |||
83 | void __init olpc_ofw_detect(void) | ||
84 | { | ||
85 | struct olpc_ofw_header *hdr = &boot_params.olpc_ofw_header; | ||
86 | unsigned long start; | ||
87 | |||
88 | /* ensure OFW booted us by checking for "OFW " string */ | ||
89 | if (hdr->ofw_magic != OLPC_OFW_SIG) | ||
90 | return; | ||
91 | |||
92 | olpc_ofw_cif = (int (*)(int *))hdr->cif_handler; | ||
93 | |||
94 | if ((unsigned long)olpc_ofw_cif < OFW_MIN) { | ||
95 | printk(KERN_ERR "OFW detected, but cif has invalid address 0x%lx - disabling.\n", | ||
96 | (unsigned long)olpc_ofw_cif); | ||
97 | olpc_ofw_cif = NULL; | ||
98 | return; | ||
99 | } | ||
100 | |||
101 | /* determine where OFW starts in memory */ | ||
102 | start = round_down((unsigned long)olpc_ofw_cif, OFW_BOUND); | ||
103 | printk(KERN_INFO "OFW detected in memory, cif @ 0x%lx (reserving top %ldMB)\n", | ||
104 | (unsigned long)olpc_ofw_cif, (-start) >> 20); | ||
105 | reserve_top_address(-start); | ||
106 | } | ||
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 4b7e3d8b01dd..9f07cfcbd3a5 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <asm/calgary.h> | 13 | #include <asm/calgary.h> |
14 | #include <asm/amd_iommu.h> | 14 | #include <asm/amd_iommu.h> |
15 | #include <asm/x86_init.h> | 15 | #include <asm/x86_init.h> |
16 | #include <asm/xen/swiotlb-xen.h> | ||
16 | 17 | ||
17 | static int forbid_dac __read_mostly; | 18 | static int forbid_dac __read_mostly; |
18 | 19 | ||
@@ -132,7 +133,7 @@ void __init pci_iommu_alloc(void) | |||
132 | /* free the range so iommu could get some range less than 4G */ | 133 | /* free the range so iommu could get some range less than 4G */ |
133 | dma32_free_bootmem(); | 134 | dma32_free_bootmem(); |
134 | 135 | ||
135 | if (pci_swiotlb_detect()) | 136 | if (pci_xen_swiotlb_detect() || pci_swiotlb_detect()) |
136 | goto out; | 137 | goto out; |
137 | 138 | ||
138 | gart_iommu_hole_init(); | 139 | gart_iommu_hole_init(); |
@@ -144,6 +145,8 @@ void __init pci_iommu_alloc(void) | |||
144 | /* needs to be called after gart_iommu_hole_init */ | 145 | /* needs to be called after gart_iommu_hole_init */ |
145 | amd_iommu_detect(); | 146 | amd_iommu_detect(); |
146 | out: | 147 | out: |
148 | pci_xen_swiotlb_init(); | ||
149 | |||
147 | pci_swiotlb_init(); | 150 | pci_swiotlb_init(); |
148 | } | 151 | } |
149 | 152 | ||
@@ -296,7 +299,7 @@ static int __init pci_iommu_init(void) | |||
296 | #endif | 299 | #endif |
297 | x86_init.iommu.iommu_init(); | 300 | x86_init.iommu.iommu_init(); |
298 | 301 | ||
299 | if (swiotlb) { | 302 | if (swiotlb || xen_swiotlb) { |
300 | printk(KERN_INFO "PCI-DMA: " | 303 | printk(KERN_INFO "PCI-DMA: " |
301 | "Using software bounce buffering for IO (SWIOTLB)\n"); | 304 | "Using software bounce buffering for IO (SWIOTLB)\n"); |
302 | swiotlb_print_info(); | 305 | swiotlb_print_info(); |
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index e7e35219b32f..64ecaf0af9af 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c | |||
@@ -28,6 +28,7 @@ unsigned long idle_nomwait; | |||
28 | EXPORT_SYMBOL(idle_nomwait); | 28 | EXPORT_SYMBOL(idle_nomwait); |
29 | 29 | ||
30 | struct kmem_cache *task_xstate_cachep; | 30 | struct kmem_cache *task_xstate_cachep; |
31 | EXPORT_SYMBOL_GPL(task_xstate_cachep); | ||
31 | 32 | ||
32 | int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) | 33 | int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) |
33 | { | 34 | { |
@@ -300,7 +301,7 @@ EXPORT_SYMBOL(kernel_thread); | |||
300 | /* | 301 | /* |
301 | * sys_execve() executes a new program. | 302 | * sys_execve() executes a new program. |
302 | */ | 303 | */ |
303 | long sys_execve(char __user *name, char __user * __user *argv, | 304 | long sys_execve(const char __user *name, char __user * __user *argv, |
304 | char __user * __user *envp, struct pt_regs *regs) | 305 | char __user * __user *envp, struct pt_regs *regs) |
305 | { | 306 | { |
306 | long error; | 307 | long error; |
@@ -371,7 +372,7 @@ static inline int hlt_use_halt(void) | |||
371 | void default_idle(void) | 372 | void default_idle(void) |
372 | { | 373 | { |
373 | if (hlt_use_halt()) { | 374 | if (hlt_use_halt()) { |
374 | trace_power_start(POWER_CSTATE, 1); | 375 | trace_power_start(POWER_CSTATE, 1, smp_processor_id()); |
375 | current_thread_info()->status &= ~TS_POLLING; | 376 | current_thread_info()->status &= ~TS_POLLING; |
376 | /* | 377 | /* |
377 | * TS_POLLING-cleared state must be visible before we | 378 | * TS_POLLING-cleared state must be visible before we |
@@ -441,7 +442,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); | |||
441 | */ | 442 | */ |
442 | void mwait_idle_with_hints(unsigned long ax, unsigned long cx) | 443 | void mwait_idle_with_hints(unsigned long ax, unsigned long cx) |
443 | { | 444 | { |
444 | trace_power_start(POWER_CSTATE, (ax>>4)+1); | 445 | trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id()); |
445 | if (!need_resched()) { | 446 | if (!need_resched()) { |
446 | if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) | 447 | if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) |
447 | clflush((void *)¤t_thread_info()->flags); | 448 | clflush((void *)¤t_thread_info()->flags); |
@@ -457,7 +458,7 @@ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) | |||
457 | static void mwait_idle(void) | 458 | static void mwait_idle(void) |
458 | { | 459 | { |
459 | if (!need_resched()) { | 460 | if (!need_resched()) { |
460 | trace_power_start(POWER_CSTATE, 1); | 461 | trace_power_start(POWER_CSTATE, 1, smp_processor_id()); |
461 | if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) | 462 | if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) |
462 | clflush((void *)¤t_thread_info()->flags); | 463 | clflush((void *)¤t_thread_info()->flags); |
463 | 464 | ||
@@ -478,7 +479,7 @@ static void mwait_idle(void) | |||
478 | */ | 479 | */ |
479 | static void poll_idle(void) | 480 | static void poll_idle(void) |
480 | { | 481 | { |
481 | trace_power_start(POWER_CSTATE, 0); | 482 | trace_power_start(POWER_CSTATE, 0, smp_processor_id()); |
482 | local_irq_enable(); | 483 | local_irq_enable(); |
483 | while (!need_resched()) | 484 | while (!need_resched()) |
484 | cpu_relax(); | 485 | cpu_relax(); |
@@ -525,44 +526,10 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) | |||
525 | return (edx & MWAIT_EDX_C1); | 526 | return (edx & MWAIT_EDX_C1); |
526 | } | 527 | } |
527 | 528 | ||
528 | /* | 529 | bool c1e_detected; |
529 | * Check for AMD CPUs, where APIC timer interrupt does not wake up CPU from C1e. | 530 | EXPORT_SYMBOL(c1e_detected); |
530 | * For more information see | ||
531 | * - Erratum #400 for NPT family 0xf and family 0x10 CPUs | ||
532 | * - Erratum #365 for family 0x11 (not affected because C1e not in use) | ||
533 | */ | ||
534 | static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c) | ||
535 | { | ||
536 | u64 val; | ||
537 | if (c->x86_vendor != X86_VENDOR_AMD) | ||
538 | goto no_c1e_idle; | ||
539 | |||
540 | /* Family 0x0f models < rev F do not have C1E */ | ||
541 | if (c->x86 == 0x0F && c->x86_model >= 0x40) | ||
542 | return 1; | ||
543 | |||
544 | if (c->x86 == 0x10) { | ||
545 | /* | ||
546 | * check OSVW bit for CPUs that are not affected | ||
547 | * by erratum #400 | ||
548 | */ | ||
549 | if (cpu_has(c, X86_FEATURE_OSVW)) { | ||
550 | rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val); | ||
551 | if (val >= 2) { | ||
552 | rdmsrl(MSR_AMD64_OSVW_STATUS, val); | ||
553 | if (!(val & BIT(1))) | ||
554 | goto no_c1e_idle; | ||
555 | } | ||
556 | } | ||
557 | return 1; | ||
558 | } | ||
559 | |||
560 | no_c1e_idle: | ||
561 | return 0; | ||
562 | } | ||
563 | 531 | ||
564 | static cpumask_var_t c1e_mask; | 532 | static cpumask_var_t c1e_mask; |
565 | static int c1e_detected; | ||
566 | 533 | ||
567 | void c1e_remove_cpu(int cpu) | 534 | void c1e_remove_cpu(int cpu) |
568 | { | 535 | { |
@@ -584,12 +551,12 @@ static void c1e_idle(void) | |||
584 | u32 lo, hi; | 551 | u32 lo, hi; |
585 | 552 | ||
586 | rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); | 553 | rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); |
554 | |||
587 | if (lo & K8_INTP_C1E_ACTIVE_MASK) { | 555 | if (lo & K8_INTP_C1E_ACTIVE_MASK) { |
588 | c1e_detected = 1; | 556 | c1e_detected = true; |
589 | if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) | 557 | if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) |
590 | mark_tsc_unstable("TSC halt in AMD C1E"); | 558 | mark_tsc_unstable("TSC halt in AMD C1E"); |
591 | printk(KERN_INFO "System has AMD C1E enabled\n"); | 559 | printk(KERN_INFO "System has AMD C1E enabled\n"); |
592 | set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E); | ||
593 | } | 560 | } |
594 | } | 561 | } |
595 | 562 | ||
@@ -638,7 +605,8 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) | |||
638 | */ | 605 | */ |
639 | printk(KERN_INFO "using mwait in idle threads.\n"); | 606 | printk(KERN_INFO "using mwait in idle threads.\n"); |
640 | pm_idle = mwait_idle; | 607 | pm_idle = mwait_idle; |
641 | } else if (check_c1e_idle(c)) { | 608 | } else if (cpu_has_amd_erratum(amd_erratum_400)) { |
609 | /* E400: APIC timer interrupt does not wake up CPU from C1e */ | ||
642 | printk(KERN_INFO "using C1E aware idle routine\n"); | 610 | printk(KERN_INFO "using C1E aware idle routine\n"); |
643 | pm_idle = c1e_idle; | 611 | pm_idle = c1e_idle; |
644 | } else | 612 | } else |
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 8d128783af47..96586c3cbbbf 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c | |||
@@ -57,6 +57,8 @@ | |||
57 | #include <asm/syscalls.h> | 57 | #include <asm/syscalls.h> |
58 | #include <asm/debugreg.h> | 58 | #include <asm/debugreg.h> |
59 | 59 | ||
60 | #include <trace/events/power.h> | ||
61 | |||
60 | asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); | 62 | asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); |
61 | 63 | ||
62 | /* | 64 | /* |
@@ -111,6 +113,8 @@ void cpu_idle(void) | |||
111 | stop_critical_timings(); | 113 | stop_critical_timings(); |
112 | pm_idle(); | 114 | pm_idle(); |
113 | start_critical_timings(); | 115 | start_critical_timings(); |
116 | |||
117 | trace_power_end(smp_processor_id()); | ||
114 | } | 118 | } |
115 | tick_nohz_restart_sched_tick(); | 119 | tick_nohz_restart_sched_tick(); |
116 | preempt_enable_no_resched(); | 120 | preempt_enable_no_resched(); |
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 3c2422a99f1f..3d9ea531ddd1 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c | |||
@@ -51,6 +51,8 @@ | |||
51 | #include <asm/syscalls.h> | 51 | #include <asm/syscalls.h> |
52 | #include <asm/debugreg.h> | 52 | #include <asm/debugreg.h> |
53 | 53 | ||
54 | #include <trace/events/power.h> | ||
55 | |||
54 | asmlinkage extern void ret_from_fork(void); | 56 | asmlinkage extern void ret_from_fork(void); |
55 | 57 | ||
56 | DEFINE_PER_CPU(unsigned long, old_rsp); | 58 | DEFINE_PER_CPU(unsigned long, old_rsp); |
@@ -138,6 +140,9 @@ void cpu_idle(void) | |||
138 | stop_critical_timings(); | 140 | stop_critical_timings(); |
139 | pm_idle(); | 141 | pm_idle(); |
140 | start_critical_timings(); | 142 | start_critical_timings(); |
143 | |||
144 | trace_power_end(smp_processor_id()); | ||
145 | |||
141 | /* In many cases the interrupt that ended idle | 146 | /* In many cases the interrupt that ended idle |
142 | has already called exit_idle. But some idle | 147 | has already called exit_idle. But some idle |
143 | loops can be woken up without interrupt. */ | 148 | loops can be woken up without interrupt. */ |
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b4ae4acbd031..b008e7883207 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c | |||
@@ -102,6 +102,7 @@ | |||
102 | 102 | ||
103 | #include <asm/paravirt.h> | 103 | #include <asm/paravirt.h> |
104 | #include <asm/hypervisor.h> | 104 | #include <asm/hypervisor.h> |
105 | #include <asm/olpc_ofw.h> | ||
105 | 106 | ||
106 | #include <asm/percpu.h> | 107 | #include <asm/percpu.h> |
107 | #include <asm/topology.h> | 108 | #include <asm/topology.h> |
@@ -736,10 +737,15 @@ void __init setup_arch(char **cmdline_p) | |||
736 | /* VMI may relocate the fixmap; do this before touching ioremap area */ | 737 | /* VMI may relocate the fixmap; do this before touching ioremap area */ |
737 | vmi_init(); | 738 | vmi_init(); |
738 | 739 | ||
740 | /* OFW also may relocate the fixmap */ | ||
741 | olpc_ofw_detect(); | ||
742 | |||
739 | early_trap_init(); | 743 | early_trap_init(); |
740 | early_cpu_init(); | 744 | early_cpu_init(); |
741 | early_ioremap_init(); | 745 | early_ioremap_init(); |
742 | 746 | ||
747 | setup_olpc_ofw_pgd(); | ||
748 | |||
743 | ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); | 749 | ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); |
744 | screen_info = boot_params.screen_info; | 750 | screen_info = boot_params.screen_info; |
745 | edid_info = boot_params.edid_info; | 751 | edid_info = boot_params.edid_info; |
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c4f33b2e77d6..a5e928b0cb5f 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c | |||
@@ -735,12 +735,8 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) | |||
735 | goto do_rest; | 735 | goto do_rest; |
736 | } | 736 | } |
737 | 737 | ||
738 | if (!keventd_up() || current_is_keventd()) | 738 | schedule_work(&c_idle.work); |
739 | c_idle.work.func(&c_idle.work); | 739 | wait_for_completion(&c_idle.done); |
740 | else { | ||
741 | schedule_work(&c_idle.work); | ||
742 | wait_for_completion(&c_idle.done); | ||
743 | } | ||
744 | 740 | ||
745 | if (IS_ERR(c_idle.idle)) { | 741 | if (IS_ERR(c_idle.idle)) { |
746 | printk("failed fork for CPU %d\n", cpu); | 742 | printk("failed fork for CPU %d\n", cpu); |
@@ -816,6 +812,13 @@ do_rest: | |||
816 | if (cpumask_test_cpu(cpu, cpu_callin_mask)) | 812 | if (cpumask_test_cpu(cpu, cpu_callin_mask)) |
817 | break; /* It has booted */ | 813 | break; /* It has booted */ |
818 | udelay(100); | 814 | udelay(100); |
815 | /* | ||
816 | * Allow other tasks to run while we wait for the | ||
817 | * AP to come online. This also gives a chance | ||
818 | * for the MTRR work(triggered by the AP coming online) | ||
819 | * to be completed in the stop machine context. | ||
820 | */ | ||
821 | schedule(); | ||
819 | } | 822 | } |
820 | 823 | ||
821 | if (cpumask_test_cpu(cpu, cpu_callin_mask)) | 824 | if (cpumask_test_cpu(cpu, cpu_callin_mask)) |
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 922eefbb3f6c..b53c525368a7 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c | |||
@@ -23,11 +23,16 @@ static int save_stack_stack(void *data, char *name) | |||
23 | return 0; | 23 | return 0; |
24 | } | 24 | } |
25 | 25 | ||
26 | static void save_stack_address(void *data, unsigned long addr, int reliable) | 26 | static void |
27 | __save_stack_address(void *data, unsigned long addr, bool reliable, bool nosched) | ||
27 | { | 28 | { |
28 | struct stack_trace *trace = data; | 29 | struct stack_trace *trace = data; |
30 | #ifdef CONFIG_FRAME_POINTER | ||
29 | if (!reliable) | 31 | if (!reliable) |
30 | return; | 32 | return; |
33 | #endif | ||
34 | if (nosched && in_sched_functions(addr)) | ||
35 | return; | ||
31 | if (trace->skip > 0) { | 36 | if (trace->skip > 0) { |
32 | trace->skip--; | 37 | trace->skip--; |
33 | return; | 38 | return; |
@@ -36,20 +41,15 @@ static void save_stack_address(void *data, unsigned long addr, int reliable) | |||
36 | trace->entries[trace->nr_entries++] = addr; | 41 | trace->entries[trace->nr_entries++] = addr; |
37 | } | 42 | } |
38 | 43 | ||
44 | static void save_stack_address(void *data, unsigned long addr, int reliable) | ||
45 | { | ||
46 | return __save_stack_address(data, addr, reliable, false); | ||
47 | } | ||
48 | |||
39 | static void | 49 | static void |
40 | save_stack_address_nosched(void *data, unsigned long addr, int reliable) | 50 | save_stack_address_nosched(void *data, unsigned long addr, int reliable) |
41 | { | 51 | { |
42 | struct stack_trace *trace = (struct stack_trace *)data; | 52 | return __save_stack_address(data, addr, reliable, true); |
43 | if (!reliable) | ||
44 | return; | ||
45 | if (in_sched_functions(addr)) | ||
46 | return; | ||
47 | if (trace->skip > 0) { | ||
48 | trace->skip--; | ||
49 | return; | ||
50 | } | ||
51 | if (trace->nr_entries < trace->max_entries) | ||
52 | trace->entries[trace->nr_entries++] = addr; | ||
53 | } | 53 | } |
54 | 54 | ||
55 | static const struct stacktrace_ops save_stack_ops = { | 55 | static const struct stacktrace_ops save_stack_ops = { |
@@ -96,12 +96,13 @@ EXPORT_SYMBOL_GPL(save_stack_trace_tsk); | |||
96 | 96 | ||
97 | /* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */ | 97 | /* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */ |
98 | 98 | ||
99 | struct stack_frame { | 99 | struct stack_frame_user { |
100 | const void __user *next_fp; | 100 | const void __user *next_fp; |
101 | unsigned long ret_addr; | 101 | unsigned long ret_addr; |
102 | }; | 102 | }; |
103 | 103 | ||
104 | static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) | 104 | static int |
105 | copy_stack_frame(const void __user *fp, struct stack_frame_user *frame) | ||
105 | { | 106 | { |
106 | int ret; | 107 | int ret; |
107 | 108 | ||
@@ -126,7 +127,7 @@ static inline void __save_stack_trace_user(struct stack_trace *trace) | |||
126 | trace->entries[trace->nr_entries++] = regs->ip; | 127 | trace->entries[trace->nr_entries++] = regs->ip; |
127 | 128 | ||
128 | while (trace->nr_entries < trace->max_entries) { | 129 | while (trace->nr_entries < trace->max_entries) { |
129 | struct stack_frame frame; | 130 | struct stack_frame_user frame; |
130 | 131 | ||
131 | frame.next_fp = NULL; | 132 | frame.next_fp = NULL; |
132 | frame.ret_addr = 0; | 133 | frame.ret_addr = 0; |
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index 8b3729341216..b35786dc9b8f 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S | |||
@@ -337,3 +337,6 @@ ENTRY(sys_call_table) | |||
337 | .long sys_rt_tgsigqueueinfo /* 335 */ | 337 | .long sys_rt_tgsigqueueinfo /* 335 */ |
338 | .long sys_perf_event_open | 338 | .long sys_perf_event_open |
339 | .long sys_recvmmsg | 339 | .long sys_recvmmsg |
340 | .long sys_fanotify_init | ||
341 | .long sys_fanotify_mark | ||
342 | .long sys_prlimit64 /* 340 */ | ||
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 7fea555929e2..312ef0292815 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c | |||
@@ -8,6 +8,7 @@ | |||
8 | */ | 8 | */ |
9 | #include <linux/seq_file.h> | 9 | #include <linux/seq_file.h> |
10 | #include <linux/proc_fs.h> | 10 | #include <linux/proc_fs.h> |
11 | #include <linux/debugfs.h> | ||
11 | #include <linux/kernel.h> | 12 | #include <linux/kernel.h> |
12 | #include <linux/slab.h> | 13 | #include <linux/slab.h> |
13 | 14 | ||
@@ -22,19 +23,37 @@ | |||
22 | #include <asm/irq_vectors.h> | 23 | #include <asm/irq_vectors.h> |
23 | #include <asm/timer.h> | 24 | #include <asm/timer.h> |
24 | 25 | ||
25 | struct msg_desc { | 26 | /* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */ |
26 | struct bau_payload_queue_entry *msg; | 27 | static int timeout_base_ns[] = { |
27 | int msg_slot; | 28 | 20, |
28 | int sw_ack_slot; | 29 | 160, |
29 | struct bau_payload_queue_entry *va_queue_first; | 30 | 1280, |
30 | struct bau_payload_queue_entry *va_queue_last; | 31 | 10240, |
32 | 81920, | ||
33 | 655360, | ||
34 | 5242880, | ||
35 | 167772160 | ||
31 | }; | 36 | }; |
32 | 37 | static int timeout_us; | |
33 | #define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL | ||
34 | |||
35 | static int uv_bau_max_concurrent __read_mostly; | ||
36 | |||
37 | static int nobau; | 38 | static int nobau; |
39 | static int baudisabled; | ||
40 | static spinlock_t disable_lock; | ||
41 | static cycles_t congested_cycles; | ||
42 | |||
43 | /* tunables: */ | ||
44 | static int max_bau_concurrent = MAX_BAU_CONCURRENT; | ||
45 | static int max_bau_concurrent_constant = MAX_BAU_CONCURRENT; | ||
46 | static int plugged_delay = PLUGGED_DELAY; | ||
47 | static int plugsb4reset = PLUGSB4RESET; | ||
48 | static int timeoutsb4reset = TIMEOUTSB4RESET; | ||
49 | static int ipi_reset_limit = IPI_RESET_LIMIT; | ||
50 | static int complete_threshold = COMPLETE_THRESHOLD; | ||
51 | static int congested_response_us = CONGESTED_RESPONSE_US; | ||
52 | static int congested_reps = CONGESTED_REPS; | ||
53 | static int congested_period = CONGESTED_PERIOD; | ||
54 | static struct dentry *tunables_dir; | ||
55 | static struct dentry *tunables_file; | ||
56 | |||
38 | static int __init setup_nobau(char *arg) | 57 | static int __init setup_nobau(char *arg) |
39 | { | 58 | { |
40 | nobau = 1; | 59 | nobau = 1; |
@@ -52,10 +71,6 @@ static DEFINE_PER_CPU(struct ptc_stats, ptcstats); | |||
52 | static DEFINE_PER_CPU(struct bau_control, bau_control); | 71 | static DEFINE_PER_CPU(struct bau_control, bau_control); |
53 | static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask); | 72 | static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask); |
54 | 73 | ||
55 | struct reset_args { | ||
56 | int sender; | ||
57 | }; | ||
58 | |||
59 | /* | 74 | /* |
60 | * Determine the first node on a uvhub. 'Nodes' are used for kernel | 75 | * Determine the first node on a uvhub. 'Nodes' are used for kernel |
61 | * memory allocation. | 76 | * memory allocation. |
@@ -126,7 +141,7 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp, | |||
126 | struct ptc_stats *stat; | 141 | struct ptc_stats *stat; |
127 | 142 | ||
128 | msg = mdp->msg; | 143 | msg = mdp->msg; |
129 | stat = &per_cpu(ptcstats, bcp->cpu); | 144 | stat = bcp->statp; |
130 | stat->d_retries++; | 145 | stat->d_retries++; |
131 | /* | 146 | /* |
132 | * cancel any message from msg+1 to the retry itself | 147 | * cancel any message from msg+1 to the retry itself |
@@ -146,15 +161,14 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp, | |||
146 | slot2 = msg2 - mdp->va_queue_first; | 161 | slot2 = msg2 - mdp->va_queue_first; |
147 | mmr = uv_read_local_mmr | 162 | mmr = uv_read_local_mmr |
148 | (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); | 163 | (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); |
149 | msg_res = ((msg2->sw_ack_vector << 8) | | 164 | msg_res = msg2->sw_ack_vector; |
150 | msg2->sw_ack_vector); | ||
151 | /* | 165 | /* |
152 | * This is a message retry; clear the resources held | 166 | * This is a message retry; clear the resources held |
153 | * by the previous message only if they timed out. | 167 | * by the previous message only if they timed out. |
154 | * If it has not timed out we have an unexpected | 168 | * If it has not timed out we have an unexpected |
155 | * situation to report. | 169 | * situation to report. |
156 | */ | 170 | */ |
157 | if (mmr & (msg_res << 8)) { | 171 | if (mmr & (msg_res << UV_SW_ACK_NPENDING)) { |
158 | /* | 172 | /* |
159 | * is the resource timed out? | 173 | * is the resource timed out? |
160 | * make everyone ignore the cancelled message. | 174 | * make everyone ignore the cancelled message. |
@@ -164,9 +178,9 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp, | |||
164 | cancel_count++; | 178 | cancel_count++; |
165 | uv_write_local_mmr( | 179 | uv_write_local_mmr( |
166 | UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, | 180 | UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, |
167 | (msg_res << 8) | msg_res); | 181 | (msg_res << UV_SW_ACK_NPENDING) | |
168 | } else | 182 | msg_res); |
169 | printk(KERN_INFO "note bau retry: no effect\n"); | 183 | } |
170 | } | 184 | } |
171 | } | 185 | } |
172 | if (!cancel_count) | 186 | if (!cancel_count) |
@@ -190,7 +204,7 @@ static void uv_bau_process_message(struct msg_desc *mdp, | |||
190 | * This must be a normal message, or retry of a normal message | 204 | * This must be a normal message, or retry of a normal message |
191 | */ | 205 | */ |
192 | msg = mdp->msg; | 206 | msg = mdp->msg; |
193 | stat = &per_cpu(ptcstats, bcp->cpu); | 207 | stat = bcp->statp; |
194 | if (msg->address == TLB_FLUSH_ALL) { | 208 | if (msg->address == TLB_FLUSH_ALL) { |
195 | local_flush_tlb(); | 209 | local_flush_tlb(); |
196 | stat->d_alltlb++; | 210 | stat->d_alltlb++; |
@@ -274,7 +288,7 @@ uv_do_reset(void *ptr) | |||
274 | 288 | ||
275 | bcp = &per_cpu(bau_control, smp_processor_id()); | 289 | bcp = &per_cpu(bau_control, smp_processor_id()); |
276 | rap = (struct reset_args *)ptr; | 290 | rap = (struct reset_args *)ptr; |
277 | stat = &per_cpu(ptcstats, bcp->cpu); | 291 | stat = bcp->statp; |
278 | stat->d_resets++; | 292 | stat->d_resets++; |
279 | 293 | ||
280 | /* | 294 | /* |
@@ -302,13 +316,13 @@ uv_do_reset(void *ptr) | |||
302 | */ | 316 | */ |
303 | mmr = uv_read_local_mmr | 317 | mmr = uv_read_local_mmr |
304 | (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); | 318 | (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); |
305 | msg_res = ((msg->sw_ack_vector << 8) | | 319 | msg_res = msg->sw_ack_vector; |
306 | msg->sw_ack_vector); | ||
307 | if (mmr & msg_res) { | 320 | if (mmr & msg_res) { |
308 | stat->d_rcanceled++; | 321 | stat->d_rcanceled++; |
309 | uv_write_local_mmr( | 322 | uv_write_local_mmr( |
310 | UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, | 323 | UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, |
311 | msg_res); | 324 | (msg_res << UV_SW_ACK_NPENDING) | |
325 | msg_res); | ||
312 | } | 326 | } |
313 | } | 327 | } |
314 | } | 328 | } |
@@ -386,17 +400,12 @@ static int uv_wait_completion(struct bau_desc *bau_desc, | |||
386 | unsigned long mmr_offset, int right_shift, int this_cpu, | 400 | unsigned long mmr_offset, int right_shift, int this_cpu, |
387 | struct bau_control *bcp, struct bau_control *smaster, long try) | 401 | struct bau_control *bcp, struct bau_control *smaster, long try) |
388 | { | 402 | { |
389 | int relaxes = 0; | ||
390 | unsigned long descriptor_status; | 403 | unsigned long descriptor_status; |
391 | unsigned long mmr; | ||
392 | unsigned long mask; | ||
393 | cycles_t ttime; | 404 | cycles_t ttime; |
394 | cycles_t timeout_time; | 405 | struct ptc_stats *stat = bcp->statp; |
395 | struct ptc_stats *stat = &per_cpu(ptcstats, this_cpu); | ||
396 | struct bau_control *hmaster; | 406 | struct bau_control *hmaster; |
397 | 407 | ||
398 | hmaster = bcp->uvhub_master; | 408 | hmaster = bcp->uvhub_master; |
399 | timeout_time = get_cycles() + bcp->timeout_interval; | ||
400 | 409 | ||
401 | /* spin on the status MMR, waiting for it to go idle */ | 410 | /* spin on the status MMR, waiting for it to go idle */ |
402 | while ((descriptor_status = (((unsigned long) | 411 | while ((descriptor_status = (((unsigned long) |
@@ -423,7 +432,8 @@ static int uv_wait_completion(struct bau_desc *bau_desc, | |||
423 | * pending. In that case hardware returns the | 432 | * pending. In that case hardware returns the |
424 | * ERROR that looks like a destination timeout. | 433 | * ERROR that looks like a destination timeout. |
425 | */ | 434 | */ |
426 | if (cycles_2_us(ttime - bcp->send_message) < BIOS_TO) { | 435 | if (cycles_2_us(ttime - bcp->send_message) < |
436 | timeout_us) { | ||
427 | bcp->conseccompletes = 0; | 437 | bcp->conseccompletes = 0; |
428 | return FLUSH_RETRY_PLUGGED; | 438 | return FLUSH_RETRY_PLUGGED; |
429 | } | 439 | } |
@@ -435,26 +445,6 @@ static int uv_wait_completion(struct bau_desc *bau_desc, | |||
435 | * descriptor_status is still BUSY | 445 | * descriptor_status is still BUSY |
436 | */ | 446 | */ |
437 | cpu_relax(); | 447 | cpu_relax(); |
438 | relaxes++; | ||
439 | if (relaxes >= 10000) { | ||
440 | relaxes = 0; | ||
441 | if (get_cycles() > timeout_time) { | ||
442 | quiesce_local_uvhub(hmaster); | ||
443 | |||
444 | /* single-thread the register change */ | ||
445 | spin_lock(&hmaster->masks_lock); | ||
446 | mmr = uv_read_local_mmr(mmr_offset); | ||
447 | mask = 0UL; | ||
448 | mask |= (3UL < right_shift); | ||
449 | mask = ~mask; | ||
450 | mmr &= mask; | ||
451 | uv_write_local_mmr(mmr_offset, mmr); | ||
452 | spin_unlock(&hmaster->masks_lock); | ||
453 | end_uvhub_quiesce(hmaster); | ||
454 | stat->s_busy++; | ||
455 | return FLUSH_GIVEUP; | ||
456 | } | ||
457 | } | ||
458 | } | 448 | } |
459 | } | 449 | } |
460 | bcp->conseccompletes++; | 450 | bcp->conseccompletes++; |
@@ -494,56 +484,116 @@ static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u) | |||
494 | return 1; | 484 | return 1; |
495 | } | 485 | } |
496 | 486 | ||
487 | /* | ||
488 | * Our retries are blocked by all destination swack resources being | ||
489 | * in use, and a timeout is pending. In that case hardware immediately | ||
490 | * returns the ERROR that looks like a destination timeout. | ||
491 | */ | ||
492 | static void | ||
493 | destination_plugged(struct bau_desc *bau_desc, struct bau_control *bcp, | ||
494 | struct bau_control *hmaster, struct ptc_stats *stat) | ||
495 | { | ||
496 | udelay(bcp->plugged_delay); | ||
497 | bcp->plugged_tries++; | ||
498 | if (bcp->plugged_tries >= bcp->plugsb4reset) { | ||
499 | bcp->plugged_tries = 0; | ||
500 | quiesce_local_uvhub(hmaster); | ||
501 | spin_lock(&hmaster->queue_lock); | ||
502 | uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu); | ||
503 | spin_unlock(&hmaster->queue_lock); | ||
504 | end_uvhub_quiesce(hmaster); | ||
505 | bcp->ipi_attempts++; | ||
506 | stat->s_resets_plug++; | ||
507 | } | ||
508 | } | ||
509 | |||
510 | static void | ||
511 | destination_timeout(struct bau_desc *bau_desc, struct bau_control *bcp, | ||
512 | struct bau_control *hmaster, struct ptc_stats *stat) | ||
513 | { | ||
514 | hmaster->max_bau_concurrent = 1; | ||
515 | bcp->timeout_tries++; | ||
516 | if (bcp->timeout_tries >= bcp->timeoutsb4reset) { | ||
517 | bcp->timeout_tries = 0; | ||
518 | quiesce_local_uvhub(hmaster); | ||
519 | spin_lock(&hmaster->queue_lock); | ||
520 | uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu); | ||
521 | spin_unlock(&hmaster->queue_lock); | ||
522 | end_uvhub_quiesce(hmaster); | ||
523 | bcp->ipi_attempts++; | ||
524 | stat->s_resets_timeout++; | ||
525 | } | ||
526 | } | ||
527 | |||
528 | /* | ||
529 | * Completions are taking a very long time due to a congested numalink | ||
530 | * network. | ||
531 | */ | ||
532 | static void | ||
533 | disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat) | ||
534 | { | ||
535 | int tcpu; | ||
536 | struct bau_control *tbcp; | ||
537 | |||
538 | /* let only one cpu do this disabling */ | ||
539 | spin_lock(&disable_lock); | ||
540 | if (!baudisabled && bcp->period_requests && | ||
541 | ((bcp->period_time / bcp->period_requests) > congested_cycles)) { | ||
542 | /* it becomes this cpu's job to turn on the use of the | ||
543 | BAU again */ | ||
544 | baudisabled = 1; | ||
545 | bcp->set_bau_off = 1; | ||
546 | bcp->set_bau_on_time = get_cycles() + | ||
547 | sec_2_cycles(bcp->congested_period); | ||
548 | stat->s_bau_disabled++; | ||
549 | for_each_present_cpu(tcpu) { | ||
550 | tbcp = &per_cpu(bau_control, tcpu); | ||
551 | tbcp->baudisabled = 1; | ||
552 | } | ||
553 | } | ||
554 | spin_unlock(&disable_lock); | ||
555 | } | ||
556 | |||
497 | /** | 557 | /** |
498 | * uv_flush_send_and_wait | 558 | * uv_flush_send_and_wait |
499 | * | 559 | * |
500 | * Send a broadcast and wait for it to complete. | 560 | * Send a broadcast and wait for it to complete. |
501 | * | 561 | * |
502 | * The flush_mask contains the cpus the broadcast is to be sent to, plus | 562 | * The flush_mask contains the cpus the broadcast is to be sent to including |
503 | * cpus that are on the local uvhub. | 563 | * cpus that are on the local uvhub. |
504 | * | 564 | * |
505 | * Returns NULL if all flushing represented in the mask was done. The mask | 565 | * Returns 0 if all flushing represented in the mask was done. |
506 | * is zeroed. | 566 | * Returns 1 if it gives up entirely and the original cpu mask is to be |
507 | * Returns @flush_mask if some remote flushing remains to be done. The | 567 | * returned to the kernel. |
508 | * mask will have some bits still set, representing any cpus on the local | ||
509 | * uvhub (not current cpu) and any on remote uvhubs if the broadcast failed. | ||
510 | */ | 568 | */ |
511 | const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, | 569 | int uv_flush_send_and_wait(struct bau_desc *bau_desc, |
512 | struct cpumask *flush_mask, | 570 | struct cpumask *flush_mask, struct bau_control *bcp) |
513 | struct bau_control *bcp) | ||
514 | { | 571 | { |
515 | int right_shift; | 572 | int right_shift; |
516 | int uvhub; | ||
517 | int bit; | ||
518 | int completion_status = 0; | 573 | int completion_status = 0; |
519 | int seq_number = 0; | 574 | int seq_number = 0; |
520 | long try = 0; | 575 | long try = 0; |
521 | int cpu = bcp->uvhub_cpu; | 576 | int cpu = bcp->uvhub_cpu; |
522 | int this_cpu = bcp->cpu; | 577 | int this_cpu = bcp->cpu; |
523 | int this_uvhub = bcp->uvhub; | ||
524 | unsigned long mmr_offset; | 578 | unsigned long mmr_offset; |
525 | unsigned long index; | 579 | unsigned long index; |
526 | cycles_t time1; | 580 | cycles_t time1; |
527 | cycles_t time2; | 581 | cycles_t time2; |
528 | struct ptc_stats *stat = &per_cpu(ptcstats, bcp->cpu); | 582 | cycles_t elapsed; |
583 | struct ptc_stats *stat = bcp->statp; | ||
529 | struct bau_control *smaster = bcp->socket_master; | 584 | struct bau_control *smaster = bcp->socket_master; |
530 | struct bau_control *hmaster = bcp->uvhub_master; | 585 | struct bau_control *hmaster = bcp->uvhub_master; |
531 | 586 | ||
532 | /* | ||
533 | * Spin here while there are hmaster->max_concurrent or more active | ||
534 | * descriptors. This is the per-uvhub 'throttle'. | ||
535 | */ | ||
536 | if (!atomic_inc_unless_ge(&hmaster->uvhub_lock, | 587 | if (!atomic_inc_unless_ge(&hmaster->uvhub_lock, |
537 | &hmaster->active_descriptor_count, | 588 | &hmaster->active_descriptor_count, |
538 | hmaster->max_concurrent)) { | 589 | hmaster->max_bau_concurrent)) { |
539 | stat->s_throttles++; | 590 | stat->s_throttles++; |
540 | do { | 591 | do { |
541 | cpu_relax(); | 592 | cpu_relax(); |
542 | } while (!atomic_inc_unless_ge(&hmaster->uvhub_lock, | 593 | } while (!atomic_inc_unless_ge(&hmaster->uvhub_lock, |
543 | &hmaster->active_descriptor_count, | 594 | &hmaster->active_descriptor_count, |
544 | hmaster->max_concurrent)); | 595 | hmaster->max_bau_concurrent)); |
545 | } | 596 | } |
546 | |||
547 | while (hmaster->uvhub_quiesce) | 597 | while (hmaster->uvhub_quiesce) |
548 | cpu_relax(); | 598 | cpu_relax(); |
549 | 599 | ||
@@ -557,23 +607,10 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, | |||
557 | } | 607 | } |
558 | time1 = get_cycles(); | 608 | time1 = get_cycles(); |
559 | do { | 609 | do { |
560 | /* | ||
561 | * Every message from any given cpu gets a unique message | ||
562 | * sequence number. But retries use that same number. | ||
563 | * Our message may have timed out at the destination because | ||
564 | * all sw-ack resources are in use and there is a timeout | ||
565 | * pending there. In that case, our last send never got | ||
566 | * placed into the queue and we need to persist until it | ||
567 | * does. | ||
568 | * | ||
569 | * Make any retry a type MSG_RETRY so that the destination will | ||
570 | * free any resource held by a previous message from this cpu. | ||
571 | */ | ||
572 | if (try == 0) { | 610 | if (try == 0) { |
573 | /* use message type set by the caller the first time */ | 611 | bau_desc->header.msg_type = MSG_REGULAR; |
574 | seq_number = bcp->message_number++; | 612 | seq_number = bcp->message_number++; |
575 | } else { | 613 | } else { |
576 | /* use RETRY type on all the rest; same sequence */ | ||
577 | bau_desc->header.msg_type = MSG_RETRY; | 614 | bau_desc->header.msg_type = MSG_RETRY; |
578 | stat->s_retry_messages++; | 615 | stat->s_retry_messages++; |
579 | } | 616 | } |
@@ -581,50 +618,17 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, | |||
581 | index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) | | 618 | index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) | |
582 | bcp->uvhub_cpu; | 619 | bcp->uvhub_cpu; |
583 | bcp->send_message = get_cycles(); | 620 | bcp->send_message = get_cycles(); |
584 | |||
585 | uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index); | 621 | uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index); |
586 | |||
587 | try++; | 622 | try++; |
588 | completion_status = uv_wait_completion(bau_desc, mmr_offset, | 623 | completion_status = uv_wait_completion(bau_desc, mmr_offset, |
589 | right_shift, this_cpu, bcp, smaster, try); | 624 | right_shift, this_cpu, bcp, smaster, try); |
590 | 625 | ||
591 | if (completion_status == FLUSH_RETRY_PLUGGED) { | 626 | if (completion_status == FLUSH_RETRY_PLUGGED) { |
592 | /* | 627 | destination_plugged(bau_desc, bcp, hmaster, stat); |
593 | * Our retries may be blocked by all destination swack | ||
594 | * resources being consumed, and a timeout pending. In | ||
595 | * that case hardware immediately returns the ERROR | ||
596 | * that looks like a destination timeout. | ||
597 | */ | ||
598 | udelay(TIMEOUT_DELAY); | ||
599 | bcp->plugged_tries++; | ||
600 | if (bcp->plugged_tries >= PLUGSB4RESET) { | ||
601 | bcp->plugged_tries = 0; | ||
602 | quiesce_local_uvhub(hmaster); | ||
603 | spin_lock(&hmaster->queue_lock); | ||
604 | uv_reset_with_ipi(&bau_desc->distribution, | ||
605 | this_cpu); | ||
606 | spin_unlock(&hmaster->queue_lock); | ||
607 | end_uvhub_quiesce(hmaster); | ||
608 | bcp->ipi_attempts++; | ||
609 | stat->s_resets_plug++; | ||
610 | } | ||
611 | } else if (completion_status == FLUSH_RETRY_TIMEOUT) { | 628 | } else if (completion_status == FLUSH_RETRY_TIMEOUT) { |
612 | hmaster->max_concurrent = 1; | 629 | destination_timeout(bau_desc, bcp, hmaster, stat); |
613 | bcp->timeout_tries++; | ||
614 | udelay(TIMEOUT_DELAY); | ||
615 | if (bcp->timeout_tries >= TIMEOUTSB4RESET) { | ||
616 | bcp->timeout_tries = 0; | ||
617 | quiesce_local_uvhub(hmaster); | ||
618 | spin_lock(&hmaster->queue_lock); | ||
619 | uv_reset_with_ipi(&bau_desc->distribution, | ||
620 | this_cpu); | ||
621 | spin_unlock(&hmaster->queue_lock); | ||
622 | end_uvhub_quiesce(hmaster); | ||
623 | bcp->ipi_attempts++; | ||
624 | stat->s_resets_timeout++; | ||
625 | } | ||
626 | } | 630 | } |
627 | if (bcp->ipi_attempts >= 3) { | 631 | if (bcp->ipi_attempts >= bcp->ipi_reset_limit) { |
628 | bcp->ipi_attempts = 0; | 632 | bcp->ipi_attempts = 0; |
629 | completion_status = FLUSH_GIVEUP; | 633 | completion_status = FLUSH_GIVEUP; |
630 | break; | 634 | break; |
@@ -633,49 +637,36 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, | |||
633 | } while ((completion_status == FLUSH_RETRY_PLUGGED) || | 637 | } while ((completion_status == FLUSH_RETRY_PLUGGED) || |
634 | (completion_status == FLUSH_RETRY_TIMEOUT)); | 638 | (completion_status == FLUSH_RETRY_TIMEOUT)); |
635 | time2 = get_cycles(); | 639 | time2 = get_cycles(); |
636 | 640 | bcp->plugged_tries = 0; | |
637 | if ((completion_status == FLUSH_COMPLETE) && (bcp->conseccompletes > 5) | 641 | bcp->timeout_tries = 0; |
638 | && (hmaster->max_concurrent < hmaster->max_concurrent_constant)) | 642 | if ((completion_status == FLUSH_COMPLETE) && |
639 | hmaster->max_concurrent++; | 643 | (bcp->conseccompletes > bcp->complete_threshold) && |
640 | 644 | (hmaster->max_bau_concurrent < | |
641 | /* | 645 | hmaster->max_bau_concurrent_constant)) |
642 | * hold any cpu not timing out here; no other cpu currently held by | 646 | hmaster->max_bau_concurrent++; |
643 | * the 'throttle' should enter the activation code | ||
644 | */ | ||
645 | while (hmaster->uvhub_quiesce) | 647 | while (hmaster->uvhub_quiesce) |
646 | cpu_relax(); | 648 | cpu_relax(); |
647 | atomic_dec(&hmaster->active_descriptor_count); | 649 | atomic_dec(&hmaster->active_descriptor_count); |
648 | 650 | if (time2 > time1) { | |
649 | /* guard against cycles wrap */ | 651 | elapsed = time2 - time1; |
650 | if (time2 > time1) | 652 | stat->s_time += elapsed; |
651 | stat->s_time += (time2 - time1); | 653 | if ((completion_status == FLUSH_COMPLETE) && (try == 1)) { |
652 | else | 654 | bcp->period_requests++; |
653 | stat->s_requestor--; /* don't count this one */ | 655 | bcp->period_time += elapsed; |
656 | if ((elapsed > congested_cycles) && | ||
657 | (bcp->period_requests > bcp->congested_reps)) { | ||
658 | disable_for_congestion(bcp, stat); | ||
659 | } | ||
660 | } | ||
661 | } else | ||
662 | stat->s_requestor--; | ||
654 | if (completion_status == FLUSH_COMPLETE && try > 1) | 663 | if (completion_status == FLUSH_COMPLETE && try > 1) |
655 | stat->s_retriesok++; | 664 | stat->s_retriesok++; |
656 | else if (completion_status == FLUSH_GIVEUP) { | 665 | else if (completion_status == FLUSH_GIVEUP) { |
657 | /* | ||
658 | * Cause the caller to do an IPI-style TLB shootdown on | ||
659 | * the target cpu's, all of which are still in the mask. | ||
660 | */ | ||
661 | stat->s_giveup++; | 666 | stat->s_giveup++; |
662 | return flush_mask; | 667 | return 1; |
663 | } | ||
664 | |||
665 | /* | ||
666 | * Success, so clear the remote cpu's from the mask so we don't | ||
667 | * use the IPI method of shootdown on them. | ||
668 | */ | ||
669 | for_each_cpu(bit, flush_mask) { | ||
670 | uvhub = uv_cpu_to_blade_id(bit); | ||
671 | if (uvhub == this_uvhub) | ||
672 | continue; | ||
673 | cpumask_clear_cpu(bit, flush_mask); | ||
674 | } | 668 | } |
675 | if (!cpumask_empty(flush_mask)) | 669 | return 0; |
676 | return flush_mask; | ||
677 | |||
678 | return NULL; | ||
679 | } | 670 | } |
680 | 671 | ||
681 | /** | 672 | /** |
@@ -707,70 +698,89 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, | |||
707 | struct mm_struct *mm, | 698 | struct mm_struct *mm, |
708 | unsigned long va, unsigned int cpu) | 699 | unsigned long va, unsigned int cpu) |
709 | { | 700 | { |
710 | int remotes; | ||
711 | int tcpu; | 701 | int tcpu; |
712 | int uvhub; | 702 | int uvhub; |
713 | int locals = 0; | 703 | int locals = 0; |
704 | int remotes = 0; | ||
705 | int hubs = 0; | ||
714 | struct bau_desc *bau_desc; | 706 | struct bau_desc *bau_desc; |
715 | struct cpumask *flush_mask; | 707 | struct cpumask *flush_mask; |
716 | struct ptc_stats *stat; | 708 | struct ptc_stats *stat; |
717 | struct bau_control *bcp; | 709 | struct bau_control *bcp; |
710 | struct bau_control *tbcp; | ||
718 | 711 | ||
712 | /* kernel was booted 'nobau' */ | ||
719 | if (nobau) | 713 | if (nobau) |
720 | return cpumask; | 714 | return cpumask; |
721 | 715 | ||
722 | bcp = &per_cpu(bau_control, cpu); | 716 | bcp = &per_cpu(bau_control, cpu); |
717 | stat = bcp->statp; | ||
718 | |||
719 | /* bau was disabled due to slow response */ | ||
720 | if (bcp->baudisabled) { | ||
721 | /* the cpu that disabled it must re-enable it */ | ||
722 | if (bcp->set_bau_off) { | ||
723 | if (get_cycles() >= bcp->set_bau_on_time) { | ||
724 | stat->s_bau_reenabled++; | ||
725 | baudisabled = 0; | ||
726 | for_each_present_cpu(tcpu) { | ||
727 | tbcp = &per_cpu(bau_control, tcpu); | ||
728 | tbcp->baudisabled = 0; | ||
729 | tbcp->period_requests = 0; | ||
730 | tbcp->period_time = 0; | ||
731 | } | ||
732 | } | ||
733 | } | ||
734 | return cpumask; | ||
735 | } | ||
736 | |||
723 | /* | 737 | /* |
724 | * Each sending cpu has a per-cpu mask which it fills from the caller's | 738 | * Each sending cpu has a per-cpu mask which it fills from the caller's |
725 | * cpu mask. Only remote cpus are converted to uvhubs and copied. | 739 | * cpu mask. All cpus are converted to uvhubs and copied to the |
740 | * activation descriptor. | ||
726 | */ | 741 | */ |
727 | flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu); | 742 | flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu); |
728 | /* | 743 | /* don't actually do a shootdown of the local cpu */ |
729 | * copy cpumask to flush_mask, removing current cpu | ||
730 | * (current cpu should already have been flushed by the caller and | ||
731 | * should never be returned if we return flush_mask) | ||
732 | */ | ||
733 | cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); | 744 | cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); |
734 | if (cpu_isset(cpu, *cpumask)) | 745 | if (cpu_isset(cpu, *cpumask)) |
735 | locals++; /* current cpu was targeted */ | 746 | stat->s_ntargself++; |
736 | 747 | ||
737 | bau_desc = bcp->descriptor_base; | 748 | bau_desc = bcp->descriptor_base; |
738 | bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu; | 749 | bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu; |
739 | |||
740 | bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); | 750 | bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); |
741 | remotes = 0; | 751 | |
752 | /* cpu statistics */ | ||
742 | for_each_cpu(tcpu, flush_mask) { | 753 | for_each_cpu(tcpu, flush_mask) { |
743 | uvhub = uv_cpu_to_blade_id(tcpu); | 754 | uvhub = uv_cpu_to_blade_id(tcpu); |
744 | if (uvhub == bcp->uvhub) { | ||
745 | locals++; | ||
746 | continue; | ||
747 | } | ||
748 | bau_uvhub_set(uvhub, &bau_desc->distribution); | 755 | bau_uvhub_set(uvhub, &bau_desc->distribution); |
749 | remotes++; | 756 | if (uvhub == bcp->uvhub) |
750 | } | 757 | locals++; |
751 | if (remotes == 0) { | ||
752 | /* | ||
753 | * No off_hub flushing; return status for local hub. | ||
754 | * Return the caller's mask if all were local (the current | ||
755 | * cpu may be in that mask). | ||
756 | */ | ||
757 | if (locals) | ||
758 | return cpumask; | ||
759 | else | 758 | else |
760 | return NULL; | 759 | remotes++; |
761 | } | 760 | } |
762 | stat = &per_cpu(ptcstats, cpu); | 761 | if ((locals + remotes) == 0) |
762 | return NULL; | ||
763 | stat->s_requestor++; | 763 | stat->s_requestor++; |
764 | stat->s_ntargcpu += remotes; | 764 | stat->s_ntargcpu += remotes + locals; |
765 | stat->s_ntargremotes += remotes; | ||
766 | stat->s_ntarglocals += locals; | ||
765 | remotes = bau_uvhub_weight(&bau_desc->distribution); | 767 | remotes = bau_uvhub_weight(&bau_desc->distribution); |
766 | stat->s_ntarguvhub += remotes; | 768 | |
767 | if (remotes >= 16) | 769 | /* uvhub statistics */ |
770 | hubs = bau_uvhub_weight(&bau_desc->distribution); | ||
771 | if (locals) { | ||
772 | stat->s_ntarglocaluvhub++; | ||
773 | stat->s_ntargremoteuvhub += (hubs - 1); | ||
774 | } else | ||
775 | stat->s_ntargremoteuvhub += hubs; | ||
776 | stat->s_ntarguvhub += hubs; | ||
777 | if (hubs >= 16) | ||
768 | stat->s_ntarguvhub16++; | 778 | stat->s_ntarguvhub16++; |
769 | else if (remotes >= 8) | 779 | else if (hubs >= 8) |
770 | stat->s_ntarguvhub8++; | 780 | stat->s_ntarguvhub8++; |
771 | else if (remotes >= 4) | 781 | else if (hubs >= 4) |
772 | stat->s_ntarguvhub4++; | 782 | stat->s_ntarguvhub4++; |
773 | else if (remotes >= 2) | 783 | else if (hubs >= 2) |
774 | stat->s_ntarguvhub2++; | 784 | stat->s_ntarguvhub2++; |
775 | else | 785 | else |
776 | stat->s_ntarguvhub1++; | 786 | stat->s_ntarguvhub1++; |
@@ -779,10 +789,13 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, | |||
779 | bau_desc->payload.sending_cpu = cpu; | 789 | bau_desc->payload.sending_cpu = cpu; |
780 | 790 | ||
781 | /* | 791 | /* |
782 | * uv_flush_send_and_wait returns null if all cpu's were messaged, or | 792 | * uv_flush_send_and_wait returns 0 if all cpu's were messaged, |
783 | * the adjusted flush_mask if any cpu's were not messaged. | 793 | * or 1 if it gave up and the original cpumask should be returned. |
784 | */ | 794 | */ |
785 | return uv_flush_send_and_wait(bau_desc, flush_mask, bcp); | 795 | if (!uv_flush_send_and_wait(bau_desc, flush_mask, bcp)) |
796 | return NULL; | ||
797 | else | ||
798 | return cpumask; | ||
786 | } | 799 | } |
787 | 800 | ||
788 | /* | 801 | /* |
@@ -810,7 +823,7 @@ void uv_bau_message_interrupt(struct pt_regs *regs) | |||
810 | 823 | ||
811 | time_start = get_cycles(); | 824 | time_start = get_cycles(); |
812 | bcp = &per_cpu(bau_control, smp_processor_id()); | 825 | bcp = &per_cpu(bau_control, smp_processor_id()); |
813 | stat = &per_cpu(ptcstats, smp_processor_id()); | 826 | stat = bcp->statp; |
814 | msgdesc.va_queue_first = bcp->va_queue_first; | 827 | msgdesc.va_queue_first = bcp->va_queue_first; |
815 | msgdesc.va_queue_last = bcp->va_queue_last; | 828 | msgdesc.va_queue_last = bcp->va_queue_last; |
816 | msg = bcp->bau_msg_head; | 829 | msg = bcp->bau_msg_head; |
@@ -908,12 +921,12 @@ static void uv_ptc_seq_stop(struct seq_file *file, void *data) | |||
908 | } | 921 | } |
909 | 922 | ||
910 | static inline unsigned long long | 923 | static inline unsigned long long |
911 | millisec_2_cycles(unsigned long millisec) | 924 | microsec_2_cycles(unsigned long microsec) |
912 | { | 925 | { |
913 | unsigned long ns; | 926 | unsigned long ns; |
914 | unsigned long long cyc; | 927 | unsigned long long cyc; |
915 | 928 | ||
916 | ns = millisec * 1000; | 929 | ns = microsec * 1000; |
917 | cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); | 930 | cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); |
918 | return cyc; | 931 | return cyc; |
919 | } | 932 | } |
@@ -931,15 +944,19 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data) | |||
931 | 944 | ||
932 | if (!cpu) { | 945 | if (!cpu) { |
933 | seq_printf(file, | 946 | seq_printf(file, |
934 | "# cpu sent stime numuvhubs numuvhubs16 numuvhubs8 "); | 947 | "# cpu sent stime self locals remotes ncpus localhub "); |
948 | seq_printf(file, | ||
949 | "remotehub numuvhubs numuvhubs16 numuvhubs8 "); | ||
935 | seq_printf(file, | 950 | seq_printf(file, |
936 | "numuvhubs4 numuvhubs2 numuvhubs1 numcpus dto "); | 951 | "numuvhubs4 numuvhubs2 numuvhubs1 dto "); |
937 | seq_printf(file, | 952 | seq_printf(file, |
938 | "retries rok resetp resett giveup sto bz throt "); | 953 | "retries rok resetp resett giveup sto bz throt "); |
939 | seq_printf(file, | 954 | seq_printf(file, |
940 | "sw_ack recv rtime all "); | 955 | "sw_ack recv rtime all "); |
941 | seq_printf(file, | 956 | seq_printf(file, |
942 | "one mult none retry canc nocan reset rcan\n"); | 957 | "one mult none retry canc nocan reset rcan "); |
958 | seq_printf(file, | ||
959 | "disable enable\n"); | ||
943 | } | 960 | } |
944 | if (cpu < num_possible_cpus() && cpu_online(cpu)) { | 961 | if (cpu < num_possible_cpus() && cpu_online(cpu)) { |
945 | stat = &per_cpu(ptcstats, cpu); | 962 | stat = &per_cpu(ptcstats, cpu); |
@@ -947,18 +964,23 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data) | |||
947 | seq_printf(file, | 964 | seq_printf(file, |
948 | "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", | 965 | "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", |
949 | cpu, stat->s_requestor, cycles_2_us(stat->s_time), | 966 | cpu, stat->s_requestor, cycles_2_us(stat->s_time), |
950 | stat->s_ntarguvhub, stat->s_ntarguvhub16, | 967 | stat->s_ntargself, stat->s_ntarglocals, |
968 | stat->s_ntargremotes, stat->s_ntargcpu, | ||
969 | stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub, | ||
970 | stat->s_ntarguvhub, stat->s_ntarguvhub16); | ||
971 | seq_printf(file, "%ld %ld %ld %ld %ld ", | ||
951 | stat->s_ntarguvhub8, stat->s_ntarguvhub4, | 972 | stat->s_ntarguvhub8, stat->s_ntarguvhub4, |
952 | stat->s_ntarguvhub2, stat->s_ntarguvhub1, | 973 | stat->s_ntarguvhub2, stat->s_ntarguvhub1, |
953 | stat->s_ntargcpu, stat->s_dtimeout); | 974 | stat->s_dtimeout); |
954 | seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ", | 975 | seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ", |
955 | stat->s_retry_messages, stat->s_retriesok, | 976 | stat->s_retry_messages, stat->s_retriesok, |
956 | stat->s_resets_plug, stat->s_resets_timeout, | 977 | stat->s_resets_plug, stat->s_resets_timeout, |
957 | stat->s_giveup, stat->s_stimeout, | 978 | stat->s_giveup, stat->s_stimeout, |
958 | stat->s_busy, stat->s_throttles); | 979 | stat->s_busy, stat->s_throttles); |
980 | |||
959 | /* destination side statistics */ | 981 | /* destination side statistics */ |
960 | seq_printf(file, | 982 | seq_printf(file, |
961 | "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n", | 983 | "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", |
962 | uv_read_global_mmr64(uv_cpu_to_pnode(cpu), | 984 | uv_read_global_mmr64(uv_cpu_to_pnode(cpu), |
963 | UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE), | 985 | UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE), |
964 | stat->d_requestee, cycles_2_us(stat->d_time), | 986 | stat->d_requestee, cycles_2_us(stat->d_time), |
@@ -966,15 +988,36 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data) | |||
966 | stat->d_nomsg, stat->d_retries, stat->d_canceled, | 988 | stat->d_nomsg, stat->d_retries, stat->d_canceled, |
967 | stat->d_nocanceled, stat->d_resets, | 989 | stat->d_nocanceled, stat->d_resets, |
968 | stat->d_rcanceled); | 990 | stat->d_rcanceled); |
991 | seq_printf(file, "%ld %ld\n", | ||
992 | stat->s_bau_disabled, stat->s_bau_reenabled); | ||
969 | } | 993 | } |
970 | 994 | ||
971 | return 0; | 995 | return 0; |
972 | } | 996 | } |
973 | 997 | ||
974 | /* | 998 | /* |
999 | * Display the tunables thru debugfs | ||
1000 | */ | ||
1001 | static ssize_t tunables_read(struct file *file, char __user *userbuf, | ||
1002 | size_t count, loff_t *ppos) | ||
1003 | { | ||
1004 | char buf[300]; | ||
1005 | int ret; | ||
1006 | |||
1007 | ret = snprintf(buf, 300, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n", | ||
1008 | "max_bau_concurrent plugged_delay plugsb4reset", | ||
1009 | "timeoutsb4reset ipi_reset_limit complete_threshold", | ||
1010 | "congested_response_us congested_reps congested_period", | ||
1011 | max_bau_concurrent, plugged_delay, plugsb4reset, | ||
1012 | timeoutsb4reset, ipi_reset_limit, complete_threshold, | ||
1013 | congested_response_us, congested_reps, congested_period); | ||
1014 | |||
1015 | return simple_read_from_buffer(userbuf, count, ppos, buf, ret); | ||
1016 | } | ||
1017 | |||
1018 | /* | ||
975 | * -1: resetf the statistics | 1019 | * -1: resetf the statistics |
976 | * 0: display meaning of the statistics | 1020 | * 0: display meaning of the statistics |
977 | * >0: maximum concurrent active descriptors per uvhub (throttle) | ||
978 | */ | 1021 | */ |
979 | static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, | 1022 | static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, |
980 | size_t count, loff_t *data) | 1023 | size_t count, loff_t *data) |
@@ -983,7 +1026,6 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, | |||
983 | long input_arg; | 1026 | long input_arg; |
984 | char optstr[64]; | 1027 | char optstr[64]; |
985 | struct ptc_stats *stat; | 1028 | struct ptc_stats *stat; |
986 | struct bau_control *bcp; | ||
987 | 1029 | ||
988 | if (count == 0 || count > sizeof(optstr)) | 1030 | if (count == 0 || count > sizeof(optstr)) |
989 | return -EINVAL; | 1031 | return -EINVAL; |
@@ -1059,29 +1101,158 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, | |||
1059 | "reset: number of ipi-style reset requests processed\n"); | 1101 | "reset: number of ipi-style reset requests processed\n"); |
1060 | printk(KERN_DEBUG | 1102 | printk(KERN_DEBUG |
1061 | "rcan: number messages canceled by reset requests\n"); | 1103 | "rcan: number messages canceled by reset requests\n"); |
1104 | printk(KERN_DEBUG | ||
1105 | "disable: number times use of the BAU was disabled\n"); | ||
1106 | printk(KERN_DEBUG | ||
1107 | "enable: number times use of the BAU was re-enabled\n"); | ||
1062 | } else if (input_arg == -1) { | 1108 | } else if (input_arg == -1) { |
1063 | for_each_present_cpu(cpu) { | 1109 | for_each_present_cpu(cpu) { |
1064 | stat = &per_cpu(ptcstats, cpu); | 1110 | stat = &per_cpu(ptcstats, cpu); |
1065 | memset(stat, 0, sizeof(struct ptc_stats)); | 1111 | memset(stat, 0, sizeof(struct ptc_stats)); |
1066 | } | 1112 | } |
1067 | } else { | 1113 | } |
1068 | uv_bau_max_concurrent = input_arg; | 1114 | |
1069 | bcp = &per_cpu(bau_control, smp_processor_id()); | 1115 | return count; |
1070 | if (uv_bau_max_concurrent < 1 || | 1116 | } |
1071 | uv_bau_max_concurrent > bcp->cpus_in_uvhub) { | 1117 | |
1072 | printk(KERN_DEBUG | 1118 | static int local_atoi(const char *name) |
1073 | "Error: BAU max concurrent %d; %d is invalid\n", | 1119 | { |
1074 | bcp->max_concurrent, uv_bau_max_concurrent); | 1120 | int val = 0; |
1075 | return -EINVAL; | 1121 | |
1076 | } | 1122 | for (;; name++) { |
1077 | printk(KERN_DEBUG "Set BAU max concurrent:%d\n", | 1123 | switch (*name) { |
1078 | uv_bau_max_concurrent); | 1124 | case '0' ... '9': |
1079 | for_each_present_cpu(cpu) { | 1125 | val = 10*val+(*name-'0'); |
1080 | bcp = &per_cpu(bau_control, cpu); | 1126 | break; |
1081 | bcp->max_concurrent = uv_bau_max_concurrent; | 1127 | default: |
1128 | return val; | ||
1082 | } | 1129 | } |
1083 | } | 1130 | } |
1131 | } | ||
1132 | |||
1133 | /* | ||
1134 | * set the tunables | ||
1135 | * 0 values reset them to defaults | ||
1136 | */ | ||
1137 | static ssize_t tunables_write(struct file *file, const char __user *user, | ||
1138 | size_t count, loff_t *data) | ||
1139 | { | ||
1140 | int cpu; | ||
1141 | int cnt = 0; | ||
1142 | int val; | ||
1143 | char *p; | ||
1144 | char *q; | ||
1145 | char instr[64]; | ||
1146 | struct bau_control *bcp; | ||
1147 | |||
1148 | if (count == 0 || count > sizeof(instr)-1) | ||
1149 | return -EINVAL; | ||
1150 | if (copy_from_user(instr, user, count)) | ||
1151 | return -EFAULT; | ||
1084 | 1152 | ||
1153 | instr[count] = '\0'; | ||
1154 | /* count the fields */ | ||
1155 | p = instr + strspn(instr, WHITESPACE); | ||
1156 | q = p; | ||
1157 | for (; *p; p = q + strspn(q, WHITESPACE)) { | ||
1158 | q = p + strcspn(p, WHITESPACE); | ||
1159 | cnt++; | ||
1160 | if (q == p) | ||
1161 | break; | ||
1162 | } | ||
1163 | if (cnt != 9) { | ||
1164 | printk(KERN_INFO "bau tunable error: should be 9 numbers\n"); | ||
1165 | return -EINVAL; | ||
1166 | } | ||
1167 | |||
1168 | p = instr + strspn(instr, WHITESPACE); | ||
1169 | q = p; | ||
1170 | for (cnt = 0; *p; p = q + strspn(q, WHITESPACE), cnt++) { | ||
1171 | q = p + strcspn(p, WHITESPACE); | ||
1172 | val = local_atoi(p); | ||
1173 | switch (cnt) { | ||
1174 | case 0: | ||
1175 | if (val == 0) { | ||
1176 | max_bau_concurrent = MAX_BAU_CONCURRENT; | ||
1177 | max_bau_concurrent_constant = | ||
1178 | MAX_BAU_CONCURRENT; | ||
1179 | continue; | ||
1180 | } | ||
1181 | bcp = &per_cpu(bau_control, smp_processor_id()); | ||
1182 | if (val < 1 || val > bcp->cpus_in_uvhub) { | ||
1183 | printk(KERN_DEBUG | ||
1184 | "Error: BAU max concurrent %d is invalid\n", | ||
1185 | val); | ||
1186 | return -EINVAL; | ||
1187 | } | ||
1188 | max_bau_concurrent = val; | ||
1189 | max_bau_concurrent_constant = val; | ||
1190 | continue; | ||
1191 | case 1: | ||
1192 | if (val == 0) | ||
1193 | plugged_delay = PLUGGED_DELAY; | ||
1194 | else | ||
1195 | plugged_delay = val; | ||
1196 | continue; | ||
1197 | case 2: | ||
1198 | if (val == 0) | ||
1199 | plugsb4reset = PLUGSB4RESET; | ||
1200 | else | ||
1201 | plugsb4reset = val; | ||
1202 | continue; | ||
1203 | case 3: | ||
1204 | if (val == 0) | ||
1205 | timeoutsb4reset = TIMEOUTSB4RESET; | ||
1206 | else | ||
1207 | timeoutsb4reset = val; | ||
1208 | continue; | ||
1209 | case 4: | ||
1210 | if (val == 0) | ||
1211 | ipi_reset_limit = IPI_RESET_LIMIT; | ||
1212 | else | ||
1213 | ipi_reset_limit = val; | ||
1214 | continue; | ||
1215 | case 5: | ||
1216 | if (val == 0) | ||
1217 | complete_threshold = COMPLETE_THRESHOLD; | ||
1218 | else | ||
1219 | complete_threshold = val; | ||
1220 | continue; | ||
1221 | case 6: | ||
1222 | if (val == 0) | ||
1223 | congested_response_us = CONGESTED_RESPONSE_US; | ||
1224 | else | ||
1225 | congested_response_us = val; | ||
1226 | continue; | ||
1227 | case 7: | ||
1228 | if (val == 0) | ||
1229 | congested_reps = CONGESTED_REPS; | ||
1230 | else | ||
1231 | congested_reps = val; | ||
1232 | continue; | ||
1233 | case 8: | ||
1234 | if (val == 0) | ||
1235 | congested_period = CONGESTED_PERIOD; | ||
1236 | else | ||
1237 | congested_period = val; | ||
1238 | continue; | ||
1239 | } | ||
1240 | if (q == p) | ||
1241 | break; | ||
1242 | } | ||
1243 | for_each_present_cpu(cpu) { | ||
1244 | bcp = &per_cpu(bau_control, cpu); | ||
1245 | bcp->max_bau_concurrent = max_bau_concurrent; | ||
1246 | bcp->max_bau_concurrent_constant = max_bau_concurrent; | ||
1247 | bcp->plugged_delay = plugged_delay; | ||
1248 | bcp->plugsb4reset = plugsb4reset; | ||
1249 | bcp->timeoutsb4reset = timeoutsb4reset; | ||
1250 | bcp->ipi_reset_limit = ipi_reset_limit; | ||
1251 | bcp->complete_threshold = complete_threshold; | ||
1252 | bcp->congested_response_us = congested_response_us; | ||
1253 | bcp->congested_reps = congested_reps; | ||
1254 | bcp->congested_period = congested_period; | ||
1255 | } | ||
1085 | return count; | 1256 | return count; |
1086 | } | 1257 | } |
1087 | 1258 | ||
@@ -1097,6 +1268,11 @@ static int uv_ptc_proc_open(struct inode *inode, struct file *file) | |||
1097 | return seq_open(file, &uv_ptc_seq_ops); | 1268 | return seq_open(file, &uv_ptc_seq_ops); |
1098 | } | 1269 | } |
1099 | 1270 | ||
1271 | static int tunables_open(struct inode *inode, struct file *file) | ||
1272 | { | ||
1273 | return 0; | ||
1274 | } | ||
1275 | |||
1100 | static const struct file_operations proc_uv_ptc_operations = { | 1276 | static const struct file_operations proc_uv_ptc_operations = { |
1101 | .open = uv_ptc_proc_open, | 1277 | .open = uv_ptc_proc_open, |
1102 | .read = seq_read, | 1278 | .read = seq_read, |
@@ -1105,6 +1281,12 @@ static const struct file_operations proc_uv_ptc_operations = { | |||
1105 | .release = seq_release, | 1281 | .release = seq_release, |
1106 | }; | 1282 | }; |
1107 | 1283 | ||
1284 | static const struct file_operations tunables_fops = { | ||
1285 | .open = tunables_open, | ||
1286 | .read = tunables_read, | ||
1287 | .write = tunables_write, | ||
1288 | }; | ||
1289 | |||
1108 | static int __init uv_ptc_init(void) | 1290 | static int __init uv_ptc_init(void) |
1109 | { | 1291 | { |
1110 | struct proc_dir_entry *proc_uv_ptc; | 1292 | struct proc_dir_entry *proc_uv_ptc; |
@@ -1119,6 +1301,20 @@ static int __init uv_ptc_init(void) | |||
1119 | UV_PTC_BASENAME); | 1301 | UV_PTC_BASENAME); |
1120 | return -EINVAL; | 1302 | return -EINVAL; |
1121 | } | 1303 | } |
1304 | |||
1305 | tunables_dir = debugfs_create_dir(UV_BAU_TUNABLES_DIR, NULL); | ||
1306 | if (!tunables_dir) { | ||
1307 | printk(KERN_ERR "unable to create debugfs directory %s\n", | ||
1308 | UV_BAU_TUNABLES_DIR); | ||
1309 | return -EINVAL; | ||
1310 | } | ||
1311 | tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600, | ||
1312 | tunables_dir, NULL, &tunables_fops); | ||
1313 | if (!tunables_file) { | ||
1314 | printk(KERN_ERR "unable to create debugfs file %s\n", | ||
1315 | UV_BAU_TUNABLES_FILE); | ||
1316 | return -EINVAL; | ||
1317 | } | ||
1122 | return 0; | 1318 | return 0; |
1123 | } | 1319 | } |
1124 | 1320 | ||
@@ -1259,15 +1455,45 @@ static void __init uv_init_uvhub(int uvhub, int vector) | |||
1259 | } | 1455 | } |
1260 | 1456 | ||
1261 | /* | 1457 | /* |
1458 | * We will set BAU_MISC_CONTROL with a timeout period. | ||
1459 | * But the BIOS has set UVH_AGING_PRESCALE_SEL and UVH_TRANSACTION_TIMEOUT. | ||
1460 | * So the destination timeout period has be be calculated from them. | ||
1461 | */ | ||
1462 | static int | ||
1463 | calculate_destination_timeout(void) | ||
1464 | { | ||
1465 | unsigned long mmr_image; | ||
1466 | int mult1; | ||
1467 | int mult2; | ||
1468 | int index; | ||
1469 | int base; | ||
1470 | int ret; | ||
1471 | unsigned long ts_ns; | ||
1472 | |||
1473 | mult1 = UV_INTD_SOFT_ACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK; | ||
1474 | mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL); | ||
1475 | index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK; | ||
1476 | mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT); | ||
1477 | mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK; | ||
1478 | base = timeout_base_ns[index]; | ||
1479 | ts_ns = base * mult1 * mult2; | ||
1480 | ret = ts_ns / 1000; | ||
1481 | return ret; | ||
1482 | } | ||
1483 | |||
1484 | /* | ||
1262 | * initialize the bau_control structure for each cpu | 1485 | * initialize the bau_control structure for each cpu |
1263 | */ | 1486 | */ |
1264 | static void uv_init_per_cpu(int nuvhubs) | 1487 | static void __init uv_init_per_cpu(int nuvhubs) |
1265 | { | 1488 | { |
1266 | int i, j, k; | 1489 | int i; |
1267 | int cpu; | 1490 | int cpu; |
1268 | int pnode; | 1491 | int pnode; |
1269 | int uvhub; | 1492 | int uvhub; |
1493 | int have_hmaster; | ||
1270 | short socket = 0; | 1494 | short socket = 0; |
1495 | unsigned short socket_mask; | ||
1496 | unsigned char *uvhub_mask; | ||
1271 | struct bau_control *bcp; | 1497 | struct bau_control *bcp; |
1272 | struct uvhub_desc *bdp; | 1498 | struct uvhub_desc *bdp; |
1273 | struct socket_desc *sdp; | 1499 | struct socket_desc *sdp; |
@@ -1278,7 +1504,7 @@ static void uv_init_per_cpu(int nuvhubs) | |||
1278 | short cpu_number[16]; | 1504 | short cpu_number[16]; |
1279 | }; | 1505 | }; |
1280 | struct uvhub_desc { | 1506 | struct uvhub_desc { |
1281 | short num_sockets; | 1507 | unsigned short socket_mask; |
1282 | short num_cpus; | 1508 | short num_cpus; |
1283 | short uvhub; | 1509 | short uvhub; |
1284 | short pnode; | 1510 | short pnode; |
@@ -1286,57 +1512,84 @@ static void uv_init_per_cpu(int nuvhubs) | |||
1286 | }; | 1512 | }; |
1287 | struct uvhub_desc *uvhub_descs; | 1513 | struct uvhub_desc *uvhub_descs; |
1288 | 1514 | ||
1515 | timeout_us = calculate_destination_timeout(); | ||
1516 | |||
1289 | uvhub_descs = (struct uvhub_desc *) | 1517 | uvhub_descs = (struct uvhub_desc *) |
1290 | kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL); | 1518 | kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL); |
1291 | memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc)); | 1519 | memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc)); |
1520 | uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL); | ||
1292 | for_each_present_cpu(cpu) { | 1521 | for_each_present_cpu(cpu) { |
1293 | bcp = &per_cpu(bau_control, cpu); | 1522 | bcp = &per_cpu(bau_control, cpu); |
1294 | memset(bcp, 0, sizeof(struct bau_control)); | 1523 | memset(bcp, 0, sizeof(struct bau_control)); |
1295 | spin_lock_init(&bcp->masks_lock); | ||
1296 | bcp->max_concurrent = uv_bau_max_concurrent; | ||
1297 | pnode = uv_cpu_hub_info(cpu)->pnode; | 1524 | pnode = uv_cpu_hub_info(cpu)->pnode; |
1298 | uvhub = uv_cpu_hub_info(cpu)->numa_blade_id; | 1525 | uvhub = uv_cpu_hub_info(cpu)->numa_blade_id; |
1526 | *(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8)); | ||
1299 | bdp = &uvhub_descs[uvhub]; | 1527 | bdp = &uvhub_descs[uvhub]; |
1300 | bdp->num_cpus++; | 1528 | bdp->num_cpus++; |
1301 | bdp->uvhub = uvhub; | 1529 | bdp->uvhub = uvhub; |
1302 | bdp->pnode = pnode; | 1530 | bdp->pnode = pnode; |
1303 | /* time interval to catch a hardware stay-busy bug */ | 1531 | /* kludge: 'assuming' one node per socket, and assuming that |
1304 | bcp->timeout_interval = millisec_2_cycles(3); | 1532 | disabling a socket just leaves a gap in node numbers */ |
1305 | /* kludge: assume uv_hub.h is constant */ | 1533 | socket = (cpu_to_node(cpu) & 1); |
1306 | socket = (cpu_physical_id(cpu)>>5)&1; | 1534 | bdp->socket_mask |= (1 << socket); |
1307 | if (socket >= bdp->num_sockets) | ||
1308 | bdp->num_sockets = socket+1; | ||
1309 | sdp = &bdp->socket[socket]; | 1535 | sdp = &bdp->socket[socket]; |
1310 | sdp->cpu_number[sdp->num_cpus] = cpu; | 1536 | sdp->cpu_number[sdp->num_cpus] = cpu; |
1311 | sdp->num_cpus++; | 1537 | sdp->num_cpus++; |
1312 | } | 1538 | } |
1313 | socket = 0; | 1539 | for (uvhub = 0; uvhub < nuvhubs; uvhub++) { |
1314 | for_each_possible_blade(uvhub) { | 1540 | if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8)))) |
1541 | continue; | ||
1542 | have_hmaster = 0; | ||
1315 | bdp = &uvhub_descs[uvhub]; | 1543 | bdp = &uvhub_descs[uvhub]; |
1316 | for (i = 0; i < bdp->num_sockets; i++) { | 1544 | socket_mask = bdp->socket_mask; |
1317 | sdp = &bdp->socket[i]; | 1545 | socket = 0; |
1318 | for (j = 0; j < sdp->num_cpus; j++) { | 1546 | while (socket_mask) { |
1319 | cpu = sdp->cpu_number[j]; | 1547 | if (!(socket_mask & 1)) |
1548 | goto nextsocket; | ||
1549 | sdp = &bdp->socket[socket]; | ||
1550 | for (i = 0; i < sdp->num_cpus; i++) { | ||
1551 | cpu = sdp->cpu_number[i]; | ||
1320 | bcp = &per_cpu(bau_control, cpu); | 1552 | bcp = &per_cpu(bau_control, cpu); |
1321 | bcp->cpu = cpu; | 1553 | bcp->cpu = cpu; |
1322 | if (j == 0) { | 1554 | if (i == 0) { |
1323 | smaster = bcp; | 1555 | smaster = bcp; |
1324 | if (i == 0) | 1556 | if (!have_hmaster) { |
1557 | have_hmaster++; | ||
1325 | hmaster = bcp; | 1558 | hmaster = bcp; |
1559 | } | ||
1326 | } | 1560 | } |
1327 | bcp->cpus_in_uvhub = bdp->num_cpus; | 1561 | bcp->cpus_in_uvhub = bdp->num_cpus; |
1328 | bcp->cpus_in_socket = sdp->num_cpus; | 1562 | bcp->cpus_in_socket = sdp->num_cpus; |
1329 | bcp->socket_master = smaster; | 1563 | bcp->socket_master = smaster; |
1564 | bcp->uvhub = bdp->uvhub; | ||
1330 | bcp->uvhub_master = hmaster; | 1565 | bcp->uvhub_master = hmaster; |
1331 | for (k = 0; k < DEST_Q_SIZE; k++) | 1566 | bcp->uvhub_cpu = uv_cpu_hub_info(cpu)-> |
1332 | bcp->socket_acknowledge_count[k] = 0; | 1567 | blade_processor_id; |
1333 | bcp->uvhub_cpu = | ||
1334 | uv_cpu_hub_info(cpu)->blade_processor_id; | ||
1335 | } | 1568 | } |
1569 | nextsocket: | ||
1336 | socket++; | 1570 | socket++; |
1571 | socket_mask = (socket_mask >> 1); | ||
1337 | } | 1572 | } |
1338 | } | 1573 | } |
1339 | kfree(uvhub_descs); | 1574 | kfree(uvhub_descs); |
1575 | kfree(uvhub_mask); | ||
1576 | for_each_present_cpu(cpu) { | ||
1577 | bcp = &per_cpu(bau_control, cpu); | ||
1578 | bcp->baudisabled = 0; | ||
1579 | bcp->statp = &per_cpu(ptcstats, cpu); | ||
1580 | /* time interval to catch a hardware stay-busy bug */ | ||
1581 | bcp->timeout_interval = microsec_2_cycles(2*timeout_us); | ||
1582 | bcp->max_bau_concurrent = max_bau_concurrent; | ||
1583 | bcp->max_bau_concurrent_constant = max_bau_concurrent; | ||
1584 | bcp->plugged_delay = plugged_delay; | ||
1585 | bcp->plugsb4reset = plugsb4reset; | ||
1586 | bcp->timeoutsb4reset = timeoutsb4reset; | ||
1587 | bcp->ipi_reset_limit = ipi_reset_limit; | ||
1588 | bcp->complete_threshold = complete_threshold; | ||
1589 | bcp->congested_response_us = congested_response_us; | ||
1590 | bcp->congested_reps = congested_reps; | ||
1591 | bcp->congested_period = congested_period; | ||
1592 | } | ||
1340 | } | 1593 | } |
1341 | 1594 | ||
1342 | /* | 1595 | /* |
@@ -1361,10 +1614,11 @@ static int __init uv_bau_init(void) | |||
1361 | zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu), | 1614 | zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu), |
1362 | GFP_KERNEL, cpu_to_node(cur_cpu)); | 1615 | GFP_KERNEL, cpu_to_node(cur_cpu)); |
1363 | 1616 | ||
1364 | uv_bau_max_concurrent = MAX_BAU_CONCURRENT; | ||
1365 | uv_nshift = uv_hub_info->m_val; | 1617 | uv_nshift = uv_hub_info->m_val; |
1366 | uv_mmask = (1UL << uv_hub_info->m_val) - 1; | 1618 | uv_mmask = (1UL << uv_hub_info->m_val) - 1; |
1367 | nuvhubs = uv_num_possible_blades(); | 1619 | nuvhubs = uv_num_possible_blades(); |
1620 | spin_lock_init(&disable_lock); | ||
1621 | congested_cycles = microsec_2_cycles(congested_response_us); | ||
1368 | 1622 | ||
1369 | uv_init_per_cpu(nuvhubs); | 1623 | uv_init_per_cpu(nuvhubs); |
1370 | 1624 | ||
@@ -1383,15 +1637,19 @@ static int __init uv_bau_init(void) | |||
1383 | alloc_intr_gate(vector, uv_bau_message_intr1); | 1637 | alloc_intr_gate(vector, uv_bau_message_intr1); |
1384 | 1638 | ||
1385 | for_each_possible_blade(uvhub) { | 1639 | for_each_possible_blade(uvhub) { |
1386 | pnode = uv_blade_to_pnode(uvhub); | 1640 | if (uv_blade_nr_possible_cpus(uvhub)) { |
1387 | /* INIT the bau */ | 1641 | pnode = uv_blade_to_pnode(uvhub); |
1388 | uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_ACTIVATION_CONTROL, | 1642 | /* INIT the bau */ |
1389 | ((unsigned long)1 << 63)); | 1643 | uv_write_global_mmr64(pnode, |
1390 | mmr = 1; /* should be 1 to broadcast to both sockets */ | 1644 | UVH_LB_BAU_SB_ACTIVATION_CONTROL, |
1391 | uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST, mmr); | 1645 | ((unsigned long)1 << 63)); |
1646 | mmr = 1; /* should be 1 to broadcast to both sockets */ | ||
1647 | uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST, | ||
1648 | mmr); | ||
1649 | } | ||
1392 | } | 1650 | } |
1393 | 1651 | ||
1394 | return 0; | 1652 | return 0; |
1395 | } | 1653 | } |
1396 | core_initcall(uv_bau_init); | 1654 | core_initcall(uv_bau_init); |
1397 | core_initcall(uv_ptc_init); | 1655 | fs_initcall(uv_ptc_init); |
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 725ef4d17cd5..60788dee0f8a 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c | |||
@@ -392,7 +392,13 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) | |||
392 | if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) | 392 | if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) |
393 | == NOTIFY_STOP) | 393 | == NOTIFY_STOP) |
394 | return; | 394 | return; |
395 | |||
395 | #ifdef CONFIG_X86_LOCAL_APIC | 396 | #ifdef CONFIG_X86_LOCAL_APIC |
397 | if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) | ||
398 | == NOTIFY_STOP) | ||
399 | return; | ||
400 | |||
401 | #ifndef CONFIG_LOCKUP_DETECTOR | ||
396 | /* | 402 | /* |
397 | * Ok, so this is none of the documented NMI sources, | 403 | * Ok, so this is none of the documented NMI sources, |
398 | * so it must be the NMI watchdog. | 404 | * so it must be the NMI watchdog. |
@@ -400,6 +406,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) | |||
400 | if (nmi_watchdog_tick(regs, reason)) | 406 | if (nmi_watchdog_tick(regs, reason)) |
401 | return; | 407 | return; |
402 | if (!do_nmi_callback(regs, cpu)) | 408 | if (!do_nmi_callback(regs, cpu)) |
409 | #endif /* !CONFIG_LOCKUP_DETECTOR */ | ||
403 | unknown_nmi_error(reason, regs); | 410 | unknown_nmi_error(reason, regs); |
404 | #else | 411 | #else |
405 | unknown_nmi_error(reason, regs); | 412 | unknown_nmi_error(reason, regs); |
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 9faf91ae1841..ce8e50239332 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c | |||
@@ -751,7 +751,6 @@ static struct clocksource clocksource_tsc = { | |||
751 | .read = read_tsc, | 751 | .read = read_tsc, |
752 | .resume = resume_tsc, | 752 | .resume = resume_tsc, |
753 | .mask = CLOCKSOURCE_MASK(64), | 753 | .mask = CLOCKSOURCE_MASK(64), |
754 | .shift = 22, | ||
755 | .flags = CLOCK_SOURCE_IS_CONTINUOUS | | 754 | .flags = CLOCK_SOURCE_IS_CONTINUOUS | |
756 | CLOCK_SOURCE_MUST_VERIFY, | 755 | CLOCK_SOURCE_MUST_VERIFY, |
757 | #ifdef CONFIG_X86_64 | 756 | #ifdef CONFIG_X86_64 |
@@ -845,8 +844,6 @@ __cpuinit int unsynchronized_tsc(void) | |||
845 | 844 | ||
846 | static void __init init_tsc_clocksource(void) | 845 | static void __init init_tsc_clocksource(void) |
847 | { | 846 | { |
848 | clocksource_tsc.mult = clocksource_khz2mult(tsc_khz, | ||
849 | clocksource_tsc.shift); | ||
850 | if (tsc_clocksource_reliable) | 847 | if (tsc_clocksource_reliable) |
851 | clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; | 848 | clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; |
852 | /* lower the rating if we already know its unstable: */ | 849 | /* lower the rating if we already know its unstable: */ |
@@ -854,7 +851,7 @@ static void __init init_tsc_clocksource(void) | |||
854 | clocksource_tsc.rating = 0; | 851 | clocksource_tsc.rating = 0; |
855 | clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; | 852 | clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; |
856 | } | 853 | } |
857 | clocksource_register(&clocksource_tsc); | 854 | clocksource_register_khz(&clocksource_tsc, tsc_khz); |
858 | } | 855 | } |
859 | 856 | ||
860 | #ifdef CONFIG_X86_64 | 857 | #ifdef CONFIG_X86_64 |
diff --git a/arch/x86/kernel/verify_cpu_64.S b/arch/x86/kernel/verify_cpu_64.S index 45b6f8a975a1..56a8c2a867d9 100644 --- a/arch/x86/kernel/verify_cpu_64.S +++ b/arch/x86/kernel/verify_cpu_64.S | |||
@@ -31,6 +31,7 @@ | |||
31 | */ | 31 | */ |
32 | 32 | ||
33 | #include <asm/cpufeature.h> | 33 | #include <asm/cpufeature.h> |
34 | #include <asm/msr-index.h> | ||
34 | 35 | ||
35 | verify_cpu: | 36 | verify_cpu: |
36 | pushfl # Save caller passed flags | 37 | pushfl # Save caller passed flags |
@@ -88,7 +89,7 @@ verify_cpu_sse_test: | |||
88 | je verify_cpu_sse_ok | 89 | je verify_cpu_sse_ok |
89 | test %di,%di | 90 | test %di,%di |
90 | jz verify_cpu_no_longmode # only try to force SSE on AMD | 91 | jz verify_cpu_no_longmode # only try to force SSE on AMD |
91 | movl $0xc0010015,%ecx # HWCR | 92 | movl $MSR_K7_HWCR,%ecx |
92 | rdmsr | 93 | rdmsr |
93 | btr $15,%eax # enable SSE | 94 | btr $15,%eax # enable SSE |
94 | wrmsr | 95 | wrmsr |
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 1c0c6ab9c60f..dcbb28c4b694 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c | |||
@@ -73,8 +73,8 @@ void update_vsyscall_tz(void) | |||
73 | write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); | 73 | write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); |
74 | } | 74 | } |
75 | 75 | ||
76 | void update_vsyscall(struct timespec *wall_time, struct clocksource *clock, | 76 | void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, |
77 | u32 mult) | 77 | struct clocksource *clock, u32 mult) |
78 | { | 78 | { |
79 | unsigned long flags; | 79 | unsigned long flags; |
80 | 80 | ||
@@ -87,7 +87,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock, | |||
87 | vsyscall_gtod_data.clock.shift = clock->shift; | 87 | vsyscall_gtod_data.clock.shift = clock->shift; |
88 | vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; | 88 | vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; |
89 | vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; | 89 | vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; |
90 | vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; | 90 | vsyscall_gtod_data.wall_to_monotonic = *wtm; |
91 | vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); | 91 | vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); |
92 | write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); | 92 | write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); |
93 | } | 93 | } |
@@ -169,13 +169,18 @@ int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) | |||
169 | * unlikely */ | 169 | * unlikely */ |
170 | time_t __vsyscall(1) vtime(time_t *t) | 170 | time_t __vsyscall(1) vtime(time_t *t) |
171 | { | 171 | { |
172 | struct timeval tv; | 172 | unsigned seq; |
173 | time_t result; | 173 | time_t result; |
174 | if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) | 174 | if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) |
175 | return time_syscall(t); | 175 | return time_syscall(t); |
176 | 176 | ||
177 | vgettimeofday(&tv, NULL); | 177 | do { |
178 | result = tv.tv_sec; | 178 | seq = read_seqbegin(&__vsyscall_gtod_data.lock); |
179 | |||
180 | result = __vsyscall_gtod_data.wall_time_sec; | ||
181 | |||
182 | } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); | ||
183 | |||
179 | if (t) | 184 | if (t) |
180 | *t = result; | 185 | *t = result; |
181 | return result; | 186 | return result; |
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 37e68fc5e24a..9c253bd65e24 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c | |||
@@ -16,11 +16,88 @@ | |||
16 | */ | 16 | */ |
17 | u64 pcntxt_mask; | 17 | u64 pcntxt_mask; |
18 | 18 | ||
19 | /* | ||
20 | * Represents init state for the supported extended state. | ||
21 | */ | ||
22 | static struct xsave_struct *init_xstate_buf; | ||
23 | |||
19 | struct _fpx_sw_bytes fx_sw_reserved; | 24 | struct _fpx_sw_bytes fx_sw_reserved; |
20 | #ifdef CONFIG_IA32_EMULATION | 25 | #ifdef CONFIG_IA32_EMULATION |
21 | struct _fpx_sw_bytes fx_sw_reserved_ia32; | 26 | struct _fpx_sw_bytes fx_sw_reserved_ia32; |
22 | #endif | 27 | #endif |
23 | 28 | ||
29 | static unsigned int *xstate_offsets, *xstate_sizes, xstate_features; | ||
30 | |||
31 | /* | ||
32 | * If a processor implementation discern that a processor state component is | ||
33 | * in its initialized state it may modify the corresponding bit in the | ||
34 | * xsave_hdr.xstate_bv as '0', with out modifying the corresponding memory | ||
35 | * layout in the case of xsaveopt. While presenting the xstate information to | ||
36 | * the user, we always ensure that the memory layout of a feature will be in | ||
37 | * the init state if the corresponding header bit is zero. This is to ensure | ||
38 | * that the user doesn't see some stale state in the memory layout during | ||
39 | * signal handling, debugging etc. | ||
40 | */ | ||
41 | void __sanitize_i387_state(struct task_struct *tsk) | ||
42 | { | ||
43 | u64 xstate_bv; | ||
44 | int feature_bit = 0x2; | ||
45 | struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave; | ||
46 | |||
47 | if (!fx) | ||
48 | return; | ||
49 | |||
50 | BUG_ON(task_thread_info(tsk)->status & TS_USEDFPU); | ||
51 | |||
52 | xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv; | ||
53 | |||
54 | /* | ||
55 | * None of the feature bits are in init state. So nothing else | ||
56 | * to do for us, as the memory layout is upto date. | ||
57 | */ | ||
58 | if ((xstate_bv & pcntxt_mask) == pcntxt_mask) | ||
59 | return; | ||
60 | |||
61 | /* | ||
62 | * FP is in init state | ||
63 | */ | ||
64 | if (!(xstate_bv & XSTATE_FP)) { | ||
65 | fx->cwd = 0x37f; | ||
66 | fx->swd = 0; | ||
67 | fx->twd = 0; | ||
68 | fx->fop = 0; | ||
69 | fx->rip = 0; | ||
70 | fx->rdp = 0; | ||
71 | memset(&fx->st_space[0], 0, 128); | ||
72 | } | ||
73 | |||
74 | /* | ||
75 | * SSE is in init state | ||
76 | */ | ||
77 | if (!(xstate_bv & XSTATE_SSE)) | ||
78 | memset(&fx->xmm_space[0], 0, 256); | ||
79 | |||
80 | xstate_bv = (pcntxt_mask & ~xstate_bv) >> 2; | ||
81 | |||
82 | /* | ||
83 | * Update all the other memory layouts for which the corresponding | ||
84 | * header bit is in the init state. | ||
85 | */ | ||
86 | while (xstate_bv) { | ||
87 | if (xstate_bv & 0x1) { | ||
88 | int offset = xstate_offsets[feature_bit]; | ||
89 | int size = xstate_sizes[feature_bit]; | ||
90 | |||
91 | memcpy(((void *) fx) + offset, | ||
92 | ((void *) init_xstate_buf) + offset, | ||
93 | size); | ||
94 | } | ||
95 | |||
96 | xstate_bv >>= 1; | ||
97 | feature_bit++; | ||
98 | } | ||
99 | } | ||
100 | |||
24 | /* | 101 | /* |
25 | * Check for the presence of extended state information in the | 102 | * Check for the presence of extended state information in the |
26 | * user fpstate pointer in the sigcontext. | 103 | * user fpstate pointer in the sigcontext. |
@@ -36,15 +113,14 @@ int check_for_xstate(struct i387_fxsave_struct __user *buf, | |||
36 | 113 | ||
37 | err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0], | 114 | err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0], |
38 | sizeof(struct _fpx_sw_bytes)); | 115 | sizeof(struct _fpx_sw_bytes)); |
39 | |||
40 | if (err) | 116 | if (err) |
41 | return err; | 117 | return -EFAULT; |
42 | 118 | ||
43 | /* | 119 | /* |
44 | * First Magic check failed. | 120 | * First Magic check failed. |
45 | */ | 121 | */ |
46 | if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1) | 122 | if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1) |
47 | return -1; | 123 | return -EINVAL; |
48 | 124 | ||
49 | /* | 125 | /* |
50 | * Check for error scenarios. | 126 | * Check for error scenarios. |
@@ -52,19 +128,21 @@ int check_for_xstate(struct i387_fxsave_struct __user *buf, | |||
52 | if (fx_sw_user->xstate_size < min_xstate_size || | 128 | if (fx_sw_user->xstate_size < min_xstate_size || |
53 | fx_sw_user->xstate_size > xstate_size || | 129 | fx_sw_user->xstate_size > xstate_size || |
54 | fx_sw_user->xstate_size > fx_sw_user->extended_size) | 130 | fx_sw_user->xstate_size > fx_sw_user->extended_size) |
55 | return -1; | 131 | return -EINVAL; |
56 | 132 | ||
57 | err = __get_user(magic2, (__u32 *) (((void *)fpstate) + | 133 | err = __get_user(magic2, (__u32 *) (((void *)fpstate) + |
58 | fx_sw_user->extended_size - | 134 | fx_sw_user->extended_size - |
59 | FP_XSTATE_MAGIC2_SIZE)); | 135 | FP_XSTATE_MAGIC2_SIZE)); |
136 | if (err) | ||
137 | return err; | ||
60 | /* | 138 | /* |
61 | * Check for the presence of second magic word at the end of memory | 139 | * Check for the presence of second magic word at the end of memory |
62 | * layout. This detects the case where the user just copied the legacy | 140 | * layout. This detects the case where the user just copied the legacy |
63 | * fpstate layout with out copying the extended state information | 141 | * fpstate layout with out copying the extended state information |
64 | * in the memory layout. | 142 | * in the memory layout. |
65 | */ | 143 | */ |
66 | if (err || magic2 != FP_XSTATE_MAGIC2) | 144 | if (magic2 != FP_XSTATE_MAGIC2) |
67 | return -1; | 145 | return -EFAULT; |
68 | 146 | ||
69 | return 0; | 147 | return 0; |
70 | } | 148 | } |
@@ -91,14 +169,6 @@ int save_i387_xstate(void __user *buf) | |||
91 | return 0; | 169 | return 0; |
92 | 170 | ||
93 | if (task_thread_info(tsk)->status & TS_USEDFPU) { | 171 | if (task_thread_info(tsk)->status & TS_USEDFPU) { |
94 | /* | ||
95 | * Start with clearing the user buffer. This will present a | ||
96 | * clean context for the bytes not touched by the fxsave/xsave. | ||
97 | */ | ||
98 | err = __clear_user(buf, sig_xstate_size); | ||
99 | if (err) | ||
100 | return err; | ||
101 | |||
102 | if (use_xsave()) | 172 | if (use_xsave()) |
103 | err = xsave_user(buf); | 173 | err = xsave_user(buf); |
104 | else | 174 | else |
@@ -109,6 +179,7 @@ int save_i387_xstate(void __user *buf) | |||
109 | task_thread_info(tsk)->status &= ~TS_USEDFPU; | 179 | task_thread_info(tsk)->status &= ~TS_USEDFPU; |
110 | stts(); | 180 | stts(); |
111 | } else { | 181 | } else { |
182 | sanitize_i387_state(tsk); | ||
112 | if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave, | 183 | if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave, |
113 | xstate_size)) | 184 | xstate_size)) |
114 | return -1; | 185 | return -1; |
@@ -184,8 +255,8 @@ static int restore_user_xstate(void __user *buf) | |||
184 | * init the state skipped by the user. | 255 | * init the state skipped by the user. |
185 | */ | 256 | */ |
186 | mask = pcntxt_mask & ~mask; | 257 | mask = pcntxt_mask & ~mask; |
187 | 258 | if (unlikely(mask)) | |
188 | xrstor_state(init_xstate_buf, mask); | 259 | xrstor_state(init_xstate_buf, mask); |
189 | 260 | ||
190 | return 0; | 261 | return 0; |
191 | 262 | ||
@@ -274,11 +345,6 @@ static void prepare_fx_sw_frame(void) | |||
274 | #endif | 345 | #endif |
275 | } | 346 | } |
276 | 347 | ||
277 | /* | ||
278 | * Represents init state for the supported extended state. | ||
279 | */ | ||
280 | struct xsave_struct *init_xstate_buf; | ||
281 | |||
282 | #ifdef CONFIG_X86_64 | 348 | #ifdef CONFIG_X86_64 |
283 | unsigned int sig_xstate_size = sizeof(struct _fpstate); | 349 | unsigned int sig_xstate_size = sizeof(struct _fpstate); |
284 | #endif | 350 | #endif |
@@ -286,37 +352,77 @@ unsigned int sig_xstate_size = sizeof(struct _fpstate); | |||
286 | /* | 352 | /* |
287 | * Enable the extended processor state save/restore feature | 353 | * Enable the extended processor state save/restore feature |
288 | */ | 354 | */ |
289 | void __cpuinit xsave_init(void) | 355 | static inline void xstate_enable(void) |
290 | { | 356 | { |
291 | if (!cpu_has_xsave) | ||
292 | return; | ||
293 | |||
294 | set_in_cr4(X86_CR4_OSXSAVE); | 357 | set_in_cr4(X86_CR4_OSXSAVE); |
295 | |||
296 | /* | ||
297 | * Enable all the features that the HW is capable of | ||
298 | * and the Linux kernel is aware of. | ||
299 | */ | ||
300 | xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); | 358 | xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); |
301 | } | 359 | } |
302 | 360 | ||
303 | /* | 361 | /* |
362 | * Record the offsets and sizes of different state managed by the xsave | ||
363 | * memory layout. | ||
364 | */ | ||
365 | static void __init setup_xstate_features(void) | ||
366 | { | ||
367 | int eax, ebx, ecx, edx, leaf = 0x2; | ||
368 | |||
369 | xstate_features = fls64(pcntxt_mask); | ||
370 | xstate_offsets = alloc_bootmem(xstate_features * sizeof(int)); | ||
371 | xstate_sizes = alloc_bootmem(xstate_features * sizeof(int)); | ||
372 | |||
373 | do { | ||
374 | cpuid_count(XSTATE_CPUID, leaf, &eax, &ebx, &ecx, &edx); | ||
375 | |||
376 | if (eax == 0) | ||
377 | break; | ||
378 | |||
379 | xstate_offsets[leaf] = ebx; | ||
380 | xstate_sizes[leaf] = eax; | ||
381 | |||
382 | leaf++; | ||
383 | } while (1); | ||
384 | } | ||
385 | |||
386 | /* | ||
304 | * setup the xstate image representing the init state | 387 | * setup the xstate image representing the init state |
305 | */ | 388 | */ |
306 | static void __init setup_xstate_init(void) | 389 | static void __init setup_xstate_init(void) |
307 | { | 390 | { |
391 | setup_xstate_features(); | ||
392 | |||
393 | /* | ||
394 | * Setup init_xstate_buf to represent the init state of | ||
395 | * all the features managed by the xsave | ||
396 | */ | ||
308 | init_xstate_buf = alloc_bootmem(xstate_size); | 397 | init_xstate_buf = alloc_bootmem(xstate_size); |
309 | init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT; | 398 | init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT; |
399 | |||
400 | clts(); | ||
401 | /* | ||
402 | * Init all the features state with header_bv being 0x0 | ||
403 | */ | ||
404 | xrstor_state(init_xstate_buf, -1); | ||
405 | /* | ||
406 | * Dump the init state again. This is to identify the init state | ||
407 | * of any feature which is not represented by all zero's. | ||
408 | */ | ||
409 | xsave_state(init_xstate_buf, -1); | ||
410 | stts(); | ||
310 | } | 411 | } |
311 | 412 | ||
312 | /* | 413 | /* |
313 | * Enable and initialize the xsave feature. | 414 | * Enable and initialize the xsave feature. |
314 | */ | 415 | */ |
315 | void __ref xsave_cntxt_init(void) | 416 | static void __init xstate_enable_boot_cpu(void) |
316 | { | 417 | { |
317 | unsigned int eax, ebx, ecx, edx; | 418 | unsigned int eax, ebx, ecx, edx; |
318 | 419 | ||
319 | cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); | 420 | if (boot_cpu_data.cpuid_level < XSTATE_CPUID) { |
421 | WARN(1, KERN_ERR "XSTATE_CPUID missing\n"); | ||
422 | return; | ||
423 | } | ||
424 | |||
425 | cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); | ||
320 | pcntxt_mask = eax + ((u64)edx << 32); | 426 | pcntxt_mask = eax + ((u64)edx << 32); |
321 | 427 | ||
322 | if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) { | 428 | if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) { |
@@ -329,12 +435,13 @@ void __ref xsave_cntxt_init(void) | |||
329 | * Support only the state known to OS. | 435 | * Support only the state known to OS. |
330 | */ | 436 | */ |
331 | pcntxt_mask = pcntxt_mask & XCNTXT_MASK; | 437 | pcntxt_mask = pcntxt_mask & XCNTXT_MASK; |
332 | xsave_init(); | 438 | |
439 | xstate_enable(); | ||
333 | 440 | ||
334 | /* | 441 | /* |
335 | * Recompute the context size for enabled features | 442 | * Recompute the context size for enabled features |
336 | */ | 443 | */ |
337 | cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); | 444 | cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); |
338 | xstate_size = ebx; | 445 | xstate_size = ebx; |
339 | 446 | ||
340 | update_regset_xstate_info(xstate_size, pcntxt_mask); | 447 | update_regset_xstate_info(xstate_size, pcntxt_mask); |
@@ -346,3 +453,23 @@ void __ref xsave_cntxt_init(void) | |||
346 | "cntxt size 0x%x\n", | 453 | "cntxt size 0x%x\n", |
347 | pcntxt_mask, xstate_size); | 454 | pcntxt_mask, xstate_size); |
348 | } | 455 | } |
456 | |||
457 | /* | ||
458 | * For the very first instance, this calls xstate_enable_boot_cpu(); | ||
459 | * for all subsequent instances, this calls xstate_enable(). | ||
460 | * | ||
461 | * This is somewhat obfuscated due to the lack of powerful enough | ||
462 | * overrides for the section checks. | ||
463 | */ | ||
464 | void __cpuinit xsave_init(void) | ||
465 | { | ||
466 | static __refdata void (*next_func)(void) = xstate_enable_boot_cpu; | ||
467 | void (*this_func)(void); | ||
468 | |||
469 | if (!cpu_has_xsave) | ||
470 | return; | ||
471 | |||
472 | this_func = next_func; | ||
473 | next_func = xstate_enable; | ||
474 | this_func(); | ||
475 | } | ||
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5ac0bb465ed6..b38bd8b92aa6 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c | |||
@@ -9,6 +9,7 @@ | |||
9 | * privileged instructions: | 9 | * privileged instructions: |
10 | * | 10 | * |
11 | * Copyright (C) 2006 Qumranet | 11 | * Copyright (C) 2006 Qumranet |
12 | * Copyright 2010 Red Hat, Inc. and/or its affilates. | ||
12 | * | 13 | * |
13 | * Avi Kivity <avi@qumranet.com> | 14 | * Avi Kivity <avi@qumranet.com> |
14 | * Yaniv Kamay <yaniv@qumranet.com> | 15 | * Yaniv Kamay <yaniv@qumranet.com> |
@@ -67,6 +68,9 @@ | |||
67 | #define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */ | 68 | #define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */ |
68 | #define SrcImmU (9<<4) /* Immediate operand, unsigned */ | 69 | #define SrcImmU (9<<4) /* Immediate operand, unsigned */ |
69 | #define SrcSI (0xa<<4) /* Source is in the DS:RSI */ | 70 | #define SrcSI (0xa<<4) /* Source is in the DS:RSI */ |
71 | #define SrcImmFAddr (0xb<<4) /* Source is immediate far address */ | ||
72 | #define SrcMemFAddr (0xc<<4) /* Source is far address in memory */ | ||
73 | #define SrcAcc (0xd<<4) /* Source Accumulator */ | ||
70 | #define SrcMask (0xf<<4) | 74 | #define SrcMask (0xf<<4) |
71 | /* Generic ModRM decode. */ | 75 | /* Generic ModRM decode. */ |
72 | #define ModRM (1<<8) | 76 | #define ModRM (1<<8) |
@@ -88,10 +92,6 @@ | |||
88 | #define Src2CL (1<<29) | 92 | #define Src2CL (1<<29) |
89 | #define Src2ImmByte (2<<29) | 93 | #define Src2ImmByte (2<<29) |
90 | #define Src2One (3<<29) | 94 | #define Src2One (3<<29) |
91 | #define Src2Imm16 (4<<29) | ||
92 | #define Src2Mem16 (5<<29) /* Used for Ep encoding. First argument has to be | ||
93 | in memory and second argument is located | ||
94 | immediately after the first one in memory. */ | ||
95 | #define Src2Mask (7<<29) | 95 | #define Src2Mask (7<<29) |
96 | 96 | ||
97 | enum { | 97 | enum { |
@@ -124,15 +124,15 @@ static u32 opcode_table[256] = { | |||
124 | /* 0x20 - 0x27 */ | 124 | /* 0x20 - 0x27 */ |
125 | ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, | 125 | ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, |
126 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | 126 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, |
127 | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, | 127 | ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, |
128 | /* 0x28 - 0x2F */ | 128 | /* 0x28 - 0x2F */ |
129 | ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, | 129 | ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, |
130 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | 130 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, |
131 | 0, 0, 0, 0, | 131 | ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, |
132 | /* 0x30 - 0x37 */ | 132 | /* 0x30 - 0x37 */ |
133 | ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, | 133 | ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, |
134 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | 134 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, |
135 | 0, 0, 0, 0, | 135 | ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, |
136 | /* 0x38 - 0x3F */ | 136 | /* 0x38 - 0x3F */ |
137 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | 137 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, |
138 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | 138 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, |
@@ -170,20 +170,20 @@ static u32 opcode_table[256] = { | |||
170 | /* 0x88 - 0x8F */ | 170 | /* 0x88 - 0x8F */ |
171 | ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, | 171 | ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, |
172 | ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | 172 | ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, |
173 | DstMem | SrcReg | ModRM | Mov, ModRM | DstReg, | 173 | DstMem | SrcNone | ModRM | Mov, ModRM | DstReg, |
174 | DstReg | SrcMem | ModRM | Mov, Group | Group1A, | 174 | ImplicitOps | SrcMem16 | ModRM, Group | Group1A, |
175 | /* 0x90 - 0x97 */ | 175 | /* 0x90 - 0x97 */ |
176 | DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, | 176 | DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, |
177 | /* 0x98 - 0x9F */ | 177 | /* 0x98 - 0x9F */ |
178 | 0, 0, SrcImm | Src2Imm16 | No64, 0, | 178 | 0, 0, SrcImmFAddr | No64, 0, |
179 | ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, | 179 | ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, |
180 | /* 0xA0 - 0xA7 */ | 180 | /* 0xA0 - 0xA7 */ |
181 | ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, | 181 | ByteOp | DstAcc | SrcMem | Mov | MemAbs, DstAcc | SrcMem | Mov | MemAbs, |
182 | ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs, | 182 | ByteOp | DstMem | SrcAcc | Mov | MemAbs, DstMem | SrcAcc | Mov | MemAbs, |
183 | ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String, | 183 | ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String, |
184 | ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String, | 184 | ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String, |
185 | /* 0xA8 - 0xAF */ | 185 | /* 0xA8 - 0xAF */ |
186 | 0, 0, ByteOp | DstDI | Mov | String, DstDI | Mov | String, | 186 | DstAcc | SrcImmByte | ByteOp, DstAcc | SrcImm, ByteOp | DstDI | Mov | String, DstDI | Mov | String, |
187 | ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String, | 187 | ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String, |
188 | ByteOp | DstDI | String, DstDI | String, | 188 | ByteOp | DstDI | String, DstDI | String, |
189 | /* 0xB0 - 0xB7 */ | 189 | /* 0xB0 - 0xB7 */ |
@@ -215,7 +215,7 @@ static u32 opcode_table[256] = { | |||
215 | ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc, | 215 | ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc, |
216 | /* 0xE8 - 0xEF */ | 216 | /* 0xE8 - 0xEF */ |
217 | SrcImm | Stack, SrcImm | ImplicitOps, | 217 | SrcImm | Stack, SrcImm | ImplicitOps, |
218 | SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps, | 218 | SrcImmFAddr | No64, SrcImmByte | ImplicitOps, |
219 | SrcNone | ByteOp | DstAcc, SrcNone | DstAcc, | 219 | SrcNone | ByteOp | DstAcc, SrcNone | DstAcc, |
220 | SrcNone | ByteOp | DstAcc, SrcNone | DstAcc, | 220 | SrcNone | ByteOp | DstAcc, SrcNone | DstAcc, |
221 | /* 0xF0 - 0xF7 */ | 221 | /* 0xF0 - 0xF7 */ |
@@ -337,20 +337,20 @@ static u32 group_table[] = { | |||
337 | [Group1A*8] = | 337 | [Group1A*8] = |
338 | DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, | 338 | DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, |
339 | [Group3_Byte*8] = | 339 | [Group3_Byte*8] = |
340 | ByteOp | SrcImm | DstMem | ModRM, 0, | 340 | ByteOp | SrcImm | DstMem | ModRM, ByteOp | SrcImm | DstMem | ModRM, |
341 | ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, | 341 | ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, |
342 | 0, 0, 0, 0, | 342 | 0, 0, 0, 0, |
343 | [Group3*8] = | 343 | [Group3*8] = |
344 | DstMem | SrcImm | ModRM, 0, | 344 | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, |
345 | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, | 345 | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, |
346 | 0, 0, 0, 0, | 346 | 0, 0, 0, 0, |
347 | [Group4*8] = | 347 | [Group4*8] = |
348 | ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, | 348 | ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock, |
349 | 0, 0, 0, 0, 0, 0, | 349 | 0, 0, 0, 0, 0, 0, |
350 | [Group5*8] = | 350 | [Group5*8] = |
351 | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, | 351 | DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock, |
352 | SrcMem | ModRM | Stack, 0, | 352 | SrcMem | ModRM | Stack, 0, |
353 | SrcMem | ModRM | Stack, SrcMem | ModRM | Src2Mem16 | ImplicitOps, | 353 | SrcMem | ModRM | Stack, SrcMemFAddr | ModRM | ImplicitOps, |
354 | SrcMem | ModRM | Stack, 0, | 354 | SrcMem | ModRM | Stack, 0, |
355 | [Group7*8] = | 355 | [Group7*8] = |
356 | 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv, | 356 | 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv, |
@@ -576,6 +576,13 @@ static u32 group2_table[] = { | |||
576 | (_type)_x; \ | 576 | (_type)_x; \ |
577 | }) | 577 | }) |
578 | 578 | ||
579 | #define insn_fetch_arr(_arr, _size, _eip) \ | ||
580 | ({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \ | ||
581 | if (rc != X86EMUL_CONTINUE) \ | ||
582 | goto done; \ | ||
583 | (_eip) += (_size); \ | ||
584 | }) | ||
585 | |||
579 | static inline unsigned long ad_mask(struct decode_cache *c) | 586 | static inline unsigned long ad_mask(struct decode_cache *c) |
580 | { | 587 | { |
581 | return (1UL << (c->ad_bytes << 3)) - 1; | 588 | return (1UL << (c->ad_bytes << 3)) - 1; |
@@ -617,31 +624,66 @@ static void set_seg_override(struct decode_cache *c, int seg) | |||
617 | c->seg_override = seg; | 624 | c->seg_override = seg; |
618 | } | 625 | } |
619 | 626 | ||
620 | static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) | 627 | static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, |
628 | struct x86_emulate_ops *ops, int seg) | ||
621 | { | 629 | { |
622 | if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) | 630 | if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) |
623 | return 0; | 631 | return 0; |
624 | 632 | ||
625 | return kvm_x86_ops->get_segment_base(ctxt->vcpu, seg); | 633 | return ops->get_cached_segment_base(seg, ctxt->vcpu); |
626 | } | 634 | } |
627 | 635 | ||
628 | static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt, | 636 | static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt, |
637 | struct x86_emulate_ops *ops, | ||
629 | struct decode_cache *c) | 638 | struct decode_cache *c) |
630 | { | 639 | { |
631 | if (!c->has_seg_override) | 640 | if (!c->has_seg_override) |
632 | return 0; | 641 | return 0; |
633 | 642 | ||
634 | return seg_base(ctxt, c->seg_override); | 643 | return seg_base(ctxt, ops, c->seg_override); |
644 | } | ||
645 | |||
646 | static unsigned long es_base(struct x86_emulate_ctxt *ctxt, | ||
647 | struct x86_emulate_ops *ops) | ||
648 | { | ||
649 | return seg_base(ctxt, ops, VCPU_SREG_ES); | ||
650 | } | ||
651 | |||
652 | static unsigned long ss_base(struct x86_emulate_ctxt *ctxt, | ||
653 | struct x86_emulate_ops *ops) | ||
654 | { | ||
655 | return seg_base(ctxt, ops, VCPU_SREG_SS); | ||
656 | } | ||
657 | |||
658 | static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, | ||
659 | u32 error, bool valid) | ||
660 | { | ||
661 | ctxt->exception = vec; | ||
662 | ctxt->error_code = error; | ||
663 | ctxt->error_code_valid = valid; | ||
664 | ctxt->restart = false; | ||
665 | } | ||
666 | |||
667 | static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err) | ||
668 | { | ||
669 | emulate_exception(ctxt, GP_VECTOR, err, true); | ||
635 | } | 670 | } |
636 | 671 | ||
637 | static unsigned long es_base(struct x86_emulate_ctxt *ctxt) | 672 | static void emulate_pf(struct x86_emulate_ctxt *ctxt, unsigned long addr, |
673 | int err) | ||
638 | { | 674 | { |
639 | return seg_base(ctxt, VCPU_SREG_ES); | 675 | ctxt->cr2 = addr; |
676 | emulate_exception(ctxt, PF_VECTOR, err, true); | ||
640 | } | 677 | } |
641 | 678 | ||
642 | static unsigned long ss_base(struct x86_emulate_ctxt *ctxt) | 679 | static void emulate_ud(struct x86_emulate_ctxt *ctxt) |
643 | { | 680 | { |
644 | return seg_base(ctxt, VCPU_SREG_SS); | 681 | emulate_exception(ctxt, UD_VECTOR, 0, false); |
682 | } | ||
683 | |||
684 | static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err) | ||
685 | { | ||
686 | emulate_exception(ctxt, TS_VECTOR, err, true); | ||
645 | } | 687 | } |
646 | 688 | ||
647 | static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, | 689 | static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, |
@@ -932,12 +974,9 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
932 | /* we cannot decode insn before we complete previous rep insn */ | 974 | /* we cannot decode insn before we complete previous rep insn */ |
933 | WARN_ON(ctxt->restart); | 975 | WARN_ON(ctxt->restart); |
934 | 976 | ||
935 | /* Shadow copy of register state. Committed on successful emulation. */ | ||
936 | memset(c, 0, sizeof(struct decode_cache)); | ||
937 | c->eip = ctxt->eip; | 977 | c->eip = ctxt->eip; |
938 | c->fetch.start = c->fetch.end = c->eip; | 978 | c->fetch.start = c->fetch.end = c->eip; |
939 | ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); | 979 | ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS); |
940 | memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); | ||
941 | 980 | ||
942 | switch (mode) { | 981 | switch (mode) { |
943 | case X86EMUL_MODE_REAL: | 982 | case X86EMUL_MODE_REAL: |
@@ -1060,7 +1099,7 @@ done_prefixes: | |||
1060 | set_seg_override(c, VCPU_SREG_DS); | 1099 | set_seg_override(c, VCPU_SREG_DS); |
1061 | 1100 | ||
1062 | if (!(!c->twobyte && c->b == 0x8d)) | 1101 | if (!(!c->twobyte && c->b == 0x8d)) |
1063 | c->modrm_ea += seg_override_base(ctxt, c); | 1102 | c->modrm_ea += seg_override_base(ctxt, ops, c); |
1064 | 1103 | ||
1065 | if (c->ad_bytes != 8) | 1104 | if (c->ad_bytes != 8) |
1066 | c->modrm_ea = (u32)c->modrm_ea; | 1105 | c->modrm_ea = (u32)c->modrm_ea; |
@@ -1148,6 +1187,25 @@ done_prefixes: | |||
1148 | else | 1187 | else |
1149 | c->src.val = insn_fetch(u8, 1, c->eip); | 1188 | c->src.val = insn_fetch(u8, 1, c->eip); |
1150 | break; | 1189 | break; |
1190 | case SrcAcc: | ||
1191 | c->src.type = OP_REG; | ||
1192 | c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
1193 | c->src.ptr = &c->regs[VCPU_REGS_RAX]; | ||
1194 | switch (c->src.bytes) { | ||
1195 | case 1: | ||
1196 | c->src.val = *(u8 *)c->src.ptr; | ||
1197 | break; | ||
1198 | case 2: | ||
1199 | c->src.val = *(u16 *)c->src.ptr; | ||
1200 | break; | ||
1201 | case 4: | ||
1202 | c->src.val = *(u32 *)c->src.ptr; | ||
1203 | break; | ||
1204 | case 8: | ||
1205 | c->src.val = *(u64 *)c->src.ptr; | ||
1206 | break; | ||
1207 | } | ||
1208 | break; | ||
1151 | case SrcOne: | 1209 | case SrcOne: |
1152 | c->src.bytes = 1; | 1210 | c->src.bytes = 1; |
1153 | c->src.val = 1; | 1211 | c->src.val = 1; |
@@ -1156,10 +1214,21 @@ done_prefixes: | |||
1156 | c->src.type = OP_MEM; | 1214 | c->src.type = OP_MEM; |
1157 | c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | 1215 | c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; |
1158 | c->src.ptr = (unsigned long *) | 1216 | c->src.ptr = (unsigned long *) |
1159 | register_address(c, seg_override_base(ctxt, c), | 1217 | register_address(c, seg_override_base(ctxt, ops, c), |
1160 | c->regs[VCPU_REGS_RSI]); | 1218 | c->regs[VCPU_REGS_RSI]); |
1161 | c->src.val = 0; | 1219 | c->src.val = 0; |
1162 | break; | 1220 | break; |
1221 | case SrcImmFAddr: | ||
1222 | c->src.type = OP_IMM; | ||
1223 | c->src.ptr = (unsigned long *)c->eip; | ||
1224 | c->src.bytes = c->op_bytes + 2; | ||
1225 | insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip); | ||
1226 | break; | ||
1227 | case SrcMemFAddr: | ||
1228 | c->src.type = OP_MEM; | ||
1229 | c->src.ptr = (unsigned long *)c->modrm_ea; | ||
1230 | c->src.bytes = c->op_bytes + 2; | ||
1231 | break; | ||
1163 | } | 1232 | } |
1164 | 1233 | ||
1165 | /* | 1234 | /* |
@@ -1179,22 +1248,10 @@ done_prefixes: | |||
1179 | c->src2.bytes = 1; | 1248 | c->src2.bytes = 1; |
1180 | c->src2.val = insn_fetch(u8, 1, c->eip); | 1249 | c->src2.val = insn_fetch(u8, 1, c->eip); |
1181 | break; | 1250 | break; |
1182 | case Src2Imm16: | ||
1183 | c->src2.type = OP_IMM; | ||
1184 | c->src2.ptr = (unsigned long *)c->eip; | ||
1185 | c->src2.bytes = 2; | ||
1186 | c->src2.val = insn_fetch(u16, 2, c->eip); | ||
1187 | break; | ||
1188 | case Src2One: | 1251 | case Src2One: |
1189 | c->src2.bytes = 1; | 1252 | c->src2.bytes = 1; |
1190 | c->src2.val = 1; | 1253 | c->src2.val = 1; |
1191 | break; | 1254 | break; |
1192 | case Src2Mem16: | ||
1193 | c->src2.type = OP_MEM; | ||
1194 | c->src2.bytes = 2; | ||
1195 | c->src2.ptr = (unsigned long *)(c->modrm_ea + c->src.bytes); | ||
1196 | c->src2.val = 0; | ||
1197 | break; | ||
1198 | } | 1255 | } |
1199 | 1256 | ||
1200 | /* Decode and fetch the destination operand: register or memory. */ | 1257 | /* Decode and fetch the destination operand: register or memory. */ |
@@ -1253,7 +1310,7 @@ done_prefixes: | |||
1253 | c->dst.type = OP_MEM; | 1310 | c->dst.type = OP_MEM; |
1254 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | 1311 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; |
1255 | c->dst.ptr = (unsigned long *) | 1312 | c->dst.ptr = (unsigned long *) |
1256 | register_address(c, es_base(ctxt), | 1313 | register_address(c, es_base(ctxt, ops), |
1257 | c->regs[VCPU_REGS_RDI]); | 1314 | c->regs[VCPU_REGS_RDI]); |
1258 | c->dst.val = 0; | 1315 | c->dst.val = 0; |
1259 | break; | 1316 | break; |
@@ -1263,6 +1320,37 @@ done: | |||
1263 | return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; | 1320 | return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; |
1264 | } | 1321 | } |
1265 | 1322 | ||
1323 | static int read_emulated(struct x86_emulate_ctxt *ctxt, | ||
1324 | struct x86_emulate_ops *ops, | ||
1325 | unsigned long addr, void *dest, unsigned size) | ||
1326 | { | ||
1327 | int rc; | ||
1328 | struct read_cache *mc = &ctxt->decode.mem_read; | ||
1329 | u32 err; | ||
1330 | |||
1331 | while (size) { | ||
1332 | int n = min(size, 8u); | ||
1333 | size -= n; | ||
1334 | if (mc->pos < mc->end) | ||
1335 | goto read_cached; | ||
1336 | |||
1337 | rc = ops->read_emulated(addr, mc->data + mc->end, n, &err, | ||
1338 | ctxt->vcpu); | ||
1339 | if (rc == X86EMUL_PROPAGATE_FAULT) | ||
1340 | emulate_pf(ctxt, addr, err); | ||
1341 | if (rc != X86EMUL_CONTINUE) | ||
1342 | return rc; | ||
1343 | mc->end += n; | ||
1344 | |||
1345 | read_cached: | ||
1346 | memcpy(dest, mc->data + mc->pos, n); | ||
1347 | mc->pos += n; | ||
1348 | dest += n; | ||
1349 | addr += n; | ||
1350 | } | ||
1351 | return X86EMUL_CONTINUE; | ||
1352 | } | ||
1353 | |||
1266 | static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, | 1354 | static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, |
1267 | struct x86_emulate_ops *ops, | 1355 | struct x86_emulate_ops *ops, |
1268 | unsigned int size, unsigned short port, | 1356 | unsigned int size, unsigned short port, |
@@ -1330,13 +1418,13 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
1330 | get_descriptor_table_ptr(ctxt, ops, selector, &dt); | 1418 | get_descriptor_table_ptr(ctxt, ops, selector, &dt); |
1331 | 1419 | ||
1332 | if (dt.size < index * 8 + 7) { | 1420 | if (dt.size < index * 8 + 7) { |
1333 | kvm_inject_gp(ctxt->vcpu, selector & 0xfffc); | 1421 | emulate_gp(ctxt, selector & 0xfffc); |
1334 | return X86EMUL_PROPAGATE_FAULT; | 1422 | return X86EMUL_PROPAGATE_FAULT; |
1335 | } | 1423 | } |
1336 | addr = dt.address + index * 8; | 1424 | addr = dt.address + index * 8; |
1337 | ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); | 1425 | ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); |
1338 | if (ret == X86EMUL_PROPAGATE_FAULT) | 1426 | if (ret == X86EMUL_PROPAGATE_FAULT) |
1339 | kvm_inject_page_fault(ctxt->vcpu, addr, err); | 1427 | emulate_pf(ctxt, addr, err); |
1340 | 1428 | ||
1341 | return ret; | 1429 | return ret; |
1342 | } | 1430 | } |
@@ -1355,14 +1443,14 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
1355 | get_descriptor_table_ptr(ctxt, ops, selector, &dt); | 1443 | get_descriptor_table_ptr(ctxt, ops, selector, &dt); |
1356 | 1444 | ||
1357 | if (dt.size < index * 8 + 7) { | 1445 | if (dt.size < index * 8 + 7) { |
1358 | kvm_inject_gp(ctxt->vcpu, selector & 0xfffc); | 1446 | emulate_gp(ctxt, selector & 0xfffc); |
1359 | return X86EMUL_PROPAGATE_FAULT; | 1447 | return X86EMUL_PROPAGATE_FAULT; |
1360 | } | 1448 | } |
1361 | 1449 | ||
1362 | addr = dt.address + index * 8; | 1450 | addr = dt.address + index * 8; |
1363 | ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); | 1451 | ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); |
1364 | if (ret == X86EMUL_PROPAGATE_FAULT) | 1452 | if (ret == X86EMUL_PROPAGATE_FAULT) |
1365 | kvm_inject_page_fault(ctxt->vcpu, addr, err); | 1453 | emulate_pf(ctxt, addr, err); |
1366 | 1454 | ||
1367 | return ret; | 1455 | return ret; |
1368 | } | 1456 | } |
@@ -1481,11 +1569,70 @@ load: | |||
1481 | ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu); | 1569 | ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu); |
1482 | return X86EMUL_CONTINUE; | 1570 | return X86EMUL_CONTINUE; |
1483 | exception: | 1571 | exception: |
1484 | kvm_queue_exception_e(ctxt->vcpu, err_vec, err_code); | 1572 | emulate_exception(ctxt, err_vec, err_code, true); |
1485 | return X86EMUL_PROPAGATE_FAULT; | 1573 | return X86EMUL_PROPAGATE_FAULT; |
1486 | } | 1574 | } |
1487 | 1575 | ||
1488 | static inline void emulate_push(struct x86_emulate_ctxt *ctxt) | 1576 | static inline int writeback(struct x86_emulate_ctxt *ctxt, |
1577 | struct x86_emulate_ops *ops) | ||
1578 | { | ||
1579 | int rc; | ||
1580 | struct decode_cache *c = &ctxt->decode; | ||
1581 | u32 err; | ||
1582 | |||
1583 | switch (c->dst.type) { | ||
1584 | case OP_REG: | ||
1585 | /* The 4-byte case *is* correct: | ||
1586 | * in 64-bit mode we zero-extend. | ||
1587 | */ | ||
1588 | switch (c->dst.bytes) { | ||
1589 | case 1: | ||
1590 | *(u8 *)c->dst.ptr = (u8)c->dst.val; | ||
1591 | break; | ||
1592 | case 2: | ||
1593 | *(u16 *)c->dst.ptr = (u16)c->dst.val; | ||
1594 | break; | ||
1595 | case 4: | ||
1596 | *c->dst.ptr = (u32)c->dst.val; | ||
1597 | break; /* 64b: zero-ext */ | ||
1598 | case 8: | ||
1599 | *c->dst.ptr = c->dst.val; | ||
1600 | break; | ||
1601 | } | ||
1602 | break; | ||
1603 | case OP_MEM: | ||
1604 | if (c->lock_prefix) | ||
1605 | rc = ops->cmpxchg_emulated( | ||
1606 | (unsigned long)c->dst.ptr, | ||
1607 | &c->dst.orig_val, | ||
1608 | &c->dst.val, | ||
1609 | c->dst.bytes, | ||
1610 | &err, | ||
1611 | ctxt->vcpu); | ||
1612 | else | ||
1613 | rc = ops->write_emulated( | ||
1614 | (unsigned long)c->dst.ptr, | ||
1615 | &c->dst.val, | ||
1616 | c->dst.bytes, | ||
1617 | &err, | ||
1618 | ctxt->vcpu); | ||
1619 | if (rc == X86EMUL_PROPAGATE_FAULT) | ||
1620 | emulate_pf(ctxt, | ||
1621 | (unsigned long)c->dst.ptr, err); | ||
1622 | if (rc != X86EMUL_CONTINUE) | ||
1623 | return rc; | ||
1624 | break; | ||
1625 | case OP_NONE: | ||
1626 | /* no writeback */ | ||
1627 | break; | ||
1628 | default: | ||
1629 | break; | ||
1630 | } | ||
1631 | return X86EMUL_CONTINUE; | ||
1632 | } | ||
1633 | |||
1634 | static inline void emulate_push(struct x86_emulate_ctxt *ctxt, | ||
1635 | struct x86_emulate_ops *ops) | ||
1489 | { | 1636 | { |
1490 | struct decode_cache *c = &ctxt->decode; | 1637 | struct decode_cache *c = &ctxt->decode; |
1491 | 1638 | ||
@@ -1493,7 +1640,7 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt) | |||
1493 | c->dst.bytes = c->op_bytes; | 1640 | c->dst.bytes = c->op_bytes; |
1494 | c->dst.val = c->src.val; | 1641 | c->dst.val = c->src.val; |
1495 | register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); | 1642 | register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); |
1496 | c->dst.ptr = (void *) register_address(c, ss_base(ctxt), | 1643 | c->dst.ptr = (void *) register_address(c, ss_base(ctxt, ops), |
1497 | c->regs[VCPU_REGS_RSP]); | 1644 | c->regs[VCPU_REGS_RSP]); |
1498 | } | 1645 | } |
1499 | 1646 | ||
@@ -1504,9 +1651,9 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt, | |||
1504 | struct decode_cache *c = &ctxt->decode; | 1651 | struct decode_cache *c = &ctxt->decode; |
1505 | int rc; | 1652 | int rc; |
1506 | 1653 | ||
1507 | rc = ops->read_emulated(register_address(c, ss_base(ctxt), | 1654 | rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt, ops), |
1508 | c->regs[VCPU_REGS_RSP]), | 1655 | c->regs[VCPU_REGS_RSP]), |
1509 | dest, len, ctxt->vcpu); | 1656 | dest, len); |
1510 | if (rc != X86EMUL_CONTINUE) | 1657 | if (rc != X86EMUL_CONTINUE) |
1511 | return rc; | 1658 | return rc; |
1512 | 1659 | ||
@@ -1541,7 +1688,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, | |||
1541 | break; | 1688 | break; |
1542 | case X86EMUL_MODE_VM86: | 1689 | case X86EMUL_MODE_VM86: |
1543 | if (iopl < 3) { | 1690 | if (iopl < 3) { |
1544 | kvm_inject_gp(ctxt->vcpu, 0); | 1691 | emulate_gp(ctxt, 0); |
1545 | return X86EMUL_PROPAGATE_FAULT; | 1692 | return X86EMUL_PROPAGATE_FAULT; |
1546 | } | 1693 | } |
1547 | change_mask |= EFLG_IF; | 1694 | change_mask |= EFLG_IF; |
@@ -1557,15 +1704,14 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, | |||
1557 | return rc; | 1704 | return rc; |
1558 | } | 1705 | } |
1559 | 1706 | ||
1560 | static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg) | 1707 | static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, |
1708 | struct x86_emulate_ops *ops, int seg) | ||
1561 | { | 1709 | { |
1562 | struct decode_cache *c = &ctxt->decode; | 1710 | struct decode_cache *c = &ctxt->decode; |
1563 | struct kvm_segment segment; | ||
1564 | 1711 | ||
1565 | kvm_x86_ops->get_segment(ctxt->vcpu, &segment, seg); | 1712 | c->src.val = ops->get_segment_selector(seg, ctxt->vcpu); |
1566 | 1713 | ||
1567 | c->src.val = segment.selector; | 1714 | emulate_push(ctxt, ops); |
1568 | emulate_push(ctxt); | ||
1569 | } | 1715 | } |
1570 | 1716 | ||
1571 | static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, | 1717 | static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, |
@@ -1583,19 +1729,31 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, | |||
1583 | return rc; | 1729 | return rc; |
1584 | } | 1730 | } |
1585 | 1731 | ||
1586 | static void emulate_pusha(struct x86_emulate_ctxt *ctxt) | 1732 | static int emulate_pusha(struct x86_emulate_ctxt *ctxt, |
1733 | struct x86_emulate_ops *ops) | ||
1587 | { | 1734 | { |
1588 | struct decode_cache *c = &ctxt->decode; | 1735 | struct decode_cache *c = &ctxt->decode; |
1589 | unsigned long old_esp = c->regs[VCPU_REGS_RSP]; | 1736 | unsigned long old_esp = c->regs[VCPU_REGS_RSP]; |
1737 | int rc = X86EMUL_CONTINUE; | ||
1590 | int reg = VCPU_REGS_RAX; | 1738 | int reg = VCPU_REGS_RAX; |
1591 | 1739 | ||
1592 | while (reg <= VCPU_REGS_RDI) { | 1740 | while (reg <= VCPU_REGS_RDI) { |
1593 | (reg == VCPU_REGS_RSP) ? | 1741 | (reg == VCPU_REGS_RSP) ? |
1594 | (c->src.val = old_esp) : (c->src.val = c->regs[reg]); | 1742 | (c->src.val = old_esp) : (c->src.val = c->regs[reg]); |
1595 | 1743 | ||
1596 | emulate_push(ctxt); | 1744 | emulate_push(ctxt, ops); |
1745 | |||
1746 | rc = writeback(ctxt, ops); | ||
1747 | if (rc != X86EMUL_CONTINUE) | ||
1748 | return rc; | ||
1749 | |||
1597 | ++reg; | 1750 | ++reg; |
1598 | } | 1751 | } |
1752 | |||
1753 | /* Disable writeback. */ | ||
1754 | c->dst.type = OP_NONE; | ||
1755 | |||
1756 | return rc; | ||
1599 | } | 1757 | } |
1600 | 1758 | ||
1601 | static int emulate_popa(struct x86_emulate_ctxt *ctxt, | 1759 | static int emulate_popa(struct x86_emulate_ctxt *ctxt, |
@@ -1695,14 +1853,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, | |||
1695 | old_eip = c->eip; | 1853 | old_eip = c->eip; |
1696 | c->eip = c->src.val; | 1854 | c->eip = c->src.val; |
1697 | c->src.val = old_eip; | 1855 | c->src.val = old_eip; |
1698 | emulate_push(ctxt); | 1856 | emulate_push(ctxt, ops); |
1699 | break; | 1857 | break; |
1700 | } | 1858 | } |
1701 | case 4: /* jmp abs */ | 1859 | case 4: /* jmp abs */ |
1702 | c->eip = c->src.val; | 1860 | c->eip = c->src.val; |
1703 | break; | 1861 | break; |
1704 | case 6: /* push */ | 1862 | case 6: /* push */ |
1705 | emulate_push(ctxt); | 1863 | emulate_push(ctxt, ops); |
1706 | break; | 1864 | break; |
1707 | } | 1865 | } |
1708 | return X86EMUL_CONTINUE; | 1866 | return X86EMUL_CONTINUE; |
@@ -1748,145 +1906,82 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, | |||
1748 | return rc; | 1906 | return rc; |
1749 | } | 1907 | } |
1750 | 1908 | ||
1751 | static inline int writeback(struct x86_emulate_ctxt *ctxt, | ||
1752 | struct x86_emulate_ops *ops) | ||
1753 | { | ||
1754 | int rc; | ||
1755 | struct decode_cache *c = &ctxt->decode; | ||
1756 | |||
1757 | switch (c->dst.type) { | ||
1758 | case OP_REG: | ||
1759 | /* The 4-byte case *is* correct: | ||
1760 | * in 64-bit mode we zero-extend. | ||
1761 | */ | ||
1762 | switch (c->dst.bytes) { | ||
1763 | case 1: | ||
1764 | *(u8 *)c->dst.ptr = (u8)c->dst.val; | ||
1765 | break; | ||
1766 | case 2: | ||
1767 | *(u16 *)c->dst.ptr = (u16)c->dst.val; | ||
1768 | break; | ||
1769 | case 4: | ||
1770 | *c->dst.ptr = (u32)c->dst.val; | ||
1771 | break; /* 64b: zero-ext */ | ||
1772 | case 8: | ||
1773 | *c->dst.ptr = c->dst.val; | ||
1774 | break; | ||
1775 | } | ||
1776 | break; | ||
1777 | case OP_MEM: | ||
1778 | if (c->lock_prefix) | ||
1779 | rc = ops->cmpxchg_emulated( | ||
1780 | (unsigned long)c->dst.ptr, | ||
1781 | &c->dst.orig_val, | ||
1782 | &c->dst.val, | ||
1783 | c->dst.bytes, | ||
1784 | ctxt->vcpu); | ||
1785 | else | ||
1786 | rc = ops->write_emulated( | ||
1787 | (unsigned long)c->dst.ptr, | ||
1788 | &c->dst.val, | ||
1789 | c->dst.bytes, | ||
1790 | ctxt->vcpu); | ||
1791 | if (rc != X86EMUL_CONTINUE) | ||
1792 | return rc; | ||
1793 | break; | ||
1794 | case OP_NONE: | ||
1795 | /* no writeback */ | ||
1796 | break; | ||
1797 | default: | ||
1798 | break; | ||
1799 | } | ||
1800 | return X86EMUL_CONTINUE; | ||
1801 | } | ||
1802 | |||
1803 | static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask) | ||
1804 | { | ||
1805 | u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask); | ||
1806 | /* | ||
1807 | * an sti; sti; sequence only disable interrupts for the first | ||
1808 | * instruction. So, if the last instruction, be it emulated or | ||
1809 | * not, left the system with the INT_STI flag enabled, it | ||
1810 | * means that the last instruction is an sti. We should not | ||
1811 | * leave the flag on in this case. The same goes for mov ss | ||
1812 | */ | ||
1813 | if (!(int_shadow & mask)) | ||
1814 | ctxt->interruptibility = mask; | ||
1815 | } | ||
1816 | |||
1817 | static inline void | 1909 | static inline void |
1818 | setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, | 1910 | setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, |
1819 | struct kvm_segment *cs, struct kvm_segment *ss) | 1911 | struct x86_emulate_ops *ops, struct desc_struct *cs, |
1912 | struct desc_struct *ss) | ||
1820 | { | 1913 | { |
1821 | memset(cs, 0, sizeof(struct kvm_segment)); | 1914 | memset(cs, 0, sizeof(struct desc_struct)); |
1822 | kvm_x86_ops->get_segment(ctxt->vcpu, cs, VCPU_SREG_CS); | 1915 | ops->get_cached_descriptor(cs, VCPU_SREG_CS, ctxt->vcpu); |
1823 | memset(ss, 0, sizeof(struct kvm_segment)); | 1916 | memset(ss, 0, sizeof(struct desc_struct)); |
1824 | 1917 | ||
1825 | cs->l = 0; /* will be adjusted later */ | 1918 | cs->l = 0; /* will be adjusted later */ |
1826 | cs->base = 0; /* flat segment */ | 1919 | set_desc_base(cs, 0); /* flat segment */ |
1827 | cs->g = 1; /* 4kb granularity */ | 1920 | cs->g = 1; /* 4kb granularity */ |
1828 | cs->limit = 0xffffffff; /* 4GB limit */ | 1921 | set_desc_limit(cs, 0xfffff); /* 4GB limit */ |
1829 | cs->type = 0x0b; /* Read, Execute, Accessed */ | 1922 | cs->type = 0x0b; /* Read, Execute, Accessed */ |
1830 | cs->s = 1; | 1923 | cs->s = 1; |
1831 | cs->dpl = 0; /* will be adjusted later */ | 1924 | cs->dpl = 0; /* will be adjusted later */ |
1832 | cs->present = 1; | 1925 | cs->p = 1; |
1833 | cs->db = 1; | 1926 | cs->d = 1; |
1834 | 1927 | ||
1835 | ss->unusable = 0; | 1928 | set_desc_base(ss, 0); /* flat segment */ |
1836 | ss->base = 0; /* flat segment */ | 1929 | set_desc_limit(ss, 0xfffff); /* 4GB limit */ |
1837 | ss->limit = 0xffffffff; /* 4GB limit */ | ||
1838 | ss->g = 1; /* 4kb granularity */ | 1930 | ss->g = 1; /* 4kb granularity */ |
1839 | ss->s = 1; | 1931 | ss->s = 1; |
1840 | ss->type = 0x03; /* Read/Write, Accessed */ | 1932 | ss->type = 0x03; /* Read/Write, Accessed */ |
1841 | ss->db = 1; /* 32bit stack segment */ | 1933 | ss->d = 1; /* 32bit stack segment */ |
1842 | ss->dpl = 0; | 1934 | ss->dpl = 0; |
1843 | ss->present = 1; | 1935 | ss->p = 1; |
1844 | } | 1936 | } |
1845 | 1937 | ||
1846 | static int | 1938 | static int |
1847 | emulate_syscall(struct x86_emulate_ctxt *ctxt) | 1939 | emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) |
1848 | { | 1940 | { |
1849 | struct decode_cache *c = &ctxt->decode; | 1941 | struct decode_cache *c = &ctxt->decode; |
1850 | struct kvm_segment cs, ss; | 1942 | struct desc_struct cs, ss; |
1851 | u64 msr_data; | 1943 | u64 msr_data; |
1944 | u16 cs_sel, ss_sel; | ||
1852 | 1945 | ||
1853 | /* syscall is not available in real mode */ | 1946 | /* syscall is not available in real mode */ |
1854 | if (ctxt->mode == X86EMUL_MODE_REAL || | 1947 | if (ctxt->mode == X86EMUL_MODE_REAL || |
1855 | ctxt->mode == X86EMUL_MODE_VM86) { | 1948 | ctxt->mode == X86EMUL_MODE_VM86) { |
1856 | kvm_queue_exception(ctxt->vcpu, UD_VECTOR); | 1949 | emulate_ud(ctxt); |
1857 | return X86EMUL_PROPAGATE_FAULT; | 1950 | return X86EMUL_PROPAGATE_FAULT; |
1858 | } | 1951 | } |
1859 | 1952 | ||
1860 | setup_syscalls_segments(ctxt, &cs, &ss); | 1953 | setup_syscalls_segments(ctxt, ops, &cs, &ss); |
1861 | 1954 | ||
1862 | kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); | 1955 | ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); |
1863 | msr_data >>= 32; | 1956 | msr_data >>= 32; |
1864 | cs.selector = (u16)(msr_data & 0xfffc); | 1957 | cs_sel = (u16)(msr_data & 0xfffc); |
1865 | ss.selector = (u16)(msr_data + 8); | 1958 | ss_sel = (u16)(msr_data + 8); |
1866 | 1959 | ||
1867 | if (is_long_mode(ctxt->vcpu)) { | 1960 | if (is_long_mode(ctxt->vcpu)) { |
1868 | cs.db = 0; | 1961 | cs.d = 0; |
1869 | cs.l = 1; | 1962 | cs.l = 1; |
1870 | } | 1963 | } |
1871 | kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); | 1964 | ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); |
1872 | kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); | 1965 | ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); |
1966 | ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); | ||
1967 | ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); | ||
1873 | 1968 | ||
1874 | c->regs[VCPU_REGS_RCX] = c->eip; | 1969 | c->regs[VCPU_REGS_RCX] = c->eip; |
1875 | if (is_long_mode(ctxt->vcpu)) { | 1970 | if (is_long_mode(ctxt->vcpu)) { |
1876 | #ifdef CONFIG_X86_64 | 1971 | #ifdef CONFIG_X86_64 |
1877 | c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; | 1972 | c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; |
1878 | 1973 | ||
1879 | kvm_x86_ops->get_msr(ctxt->vcpu, | 1974 | ops->get_msr(ctxt->vcpu, |
1880 | ctxt->mode == X86EMUL_MODE_PROT64 ? | 1975 | ctxt->mode == X86EMUL_MODE_PROT64 ? |
1881 | MSR_LSTAR : MSR_CSTAR, &msr_data); | 1976 | MSR_LSTAR : MSR_CSTAR, &msr_data); |
1882 | c->eip = msr_data; | 1977 | c->eip = msr_data; |
1883 | 1978 | ||
1884 | kvm_x86_ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data); | 1979 | ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data); |
1885 | ctxt->eflags &= ~(msr_data | EFLG_RF); | 1980 | ctxt->eflags &= ~(msr_data | EFLG_RF); |
1886 | #endif | 1981 | #endif |
1887 | } else { | 1982 | } else { |
1888 | /* legacy mode */ | 1983 | /* legacy mode */ |
1889 | kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); | 1984 | ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); |
1890 | c->eip = (u32)msr_data; | 1985 | c->eip = (u32)msr_data; |
1891 | 1986 | ||
1892 | ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); | 1987 | ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); |
@@ -1896,15 +1991,16 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt) | |||
1896 | } | 1991 | } |
1897 | 1992 | ||
1898 | static int | 1993 | static int |
1899 | emulate_sysenter(struct x86_emulate_ctxt *ctxt) | 1994 | emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) |
1900 | { | 1995 | { |
1901 | struct decode_cache *c = &ctxt->decode; | 1996 | struct decode_cache *c = &ctxt->decode; |
1902 | struct kvm_segment cs, ss; | 1997 | struct desc_struct cs, ss; |
1903 | u64 msr_data; | 1998 | u64 msr_data; |
1999 | u16 cs_sel, ss_sel; | ||
1904 | 2000 | ||
1905 | /* inject #GP if in real mode */ | 2001 | /* inject #GP if in real mode */ |
1906 | if (ctxt->mode == X86EMUL_MODE_REAL) { | 2002 | if (ctxt->mode == X86EMUL_MODE_REAL) { |
1907 | kvm_inject_gp(ctxt->vcpu, 0); | 2003 | emulate_gp(ctxt, 0); |
1908 | return X86EMUL_PROPAGATE_FAULT; | 2004 | return X86EMUL_PROPAGATE_FAULT; |
1909 | } | 2005 | } |
1910 | 2006 | ||
@@ -1912,67 +2008,70 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt) | |||
1912 | * Therefore, we inject an #UD. | 2008 | * Therefore, we inject an #UD. |
1913 | */ | 2009 | */ |
1914 | if (ctxt->mode == X86EMUL_MODE_PROT64) { | 2010 | if (ctxt->mode == X86EMUL_MODE_PROT64) { |
1915 | kvm_queue_exception(ctxt->vcpu, UD_VECTOR); | 2011 | emulate_ud(ctxt); |
1916 | return X86EMUL_PROPAGATE_FAULT; | 2012 | return X86EMUL_PROPAGATE_FAULT; |
1917 | } | 2013 | } |
1918 | 2014 | ||
1919 | setup_syscalls_segments(ctxt, &cs, &ss); | 2015 | setup_syscalls_segments(ctxt, ops, &cs, &ss); |
1920 | 2016 | ||
1921 | kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); | 2017 | ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); |
1922 | switch (ctxt->mode) { | 2018 | switch (ctxt->mode) { |
1923 | case X86EMUL_MODE_PROT32: | 2019 | case X86EMUL_MODE_PROT32: |
1924 | if ((msr_data & 0xfffc) == 0x0) { | 2020 | if ((msr_data & 0xfffc) == 0x0) { |
1925 | kvm_inject_gp(ctxt->vcpu, 0); | 2021 | emulate_gp(ctxt, 0); |
1926 | return X86EMUL_PROPAGATE_FAULT; | 2022 | return X86EMUL_PROPAGATE_FAULT; |
1927 | } | 2023 | } |
1928 | break; | 2024 | break; |
1929 | case X86EMUL_MODE_PROT64: | 2025 | case X86EMUL_MODE_PROT64: |
1930 | if (msr_data == 0x0) { | 2026 | if (msr_data == 0x0) { |
1931 | kvm_inject_gp(ctxt->vcpu, 0); | 2027 | emulate_gp(ctxt, 0); |
1932 | return X86EMUL_PROPAGATE_FAULT; | 2028 | return X86EMUL_PROPAGATE_FAULT; |
1933 | } | 2029 | } |
1934 | break; | 2030 | break; |
1935 | } | 2031 | } |
1936 | 2032 | ||
1937 | ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); | 2033 | ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); |
1938 | cs.selector = (u16)msr_data; | 2034 | cs_sel = (u16)msr_data; |
1939 | cs.selector &= ~SELECTOR_RPL_MASK; | 2035 | cs_sel &= ~SELECTOR_RPL_MASK; |
1940 | ss.selector = cs.selector + 8; | 2036 | ss_sel = cs_sel + 8; |
1941 | ss.selector &= ~SELECTOR_RPL_MASK; | 2037 | ss_sel &= ~SELECTOR_RPL_MASK; |
1942 | if (ctxt->mode == X86EMUL_MODE_PROT64 | 2038 | if (ctxt->mode == X86EMUL_MODE_PROT64 |
1943 | || is_long_mode(ctxt->vcpu)) { | 2039 | || is_long_mode(ctxt->vcpu)) { |
1944 | cs.db = 0; | 2040 | cs.d = 0; |
1945 | cs.l = 1; | 2041 | cs.l = 1; |
1946 | } | 2042 | } |
1947 | 2043 | ||
1948 | kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); | 2044 | ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); |
1949 | kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); | 2045 | ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); |
2046 | ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); | ||
2047 | ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); | ||
1950 | 2048 | ||
1951 | kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); | 2049 | ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); |
1952 | c->eip = msr_data; | 2050 | c->eip = msr_data; |
1953 | 2051 | ||
1954 | kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); | 2052 | ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); |
1955 | c->regs[VCPU_REGS_RSP] = msr_data; | 2053 | c->regs[VCPU_REGS_RSP] = msr_data; |
1956 | 2054 | ||
1957 | return X86EMUL_CONTINUE; | 2055 | return X86EMUL_CONTINUE; |
1958 | } | 2056 | } |
1959 | 2057 | ||
1960 | static int | 2058 | static int |
1961 | emulate_sysexit(struct x86_emulate_ctxt *ctxt) | 2059 | emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) |
1962 | { | 2060 | { |
1963 | struct decode_cache *c = &ctxt->decode; | 2061 | struct decode_cache *c = &ctxt->decode; |
1964 | struct kvm_segment cs, ss; | 2062 | struct desc_struct cs, ss; |
1965 | u64 msr_data; | 2063 | u64 msr_data; |
1966 | int usermode; | 2064 | int usermode; |
2065 | u16 cs_sel, ss_sel; | ||
1967 | 2066 | ||
1968 | /* inject #GP if in real mode or Virtual 8086 mode */ | 2067 | /* inject #GP if in real mode or Virtual 8086 mode */ |
1969 | if (ctxt->mode == X86EMUL_MODE_REAL || | 2068 | if (ctxt->mode == X86EMUL_MODE_REAL || |
1970 | ctxt->mode == X86EMUL_MODE_VM86) { | 2069 | ctxt->mode == X86EMUL_MODE_VM86) { |
1971 | kvm_inject_gp(ctxt->vcpu, 0); | 2070 | emulate_gp(ctxt, 0); |
1972 | return X86EMUL_PROPAGATE_FAULT; | 2071 | return X86EMUL_PROPAGATE_FAULT; |
1973 | } | 2072 | } |
1974 | 2073 | ||
1975 | setup_syscalls_segments(ctxt, &cs, &ss); | 2074 | setup_syscalls_segments(ctxt, ops, &cs, &ss); |
1976 | 2075 | ||
1977 | if ((c->rex_prefix & 0x8) != 0x0) | 2076 | if ((c->rex_prefix & 0x8) != 0x0) |
1978 | usermode = X86EMUL_MODE_PROT64; | 2077 | usermode = X86EMUL_MODE_PROT64; |
@@ -1981,35 +2080,37 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) | |||
1981 | 2080 | ||
1982 | cs.dpl = 3; | 2081 | cs.dpl = 3; |
1983 | ss.dpl = 3; | 2082 | ss.dpl = 3; |
1984 | kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); | 2083 | ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); |
1985 | switch (usermode) { | 2084 | switch (usermode) { |
1986 | case X86EMUL_MODE_PROT32: | 2085 | case X86EMUL_MODE_PROT32: |
1987 | cs.selector = (u16)(msr_data + 16); | 2086 | cs_sel = (u16)(msr_data + 16); |
1988 | if ((msr_data & 0xfffc) == 0x0) { | 2087 | if ((msr_data & 0xfffc) == 0x0) { |
1989 | kvm_inject_gp(ctxt->vcpu, 0); | 2088 | emulate_gp(ctxt, 0); |
1990 | return X86EMUL_PROPAGATE_FAULT; | 2089 | return X86EMUL_PROPAGATE_FAULT; |
1991 | } | 2090 | } |
1992 | ss.selector = (u16)(msr_data + 24); | 2091 | ss_sel = (u16)(msr_data + 24); |
1993 | break; | 2092 | break; |
1994 | case X86EMUL_MODE_PROT64: | 2093 | case X86EMUL_MODE_PROT64: |
1995 | cs.selector = (u16)(msr_data + 32); | 2094 | cs_sel = (u16)(msr_data + 32); |
1996 | if (msr_data == 0x0) { | 2095 | if (msr_data == 0x0) { |
1997 | kvm_inject_gp(ctxt->vcpu, 0); | 2096 | emulate_gp(ctxt, 0); |
1998 | return X86EMUL_PROPAGATE_FAULT; | 2097 | return X86EMUL_PROPAGATE_FAULT; |
1999 | } | 2098 | } |
2000 | ss.selector = cs.selector + 8; | 2099 | ss_sel = cs_sel + 8; |
2001 | cs.db = 0; | 2100 | cs.d = 0; |
2002 | cs.l = 1; | 2101 | cs.l = 1; |
2003 | break; | 2102 | break; |
2004 | } | 2103 | } |
2005 | cs.selector |= SELECTOR_RPL_MASK; | 2104 | cs_sel |= SELECTOR_RPL_MASK; |
2006 | ss.selector |= SELECTOR_RPL_MASK; | 2105 | ss_sel |= SELECTOR_RPL_MASK; |
2007 | 2106 | ||
2008 | kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); | 2107 | ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); |
2009 | kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); | 2108 | ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); |
2109 | ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); | ||
2110 | ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); | ||
2010 | 2111 | ||
2011 | c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX]; | 2112 | c->eip = c->regs[VCPU_REGS_RDX]; |
2012 | c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX]; | 2113 | c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX]; |
2013 | 2114 | ||
2014 | return X86EMUL_CONTINUE; | 2115 | return X86EMUL_CONTINUE; |
2015 | } | 2116 | } |
@@ -2030,25 +2131,25 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, | |||
2030 | struct x86_emulate_ops *ops, | 2131 | struct x86_emulate_ops *ops, |
2031 | u16 port, u16 len) | 2132 | u16 port, u16 len) |
2032 | { | 2133 | { |
2033 | struct kvm_segment tr_seg; | 2134 | struct desc_struct tr_seg; |
2034 | int r; | 2135 | int r; |
2035 | u16 io_bitmap_ptr; | 2136 | u16 io_bitmap_ptr; |
2036 | u8 perm, bit_idx = port & 0x7; | 2137 | u8 perm, bit_idx = port & 0x7; |
2037 | unsigned mask = (1 << len) - 1; | 2138 | unsigned mask = (1 << len) - 1; |
2038 | 2139 | ||
2039 | kvm_get_segment(ctxt->vcpu, &tr_seg, VCPU_SREG_TR); | 2140 | ops->get_cached_descriptor(&tr_seg, VCPU_SREG_TR, ctxt->vcpu); |
2040 | if (tr_seg.unusable) | 2141 | if (!tr_seg.p) |
2041 | return false; | 2142 | return false; |
2042 | if (tr_seg.limit < 103) | 2143 | if (desc_limit_scaled(&tr_seg) < 103) |
2043 | return false; | 2144 | return false; |
2044 | r = ops->read_std(tr_seg.base + 102, &io_bitmap_ptr, 2, ctxt->vcpu, | 2145 | r = ops->read_std(get_desc_base(&tr_seg) + 102, &io_bitmap_ptr, 2, |
2045 | NULL); | 2146 | ctxt->vcpu, NULL); |
2046 | if (r != X86EMUL_CONTINUE) | 2147 | if (r != X86EMUL_CONTINUE) |
2047 | return false; | 2148 | return false; |
2048 | if (io_bitmap_ptr + port/8 > tr_seg.limit) | 2149 | if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg)) |
2049 | return false; | 2150 | return false; |
2050 | r = ops->read_std(tr_seg.base + io_bitmap_ptr + port/8, &perm, 1, | 2151 | r = ops->read_std(get_desc_base(&tr_seg) + io_bitmap_ptr + port/8, |
2051 | ctxt->vcpu, NULL); | 2152 | &perm, 1, ctxt->vcpu, NULL); |
2052 | if (r != X86EMUL_CONTINUE) | 2153 | if (r != X86EMUL_CONTINUE) |
2053 | return false; | 2154 | return false; |
2054 | if ((perm >> bit_idx) & mask) | 2155 | if ((perm >> bit_idx) & mask) |
@@ -2066,17 +2167,6 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt, | |||
2066 | return true; | 2167 | return true; |
2067 | } | 2168 | } |
2068 | 2169 | ||
2069 | static u32 get_cached_descriptor_base(struct x86_emulate_ctxt *ctxt, | ||
2070 | struct x86_emulate_ops *ops, | ||
2071 | int seg) | ||
2072 | { | ||
2073 | struct desc_struct desc; | ||
2074 | if (ops->get_cached_descriptor(&desc, seg, ctxt->vcpu)) | ||
2075 | return get_desc_base(&desc); | ||
2076 | else | ||
2077 | return ~0; | ||
2078 | } | ||
2079 | |||
2080 | static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, | 2170 | static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, |
2081 | struct x86_emulate_ops *ops, | 2171 | struct x86_emulate_ops *ops, |
2082 | struct tss_segment_16 *tss) | 2172 | struct tss_segment_16 *tss) |
@@ -2165,7 +2255,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, | |||
2165 | &err); | 2255 | &err); |
2166 | if (ret == X86EMUL_PROPAGATE_FAULT) { | 2256 | if (ret == X86EMUL_PROPAGATE_FAULT) { |
2167 | /* FIXME: need to provide precise fault address */ | 2257 | /* FIXME: need to provide precise fault address */ |
2168 | kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); | 2258 | emulate_pf(ctxt, old_tss_base, err); |
2169 | return ret; | 2259 | return ret; |
2170 | } | 2260 | } |
2171 | 2261 | ||
@@ -2175,7 +2265,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, | |||
2175 | &err); | 2265 | &err); |
2176 | if (ret == X86EMUL_PROPAGATE_FAULT) { | 2266 | if (ret == X86EMUL_PROPAGATE_FAULT) { |
2177 | /* FIXME: need to provide precise fault address */ | 2267 | /* FIXME: need to provide precise fault address */ |
2178 | kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); | 2268 | emulate_pf(ctxt, old_tss_base, err); |
2179 | return ret; | 2269 | return ret; |
2180 | } | 2270 | } |
2181 | 2271 | ||
@@ -2183,7 +2273,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, | |||
2183 | &err); | 2273 | &err); |
2184 | if (ret == X86EMUL_PROPAGATE_FAULT) { | 2274 | if (ret == X86EMUL_PROPAGATE_FAULT) { |
2185 | /* FIXME: need to provide precise fault address */ | 2275 | /* FIXME: need to provide precise fault address */ |
2186 | kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); | 2276 | emulate_pf(ctxt, new_tss_base, err); |
2187 | return ret; | 2277 | return ret; |
2188 | } | 2278 | } |
2189 | 2279 | ||
@@ -2196,7 +2286,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, | |||
2196 | ctxt->vcpu, &err); | 2286 | ctxt->vcpu, &err); |
2197 | if (ret == X86EMUL_PROPAGATE_FAULT) { | 2287 | if (ret == X86EMUL_PROPAGATE_FAULT) { |
2198 | /* FIXME: need to provide precise fault address */ | 2288 | /* FIXME: need to provide precise fault address */ |
2199 | kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); | 2289 | emulate_pf(ctxt, new_tss_base, err); |
2200 | return ret; | 2290 | return ret; |
2201 | } | 2291 | } |
2202 | } | 2292 | } |
@@ -2238,7 +2328,10 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, | |||
2238 | struct decode_cache *c = &ctxt->decode; | 2328 | struct decode_cache *c = &ctxt->decode; |
2239 | int ret; | 2329 | int ret; |
2240 | 2330 | ||
2241 | ops->set_cr(3, tss->cr3, ctxt->vcpu); | 2331 | if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) { |
2332 | emulate_gp(ctxt, 0); | ||
2333 | return X86EMUL_PROPAGATE_FAULT; | ||
2334 | } | ||
2242 | c->eip = tss->eip; | 2335 | c->eip = tss->eip; |
2243 | ctxt->eflags = tss->eflags | 2; | 2336 | ctxt->eflags = tss->eflags | 2; |
2244 | c->regs[VCPU_REGS_RAX] = tss->eax; | 2337 | c->regs[VCPU_REGS_RAX] = tss->eax; |
@@ -2304,7 +2397,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, | |||
2304 | &err); | 2397 | &err); |
2305 | if (ret == X86EMUL_PROPAGATE_FAULT) { | 2398 | if (ret == X86EMUL_PROPAGATE_FAULT) { |
2306 | /* FIXME: need to provide precise fault address */ | 2399 | /* FIXME: need to provide precise fault address */ |
2307 | kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); | 2400 | emulate_pf(ctxt, old_tss_base, err); |
2308 | return ret; | 2401 | return ret; |
2309 | } | 2402 | } |
2310 | 2403 | ||
@@ -2314,7 +2407,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, | |||
2314 | &err); | 2407 | &err); |
2315 | if (ret == X86EMUL_PROPAGATE_FAULT) { | 2408 | if (ret == X86EMUL_PROPAGATE_FAULT) { |
2316 | /* FIXME: need to provide precise fault address */ | 2409 | /* FIXME: need to provide precise fault address */ |
2317 | kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); | 2410 | emulate_pf(ctxt, old_tss_base, err); |
2318 | return ret; | 2411 | return ret; |
2319 | } | 2412 | } |
2320 | 2413 | ||
@@ -2322,7 +2415,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, | |||
2322 | &err); | 2415 | &err); |
2323 | if (ret == X86EMUL_PROPAGATE_FAULT) { | 2416 | if (ret == X86EMUL_PROPAGATE_FAULT) { |
2324 | /* FIXME: need to provide precise fault address */ | 2417 | /* FIXME: need to provide precise fault address */ |
2325 | kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); | 2418 | emulate_pf(ctxt, new_tss_base, err); |
2326 | return ret; | 2419 | return ret; |
2327 | } | 2420 | } |
2328 | 2421 | ||
@@ -2335,7 +2428,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, | |||
2335 | ctxt->vcpu, &err); | 2428 | ctxt->vcpu, &err); |
2336 | if (ret == X86EMUL_PROPAGATE_FAULT) { | 2429 | if (ret == X86EMUL_PROPAGATE_FAULT) { |
2337 | /* FIXME: need to provide precise fault address */ | 2430 | /* FIXME: need to provide precise fault address */ |
2338 | kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); | 2431 | emulate_pf(ctxt, new_tss_base, err); |
2339 | return ret; | 2432 | return ret; |
2340 | } | 2433 | } |
2341 | } | 2434 | } |
@@ -2352,7 +2445,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, | |||
2352 | int ret; | 2445 | int ret; |
2353 | u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu); | 2446 | u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu); |
2354 | ulong old_tss_base = | 2447 | ulong old_tss_base = |
2355 | get_cached_descriptor_base(ctxt, ops, VCPU_SREG_TR); | 2448 | ops->get_cached_segment_base(VCPU_SREG_TR, ctxt->vcpu); |
2356 | u32 desc_limit; | 2449 | u32 desc_limit; |
2357 | 2450 | ||
2358 | /* FIXME: old_tss_base == ~0 ? */ | 2451 | /* FIXME: old_tss_base == ~0 ? */ |
@@ -2369,7 +2462,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, | |||
2369 | if (reason != TASK_SWITCH_IRET) { | 2462 | if (reason != TASK_SWITCH_IRET) { |
2370 | if ((tss_selector & 3) > next_tss_desc.dpl || | 2463 | if ((tss_selector & 3) > next_tss_desc.dpl || |
2371 | ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) { | 2464 | ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) { |
2372 | kvm_inject_gp(ctxt->vcpu, 0); | 2465 | emulate_gp(ctxt, 0); |
2373 | return X86EMUL_PROPAGATE_FAULT; | 2466 | return X86EMUL_PROPAGATE_FAULT; |
2374 | } | 2467 | } |
2375 | } | 2468 | } |
@@ -2378,8 +2471,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, | |||
2378 | if (!next_tss_desc.p || | 2471 | if (!next_tss_desc.p || |
2379 | ((desc_limit < 0x67 && (next_tss_desc.type & 8)) || | 2472 | ((desc_limit < 0x67 && (next_tss_desc.type & 8)) || |
2380 | desc_limit < 0x2b)) { | 2473 | desc_limit < 0x2b)) { |
2381 | kvm_queue_exception_e(ctxt->vcpu, TS_VECTOR, | 2474 | emulate_ts(ctxt, tss_selector & 0xfffc); |
2382 | tss_selector & 0xfffc); | ||
2383 | return X86EMUL_PROPAGATE_FAULT; | 2475 | return X86EMUL_PROPAGATE_FAULT; |
2384 | } | 2476 | } |
2385 | 2477 | ||
@@ -2425,7 +2517,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, | |||
2425 | c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2; | 2517 | c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2; |
2426 | c->lock_prefix = 0; | 2518 | c->lock_prefix = 0; |
2427 | c->src.val = (unsigned long) error_code; | 2519 | c->src.val = (unsigned long) error_code; |
2428 | emulate_push(ctxt); | 2520 | emulate_push(ctxt, ops); |
2429 | } | 2521 | } |
2430 | 2522 | ||
2431 | return ret; | 2523 | return ret; |
@@ -2439,18 +2531,16 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, | |||
2439 | struct decode_cache *c = &ctxt->decode; | 2531 | struct decode_cache *c = &ctxt->decode; |
2440 | int rc; | 2532 | int rc; |
2441 | 2533 | ||
2442 | memset(c, 0, sizeof(struct decode_cache)); | ||
2443 | c->eip = ctxt->eip; | 2534 | c->eip = ctxt->eip; |
2444 | memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); | ||
2445 | c->dst.type = OP_NONE; | 2535 | c->dst.type = OP_NONE; |
2446 | 2536 | ||
2447 | rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason, | 2537 | rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason, |
2448 | has_error_code, error_code); | 2538 | has_error_code, error_code); |
2449 | 2539 | ||
2450 | if (rc == X86EMUL_CONTINUE) { | 2540 | if (rc == X86EMUL_CONTINUE) { |
2451 | memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); | ||
2452 | kvm_rip_write(ctxt->vcpu, c->eip); | ||
2453 | rc = writeback(ctxt, ops); | 2541 | rc = writeback(ctxt, ops); |
2542 | if (rc == X86EMUL_CONTINUE) | ||
2543 | ctxt->eip = c->eip; | ||
2454 | } | 2544 | } |
2455 | 2545 | ||
2456 | return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; | 2546 | return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; |
@@ -2474,29 +2564,22 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
2474 | int rc = X86EMUL_CONTINUE; | 2564 | int rc = X86EMUL_CONTINUE; |
2475 | int saved_dst_type = c->dst.type; | 2565 | int saved_dst_type = c->dst.type; |
2476 | 2566 | ||
2477 | ctxt->interruptibility = 0; | 2567 | ctxt->decode.mem_read.pos = 0; |
2478 | |||
2479 | /* Shadow copy of register state. Committed on successful emulation. | ||
2480 | * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't | ||
2481 | * modify them. | ||
2482 | */ | ||
2483 | |||
2484 | memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); | ||
2485 | 2568 | ||
2486 | if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { | 2569 | if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { |
2487 | kvm_queue_exception(ctxt->vcpu, UD_VECTOR); | 2570 | emulate_ud(ctxt); |
2488 | goto done; | 2571 | goto done; |
2489 | } | 2572 | } |
2490 | 2573 | ||
2491 | /* LOCK prefix is allowed only with some instructions */ | 2574 | /* LOCK prefix is allowed only with some instructions */ |
2492 | if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) { | 2575 | if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) { |
2493 | kvm_queue_exception(ctxt->vcpu, UD_VECTOR); | 2576 | emulate_ud(ctxt); |
2494 | goto done; | 2577 | goto done; |
2495 | } | 2578 | } |
2496 | 2579 | ||
2497 | /* Privileged instruction can be executed only in CPL=0 */ | 2580 | /* Privileged instruction can be executed only in CPL=0 */ |
2498 | if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { | 2581 | if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { |
2499 | kvm_inject_gp(ctxt->vcpu, 0); | 2582 | emulate_gp(ctxt, 0); |
2500 | goto done; | 2583 | goto done; |
2501 | } | 2584 | } |
2502 | 2585 | ||
@@ -2506,7 +2589,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
2506 | if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) { | 2589 | if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) { |
2507 | string_done: | 2590 | string_done: |
2508 | ctxt->restart = false; | 2591 | ctxt->restart = false; |
2509 | kvm_rip_write(ctxt->vcpu, c->eip); | 2592 | ctxt->eip = c->eip; |
2510 | goto done; | 2593 | goto done; |
2511 | } | 2594 | } |
2512 | /* The second termination condition only applies for REPE | 2595 | /* The second termination condition only applies for REPE |
@@ -2529,20 +2612,16 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
2529 | } | 2612 | } |
2530 | 2613 | ||
2531 | if (c->src.type == OP_MEM) { | 2614 | if (c->src.type == OP_MEM) { |
2532 | rc = ops->read_emulated((unsigned long)c->src.ptr, | 2615 | rc = read_emulated(ctxt, ops, (unsigned long)c->src.ptr, |
2533 | &c->src.val, | 2616 | c->src.valptr, c->src.bytes); |
2534 | c->src.bytes, | ||
2535 | ctxt->vcpu); | ||
2536 | if (rc != X86EMUL_CONTINUE) | 2617 | if (rc != X86EMUL_CONTINUE) |
2537 | goto done; | 2618 | goto done; |
2538 | c->src.orig_val = c->src.val; | 2619 | c->src.orig_val = c->src.val; |
2539 | } | 2620 | } |
2540 | 2621 | ||
2541 | if (c->src2.type == OP_MEM) { | 2622 | if (c->src2.type == OP_MEM) { |
2542 | rc = ops->read_emulated((unsigned long)c->src2.ptr, | 2623 | rc = read_emulated(ctxt, ops, (unsigned long)c->src2.ptr, |
2543 | &c->src2.val, | 2624 | &c->src2.val, c->src2.bytes); |
2544 | c->src2.bytes, | ||
2545 | ctxt->vcpu); | ||
2546 | if (rc != X86EMUL_CONTINUE) | 2625 | if (rc != X86EMUL_CONTINUE) |
2547 | goto done; | 2626 | goto done; |
2548 | } | 2627 | } |
@@ -2553,8 +2632,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
2553 | 2632 | ||
2554 | if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { | 2633 | if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { |
2555 | /* optimisation - avoid slow emulated read if Mov */ | 2634 | /* optimisation - avoid slow emulated read if Mov */ |
2556 | rc = ops->read_emulated((unsigned long)c->dst.ptr, &c->dst.val, | 2635 | rc = read_emulated(ctxt, ops, (unsigned long)c->dst.ptr, |
2557 | c->dst.bytes, ctxt->vcpu); | 2636 | &c->dst.val, c->dst.bytes); |
2558 | if (rc != X86EMUL_CONTINUE) | 2637 | if (rc != X86EMUL_CONTINUE) |
2559 | goto done; | 2638 | goto done; |
2560 | } | 2639 | } |
@@ -2571,7 +2650,7 @@ special_insn: | |||
2571 | emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); | 2650 | emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); |
2572 | break; | 2651 | break; |
2573 | case 0x06: /* push es */ | 2652 | case 0x06: /* push es */ |
2574 | emulate_push_sreg(ctxt, VCPU_SREG_ES); | 2653 | emulate_push_sreg(ctxt, ops, VCPU_SREG_ES); |
2575 | break; | 2654 | break; |
2576 | case 0x07: /* pop es */ | 2655 | case 0x07: /* pop es */ |
2577 | rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); | 2656 | rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); |
@@ -2583,14 +2662,14 @@ special_insn: | |||
2583 | emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); | 2662 | emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); |
2584 | break; | 2663 | break; |
2585 | case 0x0e: /* push cs */ | 2664 | case 0x0e: /* push cs */ |
2586 | emulate_push_sreg(ctxt, VCPU_SREG_CS); | 2665 | emulate_push_sreg(ctxt, ops, VCPU_SREG_CS); |
2587 | break; | 2666 | break; |
2588 | case 0x10 ... 0x15: | 2667 | case 0x10 ... 0x15: |
2589 | adc: /* adc */ | 2668 | adc: /* adc */ |
2590 | emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); | 2669 | emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); |
2591 | break; | 2670 | break; |
2592 | case 0x16: /* push ss */ | 2671 | case 0x16: /* push ss */ |
2593 | emulate_push_sreg(ctxt, VCPU_SREG_SS); | 2672 | emulate_push_sreg(ctxt, ops, VCPU_SREG_SS); |
2594 | break; | 2673 | break; |
2595 | case 0x17: /* pop ss */ | 2674 | case 0x17: /* pop ss */ |
2596 | rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); | 2675 | rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); |
@@ -2602,7 +2681,7 @@ special_insn: | |||
2602 | emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); | 2681 | emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); |
2603 | break; | 2682 | break; |
2604 | case 0x1e: /* push ds */ | 2683 | case 0x1e: /* push ds */ |
2605 | emulate_push_sreg(ctxt, VCPU_SREG_DS); | 2684 | emulate_push_sreg(ctxt, ops, VCPU_SREG_DS); |
2606 | break; | 2685 | break; |
2607 | case 0x1f: /* pop ds */ | 2686 | case 0x1f: /* pop ds */ |
2608 | rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); | 2687 | rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); |
@@ -2632,7 +2711,7 @@ special_insn: | |||
2632 | emulate_1op("dec", c->dst, ctxt->eflags); | 2711 | emulate_1op("dec", c->dst, ctxt->eflags); |
2633 | break; | 2712 | break; |
2634 | case 0x50 ... 0x57: /* push reg */ | 2713 | case 0x50 ... 0x57: /* push reg */ |
2635 | emulate_push(ctxt); | 2714 | emulate_push(ctxt, ops); |
2636 | break; | 2715 | break; |
2637 | case 0x58 ... 0x5f: /* pop reg */ | 2716 | case 0x58 ... 0x5f: /* pop reg */ |
2638 | pop_instruction: | 2717 | pop_instruction: |
@@ -2641,7 +2720,9 @@ special_insn: | |||
2641 | goto done; | 2720 | goto done; |
2642 | break; | 2721 | break; |
2643 | case 0x60: /* pusha */ | 2722 | case 0x60: /* pusha */ |
2644 | emulate_pusha(ctxt); | 2723 | rc = emulate_pusha(ctxt, ops); |
2724 | if (rc != X86EMUL_CONTINUE) | ||
2725 | goto done; | ||
2645 | break; | 2726 | break; |
2646 | case 0x61: /* popa */ | 2727 | case 0x61: /* popa */ |
2647 | rc = emulate_popa(ctxt, ops); | 2728 | rc = emulate_popa(ctxt, ops); |
@@ -2655,14 +2736,14 @@ special_insn: | |||
2655 | break; | 2736 | break; |
2656 | case 0x68: /* push imm */ | 2737 | case 0x68: /* push imm */ |
2657 | case 0x6a: /* push imm8 */ | 2738 | case 0x6a: /* push imm8 */ |
2658 | emulate_push(ctxt); | 2739 | emulate_push(ctxt, ops); |
2659 | break; | 2740 | break; |
2660 | case 0x6c: /* insb */ | 2741 | case 0x6c: /* insb */ |
2661 | case 0x6d: /* insw/insd */ | 2742 | case 0x6d: /* insw/insd */ |
2662 | c->dst.bytes = min(c->dst.bytes, 4u); | 2743 | c->dst.bytes = min(c->dst.bytes, 4u); |
2663 | if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], | 2744 | if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], |
2664 | c->dst.bytes)) { | 2745 | c->dst.bytes)) { |
2665 | kvm_inject_gp(ctxt->vcpu, 0); | 2746 | emulate_gp(ctxt, 0); |
2666 | goto done; | 2747 | goto done; |
2667 | } | 2748 | } |
2668 | if (!pio_in_emulated(ctxt, ops, c->dst.bytes, | 2749 | if (!pio_in_emulated(ctxt, ops, c->dst.bytes, |
@@ -2674,7 +2755,7 @@ special_insn: | |||
2674 | c->src.bytes = min(c->src.bytes, 4u); | 2755 | c->src.bytes = min(c->src.bytes, 4u); |
2675 | if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], | 2756 | if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], |
2676 | c->src.bytes)) { | 2757 | c->src.bytes)) { |
2677 | kvm_inject_gp(ctxt->vcpu, 0); | 2758 | emulate_gp(ctxt, 0); |
2678 | goto done; | 2759 | goto done; |
2679 | } | 2760 | } |
2680 | ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX], | 2761 | ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX], |
@@ -2707,6 +2788,7 @@ special_insn: | |||
2707 | } | 2788 | } |
2708 | break; | 2789 | break; |
2709 | case 0x84 ... 0x85: | 2790 | case 0x84 ... 0x85: |
2791 | test: | ||
2710 | emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); | 2792 | emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); |
2711 | break; | 2793 | break; |
2712 | case 0x86 ... 0x87: /* xchg */ | 2794 | case 0x86 ... 0x87: /* xchg */ |
@@ -2735,18 +2817,13 @@ special_insn: | |||
2735 | break; | 2817 | break; |
2736 | case 0x88 ... 0x8b: /* mov */ | 2818 | case 0x88 ... 0x8b: /* mov */ |
2737 | goto mov; | 2819 | goto mov; |
2738 | case 0x8c: { /* mov r/m, sreg */ | 2820 | case 0x8c: /* mov r/m, sreg */ |
2739 | struct kvm_segment segreg; | 2821 | if (c->modrm_reg > VCPU_SREG_GS) { |
2740 | 2822 | emulate_ud(ctxt); | |
2741 | if (c->modrm_reg <= VCPU_SREG_GS) | ||
2742 | kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg); | ||
2743 | else { | ||
2744 | kvm_queue_exception(ctxt->vcpu, UD_VECTOR); | ||
2745 | goto done; | 2823 | goto done; |
2746 | } | 2824 | } |
2747 | c->dst.val = segreg.selector; | 2825 | c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu); |
2748 | break; | 2826 | break; |
2749 | } | ||
2750 | case 0x8d: /* lea r16/r32, m */ | 2827 | case 0x8d: /* lea r16/r32, m */ |
2751 | c->dst.val = c->modrm_ea; | 2828 | c->dst.val = c->modrm_ea; |
2752 | break; | 2829 | break; |
@@ -2757,12 +2834,12 @@ special_insn: | |||
2757 | 2834 | ||
2758 | if (c->modrm_reg == VCPU_SREG_CS || | 2835 | if (c->modrm_reg == VCPU_SREG_CS || |
2759 | c->modrm_reg > VCPU_SREG_GS) { | 2836 | c->modrm_reg > VCPU_SREG_GS) { |
2760 | kvm_queue_exception(ctxt->vcpu, UD_VECTOR); | 2837 | emulate_ud(ctxt); |
2761 | goto done; | 2838 | goto done; |
2762 | } | 2839 | } |
2763 | 2840 | ||
2764 | if (c->modrm_reg == VCPU_SREG_SS) | 2841 | if (c->modrm_reg == VCPU_SREG_SS) |
2765 | toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_MOV_SS); | 2842 | ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS; |
2766 | 2843 | ||
2767 | rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg); | 2844 | rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg); |
2768 | 2845 | ||
@@ -2775,19 +2852,19 @@ special_insn: | |||
2775 | goto done; | 2852 | goto done; |
2776 | break; | 2853 | break; |
2777 | case 0x90: /* nop / xchg r8,rax */ | 2854 | case 0x90: /* nop / xchg r8,rax */ |
2778 | if (!(c->rex_prefix & 1)) { /* nop */ | 2855 | if (c->dst.ptr == (unsigned long *)&c->regs[VCPU_REGS_RAX]) { |
2779 | c->dst.type = OP_NONE; | 2856 | c->dst.type = OP_NONE; /* nop */ |
2780 | break; | 2857 | break; |
2781 | } | 2858 | } |
2782 | case 0x91 ... 0x97: /* xchg reg,rax */ | 2859 | case 0x91 ... 0x97: /* xchg reg,rax */ |
2783 | c->src.type = c->dst.type = OP_REG; | 2860 | c->src.type = OP_REG; |
2784 | c->src.bytes = c->dst.bytes = c->op_bytes; | 2861 | c->src.bytes = c->op_bytes; |
2785 | c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX]; | 2862 | c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX]; |
2786 | c->src.val = *(c->src.ptr); | 2863 | c->src.val = *(c->src.ptr); |
2787 | goto xchg; | 2864 | goto xchg; |
2788 | case 0x9c: /* pushf */ | 2865 | case 0x9c: /* pushf */ |
2789 | c->src.val = (unsigned long) ctxt->eflags; | 2866 | c->src.val = (unsigned long) ctxt->eflags; |
2790 | emulate_push(ctxt); | 2867 | emulate_push(ctxt, ops); |
2791 | break; | 2868 | break; |
2792 | case 0x9d: /* popf */ | 2869 | case 0x9d: /* popf */ |
2793 | c->dst.type = OP_REG; | 2870 | c->dst.type = OP_REG; |
@@ -2797,19 +2874,15 @@ special_insn: | |||
2797 | if (rc != X86EMUL_CONTINUE) | 2874 | if (rc != X86EMUL_CONTINUE) |
2798 | goto done; | 2875 | goto done; |
2799 | break; | 2876 | break; |
2800 | case 0xa0 ... 0xa1: /* mov */ | 2877 | case 0xa0 ... 0xa3: /* mov */ |
2801 | c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; | ||
2802 | c->dst.val = c->src.val; | ||
2803 | break; | ||
2804 | case 0xa2 ... 0xa3: /* mov */ | ||
2805 | c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX]; | ||
2806 | break; | ||
2807 | case 0xa4 ... 0xa5: /* movs */ | 2878 | case 0xa4 ... 0xa5: /* movs */ |
2808 | goto mov; | 2879 | goto mov; |
2809 | case 0xa6 ... 0xa7: /* cmps */ | 2880 | case 0xa6 ... 0xa7: /* cmps */ |
2810 | c->dst.type = OP_NONE; /* Disable writeback. */ | 2881 | c->dst.type = OP_NONE; /* Disable writeback. */ |
2811 | DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); | 2882 | DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); |
2812 | goto cmp; | 2883 | goto cmp; |
2884 | case 0xa8 ... 0xa9: /* test ax, imm */ | ||
2885 | goto test; | ||
2813 | case 0xaa ... 0xab: /* stos */ | 2886 | case 0xaa ... 0xab: /* stos */ |
2814 | c->dst.val = c->regs[VCPU_REGS_RAX]; | 2887 | c->dst.val = c->regs[VCPU_REGS_RAX]; |
2815 | break; | 2888 | break; |
@@ -2855,19 +2928,23 @@ special_insn: | |||
2855 | long int rel = c->src.val; | 2928 | long int rel = c->src.val; |
2856 | c->src.val = (unsigned long) c->eip; | 2929 | c->src.val = (unsigned long) c->eip; |
2857 | jmp_rel(c, rel); | 2930 | jmp_rel(c, rel); |
2858 | emulate_push(ctxt); | 2931 | emulate_push(ctxt, ops); |
2859 | break; | 2932 | break; |
2860 | } | 2933 | } |
2861 | case 0xe9: /* jmp rel */ | 2934 | case 0xe9: /* jmp rel */ |
2862 | goto jmp; | 2935 | goto jmp; |
2863 | case 0xea: /* jmp far */ | 2936 | case 0xea: { /* jmp far */ |
2937 | unsigned short sel; | ||
2864 | jump_far: | 2938 | jump_far: |
2865 | if (load_segment_descriptor(ctxt, ops, c->src2.val, | 2939 | memcpy(&sel, c->src.valptr + c->op_bytes, 2); |
2866 | VCPU_SREG_CS)) | 2940 | |
2941 | if (load_segment_descriptor(ctxt, ops, sel, VCPU_SREG_CS)) | ||
2867 | goto done; | 2942 | goto done; |
2868 | 2943 | ||
2869 | c->eip = c->src.val; | 2944 | c->eip = 0; |
2945 | memcpy(&c->eip, c->src.valptr, c->op_bytes); | ||
2870 | break; | 2946 | break; |
2947 | } | ||
2871 | case 0xeb: | 2948 | case 0xeb: |
2872 | jmp: /* jmp rel short */ | 2949 | jmp: /* jmp rel short */ |
2873 | jmp_rel(c, c->src.val); | 2950 | jmp_rel(c, c->src.val); |
@@ -2879,20 +2956,20 @@ special_insn: | |||
2879 | do_io_in: | 2956 | do_io_in: |
2880 | c->dst.bytes = min(c->dst.bytes, 4u); | 2957 | c->dst.bytes = min(c->dst.bytes, 4u); |
2881 | if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { | 2958 | if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { |
2882 | kvm_inject_gp(ctxt->vcpu, 0); | 2959 | emulate_gp(ctxt, 0); |
2883 | goto done; | 2960 | goto done; |
2884 | } | 2961 | } |
2885 | if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val, | 2962 | if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val, |
2886 | &c->dst.val)) | 2963 | &c->dst.val)) |
2887 | goto done; /* IO is needed */ | 2964 | goto done; /* IO is needed */ |
2888 | break; | 2965 | break; |
2889 | case 0xee: /* out al,dx */ | 2966 | case 0xee: /* out dx,al */ |
2890 | case 0xef: /* out (e/r)ax,dx */ | 2967 | case 0xef: /* out dx,(e/r)ax */ |
2891 | c->src.val = c->regs[VCPU_REGS_RDX]; | 2968 | c->src.val = c->regs[VCPU_REGS_RDX]; |
2892 | do_io_out: | 2969 | do_io_out: |
2893 | c->dst.bytes = min(c->dst.bytes, 4u); | 2970 | c->dst.bytes = min(c->dst.bytes, 4u); |
2894 | if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { | 2971 | if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { |
2895 | kvm_inject_gp(ctxt->vcpu, 0); | 2972 | emulate_gp(ctxt, 0); |
2896 | goto done; | 2973 | goto done; |
2897 | } | 2974 | } |
2898 | ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1, | 2975 | ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1, |
@@ -2916,18 +2993,20 @@ special_insn: | |||
2916 | c->dst.type = OP_NONE; /* Disable writeback. */ | 2993 | c->dst.type = OP_NONE; /* Disable writeback. */ |
2917 | break; | 2994 | break; |
2918 | case 0xfa: /* cli */ | 2995 | case 0xfa: /* cli */ |
2919 | if (emulator_bad_iopl(ctxt, ops)) | 2996 | if (emulator_bad_iopl(ctxt, ops)) { |
2920 | kvm_inject_gp(ctxt->vcpu, 0); | 2997 | emulate_gp(ctxt, 0); |
2921 | else { | 2998 | goto done; |
2999 | } else { | ||
2922 | ctxt->eflags &= ~X86_EFLAGS_IF; | 3000 | ctxt->eflags &= ~X86_EFLAGS_IF; |
2923 | c->dst.type = OP_NONE; /* Disable writeback. */ | 3001 | c->dst.type = OP_NONE; /* Disable writeback. */ |
2924 | } | 3002 | } |
2925 | break; | 3003 | break; |
2926 | case 0xfb: /* sti */ | 3004 | case 0xfb: /* sti */ |
2927 | if (emulator_bad_iopl(ctxt, ops)) | 3005 | if (emulator_bad_iopl(ctxt, ops)) { |
2928 | kvm_inject_gp(ctxt->vcpu, 0); | 3006 | emulate_gp(ctxt, 0); |
2929 | else { | 3007 | goto done; |
2930 | toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_STI); | 3008 | } else { |
3009 | ctxt->interruptibility = KVM_X86_SHADOW_INT_STI; | ||
2931 | ctxt->eflags |= X86_EFLAGS_IF; | 3010 | ctxt->eflags |= X86_EFLAGS_IF; |
2932 | c->dst.type = OP_NONE; /* Disable writeback. */ | 3011 | c->dst.type = OP_NONE; /* Disable writeback. */ |
2933 | } | 3012 | } |
@@ -2964,11 +3043,12 @@ writeback: | |||
2964 | c->dst.type = saved_dst_type; | 3043 | c->dst.type = saved_dst_type; |
2965 | 3044 | ||
2966 | if ((c->d & SrcMask) == SrcSI) | 3045 | if ((c->d & SrcMask) == SrcSI) |
2967 | string_addr_inc(ctxt, seg_override_base(ctxt, c), VCPU_REGS_RSI, | 3046 | string_addr_inc(ctxt, seg_override_base(ctxt, ops, c), |
2968 | &c->src); | 3047 | VCPU_REGS_RSI, &c->src); |
2969 | 3048 | ||
2970 | if ((c->d & DstMask) == DstDI) | 3049 | if ((c->d & DstMask) == DstDI) |
2971 | string_addr_inc(ctxt, es_base(ctxt), VCPU_REGS_RDI, &c->dst); | 3050 | string_addr_inc(ctxt, es_base(ctxt, ops), VCPU_REGS_RDI, |
3051 | &c->dst); | ||
2972 | 3052 | ||
2973 | if (c->rep_prefix && (c->d & String)) { | 3053 | if (c->rep_prefix && (c->d & String)) { |
2974 | struct read_cache *rc = &ctxt->decode.io_read; | 3054 | struct read_cache *rc = &ctxt->decode.io_read; |
@@ -2981,11 +3061,12 @@ writeback: | |||
2981 | (rc->end != 0 && rc->end == rc->pos)) | 3061 | (rc->end != 0 && rc->end == rc->pos)) |
2982 | ctxt->restart = false; | 3062 | ctxt->restart = false; |
2983 | } | 3063 | } |
2984 | 3064 | /* | |
2985 | /* Commit shadow register state. */ | 3065 | * reset read cache here in case string instruction is restared |
2986 | memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); | 3066 | * without decoding |
2987 | kvm_rip_write(ctxt->vcpu, c->eip); | 3067 | */ |
2988 | ops->set_rflags(ctxt->vcpu, ctxt->eflags); | 3068 | ctxt->decode.mem_read.end = 0; |
3069 | ctxt->eip = c->eip; | ||
2989 | 3070 | ||
2990 | done: | 3071 | done: |
2991 | return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; | 3072 | return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; |
@@ -3051,7 +3132,7 @@ twobyte_insn: | |||
3051 | c->dst.type = OP_NONE; | 3132 | c->dst.type = OP_NONE; |
3052 | break; | 3133 | break; |
3053 | case 5: /* not defined */ | 3134 | case 5: /* not defined */ |
3054 | kvm_queue_exception(ctxt->vcpu, UD_VECTOR); | 3135 | emulate_ud(ctxt); |
3055 | goto done; | 3136 | goto done; |
3056 | case 7: /* invlpg*/ | 3137 | case 7: /* invlpg*/ |
3057 | emulate_invlpg(ctxt->vcpu, c->modrm_ea); | 3138 | emulate_invlpg(ctxt->vcpu, c->modrm_ea); |
@@ -3063,7 +3144,7 @@ twobyte_insn: | |||
3063 | } | 3144 | } |
3064 | break; | 3145 | break; |
3065 | case 0x05: /* syscall */ | 3146 | case 0x05: /* syscall */ |
3066 | rc = emulate_syscall(ctxt); | 3147 | rc = emulate_syscall(ctxt, ops); |
3067 | if (rc != X86EMUL_CONTINUE) | 3148 | if (rc != X86EMUL_CONTINUE) |
3068 | goto done; | 3149 | goto done; |
3069 | else | 3150 | else |
@@ -3073,8 +3154,11 @@ twobyte_insn: | |||
3073 | emulate_clts(ctxt->vcpu); | 3154 | emulate_clts(ctxt->vcpu); |
3074 | c->dst.type = OP_NONE; | 3155 | c->dst.type = OP_NONE; |
3075 | break; | 3156 | break; |
3076 | case 0x08: /* invd */ | ||
3077 | case 0x09: /* wbinvd */ | 3157 | case 0x09: /* wbinvd */ |
3158 | kvm_emulate_wbinvd(ctxt->vcpu); | ||
3159 | c->dst.type = OP_NONE; | ||
3160 | break; | ||
3161 | case 0x08: /* invd */ | ||
3078 | case 0x0d: /* GrpP (prefetch) */ | 3162 | case 0x0d: /* GrpP (prefetch) */ |
3079 | case 0x18: /* Grp16 (prefetch/nop) */ | 3163 | case 0x18: /* Grp16 (prefetch/nop) */ |
3080 | c->dst.type = OP_NONE; | 3164 | c->dst.type = OP_NONE; |
@@ -3084,7 +3168,7 @@ twobyte_insn: | |||
3084 | case 1: | 3168 | case 1: |
3085 | case 5 ... 7: | 3169 | case 5 ... 7: |
3086 | case 9 ... 15: | 3170 | case 9 ... 15: |
3087 | kvm_queue_exception(ctxt->vcpu, UD_VECTOR); | 3171 | emulate_ud(ctxt); |
3088 | goto done; | 3172 | goto done; |
3089 | } | 3173 | } |
3090 | c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu); | 3174 | c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu); |
@@ -3093,31 +3177,42 @@ twobyte_insn: | |||
3093 | case 0x21: /* mov from dr to reg */ | 3177 | case 0x21: /* mov from dr to reg */ |
3094 | if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && | 3178 | if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && |
3095 | (c->modrm_reg == 4 || c->modrm_reg == 5)) { | 3179 | (c->modrm_reg == 4 || c->modrm_reg == 5)) { |
3096 | kvm_queue_exception(ctxt->vcpu, UD_VECTOR); | 3180 | emulate_ud(ctxt); |
3097 | goto done; | 3181 | goto done; |
3098 | } | 3182 | } |
3099 | emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]); | 3183 | ops->get_dr(c->modrm_reg, &c->regs[c->modrm_rm], ctxt->vcpu); |
3100 | c->dst.type = OP_NONE; /* no writeback */ | 3184 | c->dst.type = OP_NONE; /* no writeback */ |
3101 | break; | 3185 | break; |
3102 | case 0x22: /* mov reg, cr */ | 3186 | case 0x22: /* mov reg, cr */ |
3103 | ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu); | 3187 | if (ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu)) { |
3188 | emulate_gp(ctxt, 0); | ||
3189 | goto done; | ||
3190 | } | ||
3104 | c->dst.type = OP_NONE; | 3191 | c->dst.type = OP_NONE; |
3105 | break; | 3192 | break; |
3106 | case 0x23: /* mov from reg to dr */ | 3193 | case 0x23: /* mov from reg to dr */ |
3107 | if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && | 3194 | if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && |
3108 | (c->modrm_reg == 4 || c->modrm_reg == 5)) { | 3195 | (c->modrm_reg == 4 || c->modrm_reg == 5)) { |
3109 | kvm_queue_exception(ctxt->vcpu, UD_VECTOR); | 3196 | emulate_ud(ctxt); |
3197 | goto done; | ||
3198 | } | ||
3199 | |||
3200 | if (ops->set_dr(c->modrm_reg, c->regs[c->modrm_rm] & | ||
3201 | ((ctxt->mode == X86EMUL_MODE_PROT64) ? | ||
3202 | ~0ULL : ~0U), ctxt->vcpu) < 0) { | ||
3203 | /* #UD condition is already handled by the code above */ | ||
3204 | emulate_gp(ctxt, 0); | ||
3110 | goto done; | 3205 | goto done; |
3111 | } | 3206 | } |
3112 | emulator_set_dr(ctxt, c->modrm_reg, c->regs[c->modrm_rm]); | 3207 | |
3113 | c->dst.type = OP_NONE; /* no writeback */ | 3208 | c->dst.type = OP_NONE; /* no writeback */ |
3114 | break; | 3209 | break; |
3115 | case 0x30: | 3210 | case 0x30: |
3116 | /* wrmsr */ | 3211 | /* wrmsr */ |
3117 | msr_data = (u32)c->regs[VCPU_REGS_RAX] | 3212 | msr_data = (u32)c->regs[VCPU_REGS_RAX] |
3118 | | ((u64)c->regs[VCPU_REGS_RDX] << 32); | 3213 | | ((u64)c->regs[VCPU_REGS_RDX] << 32); |
3119 | if (kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) { | 3214 | if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) { |
3120 | kvm_inject_gp(ctxt->vcpu, 0); | 3215 | emulate_gp(ctxt, 0); |
3121 | goto done; | 3216 | goto done; |
3122 | } | 3217 | } |
3123 | rc = X86EMUL_CONTINUE; | 3218 | rc = X86EMUL_CONTINUE; |
@@ -3125,8 +3220,8 @@ twobyte_insn: | |||
3125 | break; | 3220 | break; |
3126 | case 0x32: | 3221 | case 0x32: |
3127 | /* rdmsr */ | 3222 | /* rdmsr */ |
3128 | if (kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) { | 3223 | if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) { |
3129 | kvm_inject_gp(ctxt->vcpu, 0); | 3224 | emulate_gp(ctxt, 0); |
3130 | goto done; | 3225 | goto done; |
3131 | } else { | 3226 | } else { |
3132 | c->regs[VCPU_REGS_RAX] = (u32)msr_data; | 3227 | c->regs[VCPU_REGS_RAX] = (u32)msr_data; |
@@ -3136,14 +3231,14 @@ twobyte_insn: | |||
3136 | c->dst.type = OP_NONE; | 3231 | c->dst.type = OP_NONE; |
3137 | break; | 3232 | break; |
3138 | case 0x34: /* sysenter */ | 3233 | case 0x34: /* sysenter */ |
3139 | rc = emulate_sysenter(ctxt); | 3234 | rc = emulate_sysenter(ctxt, ops); |
3140 | if (rc != X86EMUL_CONTINUE) | 3235 | if (rc != X86EMUL_CONTINUE) |
3141 | goto done; | 3236 | goto done; |
3142 | else | 3237 | else |
3143 | goto writeback; | 3238 | goto writeback; |
3144 | break; | 3239 | break; |
3145 | case 0x35: /* sysexit */ | 3240 | case 0x35: /* sysexit */ |
3146 | rc = emulate_sysexit(ctxt); | 3241 | rc = emulate_sysexit(ctxt, ops); |
3147 | if (rc != X86EMUL_CONTINUE) | 3242 | if (rc != X86EMUL_CONTINUE) |
3148 | goto done; | 3243 | goto done; |
3149 | else | 3244 | else |
@@ -3160,7 +3255,7 @@ twobyte_insn: | |||
3160 | c->dst.type = OP_NONE; | 3255 | c->dst.type = OP_NONE; |
3161 | break; | 3256 | break; |
3162 | case 0xa0: /* push fs */ | 3257 | case 0xa0: /* push fs */ |
3163 | emulate_push_sreg(ctxt, VCPU_SREG_FS); | 3258 | emulate_push_sreg(ctxt, ops, VCPU_SREG_FS); |
3164 | break; | 3259 | break; |
3165 | case 0xa1: /* pop fs */ | 3260 | case 0xa1: /* pop fs */ |
3166 | rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); | 3261 | rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); |
@@ -3179,7 +3274,7 @@ twobyte_insn: | |||
3179 | emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); | 3274 | emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); |
3180 | break; | 3275 | break; |
3181 | case 0xa8: /* push gs */ | 3276 | case 0xa8: /* push gs */ |
3182 | emulate_push_sreg(ctxt, VCPU_SREG_GS); | 3277 | emulate_push_sreg(ctxt, ops, VCPU_SREG_GS); |
3183 | break; | 3278 | break; |
3184 | case 0xa9: /* pop gs */ | 3279 | case 0xa9: /* pop gs */ |
3185 | rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); | 3280 | rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); |
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 0150affad25d..0fd6378981f4 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c | |||
@@ -5,6 +5,7 @@ | |||
5 | * Copyright (c) 2006 Intel Corporation | 5 | * Copyright (c) 2006 Intel Corporation |
6 | * Copyright (c) 2007 Keir Fraser, XenSource Inc | 6 | * Copyright (c) 2007 Keir Fraser, XenSource Inc |
7 | * Copyright (c) 2008 Intel Corporation | 7 | * Copyright (c) 2008 Intel Corporation |
8 | * Copyright 2009 Red Hat, Inc. and/or its affilates. | ||
8 | * | 9 | * |
9 | * Permission is hereby granted, free of charge, to any person obtaining a copy | 10 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
10 | * of this software and associated documentation files (the "Software"), to deal | 11 | * of this software and associated documentation files (the "Software"), to deal |
@@ -33,6 +34,7 @@ | |||
33 | 34 | ||
34 | #include <linux/kvm_host.h> | 35 | #include <linux/kvm_host.h> |
35 | #include <linux/slab.h> | 36 | #include <linux/slab.h> |
37 | #include <linux/workqueue.h> | ||
36 | 38 | ||
37 | #include "irq.h" | 39 | #include "irq.h" |
38 | #include "i8254.h" | 40 | #include "i8254.h" |
@@ -243,11 +245,22 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) | |||
243 | { | 245 | { |
244 | struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, | 246 | struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, |
245 | irq_ack_notifier); | 247 | irq_ack_notifier); |
246 | raw_spin_lock(&ps->inject_lock); | 248 | int value; |
247 | if (atomic_dec_return(&ps->pit_timer.pending) < 0) | 249 | |
250 | spin_lock(&ps->inject_lock); | ||
251 | value = atomic_dec_return(&ps->pit_timer.pending); | ||
252 | if (value < 0) | ||
253 | /* spurious acks can be generated if, for example, the | ||
254 | * PIC is being reset. Handle it gracefully here | ||
255 | */ | ||
248 | atomic_inc(&ps->pit_timer.pending); | 256 | atomic_inc(&ps->pit_timer.pending); |
257 | else if (value > 0) | ||
258 | /* in this case, we had multiple outstanding pit interrupts | ||
259 | * that we needed to inject. Reinject | ||
260 | */ | ||
261 | queue_work(ps->pit->wq, &ps->pit->expired); | ||
249 | ps->irq_ack = 1; | 262 | ps->irq_ack = 1; |
250 | raw_spin_unlock(&ps->inject_lock); | 263 | spin_unlock(&ps->inject_lock); |
251 | } | 264 | } |
252 | 265 | ||
253 | void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) | 266 | void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) |
@@ -263,10 +276,10 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) | |||
263 | hrtimer_start_expires(timer, HRTIMER_MODE_ABS); | 276 | hrtimer_start_expires(timer, HRTIMER_MODE_ABS); |
264 | } | 277 | } |
265 | 278 | ||
266 | static void destroy_pit_timer(struct kvm_timer *pt) | 279 | static void destroy_pit_timer(struct kvm_pit *pit) |
267 | { | 280 | { |
268 | pr_debug("execute del timer!\n"); | 281 | hrtimer_cancel(&pit->pit_state.pit_timer.timer); |
269 | hrtimer_cancel(&pt->timer); | 282 | cancel_work_sync(&pit->expired); |
270 | } | 283 | } |
271 | 284 | ||
272 | static bool kpit_is_periodic(struct kvm_timer *ktimer) | 285 | static bool kpit_is_periodic(struct kvm_timer *ktimer) |
@@ -280,6 +293,60 @@ static struct kvm_timer_ops kpit_ops = { | |||
280 | .is_periodic = kpit_is_periodic, | 293 | .is_periodic = kpit_is_periodic, |
281 | }; | 294 | }; |
282 | 295 | ||
296 | static void pit_do_work(struct work_struct *work) | ||
297 | { | ||
298 | struct kvm_pit *pit = container_of(work, struct kvm_pit, expired); | ||
299 | struct kvm *kvm = pit->kvm; | ||
300 | struct kvm_vcpu *vcpu; | ||
301 | int i; | ||
302 | struct kvm_kpit_state *ps = &pit->pit_state; | ||
303 | int inject = 0; | ||
304 | |||
305 | /* Try to inject pending interrupts when | ||
306 | * last one has been acked. | ||
307 | */ | ||
308 | spin_lock(&ps->inject_lock); | ||
309 | if (ps->irq_ack) { | ||
310 | ps->irq_ack = 0; | ||
311 | inject = 1; | ||
312 | } | ||
313 | spin_unlock(&ps->inject_lock); | ||
314 | if (inject) { | ||
315 | kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); | ||
316 | kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); | ||
317 | |||
318 | /* | ||
319 | * Provides NMI watchdog support via Virtual Wire mode. | ||
320 | * The route is: PIT -> PIC -> LVT0 in NMI mode. | ||
321 | * | ||
322 | * Note: Our Virtual Wire implementation is simplified, only | ||
323 | * propagating PIT interrupts to all VCPUs when they have set | ||
324 | * LVT0 to NMI delivery. Other PIC interrupts are just sent to | ||
325 | * VCPU0, and only if its LVT0 is in EXTINT mode. | ||
326 | */ | ||
327 | if (kvm->arch.vapics_in_nmi_mode > 0) | ||
328 | kvm_for_each_vcpu(i, vcpu, kvm) | ||
329 | kvm_apic_nmi_wd_deliver(vcpu); | ||
330 | } | ||
331 | } | ||
332 | |||
333 | static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) | ||
334 | { | ||
335 | struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); | ||
336 | struct kvm_pit *pt = ktimer->kvm->arch.vpit; | ||
337 | |||
338 | if (ktimer->reinject || !atomic_read(&ktimer->pending)) { | ||
339 | atomic_inc(&ktimer->pending); | ||
340 | queue_work(pt->wq, &pt->expired); | ||
341 | } | ||
342 | |||
343 | if (ktimer->t_ops->is_periodic(ktimer)) { | ||
344 | hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); | ||
345 | return HRTIMER_RESTART; | ||
346 | } else | ||
347 | return HRTIMER_NORESTART; | ||
348 | } | ||
349 | |||
283 | static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) | 350 | static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) |
284 | { | 351 | { |
285 | struct kvm_timer *pt = &ps->pit_timer; | 352 | struct kvm_timer *pt = &ps->pit_timer; |
@@ -291,13 +358,13 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) | |||
291 | 358 | ||
292 | /* TODO The new value only affected after the retriggered */ | 359 | /* TODO The new value only affected after the retriggered */ |
293 | hrtimer_cancel(&pt->timer); | 360 | hrtimer_cancel(&pt->timer); |
361 | cancel_work_sync(&ps->pit->expired); | ||
294 | pt->period = interval; | 362 | pt->period = interval; |
295 | ps->is_periodic = is_period; | 363 | ps->is_periodic = is_period; |
296 | 364 | ||
297 | pt->timer.function = kvm_timer_fn; | 365 | pt->timer.function = pit_timer_fn; |
298 | pt->t_ops = &kpit_ops; | 366 | pt->t_ops = &kpit_ops; |
299 | pt->kvm = ps->pit->kvm; | 367 | pt->kvm = ps->pit->kvm; |
300 | pt->vcpu = pt->kvm->bsp_vcpu; | ||
301 | 368 | ||
302 | atomic_set(&pt->pending, 0); | 369 | atomic_set(&pt->pending, 0); |
303 | ps->irq_ack = 1; | 370 | ps->irq_ack = 1; |
@@ -346,7 +413,7 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val) | |||
346 | } | 413 | } |
347 | break; | 414 | break; |
348 | default: | 415 | default: |
349 | destroy_pit_timer(&ps->pit_timer); | 416 | destroy_pit_timer(kvm->arch.vpit); |
350 | } | 417 | } |
351 | } | 418 | } |
352 | 419 | ||
@@ -625,7 +692,15 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) | |||
625 | 692 | ||
626 | mutex_init(&pit->pit_state.lock); | 693 | mutex_init(&pit->pit_state.lock); |
627 | mutex_lock(&pit->pit_state.lock); | 694 | mutex_lock(&pit->pit_state.lock); |
628 | raw_spin_lock_init(&pit->pit_state.inject_lock); | 695 | spin_lock_init(&pit->pit_state.inject_lock); |
696 | |||
697 | pit->wq = create_singlethread_workqueue("kvm-pit-wq"); | ||
698 | if (!pit->wq) { | ||
699 | mutex_unlock(&pit->pit_state.lock); | ||
700 | kfree(pit); | ||
701 | return NULL; | ||
702 | } | ||
703 | INIT_WORK(&pit->expired, pit_do_work); | ||
629 | 704 | ||
630 | kvm->arch.vpit = pit; | 705 | kvm->arch.vpit = pit; |
631 | pit->kvm = kvm; | 706 | pit->kvm = kvm; |
@@ -677,6 +752,9 @@ void kvm_free_pit(struct kvm *kvm) | |||
677 | struct hrtimer *timer; | 752 | struct hrtimer *timer; |
678 | 753 | ||
679 | if (kvm->arch.vpit) { | 754 | if (kvm->arch.vpit) { |
755 | kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &kvm->arch.vpit->dev); | ||
756 | kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, | ||
757 | &kvm->arch.vpit->speaker_dev); | ||
680 | kvm_unregister_irq_mask_notifier(kvm, 0, | 758 | kvm_unregister_irq_mask_notifier(kvm, 0, |
681 | &kvm->arch.vpit->mask_notifier); | 759 | &kvm->arch.vpit->mask_notifier); |
682 | kvm_unregister_irq_ack_notifier(kvm, | 760 | kvm_unregister_irq_ack_notifier(kvm, |
@@ -684,54 +762,10 @@ void kvm_free_pit(struct kvm *kvm) | |||
684 | mutex_lock(&kvm->arch.vpit->pit_state.lock); | 762 | mutex_lock(&kvm->arch.vpit->pit_state.lock); |
685 | timer = &kvm->arch.vpit->pit_state.pit_timer.timer; | 763 | timer = &kvm->arch.vpit->pit_state.pit_timer.timer; |
686 | hrtimer_cancel(timer); | 764 | hrtimer_cancel(timer); |
765 | cancel_work_sync(&kvm->arch.vpit->expired); | ||
687 | kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id); | 766 | kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id); |
688 | mutex_unlock(&kvm->arch.vpit->pit_state.lock); | 767 | mutex_unlock(&kvm->arch.vpit->pit_state.lock); |
768 | destroy_workqueue(kvm->arch.vpit->wq); | ||
689 | kfree(kvm->arch.vpit); | 769 | kfree(kvm->arch.vpit); |
690 | } | 770 | } |
691 | } | 771 | } |
692 | |||
693 | static void __inject_pit_timer_intr(struct kvm *kvm) | ||
694 | { | ||
695 | struct kvm_vcpu *vcpu; | ||
696 | int i; | ||
697 | |||
698 | kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); | ||
699 | kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); | ||
700 | |||
701 | /* | ||
702 | * Provides NMI watchdog support via Virtual Wire mode. | ||
703 | * The route is: PIT -> PIC -> LVT0 in NMI mode. | ||
704 | * | ||
705 | * Note: Our Virtual Wire implementation is simplified, only | ||
706 | * propagating PIT interrupts to all VCPUs when they have set | ||
707 | * LVT0 to NMI delivery. Other PIC interrupts are just sent to | ||
708 | * VCPU0, and only if its LVT0 is in EXTINT mode. | ||
709 | */ | ||
710 | if (kvm->arch.vapics_in_nmi_mode > 0) | ||
711 | kvm_for_each_vcpu(i, vcpu, kvm) | ||
712 | kvm_apic_nmi_wd_deliver(vcpu); | ||
713 | } | ||
714 | |||
715 | void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) | ||
716 | { | ||
717 | struct kvm_pit *pit = vcpu->kvm->arch.vpit; | ||
718 | struct kvm *kvm = vcpu->kvm; | ||
719 | struct kvm_kpit_state *ps; | ||
720 | |||
721 | if (pit) { | ||
722 | int inject = 0; | ||
723 | ps = &pit->pit_state; | ||
724 | |||
725 | /* Try to inject pending interrupts when | ||
726 | * last one has been acked. | ||
727 | */ | ||
728 | raw_spin_lock(&ps->inject_lock); | ||
729 | if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) { | ||
730 | ps->irq_ack = 0; | ||
731 | inject = 1; | ||
732 | } | ||
733 | raw_spin_unlock(&ps->inject_lock); | ||
734 | if (inject) | ||
735 | __inject_pit_timer_intr(kvm); | ||
736 | } | ||
737 | } | ||
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index 900d6b0ba7c2..46d08ca0b48f 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h | |||
@@ -27,7 +27,7 @@ struct kvm_kpit_state { | |||
27 | u32 speaker_data_on; | 27 | u32 speaker_data_on; |
28 | struct mutex lock; | 28 | struct mutex lock; |
29 | struct kvm_pit *pit; | 29 | struct kvm_pit *pit; |
30 | raw_spinlock_t inject_lock; | 30 | spinlock_t inject_lock; |
31 | unsigned long irq_ack; | 31 | unsigned long irq_ack; |
32 | struct kvm_irq_ack_notifier irq_ack_notifier; | 32 | struct kvm_irq_ack_notifier irq_ack_notifier; |
33 | }; | 33 | }; |
@@ -40,6 +40,8 @@ struct kvm_pit { | |||
40 | struct kvm_kpit_state pit_state; | 40 | struct kvm_kpit_state pit_state; |
41 | int irq_source_id; | 41 | int irq_source_id; |
42 | struct kvm_irq_mask_notifier mask_notifier; | 42 | struct kvm_irq_mask_notifier mask_notifier; |
43 | struct workqueue_struct *wq; | ||
44 | struct work_struct expired; | ||
43 | }; | 45 | }; |
44 | 46 | ||
45 | #define KVM_PIT_BASE_ADDRESS 0x40 | 47 | #define KVM_PIT_BASE_ADDRESS 0x40 |
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 93825ff3338f..8d10c063d7f2 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c | |||
@@ -3,6 +3,7 @@ | |||
3 | * | 3 | * |
4 | * Copyright (c) 2003-2004 Fabrice Bellard | 4 | * Copyright (c) 2003-2004 Fabrice Bellard |
5 | * Copyright (c) 2007 Intel Corporation | 5 | * Copyright (c) 2007 Intel Corporation |
6 | * Copyright 2009 Red Hat, Inc. and/or its affilates. | ||
6 | * | 7 | * |
7 | * Permission is hereby granted, free of charge, to any person obtaining a copy | 8 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
8 | * of this software and associated documentation files (the "Software"), to deal | 9 | * of this software and associated documentation files (the "Software"), to deal |
@@ -33,6 +34,8 @@ | |||
33 | #include <linux/kvm_host.h> | 34 | #include <linux/kvm_host.h> |
34 | #include "trace.h" | 35 | #include "trace.h" |
35 | 36 | ||
37 | static void pic_irq_request(struct kvm *kvm, int level); | ||
38 | |||
36 | static void pic_lock(struct kvm_pic *s) | 39 | static void pic_lock(struct kvm_pic *s) |
37 | __acquires(&s->lock) | 40 | __acquires(&s->lock) |
38 | { | 41 | { |
@@ -43,16 +46,25 @@ static void pic_unlock(struct kvm_pic *s) | |||
43 | __releases(&s->lock) | 46 | __releases(&s->lock) |
44 | { | 47 | { |
45 | bool wakeup = s->wakeup_needed; | 48 | bool wakeup = s->wakeup_needed; |
46 | struct kvm_vcpu *vcpu; | 49 | struct kvm_vcpu *vcpu, *found = NULL; |
50 | int i; | ||
47 | 51 | ||
48 | s->wakeup_needed = false; | 52 | s->wakeup_needed = false; |
49 | 53 | ||
50 | raw_spin_unlock(&s->lock); | 54 | raw_spin_unlock(&s->lock); |
51 | 55 | ||
52 | if (wakeup) { | 56 | if (wakeup) { |
53 | vcpu = s->kvm->bsp_vcpu; | 57 | kvm_for_each_vcpu(i, vcpu, s->kvm) { |
54 | if (vcpu) | 58 | if (kvm_apic_accept_pic_intr(vcpu)) { |
55 | kvm_vcpu_kick(vcpu); | 59 | found = vcpu; |
60 | break; | ||
61 | } | ||
62 | } | ||
63 | |||
64 | if (!found) | ||
65 | found = s->kvm->bsp_vcpu; | ||
66 | |||
67 | kvm_vcpu_kick(found); | ||
56 | } | 68 | } |
57 | } | 69 | } |
58 | 70 | ||
@@ -173,10 +185,7 @@ static void pic_update_irq(struct kvm_pic *s) | |||
173 | pic_set_irq1(&s->pics[0], 2, 0); | 185 | pic_set_irq1(&s->pics[0], 2, 0); |
174 | } | 186 | } |
175 | irq = pic_get_irq(&s->pics[0]); | 187 | irq = pic_get_irq(&s->pics[0]); |
176 | if (irq >= 0) | 188 | pic_irq_request(s->kvm, irq >= 0); |
177 | s->irq_request(s->irq_request_opaque, 1); | ||
178 | else | ||
179 | s->irq_request(s->irq_request_opaque, 0); | ||
180 | } | 189 | } |
181 | 190 | ||
182 | void kvm_pic_update_irq(struct kvm_pic *s) | 191 | void kvm_pic_update_irq(struct kvm_pic *s) |
@@ -261,8 +270,7 @@ int kvm_pic_read_irq(struct kvm *kvm) | |||
261 | void kvm_pic_reset(struct kvm_kpic_state *s) | 270 | void kvm_pic_reset(struct kvm_kpic_state *s) |
262 | { | 271 | { |
263 | int irq; | 272 | int irq; |
264 | struct kvm *kvm = s->pics_state->irq_request_opaque; | 273 | struct kvm_vcpu *vcpu0 = s->pics_state->kvm->bsp_vcpu; |
265 | struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu; | ||
266 | u8 irr = s->irr, isr = s->imr; | 274 | u8 irr = s->irr, isr = s->imr; |
267 | 275 | ||
268 | s->last_irr = 0; | 276 | s->last_irr = 0; |
@@ -301,8 +309,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) | |||
301 | /* | 309 | /* |
302 | * deassert a pending interrupt | 310 | * deassert a pending interrupt |
303 | */ | 311 | */ |
304 | s->pics_state->irq_request(s->pics_state-> | 312 | pic_irq_request(s->pics_state->kvm, 0); |
305 | irq_request_opaque, 0); | ||
306 | s->init_state = 1; | 313 | s->init_state = 1; |
307 | s->init4 = val & 1; | 314 | s->init4 = val & 1; |
308 | if (val & 0x02) | 315 | if (val & 0x02) |
@@ -356,10 +363,20 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) | |||
356 | } | 363 | } |
357 | } else | 364 | } else |
358 | switch (s->init_state) { | 365 | switch (s->init_state) { |
359 | case 0: /* normal mode */ | 366 | case 0: { /* normal mode */ |
367 | u8 imr_diff = s->imr ^ val, | ||
368 | off = (s == &s->pics_state->pics[0]) ? 0 : 8; | ||
360 | s->imr = val; | 369 | s->imr = val; |
370 | for (irq = 0; irq < PIC_NUM_PINS/2; irq++) | ||
371 | if (imr_diff & (1 << irq)) | ||
372 | kvm_fire_mask_notifiers( | ||
373 | s->pics_state->kvm, | ||
374 | SELECT_PIC(irq + off), | ||
375 | irq + off, | ||
376 | !!(s->imr & (1 << irq))); | ||
361 | pic_update_irq(s->pics_state); | 377 | pic_update_irq(s->pics_state); |
362 | break; | 378 | break; |
379 | } | ||
363 | case 1: | 380 | case 1: |
364 | s->irq_base = val & 0xf8; | 381 | s->irq_base = val & 0xf8; |
365 | s->init_state = 2; | 382 | s->init_state = 2; |
@@ -518,9 +535,8 @@ static int picdev_read(struct kvm_io_device *this, | |||
518 | /* | 535 | /* |
519 | * callback when PIC0 irq status changed | 536 | * callback when PIC0 irq status changed |
520 | */ | 537 | */ |
521 | static void pic_irq_request(void *opaque, int level) | 538 | static void pic_irq_request(struct kvm *kvm, int level) |
522 | { | 539 | { |
523 | struct kvm *kvm = opaque; | ||
524 | struct kvm_vcpu *vcpu = kvm->bsp_vcpu; | 540 | struct kvm_vcpu *vcpu = kvm->bsp_vcpu; |
525 | struct kvm_pic *s = pic_irqchip(kvm); | 541 | struct kvm_pic *s = pic_irqchip(kvm); |
526 | int irq = pic_get_irq(&s->pics[0]); | 542 | int irq = pic_get_irq(&s->pics[0]); |
@@ -549,8 +565,6 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) | |||
549 | s->kvm = kvm; | 565 | s->kvm = kvm; |
550 | s->pics[0].elcr_mask = 0xf8; | 566 | s->pics[0].elcr_mask = 0xf8; |
551 | s->pics[1].elcr_mask = 0xde; | 567 | s->pics[1].elcr_mask = 0xde; |
552 | s->irq_request = pic_irq_request; | ||
553 | s->irq_request_opaque = kvm; | ||
554 | s->pics[0].pics_state = s; | 568 | s->pics[0].pics_state = s; |
555 | s->pics[1].pics_state = s; | 569 | s->pics[1].pics_state = s; |
556 | 570 | ||
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 96dfbb6ad2a9..2095a049835e 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c | |||
@@ -1,6 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * irq.c: API for in kernel interrupt controller | 2 | * irq.c: API for in kernel interrupt controller |
3 | * Copyright (c) 2007, Intel Corporation. | 3 | * Copyright (c) 2007, Intel Corporation. |
4 | * Copyright 2009 Red Hat, Inc. and/or its affilates. | ||
4 | * | 5 | * |
5 | * This program is free software; you can redistribute it and/or modify it | 6 | * This program is free software; you can redistribute it and/or modify it |
6 | * under the terms and conditions of the GNU General Public License, | 7 | * under the terms and conditions of the GNU General Public License, |
@@ -89,7 +90,6 @@ EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt); | |||
89 | void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) | 90 | void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) |
90 | { | 91 | { |
91 | kvm_inject_apic_timer_irqs(vcpu); | 92 | kvm_inject_apic_timer_irqs(vcpu); |
92 | kvm_inject_pit_timer_irqs(vcpu); | ||
93 | /* TODO: PIT, RTC etc. */ | 93 | /* TODO: PIT, RTC etc. */ |
94 | } | 94 | } |
95 | EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); | 95 | EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); |
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index cd1f362f413d..ffed06871c5c 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h | |||
@@ -38,8 +38,6 @@ | |||
38 | struct kvm; | 38 | struct kvm; |
39 | struct kvm_vcpu; | 39 | struct kvm_vcpu; |
40 | 40 | ||
41 | typedef void irq_request_func(void *opaque, int level); | ||
42 | |||
43 | struct kvm_kpic_state { | 41 | struct kvm_kpic_state { |
44 | u8 last_irr; /* edge detection */ | 42 | u8 last_irr; /* edge detection */ |
45 | u8 irr; /* interrupt request register */ | 43 | u8 irr; /* interrupt request register */ |
@@ -67,8 +65,6 @@ struct kvm_pic { | |||
67 | unsigned pending_acks; | 65 | unsigned pending_acks; |
68 | struct kvm *kvm; | 66 | struct kvm *kvm; |
69 | struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ | 67 | struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ |
70 | irq_request_func *irq_request; | ||
71 | void *irq_request_opaque; | ||
72 | int output; /* intr from master PIC */ | 68 | int output; /* intr from master PIC */ |
73 | struct kvm_io_device dev; | 69 | struct kvm_io_device dev; |
74 | void (*ack_notifier)(void *opaque, int irq); | 70 | void (*ack_notifier)(void *opaque, int irq); |
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index cff851cf5322..6491ac8e755b 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h | |||
@@ -36,6 +36,8 @@ static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val) | |||
36 | 36 | ||
37 | static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) | 37 | static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) |
38 | { | 38 | { |
39 | might_sleep(); /* on svm */ | ||
40 | |||
39 | if (!test_bit(VCPU_EXREG_PDPTR, | 41 | if (!test_bit(VCPU_EXREG_PDPTR, |
40 | (unsigned long *)&vcpu->arch.regs_avail)) | 42 | (unsigned long *)&vcpu->arch.regs_avail)) |
41 | kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR); | 43 | kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR); |
@@ -69,4 +71,10 @@ static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu) | |||
69 | return kvm_read_cr4_bits(vcpu, ~0UL); | 71 | return kvm_read_cr4_bits(vcpu, ~0UL); |
70 | } | 72 | } |
71 | 73 | ||
74 | static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu) | ||
75 | { | ||
76 | return (kvm_register_read(vcpu, VCPU_REGS_RAX) & -1u) | ||
77 | | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32); | ||
78 | } | ||
79 | |||
72 | #endif | 80 | #endif |
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 1eb7a4ae0c9c..77d8c0f4817d 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c | |||
@@ -5,6 +5,7 @@ | |||
5 | * Copyright (C) 2006 Qumranet, Inc. | 5 | * Copyright (C) 2006 Qumranet, Inc. |
6 | * Copyright (C) 2007 Novell | 6 | * Copyright (C) 2007 Novell |
7 | * Copyright (C) 2007 Intel | 7 | * Copyright (C) 2007 Intel |
8 | * Copyright 2009 Red Hat, Inc. and/or its affilates. | ||
8 | * | 9 | * |
9 | * Authors: | 10 | * Authors: |
10 | * Dor Laor <dor.laor@qumranet.com> | 11 | * Dor Laor <dor.laor@qumranet.com> |
@@ -328,7 +329,7 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, | |||
328 | "dest_mode 0x%x, short_hand 0x%x\n", | 329 | "dest_mode 0x%x, short_hand 0x%x\n", |
329 | target, source, dest, dest_mode, short_hand); | 330 | target, source, dest, dest_mode, short_hand); |
330 | 331 | ||
331 | ASSERT(!target); | 332 | ASSERT(target); |
332 | switch (short_hand) { | 333 | switch (short_hand) { |
333 | case APIC_DEST_NOSHORT: | 334 | case APIC_DEST_NOSHORT: |
334 | if (dest_mode == 0) | 335 | if (dest_mode == 0) |
@@ -533,7 +534,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write) | |||
533 | struct kvm_vcpu *vcpu = apic->vcpu; | 534 | struct kvm_vcpu *vcpu = apic->vcpu; |
534 | struct kvm_run *run = vcpu->run; | 535 | struct kvm_run *run = vcpu->run; |
535 | 536 | ||
536 | set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests); | 537 | kvm_make_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu); |
537 | run->tpr_access.rip = kvm_rip_read(vcpu); | 538 | run->tpr_access.rip = kvm_rip_read(vcpu); |
538 | run->tpr_access.is_write = write; | 539 | run->tpr_access.is_write = write; |
539 | } | 540 | } |
@@ -1106,13 +1107,11 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) | |||
1106 | u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0); | 1107 | u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0); |
1107 | int r = 0; | 1108 | int r = 0; |
1108 | 1109 | ||
1109 | if (kvm_vcpu_is_bsp(vcpu)) { | 1110 | if (!apic_hw_enabled(vcpu->arch.apic)) |
1110 | if (!apic_hw_enabled(vcpu->arch.apic)) | 1111 | r = 1; |
1111 | r = 1; | 1112 | if ((lvt0 & APIC_LVT_MASKED) == 0 && |
1112 | if ((lvt0 & APIC_LVT_MASKED) == 0 && | 1113 | GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) |
1113 | GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) | 1114 | r = 1; |
1114 | r = 1; | ||
1115 | } | ||
1116 | return r; | 1115 | return r; |
1117 | } | 1116 | } |
1118 | 1117 | ||
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b1ed0a1a5913..311f6dad8951 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
@@ -7,6 +7,7 @@ | |||
7 | * MMU support | 7 | * MMU support |
8 | * | 8 | * |
9 | * Copyright (C) 2006 Qumranet, Inc. | 9 | * Copyright (C) 2006 Qumranet, Inc. |
10 | * Copyright 2010 Red Hat, Inc. and/or its affilates. | ||
10 | * | 11 | * |
11 | * Authors: | 12 | * Authors: |
12 | * Yaniv Kamay <yaniv@qumranet.com> | 13 | * Yaniv Kamay <yaniv@qumranet.com> |
@@ -32,6 +33,7 @@ | |||
32 | #include <linux/compiler.h> | 33 | #include <linux/compiler.h> |
33 | #include <linux/srcu.h> | 34 | #include <linux/srcu.h> |
34 | #include <linux/slab.h> | 35 | #include <linux/slab.h> |
36 | #include <linux/uaccess.h> | ||
35 | 37 | ||
36 | #include <asm/page.h> | 38 | #include <asm/page.h> |
37 | #include <asm/cmpxchg.h> | 39 | #include <asm/cmpxchg.h> |
@@ -90,8 +92,6 @@ module_param(oos_shadow, bool, 0644); | |||
90 | #define PT_FIRST_AVAIL_BITS_SHIFT 9 | 92 | #define PT_FIRST_AVAIL_BITS_SHIFT 9 |
91 | #define PT64_SECOND_AVAIL_BITS_SHIFT 52 | 93 | #define PT64_SECOND_AVAIL_BITS_SHIFT 52 |
92 | 94 | ||
93 | #define VALID_PAGE(x) ((x) != INVALID_PAGE) | ||
94 | |||
95 | #define PT64_LEVEL_BITS 9 | 95 | #define PT64_LEVEL_BITS 9 |
96 | 96 | ||
97 | #define PT64_LEVEL_SHIFT(level) \ | 97 | #define PT64_LEVEL_SHIFT(level) \ |
@@ -173,7 +173,7 @@ struct kvm_shadow_walk_iterator { | |||
173 | shadow_walk_okay(&(_walker)); \ | 173 | shadow_walk_okay(&(_walker)); \ |
174 | shadow_walk_next(&(_walker))) | 174 | shadow_walk_next(&(_walker))) |
175 | 175 | ||
176 | typedef int (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp); | 176 | typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte); |
177 | 177 | ||
178 | static struct kmem_cache *pte_chain_cache; | 178 | static struct kmem_cache *pte_chain_cache; |
179 | static struct kmem_cache *rmap_desc_cache; | 179 | static struct kmem_cache *rmap_desc_cache; |
@@ -281,13 +281,38 @@ static gfn_t pse36_gfn_delta(u32 gpte) | |||
281 | 281 | ||
282 | static void __set_spte(u64 *sptep, u64 spte) | 282 | static void __set_spte(u64 *sptep, u64 spte) |
283 | { | 283 | { |
284 | set_64bit(sptep, spte); | ||
285 | } | ||
286 | |||
287 | static u64 __xchg_spte(u64 *sptep, u64 new_spte) | ||
288 | { | ||
284 | #ifdef CONFIG_X86_64 | 289 | #ifdef CONFIG_X86_64 |
285 | set_64bit((unsigned long *)sptep, spte); | 290 | return xchg(sptep, new_spte); |
286 | #else | 291 | #else |
287 | set_64bit((unsigned long long *)sptep, spte); | 292 | u64 old_spte; |
293 | |||
294 | do { | ||
295 | old_spte = *sptep; | ||
296 | } while (cmpxchg64(sptep, old_spte, new_spte) != old_spte); | ||
297 | |||
298 | return old_spte; | ||
288 | #endif | 299 | #endif |
289 | } | 300 | } |
290 | 301 | ||
302 | static void update_spte(u64 *sptep, u64 new_spte) | ||
303 | { | ||
304 | u64 old_spte; | ||
305 | |||
306 | if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask) || | ||
307 | !is_rmap_spte(*sptep)) | ||
308 | __set_spte(sptep, new_spte); | ||
309 | else { | ||
310 | old_spte = __xchg_spte(sptep, new_spte); | ||
311 | if (old_spte & shadow_accessed_mask) | ||
312 | mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte))); | ||
313 | } | ||
314 | } | ||
315 | |||
291 | static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, | 316 | static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, |
292 | struct kmem_cache *base_cache, int min) | 317 | struct kmem_cache *base_cache, int min) |
293 | { | 318 | { |
@@ -304,10 +329,11 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, | |||
304 | return 0; | 329 | return 0; |
305 | } | 330 | } |
306 | 331 | ||
307 | static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) | 332 | static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc, |
333 | struct kmem_cache *cache) | ||
308 | { | 334 | { |
309 | while (mc->nobjs) | 335 | while (mc->nobjs) |
310 | kfree(mc->objects[--mc->nobjs]); | 336 | kmem_cache_free(cache, mc->objects[--mc->nobjs]); |
311 | } | 337 | } |
312 | 338 | ||
313 | static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, | 339 | static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, |
@@ -355,10 +381,11 @@ out: | |||
355 | 381 | ||
356 | static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) | 382 | static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) |
357 | { | 383 | { |
358 | mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache); | 384 | mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache); |
359 | mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache); | 385 | mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache); |
360 | mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); | 386 | mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); |
361 | mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); | 387 | mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache, |
388 | mmu_page_header_cache); | ||
362 | } | 389 | } |
363 | 390 | ||
364 | static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, | 391 | static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, |
@@ -379,7 +406,7 @@ static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu) | |||
379 | 406 | ||
380 | static void mmu_free_pte_chain(struct kvm_pte_chain *pc) | 407 | static void mmu_free_pte_chain(struct kvm_pte_chain *pc) |
381 | { | 408 | { |
382 | kfree(pc); | 409 | kmem_cache_free(pte_chain_cache, pc); |
383 | } | 410 | } |
384 | 411 | ||
385 | static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) | 412 | static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) |
@@ -390,7 +417,23 @@ static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) | |||
390 | 417 | ||
391 | static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) | 418 | static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) |
392 | { | 419 | { |
393 | kfree(rd); | 420 | kmem_cache_free(rmap_desc_cache, rd); |
421 | } | ||
422 | |||
423 | static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) | ||
424 | { | ||
425 | if (!sp->role.direct) | ||
426 | return sp->gfns[index]; | ||
427 | |||
428 | return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS)); | ||
429 | } | ||
430 | |||
431 | static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) | ||
432 | { | ||
433 | if (sp->role.direct) | ||
434 | BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index)); | ||
435 | else | ||
436 | sp->gfns[index] = gfn; | ||
394 | } | 437 | } |
395 | 438 | ||
396 | /* | 439 | /* |
@@ -403,8 +446,8 @@ static int *slot_largepage_idx(gfn_t gfn, | |||
403 | { | 446 | { |
404 | unsigned long idx; | 447 | unsigned long idx; |
405 | 448 | ||
406 | idx = (gfn / KVM_PAGES_PER_HPAGE(level)) - | 449 | idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - |
407 | (slot->base_gfn / KVM_PAGES_PER_HPAGE(level)); | 450 | (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); |
408 | return &slot->lpage_info[level - 2][idx].write_count; | 451 | return &slot->lpage_info[level - 2][idx].write_count; |
409 | } | 452 | } |
410 | 453 | ||
@@ -414,9 +457,7 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn) | |||
414 | int *write_count; | 457 | int *write_count; |
415 | int i; | 458 | int i; |
416 | 459 | ||
417 | gfn = unalias_gfn(kvm, gfn); | 460 | slot = gfn_to_memslot(kvm, gfn); |
418 | |||
419 | slot = gfn_to_memslot_unaliased(kvm, gfn); | ||
420 | for (i = PT_DIRECTORY_LEVEL; | 461 | for (i = PT_DIRECTORY_LEVEL; |
421 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { | 462 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { |
422 | write_count = slot_largepage_idx(gfn, slot, i); | 463 | write_count = slot_largepage_idx(gfn, slot, i); |
@@ -430,8 +471,7 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) | |||
430 | int *write_count; | 471 | int *write_count; |
431 | int i; | 472 | int i; |
432 | 473 | ||
433 | gfn = unalias_gfn(kvm, gfn); | 474 | slot = gfn_to_memslot(kvm, gfn); |
434 | slot = gfn_to_memslot_unaliased(kvm, gfn); | ||
435 | for (i = PT_DIRECTORY_LEVEL; | 475 | for (i = PT_DIRECTORY_LEVEL; |
436 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { | 476 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { |
437 | write_count = slot_largepage_idx(gfn, slot, i); | 477 | write_count = slot_largepage_idx(gfn, slot, i); |
@@ -447,8 +487,7 @@ static int has_wrprotected_page(struct kvm *kvm, | |||
447 | struct kvm_memory_slot *slot; | 487 | struct kvm_memory_slot *slot; |
448 | int *largepage_idx; | 488 | int *largepage_idx; |
449 | 489 | ||
450 | gfn = unalias_gfn(kvm, gfn); | 490 | slot = gfn_to_memslot(kvm, gfn); |
451 | slot = gfn_to_memslot_unaliased(kvm, gfn); | ||
452 | if (slot) { | 491 | if (slot) { |
453 | largepage_idx = slot_largepage_idx(gfn, slot, level); | 492 | largepage_idx = slot_largepage_idx(gfn, slot, level); |
454 | return *largepage_idx; | 493 | return *largepage_idx; |
@@ -501,7 +540,6 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) | |||
501 | 540 | ||
502 | /* | 541 | /* |
503 | * Take gfn and return the reverse mapping to it. | 542 | * Take gfn and return the reverse mapping to it. |
504 | * Note: gfn must be unaliased before this function get called | ||
505 | */ | 543 | */ |
506 | 544 | ||
507 | static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) | 545 | static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) |
@@ -513,8 +551,8 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) | |||
513 | if (likely(level == PT_PAGE_TABLE_LEVEL)) | 551 | if (likely(level == PT_PAGE_TABLE_LEVEL)) |
514 | return &slot->rmap[gfn - slot->base_gfn]; | 552 | return &slot->rmap[gfn - slot->base_gfn]; |
515 | 553 | ||
516 | idx = (gfn / KVM_PAGES_PER_HPAGE(level)) - | 554 | idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - |
517 | (slot->base_gfn / KVM_PAGES_PER_HPAGE(level)); | 555 | (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); |
518 | 556 | ||
519 | return &slot->lpage_info[level - 2][idx].rmap_pde; | 557 | return &slot->lpage_info[level - 2][idx].rmap_pde; |
520 | } | 558 | } |
@@ -541,9 +579,8 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) | |||
541 | 579 | ||
542 | if (!is_rmap_spte(*spte)) | 580 | if (!is_rmap_spte(*spte)) |
543 | return count; | 581 | return count; |
544 | gfn = unalias_gfn(vcpu->kvm, gfn); | ||
545 | sp = page_header(__pa(spte)); | 582 | sp = page_header(__pa(spte)); |
546 | sp->gfns[spte - sp->spt] = gfn; | 583 | kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); |
547 | rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); | 584 | rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); |
548 | if (!*rmapp) { | 585 | if (!*rmapp) { |
549 | rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); | 586 | rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); |
@@ -600,19 +637,13 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) | |||
600 | struct kvm_rmap_desc *desc; | 637 | struct kvm_rmap_desc *desc; |
601 | struct kvm_rmap_desc *prev_desc; | 638 | struct kvm_rmap_desc *prev_desc; |
602 | struct kvm_mmu_page *sp; | 639 | struct kvm_mmu_page *sp; |
603 | pfn_t pfn; | 640 | gfn_t gfn; |
604 | unsigned long *rmapp; | 641 | unsigned long *rmapp; |
605 | int i; | 642 | int i; |
606 | 643 | ||
607 | if (!is_rmap_spte(*spte)) | ||
608 | return; | ||
609 | sp = page_header(__pa(spte)); | 644 | sp = page_header(__pa(spte)); |
610 | pfn = spte_to_pfn(*spte); | 645 | gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); |
611 | if (*spte & shadow_accessed_mask) | 646 | rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); |
612 | kvm_set_pfn_accessed(pfn); | ||
613 | if (is_writable_pte(*spte)) | ||
614 | kvm_set_pfn_dirty(pfn); | ||
615 | rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level); | ||
616 | if (!*rmapp) { | 647 | if (!*rmapp) { |
617 | printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); | 648 | printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); |
618 | BUG(); | 649 | BUG(); |
@@ -644,6 +675,32 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) | |||
644 | } | 675 | } |
645 | } | 676 | } |
646 | 677 | ||
678 | static void set_spte_track_bits(u64 *sptep, u64 new_spte) | ||
679 | { | ||
680 | pfn_t pfn; | ||
681 | u64 old_spte = *sptep; | ||
682 | |||
683 | if (!shadow_accessed_mask || !is_shadow_present_pte(old_spte) || | ||
684 | old_spte & shadow_accessed_mask) { | ||
685 | __set_spte(sptep, new_spte); | ||
686 | } else | ||
687 | old_spte = __xchg_spte(sptep, new_spte); | ||
688 | |||
689 | if (!is_rmap_spte(old_spte)) | ||
690 | return; | ||
691 | pfn = spte_to_pfn(old_spte); | ||
692 | if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) | ||
693 | kvm_set_pfn_accessed(pfn); | ||
694 | if (is_writable_pte(old_spte)) | ||
695 | kvm_set_pfn_dirty(pfn); | ||
696 | } | ||
697 | |||
698 | static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) | ||
699 | { | ||
700 | set_spte_track_bits(sptep, new_spte); | ||
701 | rmap_remove(kvm, sptep); | ||
702 | } | ||
703 | |||
647 | static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) | 704 | static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) |
648 | { | 705 | { |
649 | struct kvm_rmap_desc *desc; | 706 | struct kvm_rmap_desc *desc; |
@@ -676,7 +733,6 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) | |||
676 | u64 *spte; | 733 | u64 *spte; |
677 | int i, write_protected = 0; | 734 | int i, write_protected = 0; |
678 | 735 | ||
679 | gfn = unalias_gfn(kvm, gfn); | ||
680 | rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL); | 736 | rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL); |
681 | 737 | ||
682 | spte = rmap_next(kvm, rmapp, NULL); | 738 | spte = rmap_next(kvm, rmapp, NULL); |
@@ -685,7 +741,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) | |||
685 | BUG_ON(!(*spte & PT_PRESENT_MASK)); | 741 | BUG_ON(!(*spte & PT_PRESENT_MASK)); |
686 | rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); | 742 | rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); |
687 | if (is_writable_pte(*spte)) { | 743 | if (is_writable_pte(*spte)) { |
688 | __set_spte(spte, *spte & ~PT_WRITABLE_MASK); | 744 | update_spte(spte, *spte & ~PT_WRITABLE_MASK); |
689 | write_protected = 1; | 745 | write_protected = 1; |
690 | } | 746 | } |
691 | spte = rmap_next(kvm, rmapp, spte); | 747 | spte = rmap_next(kvm, rmapp, spte); |
@@ -709,9 +765,9 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) | |||
709 | BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); | 765 | BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); |
710 | pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); | 766 | pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); |
711 | if (is_writable_pte(*spte)) { | 767 | if (is_writable_pte(*spte)) { |
712 | rmap_remove(kvm, spte); | 768 | drop_spte(kvm, spte, |
769 | shadow_trap_nonpresent_pte); | ||
713 | --kvm->stat.lpages; | 770 | --kvm->stat.lpages; |
714 | __set_spte(spte, shadow_trap_nonpresent_pte); | ||
715 | spte = NULL; | 771 | spte = NULL; |
716 | write_protected = 1; | 772 | write_protected = 1; |
717 | } | 773 | } |
@@ -731,8 +787,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, | |||
731 | while ((spte = rmap_next(kvm, rmapp, NULL))) { | 787 | while ((spte = rmap_next(kvm, rmapp, NULL))) { |
732 | BUG_ON(!(*spte & PT_PRESENT_MASK)); | 788 | BUG_ON(!(*spte & PT_PRESENT_MASK)); |
733 | rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); | 789 | rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); |
734 | rmap_remove(kvm, spte); | 790 | drop_spte(kvm, spte, shadow_trap_nonpresent_pte); |
735 | __set_spte(spte, shadow_trap_nonpresent_pte); | ||
736 | need_tlb_flush = 1; | 791 | need_tlb_flush = 1; |
737 | } | 792 | } |
738 | return need_tlb_flush; | 793 | return need_tlb_flush; |
@@ -754,8 +809,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, | |||
754 | rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); | 809 | rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); |
755 | need_flush = 1; | 810 | need_flush = 1; |
756 | if (pte_write(*ptep)) { | 811 | if (pte_write(*ptep)) { |
757 | rmap_remove(kvm, spte); | 812 | drop_spte(kvm, spte, shadow_trap_nonpresent_pte); |
758 | __set_spte(spte, shadow_trap_nonpresent_pte); | ||
759 | spte = rmap_next(kvm, rmapp, NULL); | 813 | spte = rmap_next(kvm, rmapp, NULL); |
760 | } else { | 814 | } else { |
761 | new_spte = *spte &~ (PT64_BASE_ADDR_MASK); | 815 | new_spte = *spte &~ (PT64_BASE_ADDR_MASK); |
@@ -763,9 +817,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, | |||
763 | 817 | ||
764 | new_spte &= ~PT_WRITABLE_MASK; | 818 | new_spte &= ~PT_WRITABLE_MASK; |
765 | new_spte &= ~SPTE_HOST_WRITEABLE; | 819 | new_spte &= ~SPTE_HOST_WRITEABLE; |
766 | if (is_writable_pte(*spte)) | 820 | new_spte &= ~shadow_accessed_mask; |
767 | kvm_set_pfn_dirty(spte_to_pfn(*spte)); | 821 | set_spte_track_bits(spte, new_spte); |
768 | __set_spte(spte, new_spte); | ||
769 | spte = rmap_next(kvm, rmapp, spte); | 822 | spte = rmap_next(kvm, rmapp, spte); |
770 | } | 823 | } |
771 | } | 824 | } |
@@ -799,8 +852,12 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, | |||
799 | ret = handler(kvm, &memslot->rmap[gfn_offset], data); | 852 | ret = handler(kvm, &memslot->rmap[gfn_offset], data); |
800 | 853 | ||
801 | for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { | 854 | for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { |
802 | int idx = gfn_offset; | 855 | unsigned long idx; |
803 | idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j); | 856 | int sh; |
857 | |||
858 | sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j); | ||
859 | idx = ((memslot->base_gfn+gfn_offset) >> sh) - | ||
860 | (memslot->base_gfn >> sh); | ||
804 | ret |= handler(kvm, | 861 | ret |= handler(kvm, |
805 | &memslot->lpage_info[j][idx].rmap_pde, | 862 | &memslot->lpage_info[j][idx].rmap_pde, |
806 | data); | 863 | data); |
@@ -863,7 +920,6 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) | |||
863 | 920 | ||
864 | sp = page_header(__pa(spte)); | 921 | sp = page_header(__pa(spte)); |
865 | 922 | ||
866 | gfn = unalias_gfn(vcpu->kvm, gfn); | ||
867 | rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); | 923 | rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); |
868 | 924 | ||
869 | kvm_unmap_rmapp(vcpu->kvm, rmapp, 0); | 925 | kvm_unmap_rmapp(vcpu->kvm, rmapp, 0); |
@@ -894,10 +950,12 @@ static int is_empty_shadow_page(u64 *spt) | |||
894 | static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) | 950 | static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) |
895 | { | 951 | { |
896 | ASSERT(is_empty_shadow_page(sp->spt)); | 952 | ASSERT(is_empty_shadow_page(sp->spt)); |
953 | hlist_del(&sp->hash_link); | ||
897 | list_del(&sp->link); | 954 | list_del(&sp->link); |
898 | __free_page(virt_to_page(sp->spt)); | 955 | __free_page(virt_to_page(sp->spt)); |
899 | __free_page(virt_to_page(sp->gfns)); | 956 | if (!sp->role.direct) |
900 | kfree(sp); | 957 | __free_page(virt_to_page(sp->gfns)); |
958 | kmem_cache_free(mmu_page_header_cache, sp); | ||
901 | ++kvm->arch.n_free_mmu_pages; | 959 | ++kvm->arch.n_free_mmu_pages; |
902 | } | 960 | } |
903 | 961 | ||
@@ -907,13 +965,15 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn) | |||
907 | } | 965 | } |
908 | 966 | ||
909 | static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, | 967 | static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, |
910 | u64 *parent_pte) | 968 | u64 *parent_pte, int direct) |
911 | { | 969 | { |
912 | struct kvm_mmu_page *sp; | 970 | struct kvm_mmu_page *sp; |
913 | 971 | ||
914 | sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp); | 972 | sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp); |
915 | sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); | 973 | sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); |
916 | sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); | 974 | if (!direct) |
975 | sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, | ||
976 | PAGE_SIZE); | ||
917 | set_page_private(virt_to_page(sp->spt), (unsigned long)sp); | 977 | set_page_private(virt_to_page(sp->spt), (unsigned long)sp); |
918 | list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); | 978 | list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); |
919 | bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); | 979 | bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); |
@@ -998,7 +1058,6 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, | |||
998 | BUG(); | 1058 | BUG(); |
999 | } | 1059 | } |
1000 | 1060 | ||
1001 | |||
1002 | static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn) | 1061 | static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn) |
1003 | { | 1062 | { |
1004 | struct kvm_pte_chain *pte_chain; | 1063 | struct kvm_pte_chain *pte_chain; |
@@ -1008,63 +1067,37 @@ static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn) | |||
1008 | 1067 | ||
1009 | if (!sp->multimapped && sp->parent_pte) { | 1068 | if (!sp->multimapped && sp->parent_pte) { |
1010 | parent_sp = page_header(__pa(sp->parent_pte)); | 1069 | parent_sp = page_header(__pa(sp->parent_pte)); |
1011 | fn(parent_sp); | 1070 | fn(parent_sp, sp->parent_pte); |
1012 | mmu_parent_walk(parent_sp, fn); | ||
1013 | return; | 1071 | return; |
1014 | } | 1072 | } |
1073 | |||
1015 | hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) | 1074 | hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) |
1016 | for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { | 1075 | for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { |
1017 | if (!pte_chain->parent_ptes[i]) | 1076 | u64 *spte = pte_chain->parent_ptes[i]; |
1077 | |||
1078 | if (!spte) | ||
1018 | break; | 1079 | break; |
1019 | parent_sp = page_header(__pa(pte_chain->parent_ptes[i])); | 1080 | parent_sp = page_header(__pa(spte)); |
1020 | fn(parent_sp); | 1081 | fn(parent_sp, spte); |
1021 | mmu_parent_walk(parent_sp, fn); | ||
1022 | } | 1082 | } |
1023 | } | 1083 | } |
1024 | 1084 | ||
1025 | static void kvm_mmu_update_unsync_bitmap(u64 *spte) | 1085 | static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte); |
1086 | static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) | ||
1026 | { | 1087 | { |
1027 | unsigned int index; | 1088 | mmu_parent_walk(sp, mark_unsync); |
1028 | struct kvm_mmu_page *sp = page_header(__pa(spte)); | ||
1029 | |||
1030 | index = spte - sp->spt; | ||
1031 | if (!__test_and_set_bit(index, sp->unsync_child_bitmap)) | ||
1032 | sp->unsync_children++; | ||
1033 | WARN_ON(!sp->unsync_children); | ||
1034 | } | 1089 | } |
1035 | 1090 | ||
1036 | static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp) | 1091 | static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte) |
1037 | { | 1092 | { |
1038 | struct kvm_pte_chain *pte_chain; | 1093 | unsigned int index; |
1039 | struct hlist_node *node; | ||
1040 | int i; | ||
1041 | 1094 | ||
1042 | if (!sp->parent_pte) | 1095 | index = spte - sp->spt; |
1096 | if (__test_and_set_bit(index, sp->unsync_child_bitmap)) | ||
1043 | return; | 1097 | return; |
1044 | 1098 | if (sp->unsync_children++) | |
1045 | if (!sp->multimapped) { | ||
1046 | kvm_mmu_update_unsync_bitmap(sp->parent_pte); | ||
1047 | return; | 1099 | return; |
1048 | } | 1100 | kvm_mmu_mark_parents_unsync(sp); |
1049 | |||
1050 | hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) | ||
1051 | for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { | ||
1052 | if (!pte_chain->parent_ptes[i]) | ||
1053 | break; | ||
1054 | kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]); | ||
1055 | } | ||
1056 | } | ||
1057 | |||
1058 | static int unsync_walk_fn(struct kvm_mmu_page *sp) | ||
1059 | { | ||
1060 | kvm_mmu_update_parents_unsync(sp); | ||
1061 | return 1; | ||
1062 | } | ||
1063 | |||
1064 | static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) | ||
1065 | { | ||
1066 | mmu_parent_walk(sp, unsync_walk_fn); | ||
1067 | kvm_mmu_update_parents_unsync(sp); | ||
1068 | } | 1101 | } |
1069 | 1102 | ||
1070 | static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, | 1103 | static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, |
@@ -1077,7 +1110,7 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, | |||
1077 | } | 1110 | } |
1078 | 1111 | ||
1079 | static int nonpaging_sync_page(struct kvm_vcpu *vcpu, | 1112 | static int nonpaging_sync_page(struct kvm_vcpu *vcpu, |
1080 | struct kvm_mmu_page *sp) | 1113 | struct kvm_mmu_page *sp, bool clear_unsync) |
1081 | { | 1114 | { |
1082 | return 1; | 1115 | return 1; |
1083 | } | 1116 | } |
@@ -1123,35 +1156,40 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, | |||
1123 | int i, ret, nr_unsync_leaf = 0; | 1156 | int i, ret, nr_unsync_leaf = 0; |
1124 | 1157 | ||
1125 | for_each_unsync_children(sp->unsync_child_bitmap, i) { | 1158 | for_each_unsync_children(sp->unsync_child_bitmap, i) { |
1159 | struct kvm_mmu_page *child; | ||
1126 | u64 ent = sp->spt[i]; | 1160 | u64 ent = sp->spt[i]; |
1127 | 1161 | ||
1128 | if (is_shadow_present_pte(ent) && !is_large_pte(ent)) { | 1162 | if (!is_shadow_present_pte(ent) || is_large_pte(ent)) |
1129 | struct kvm_mmu_page *child; | 1163 | goto clear_child_bitmap; |
1130 | child = page_header(ent & PT64_BASE_ADDR_MASK); | 1164 | |
1131 | 1165 | child = page_header(ent & PT64_BASE_ADDR_MASK); | |
1132 | if (child->unsync_children) { | 1166 | |
1133 | if (mmu_pages_add(pvec, child, i)) | 1167 | if (child->unsync_children) { |
1134 | return -ENOSPC; | 1168 | if (mmu_pages_add(pvec, child, i)) |
1135 | 1169 | return -ENOSPC; | |
1136 | ret = __mmu_unsync_walk(child, pvec); | 1170 | |
1137 | if (!ret) | 1171 | ret = __mmu_unsync_walk(child, pvec); |
1138 | __clear_bit(i, sp->unsync_child_bitmap); | 1172 | if (!ret) |
1139 | else if (ret > 0) | 1173 | goto clear_child_bitmap; |
1140 | nr_unsync_leaf += ret; | 1174 | else if (ret > 0) |
1141 | else | 1175 | nr_unsync_leaf += ret; |
1142 | return ret; | 1176 | else |
1143 | } | 1177 | return ret; |
1178 | } else if (child->unsync) { | ||
1179 | nr_unsync_leaf++; | ||
1180 | if (mmu_pages_add(pvec, child, i)) | ||
1181 | return -ENOSPC; | ||
1182 | } else | ||
1183 | goto clear_child_bitmap; | ||
1144 | 1184 | ||
1145 | if (child->unsync) { | 1185 | continue; |
1146 | nr_unsync_leaf++; | 1186 | |
1147 | if (mmu_pages_add(pvec, child, i)) | 1187 | clear_child_bitmap: |
1148 | return -ENOSPC; | 1188 | __clear_bit(i, sp->unsync_child_bitmap); |
1149 | } | 1189 | sp->unsync_children--; |
1150 | } | 1190 | WARN_ON((int)sp->unsync_children < 0); |
1151 | } | 1191 | } |
1152 | 1192 | ||
1153 | if (find_first_bit(sp->unsync_child_bitmap, 512) == 512) | ||
1154 | sp->unsync_children = 0; | ||
1155 | 1193 | ||
1156 | return nr_unsync_leaf; | 1194 | return nr_unsync_leaf; |
1157 | } | 1195 | } |
@@ -1166,26 +1204,6 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp, | |||
1166 | return __mmu_unsync_walk(sp, pvec); | 1204 | return __mmu_unsync_walk(sp, pvec); |
1167 | } | 1205 | } |
1168 | 1206 | ||
1169 | static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) | ||
1170 | { | ||
1171 | unsigned index; | ||
1172 | struct hlist_head *bucket; | ||
1173 | struct kvm_mmu_page *sp; | ||
1174 | struct hlist_node *node; | ||
1175 | |||
1176 | pgprintk("%s: looking for gfn %lx\n", __func__, gfn); | ||
1177 | index = kvm_page_table_hashfn(gfn); | ||
1178 | bucket = &kvm->arch.mmu_page_hash[index]; | ||
1179 | hlist_for_each_entry(sp, node, bucket, hash_link) | ||
1180 | if (sp->gfn == gfn && !sp->role.direct | ||
1181 | && !sp->role.invalid) { | ||
1182 | pgprintk("%s: found role %x\n", | ||
1183 | __func__, sp->role.word); | ||
1184 | return sp; | ||
1185 | } | ||
1186 | return NULL; | ||
1187 | } | ||
1188 | |||
1189 | static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) | 1207 | static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) |
1190 | { | 1208 | { |
1191 | WARN_ON(!sp->unsync); | 1209 | WARN_ON(!sp->unsync); |
@@ -1194,20 +1212,36 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) | |||
1194 | --kvm->stat.mmu_unsync; | 1212 | --kvm->stat.mmu_unsync; |
1195 | } | 1213 | } |
1196 | 1214 | ||
1197 | static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp); | 1215 | static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, |
1216 | struct list_head *invalid_list); | ||
1217 | static void kvm_mmu_commit_zap_page(struct kvm *kvm, | ||
1218 | struct list_head *invalid_list); | ||
1198 | 1219 | ||
1199 | static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | 1220 | #define for_each_gfn_sp(kvm, sp, gfn, pos) \ |
1221 | hlist_for_each_entry(sp, pos, \ | ||
1222 | &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ | ||
1223 | if ((sp)->gfn != (gfn)) {} else | ||
1224 | |||
1225 | #define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos) \ | ||
1226 | hlist_for_each_entry(sp, pos, \ | ||
1227 | &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ | ||
1228 | if ((sp)->gfn != (gfn) || (sp)->role.direct || \ | ||
1229 | (sp)->role.invalid) {} else | ||
1230 | |||
1231 | /* @sp->gfn should be write-protected at the call site */ | ||
1232 | static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | ||
1233 | struct list_head *invalid_list, bool clear_unsync) | ||
1200 | { | 1234 | { |
1201 | if (sp->role.cr4_pae != !!is_pae(vcpu)) { | 1235 | if (sp->role.cr4_pae != !!is_pae(vcpu)) { |
1202 | kvm_mmu_zap_page(vcpu->kvm, sp); | 1236 | kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); |
1203 | return 1; | 1237 | return 1; |
1204 | } | 1238 | } |
1205 | 1239 | ||
1206 | if (rmap_write_protect(vcpu->kvm, sp->gfn)) | 1240 | if (clear_unsync) |
1207 | kvm_flush_remote_tlbs(vcpu->kvm); | 1241 | kvm_unlink_unsync_page(vcpu->kvm, sp); |
1208 | kvm_unlink_unsync_page(vcpu->kvm, sp); | 1242 | |
1209 | if (vcpu->arch.mmu.sync_page(vcpu, sp)) { | 1243 | if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) { |
1210 | kvm_mmu_zap_page(vcpu->kvm, sp); | 1244 | kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); |
1211 | return 1; | 1245 | return 1; |
1212 | } | 1246 | } |
1213 | 1247 | ||
@@ -1215,6 +1249,52 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | |||
1215 | return 0; | 1249 | return 0; |
1216 | } | 1250 | } |
1217 | 1251 | ||
1252 | static int kvm_sync_page_transient(struct kvm_vcpu *vcpu, | ||
1253 | struct kvm_mmu_page *sp) | ||
1254 | { | ||
1255 | LIST_HEAD(invalid_list); | ||
1256 | int ret; | ||
1257 | |||
1258 | ret = __kvm_sync_page(vcpu, sp, &invalid_list, false); | ||
1259 | if (ret) | ||
1260 | kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); | ||
1261 | |||
1262 | return ret; | ||
1263 | } | ||
1264 | |||
1265 | static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | ||
1266 | struct list_head *invalid_list) | ||
1267 | { | ||
1268 | return __kvm_sync_page(vcpu, sp, invalid_list, true); | ||
1269 | } | ||
1270 | |||
1271 | /* @gfn should be write-protected at the call site */ | ||
1272 | static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) | ||
1273 | { | ||
1274 | struct kvm_mmu_page *s; | ||
1275 | struct hlist_node *node; | ||
1276 | LIST_HEAD(invalid_list); | ||
1277 | bool flush = false; | ||
1278 | |||
1279 | for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { | ||
1280 | if (!s->unsync) | ||
1281 | continue; | ||
1282 | |||
1283 | WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); | ||
1284 | if ((s->role.cr4_pae != !!is_pae(vcpu)) || | ||
1285 | (vcpu->arch.mmu.sync_page(vcpu, s, true))) { | ||
1286 | kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list); | ||
1287 | continue; | ||
1288 | } | ||
1289 | kvm_unlink_unsync_page(vcpu->kvm, s); | ||
1290 | flush = true; | ||
1291 | } | ||
1292 | |||
1293 | kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); | ||
1294 | if (flush) | ||
1295 | kvm_mmu_flush_tlb(vcpu); | ||
1296 | } | ||
1297 | |||
1218 | struct mmu_page_path { | 1298 | struct mmu_page_path { |
1219 | struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1]; | 1299 | struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1]; |
1220 | unsigned int idx[PT64_ROOT_LEVEL-1]; | 1300 | unsigned int idx[PT64_ROOT_LEVEL-1]; |
@@ -1281,6 +1361,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, | |||
1281 | struct kvm_mmu_page *sp; | 1361 | struct kvm_mmu_page *sp; |
1282 | struct mmu_page_path parents; | 1362 | struct mmu_page_path parents; |
1283 | struct kvm_mmu_pages pages; | 1363 | struct kvm_mmu_pages pages; |
1364 | LIST_HEAD(invalid_list); | ||
1284 | 1365 | ||
1285 | kvm_mmu_pages_init(parent, &parents, &pages); | 1366 | kvm_mmu_pages_init(parent, &parents, &pages); |
1286 | while (mmu_unsync_walk(parent, &pages)) { | 1367 | while (mmu_unsync_walk(parent, &pages)) { |
@@ -1293,9 +1374,10 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, | |||
1293 | kvm_flush_remote_tlbs(vcpu->kvm); | 1374 | kvm_flush_remote_tlbs(vcpu->kvm); |
1294 | 1375 | ||
1295 | for_each_sp(pages, sp, parents, i) { | 1376 | for_each_sp(pages, sp, parents, i) { |
1296 | kvm_sync_page(vcpu, sp); | 1377 | kvm_sync_page(vcpu, sp, &invalid_list); |
1297 | mmu_pages_clear_parents(&parents); | 1378 | mmu_pages_clear_parents(&parents); |
1298 | } | 1379 | } |
1380 | kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); | ||
1299 | cond_resched_lock(&vcpu->kvm->mmu_lock); | 1381 | cond_resched_lock(&vcpu->kvm->mmu_lock); |
1300 | kvm_mmu_pages_init(parent, &parents, &pages); | 1382 | kvm_mmu_pages_init(parent, &parents, &pages); |
1301 | } | 1383 | } |
@@ -1310,11 +1392,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | |||
1310 | u64 *parent_pte) | 1392 | u64 *parent_pte) |
1311 | { | 1393 | { |
1312 | union kvm_mmu_page_role role; | 1394 | union kvm_mmu_page_role role; |
1313 | unsigned index; | ||
1314 | unsigned quadrant; | 1395 | unsigned quadrant; |
1315 | struct hlist_head *bucket; | ||
1316 | struct kvm_mmu_page *sp; | 1396 | struct kvm_mmu_page *sp; |
1317 | struct hlist_node *node, *tmp; | 1397 | struct hlist_node *node; |
1398 | bool need_sync = false; | ||
1318 | 1399 | ||
1319 | role = vcpu->arch.mmu.base_role; | 1400 | role = vcpu->arch.mmu.base_role; |
1320 | role.level = level; | 1401 | role.level = level; |
@@ -1322,40 +1403,45 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | |||
1322 | if (role.direct) | 1403 | if (role.direct) |
1323 | role.cr4_pae = 0; | 1404 | role.cr4_pae = 0; |
1324 | role.access = access; | 1405 | role.access = access; |
1325 | if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { | 1406 | if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { |
1326 | quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); | 1407 | quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); |
1327 | quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; | 1408 | quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; |
1328 | role.quadrant = quadrant; | 1409 | role.quadrant = quadrant; |
1329 | } | 1410 | } |
1330 | index = kvm_page_table_hashfn(gfn); | 1411 | for_each_gfn_sp(vcpu->kvm, sp, gfn, node) { |
1331 | bucket = &vcpu->kvm->arch.mmu_page_hash[index]; | 1412 | if (!need_sync && sp->unsync) |
1332 | hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link) | 1413 | need_sync = true; |
1333 | if (sp->gfn == gfn) { | ||
1334 | if (sp->unsync) | ||
1335 | if (kvm_sync_page(vcpu, sp)) | ||
1336 | continue; | ||
1337 | 1414 | ||
1338 | if (sp->role.word != role.word) | 1415 | if (sp->role.word != role.word) |
1339 | continue; | 1416 | continue; |
1340 | 1417 | ||
1341 | mmu_page_add_parent_pte(vcpu, sp, parent_pte); | 1418 | if (sp->unsync && kvm_sync_page_transient(vcpu, sp)) |
1342 | if (sp->unsync_children) { | 1419 | break; |
1343 | set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); | 1420 | |
1344 | kvm_mmu_mark_parents_unsync(sp); | 1421 | mmu_page_add_parent_pte(vcpu, sp, parent_pte); |
1345 | } | 1422 | if (sp->unsync_children) { |
1346 | trace_kvm_mmu_get_page(sp, false); | 1423 | kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); |
1347 | return sp; | 1424 | kvm_mmu_mark_parents_unsync(sp); |
1348 | } | 1425 | } else if (sp->unsync) |
1426 | kvm_mmu_mark_parents_unsync(sp); | ||
1427 | |||
1428 | trace_kvm_mmu_get_page(sp, false); | ||
1429 | return sp; | ||
1430 | } | ||
1349 | ++vcpu->kvm->stat.mmu_cache_miss; | 1431 | ++vcpu->kvm->stat.mmu_cache_miss; |
1350 | sp = kvm_mmu_alloc_page(vcpu, parent_pte); | 1432 | sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct); |
1351 | if (!sp) | 1433 | if (!sp) |
1352 | return sp; | 1434 | return sp; |
1353 | sp->gfn = gfn; | 1435 | sp->gfn = gfn; |
1354 | sp->role = role; | 1436 | sp->role = role; |
1355 | hlist_add_head(&sp->hash_link, bucket); | 1437 | hlist_add_head(&sp->hash_link, |
1438 | &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]); | ||
1356 | if (!direct) { | 1439 | if (!direct) { |
1357 | if (rmap_write_protect(vcpu->kvm, gfn)) | 1440 | if (rmap_write_protect(vcpu->kvm, gfn)) |
1358 | kvm_flush_remote_tlbs(vcpu->kvm); | 1441 | kvm_flush_remote_tlbs(vcpu->kvm); |
1442 | if (level > PT_PAGE_TABLE_LEVEL && need_sync) | ||
1443 | kvm_sync_pages(vcpu, gfn); | ||
1444 | |||
1359 | account_shadowed(vcpu->kvm, gfn); | 1445 | account_shadowed(vcpu->kvm, gfn); |
1360 | } | 1446 | } |
1361 | if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) | 1447 | if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) |
@@ -1402,6 +1488,47 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) | |||
1402 | --iterator->level; | 1488 | --iterator->level; |
1403 | } | 1489 | } |
1404 | 1490 | ||
1491 | static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) | ||
1492 | { | ||
1493 | u64 spte; | ||
1494 | |||
1495 | spte = __pa(sp->spt) | ||
1496 | | PT_PRESENT_MASK | PT_ACCESSED_MASK | ||
1497 | | PT_WRITABLE_MASK | PT_USER_MASK; | ||
1498 | __set_spte(sptep, spte); | ||
1499 | } | ||
1500 | |||
1501 | static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) | ||
1502 | { | ||
1503 | if (is_large_pte(*sptep)) { | ||
1504 | drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); | ||
1505 | kvm_flush_remote_tlbs(vcpu->kvm); | ||
1506 | } | ||
1507 | } | ||
1508 | |||
1509 | static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, | ||
1510 | unsigned direct_access) | ||
1511 | { | ||
1512 | if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) { | ||
1513 | struct kvm_mmu_page *child; | ||
1514 | |||
1515 | /* | ||
1516 | * For the direct sp, if the guest pte's dirty bit | ||
1517 | * changed form clean to dirty, it will corrupt the | ||
1518 | * sp's access: allow writable in the read-only sp, | ||
1519 | * so we should update the spte at this point to get | ||
1520 | * a new sp with the correct access. | ||
1521 | */ | ||
1522 | child = page_header(*sptep & PT64_BASE_ADDR_MASK); | ||
1523 | if (child->role.access == direct_access) | ||
1524 | return; | ||
1525 | |||
1526 | mmu_page_remove_parent_pte(child, sptep); | ||
1527 | __set_spte(sptep, shadow_trap_nonpresent_pte); | ||
1528 | kvm_flush_remote_tlbs(vcpu->kvm); | ||
1529 | } | ||
1530 | } | ||
1531 | |||
1405 | static void kvm_mmu_page_unlink_children(struct kvm *kvm, | 1532 | static void kvm_mmu_page_unlink_children(struct kvm *kvm, |
1406 | struct kvm_mmu_page *sp) | 1533 | struct kvm_mmu_page *sp) |
1407 | { | 1534 | { |
@@ -1422,7 +1549,8 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm, | |||
1422 | } else { | 1549 | } else { |
1423 | if (is_large_pte(ent)) | 1550 | if (is_large_pte(ent)) |
1424 | --kvm->stat.lpages; | 1551 | --kvm->stat.lpages; |
1425 | rmap_remove(kvm, &pt[i]); | 1552 | drop_spte(kvm, &pt[i], |
1553 | shadow_trap_nonpresent_pte); | ||
1426 | } | 1554 | } |
1427 | } | 1555 | } |
1428 | pt[i] = shadow_trap_nonpresent_pte; | 1556 | pt[i] = shadow_trap_nonpresent_pte; |
@@ -1464,7 +1592,8 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) | |||
1464 | } | 1592 | } |
1465 | 1593 | ||
1466 | static int mmu_zap_unsync_children(struct kvm *kvm, | 1594 | static int mmu_zap_unsync_children(struct kvm *kvm, |
1467 | struct kvm_mmu_page *parent) | 1595 | struct kvm_mmu_page *parent, |
1596 | struct list_head *invalid_list) | ||
1468 | { | 1597 | { |
1469 | int i, zapped = 0; | 1598 | int i, zapped = 0; |
1470 | struct mmu_page_path parents; | 1599 | struct mmu_page_path parents; |
@@ -1478,7 +1607,7 @@ static int mmu_zap_unsync_children(struct kvm *kvm, | |||
1478 | struct kvm_mmu_page *sp; | 1607 | struct kvm_mmu_page *sp; |
1479 | 1608 | ||
1480 | for_each_sp(pages, sp, parents, i) { | 1609 | for_each_sp(pages, sp, parents, i) { |
1481 | kvm_mmu_zap_page(kvm, sp); | 1610 | kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); |
1482 | mmu_pages_clear_parents(&parents); | 1611 | mmu_pages_clear_parents(&parents); |
1483 | zapped++; | 1612 | zapped++; |
1484 | } | 1613 | } |
@@ -1488,32 +1617,52 @@ static int mmu_zap_unsync_children(struct kvm *kvm, | |||
1488 | return zapped; | 1617 | return zapped; |
1489 | } | 1618 | } |
1490 | 1619 | ||
1491 | static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) | 1620 | static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, |
1621 | struct list_head *invalid_list) | ||
1492 | { | 1622 | { |
1493 | int ret; | 1623 | int ret; |
1494 | 1624 | ||
1495 | trace_kvm_mmu_zap_page(sp); | 1625 | trace_kvm_mmu_prepare_zap_page(sp); |
1496 | ++kvm->stat.mmu_shadow_zapped; | 1626 | ++kvm->stat.mmu_shadow_zapped; |
1497 | ret = mmu_zap_unsync_children(kvm, sp); | 1627 | ret = mmu_zap_unsync_children(kvm, sp, invalid_list); |
1498 | kvm_mmu_page_unlink_children(kvm, sp); | 1628 | kvm_mmu_page_unlink_children(kvm, sp); |
1499 | kvm_mmu_unlink_parents(kvm, sp); | 1629 | kvm_mmu_unlink_parents(kvm, sp); |
1500 | kvm_flush_remote_tlbs(kvm); | ||
1501 | if (!sp->role.invalid && !sp->role.direct) | 1630 | if (!sp->role.invalid && !sp->role.direct) |
1502 | unaccount_shadowed(kvm, sp->gfn); | 1631 | unaccount_shadowed(kvm, sp->gfn); |
1503 | if (sp->unsync) | 1632 | if (sp->unsync) |
1504 | kvm_unlink_unsync_page(kvm, sp); | 1633 | kvm_unlink_unsync_page(kvm, sp); |
1505 | if (!sp->root_count) { | 1634 | if (!sp->root_count) { |
1506 | hlist_del(&sp->hash_link); | 1635 | /* Count self */ |
1507 | kvm_mmu_free_page(kvm, sp); | 1636 | ret++; |
1637 | list_move(&sp->link, invalid_list); | ||
1508 | } else { | 1638 | } else { |
1509 | sp->role.invalid = 1; | ||
1510 | list_move(&sp->link, &kvm->arch.active_mmu_pages); | 1639 | list_move(&sp->link, &kvm->arch.active_mmu_pages); |
1511 | kvm_reload_remote_mmus(kvm); | 1640 | kvm_reload_remote_mmus(kvm); |
1512 | } | 1641 | } |
1642 | |||
1643 | sp->role.invalid = 1; | ||
1513 | kvm_mmu_reset_last_pte_updated(kvm); | 1644 | kvm_mmu_reset_last_pte_updated(kvm); |
1514 | return ret; | 1645 | return ret; |
1515 | } | 1646 | } |
1516 | 1647 | ||
1648 | static void kvm_mmu_commit_zap_page(struct kvm *kvm, | ||
1649 | struct list_head *invalid_list) | ||
1650 | { | ||
1651 | struct kvm_mmu_page *sp; | ||
1652 | |||
1653 | if (list_empty(invalid_list)) | ||
1654 | return; | ||
1655 | |||
1656 | kvm_flush_remote_tlbs(kvm); | ||
1657 | |||
1658 | do { | ||
1659 | sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); | ||
1660 | WARN_ON(!sp->role.invalid || sp->root_count); | ||
1661 | kvm_mmu_free_page(kvm, sp); | ||
1662 | } while (!list_empty(invalid_list)); | ||
1663 | |||
1664 | } | ||
1665 | |||
1517 | /* | 1666 | /* |
1518 | * Changing the number of mmu pages allocated to the vm | 1667 | * Changing the number of mmu pages allocated to the vm |
1519 | * Note: if kvm_nr_mmu_pages is too small, you will get dead lock | 1668 | * Note: if kvm_nr_mmu_pages is too small, you will get dead lock |
@@ -1521,6 +1670,7 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) | |||
1521 | void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) | 1670 | void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) |
1522 | { | 1671 | { |
1523 | int used_pages; | 1672 | int used_pages; |
1673 | LIST_HEAD(invalid_list); | ||
1524 | 1674 | ||
1525 | used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages; | 1675 | used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages; |
1526 | used_pages = max(0, used_pages); | 1676 | used_pages = max(0, used_pages); |
@@ -1538,9 +1688,10 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) | |||
1538 | 1688 | ||
1539 | page = container_of(kvm->arch.active_mmu_pages.prev, | 1689 | page = container_of(kvm->arch.active_mmu_pages.prev, |
1540 | struct kvm_mmu_page, link); | 1690 | struct kvm_mmu_page, link); |
1541 | used_pages -= kvm_mmu_zap_page(kvm, page); | 1691 | used_pages -= kvm_mmu_prepare_zap_page(kvm, page, |
1542 | used_pages--; | 1692 | &invalid_list); |
1543 | } | 1693 | } |
1694 | kvm_mmu_commit_zap_page(kvm, &invalid_list); | ||
1544 | kvm_nr_mmu_pages = used_pages; | 1695 | kvm_nr_mmu_pages = used_pages; |
1545 | kvm->arch.n_free_mmu_pages = 0; | 1696 | kvm->arch.n_free_mmu_pages = 0; |
1546 | } | 1697 | } |
@@ -1553,47 +1704,36 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) | |||
1553 | 1704 | ||
1554 | static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) | 1705 | static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) |
1555 | { | 1706 | { |
1556 | unsigned index; | ||
1557 | struct hlist_head *bucket; | ||
1558 | struct kvm_mmu_page *sp; | 1707 | struct kvm_mmu_page *sp; |
1559 | struct hlist_node *node, *n; | 1708 | struct hlist_node *node; |
1709 | LIST_HEAD(invalid_list); | ||
1560 | int r; | 1710 | int r; |
1561 | 1711 | ||
1562 | pgprintk("%s: looking for gfn %lx\n", __func__, gfn); | 1712 | pgprintk("%s: looking for gfn %lx\n", __func__, gfn); |
1563 | r = 0; | 1713 | r = 0; |
1564 | index = kvm_page_table_hashfn(gfn); | 1714 | |
1565 | bucket = &kvm->arch.mmu_page_hash[index]; | 1715 | for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { |
1566 | restart: | 1716 | pgprintk("%s: gfn %lx role %x\n", __func__, gfn, |
1567 | hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) | 1717 | sp->role.word); |
1568 | if (sp->gfn == gfn && !sp->role.direct) { | 1718 | r = 1; |
1569 | pgprintk("%s: gfn %lx role %x\n", __func__, gfn, | 1719 | kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); |
1570 | sp->role.word); | 1720 | } |
1571 | r = 1; | 1721 | kvm_mmu_commit_zap_page(kvm, &invalid_list); |
1572 | if (kvm_mmu_zap_page(kvm, sp)) | ||
1573 | goto restart; | ||
1574 | } | ||
1575 | return r; | 1722 | return r; |
1576 | } | 1723 | } |
1577 | 1724 | ||
1578 | static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) | 1725 | static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) |
1579 | { | 1726 | { |
1580 | unsigned index; | ||
1581 | struct hlist_head *bucket; | ||
1582 | struct kvm_mmu_page *sp; | 1727 | struct kvm_mmu_page *sp; |
1583 | struct hlist_node *node, *nn; | 1728 | struct hlist_node *node; |
1729 | LIST_HEAD(invalid_list); | ||
1584 | 1730 | ||
1585 | index = kvm_page_table_hashfn(gfn); | 1731 | for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { |
1586 | bucket = &kvm->arch.mmu_page_hash[index]; | 1732 | pgprintk("%s: zap %lx %x\n", |
1587 | restart: | 1733 | __func__, gfn, sp->role.word); |
1588 | hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) { | 1734 | kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); |
1589 | if (sp->gfn == gfn && !sp->role.direct | ||
1590 | && !sp->role.invalid) { | ||
1591 | pgprintk("%s: zap %lx %x\n", | ||
1592 | __func__, gfn, sp->role.word); | ||
1593 | if (kvm_mmu_zap_page(kvm, sp)) | ||
1594 | goto restart; | ||
1595 | } | ||
1596 | } | 1735 | } |
1736 | kvm_mmu_commit_zap_page(kvm, &invalid_list); | ||
1597 | } | 1737 | } |
1598 | 1738 | ||
1599 | static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) | 1739 | static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) |
@@ -1723,47 +1863,51 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) | |||
1723 | } | 1863 | } |
1724 | EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type); | 1864 | EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type); |
1725 | 1865 | ||
1726 | static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | 1866 | static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) |
1727 | { | 1867 | { |
1728 | unsigned index; | ||
1729 | struct hlist_head *bucket; | ||
1730 | struct kvm_mmu_page *s; | ||
1731 | struct hlist_node *node, *n; | ||
1732 | |||
1733 | index = kvm_page_table_hashfn(sp->gfn); | ||
1734 | bucket = &vcpu->kvm->arch.mmu_page_hash[index]; | ||
1735 | /* don't unsync if pagetable is shadowed with multiple roles */ | ||
1736 | hlist_for_each_entry_safe(s, node, n, bucket, hash_link) { | ||
1737 | if (s->gfn != sp->gfn || s->role.direct) | ||
1738 | continue; | ||
1739 | if (s->role.word != sp->role.word) | ||
1740 | return 1; | ||
1741 | } | ||
1742 | trace_kvm_mmu_unsync_page(sp); | 1868 | trace_kvm_mmu_unsync_page(sp); |
1743 | ++vcpu->kvm->stat.mmu_unsync; | 1869 | ++vcpu->kvm->stat.mmu_unsync; |
1744 | sp->unsync = 1; | 1870 | sp->unsync = 1; |
1745 | 1871 | ||
1746 | kvm_mmu_mark_parents_unsync(sp); | 1872 | kvm_mmu_mark_parents_unsync(sp); |
1747 | |||
1748 | mmu_convert_notrap(sp); | 1873 | mmu_convert_notrap(sp); |
1749 | return 0; | 1874 | } |
1875 | |||
1876 | static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) | ||
1877 | { | ||
1878 | struct kvm_mmu_page *s; | ||
1879 | struct hlist_node *node; | ||
1880 | |||
1881 | for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { | ||
1882 | if (s->unsync) | ||
1883 | continue; | ||
1884 | WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); | ||
1885 | __kvm_unsync_page(vcpu, s); | ||
1886 | } | ||
1750 | } | 1887 | } |
1751 | 1888 | ||
1752 | static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, | 1889 | static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, |
1753 | bool can_unsync) | 1890 | bool can_unsync) |
1754 | { | 1891 | { |
1755 | struct kvm_mmu_page *shadow; | 1892 | struct kvm_mmu_page *s; |
1893 | struct hlist_node *node; | ||
1894 | bool need_unsync = false; | ||
1756 | 1895 | ||
1757 | shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); | 1896 | for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { |
1758 | if (shadow) { | 1897 | if (!can_unsync) |
1759 | if (shadow->role.level != PT_PAGE_TABLE_LEVEL) | ||
1760 | return 1; | 1898 | return 1; |
1761 | if (shadow->unsync) | 1899 | |
1762 | return 0; | 1900 | if (s->role.level != PT_PAGE_TABLE_LEVEL) |
1763 | if (can_unsync && oos_shadow) | 1901 | return 1; |
1764 | return kvm_unsync_page(vcpu, shadow); | 1902 | |
1765 | return 1; | 1903 | if (!need_unsync && !s->unsync) { |
1904 | if (!oos_shadow) | ||
1905 | return 1; | ||
1906 | need_unsync = true; | ||
1907 | } | ||
1766 | } | 1908 | } |
1909 | if (need_unsync) | ||
1910 | kvm_unsync_pages(vcpu, gfn); | ||
1767 | return 0; | 1911 | return 0; |
1768 | } | 1912 | } |
1769 | 1913 | ||
@@ -1804,13 +1948,14 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
1804 | spte |= (u64)pfn << PAGE_SHIFT; | 1948 | spte |= (u64)pfn << PAGE_SHIFT; |
1805 | 1949 | ||
1806 | if ((pte_access & ACC_WRITE_MASK) | 1950 | if ((pte_access & ACC_WRITE_MASK) |
1807 | || (write_fault && !is_write_protection(vcpu) && !user_fault)) { | 1951 | || (!tdp_enabled && write_fault && !is_write_protection(vcpu) |
1952 | && !user_fault)) { | ||
1808 | 1953 | ||
1809 | if (level > PT_PAGE_TABLE_LEVEL && | 1954 | if (level > PT_PAGE_TABLE_LEVEL && |
1810 | has_wrprotected_page(vcpu->kvm, gfn, level)) { | 1955 | has_wrprotected_page(vcpu->kvm, gfn, level)) { |
1811 | ret = 1; | 1956 | ret = 1; |
1812 | spte = shadow_trap_nonpresent_pte; | 1957 | drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); |
1813 | goto set_pte; | 1958 | goto done; |
1814 | } | 1959 | } |
1815 | 1960 | ||
1816 | spte |= PT_WRITABLE_MASK; | 1961 | spte |= PT_WRITABLE_MASK; |
@@ -1841,7 +1986,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
1841 | mark_page_dirty(vcpu->kvm, gfn); | 1986 | mark_page_dirty(vcpu->kvm, gfn); |
1842 | 1987 | ||
1843 | set_pte: | 1988 | set_pte: |
1844 | __set_spte(sptep, spte); | 1989 | if (is_writable_pte(*sptep) && !is_writable_pte(spte)) |
1990 | kvm_set_pfn_dirty(pfn); | ||
1991 | update_spte(sptep, spte); | ||
1992 | done: | ||
1845 | return ret; | 1993 | return ret; |
1846 | } | 1994 | } |
1847 | 1995 | ||
@@ -1853,7 +2001,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
1853 | bool reset_host_protection) | 2001 | bool reset_host_protection) |
1854 | { | 2002 | { |
1855 | int was_rmapped = 0; | 2003 | int was_rmapped = 0; |
1856 | int was_writable = is_writable_pte(*sptep); | ||
1857 | int rmap_count; | 2004 | int rmap_count; |
1858 | 2005 | ||
1859 | pgprintk("%s: spte %llx access %x write_fault %d" | 2006 | pgprintk("%s: spte %llx access %x write_fault %d" |
@@ -1878,8 +2025,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
1878 | } else if (pfn != spte_to_pfn(*sptep)) { | 2025 | } else if (pfn != spte_to_pfn(*sptep)) { |
1879 | pgprintk("hfn old %lx new %lx\n", | 2026 | pgprintk("hfn old %lx new %lx\n", |
1880 | spte_to_pfn(*sptep), pfn); | 2027 | spte_to_pfn(*sptep), pfn); |
1881 | rmap_remove(vcpu->kvm, sptep); | 2028 | drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); |
1882 | __set_spte(sptep, shadow_trap_nonpresent_pte); | ||
1883 | kvm_flush_remote_tlbs(vcpu->kvm); | 2029 | kvm_flush_remote_tlbs(vcpu->kvm); |
1884 | } else | 2030 | } else |
1885 | was_rmapped = 1; | 2031 | was_rmapped = 1; |
@@ -1890,7 +2036,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
1890 | reset_host_protection)) { | 2036 | reset_host_protection)) { |
1891 | if (write_fault) | 2037 | if (write_fault) |
1892 | *ptwrite = 1; | 2038 | *ptwrite = 1; |
1893 | kvm_x86_ops->tlb_flush(vcpu); | 2039 | kvm_mmu_flush_tlb(vcpu); |
1894 | } | 2040 | } |
1895 | 2041 | ||
1896 | pgprintk("%s: setting spte %llx\n", __func__, *sptep); | 2042 | pgprintk("%s: setting spte %llx\n", __func__, *sptep); |
@@ -1904,15 +2050,10 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
1904 | page_header_update_slot(vcpu->kvm, sptep, gfn); | 2050 | page_header_update_slot(vcpu->kvm, sptep, gfn); |
1905 | if (!was_rmapped) { | 2051 | if (!was_rmapped) { |
1906 | rmap_count = rmap_add(vcpu, sptep, gfn); | 2052 | rmap_count = rmap_add(vcpu, sptep, gfn); |
1907 | kvm_release_pfn_clean(pfn); | ||
1908 | if (rmap_count > RMAP_RECYCLE_THRESHOLD) | 2053 | if (rmap_count > RMAP_RECYCLE_THRESHOLD) |
1909 | rmap_recycle(vcpu, sptep, gfn); | 2054 | rmap_recycle(vcpu, sptep, gfn); |
1910 | } else { | ||
1911 | if (was_writable) | ||
1912 | kvm_release_pfn_dirty(pfn); | ||
1913 | else | ||
1914 | kvm_release_pfn_clean(pfn); | ||
1915 | } | 2055 | } |
2056 | kvm_release_pfn_clean(pfn); | ||
1916 | if (speculative) { | 2057 | if (speculative) { |
1917 | vcpu->arch.last_pte_updated = sptep; | 2058 | vcpu->arch.last_pte_updated = sptep; |
1918 | vcpu->arch.last_pte_gfn = gfn; | 2059 | vcpu->arch.last_pte_gfn = gfn; |
@@ -1941,7 +2082,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | |||
1941 | } | 2082 | } |
1942 | 2083 | ||
1943 | if (*iterator.sptep == shadow_trap_nonpresent_pte) { | 2084 | if (*iterator.sptep == shadow_trap_nonpresent_pte) { |
1944 | pseudo_gfn = (iterator.addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; | 2085 | u64 base_addr = iterator.addr; |
2086 | |||
2087 | base_addr &= PT64_LVL_ADDR_MASK(iterator.level); | ||
2088 | pseudo_gfn = base_addr >> PAGE_SHIFT; | ||
1945 | sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, | 2089 | sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, |
1946 | iterator.level - 1, | 2090 | iterator.level - 1, |
1947 | 1, ACC_ALL, iterator.sptep); | 2091 | 1, ACC_ALL, iterator.sptep); |
@@ -1960,6 +2104,29 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | |||
1960 | return pt_write; | 2104 | return pt_write; |
1961 | } | 2105 | } |
1962 | 2106 | ||
2107 | static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn) | ||
2108 | { | ||
2109 | char buf[1]; | ||
2110 | void __user *hva; | ||
2111 | int r; | ||
2112 | |||
2113 | /* Touch the page, so send SIGBUS */ | ||
2114 | hva = (void __user *)gfn_to_hva(kvm, gfn); | ||
2115 | r = copy_from_user(buf, hva, 1); | ||
2116 | } | ||
2117 | |||
2118 | static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) | ||
2119 | { | ||
2120 | kvm_release_pfn_clean(pfn); | ||
2121 | if (is_hwpoison_pfn(pfn)) { | ||
2122 | kvm_send_hwpoison_signal(kvm, gfn); | ||
2123 | return 0; | ||
2124 | } else if (is_fault_pfn(pfn)) | ||
2125 | return -EFAULT; | ||
2126 | |||
2127 | return 1; | ||
2128 | } | ||
2129 | |||
1963 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) | 2130 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) |
1964 | { | 2131 | { |
1965 | int r; | 2132 | int r; |
@@ -1983,10 +2150,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) | |||
1983 | pfn = gfn_to_pfn(vcpu->kvm, gfn); | 2150 | pfn = gfn_to_pfn(vcpu->kvm, gfn); |
1984 | 2151 | ||
1985 | /* mmio */ | 2152 | /* mmio */ |
1986 | if (is_error_pfn(pfn)) { | 2153 | if (is_error_pfn(pfn)) |
1987 | kvm_release_pfn_clean(pfn); | 2154 | return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); |
1988 | return 1; | ||
1989 | } | ||
1990 | 2155 | ||
1991 | spin_lock(&vcpu->kvm->mmu_lock); | 2156 | spin_lock(&vcpu->kvm->mmu_lock); |
1992 | if (mmu_notifier_retry(vcpu, mmu_seq)) | 2157 | if (mmu_notifier_retry(vcpu, mmu_seq)) |
@@ -2009,6 +2174,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) | |||
2009 | { | 2174 | { |
2010 | int i; | 2175 | int i; |
2011 | struct kvm_mmu_page *sp; | 2176 | struct kvm_mmu_page *sp; |
2177 | LIST_HEAD(invalid_list); | ||
2012 | 2178 | ||
2013 | if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) | 2179 | if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) |
2014 | return; | 2180 | return; |
@@ -2018,8 +2184,10 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) | |||
2018 | 2184 | ||
2019 | sp = page_header(root); | 2185 | sp = page_header(root); |
2020 | --sp->root_count; | 2186 | --sp->root_count; |
2021 | if (!sp->root_count && sp->role.invalid) | 2187 | if (!sp->root_count && sp->role.invalid) { |
2022 | kvm_mmu_zap_page(vcpu->kvm, sp); | 2188 | kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); |
2189 | kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); | ||
2190 | } | ||
2023 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; | 2191 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; |
2024 | spin_unlock(&vcpu->kvm->mmu_lock); | 2192 | spin_unlock(&vcpu->kvm->mmu_lock); |
2025 | return; | 2193 | return; |
@@ -2032,10 +2200,12 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) | |||
2032 | sp = page_header(root); | 2200 | sp = page_header(root); |
2033 | --sp->root_count; | 2201 | --sp->root_count; |
2034 | if (!sp->root_count && sp->role.invalid) | 2202 | if (!sp->root_count && sp->role.invalid) |
2035 | kvm_mmu_zap_page(vcpu->kvm, sp); | 2203 | kvm_mmu_prepare_zap_page(vcpu->kvm, sp, |
2204 | &invalid_list); | ||
2036 | } | 2205 | } |
2037 | vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; | 2206 | vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; |
2038 | } | 2207 | } |
2208 | kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); | ||
2039 | spin_unlock(&vcpu->kvm->mmu_lock); | 2209 | spin_unlock(&vcpu->kvm->mmu_lock); |
2040 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; | 2210 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; |
2041 | } | 2211 | } |
@@ -2045,7 +2215,7 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) | |||
2045 | int ret = 0; | 2215 | int ret = 0; |
2046 | 2216 | ||
2047 | if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) { | 2217 | if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) { |
2048 | set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); | 2218 | kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); |
2049 | ret = 1; | 2219 | ret = 1; |
2050 | } | 2220 | } |
2051 | 2221 | ||
@@ -2073,6 +2243,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) | |||
2073 | root_gfn = 0; | 2243 | root_gfn = 0; |
2074 | } | 2244 | } |
2075 | spin_lock(&vcpu->kvm->mmu_lock); | 2245 | spin_lock(&vcpu->kvm->mmu_lock); |
2246 | kvm_mmu_free_some_pages(vcpu); | ||
2076 | sp = kvm_mmu_get_page(vcpu, root_gfn, 0, | 2247 | sp = kvm_mmu_get_page(vcpu, root_gfn, 0, |
2077 | PT64_ROOT_LEVEL, direct, | 2248 | PT64_ROOT_LEVEL, direct, |
2078 | ACC_ALL, NULL); | 2249 | ACC_ALL, NULL); |
@@ -2103,6 +2274,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) | |||
2103 | root_gfn = i << 30; | 2274 | root_gfn = i << 30; |
2104 | } | 2275 | } |
2105 | spin_lock(&vcpu->kvm->mmu_lock); | 2276 | spin_lock(&vcpu->kvm->mmu_lock); |
2277 | kvm_mmu_free_some_pages(vcpu); | ||
2106 | sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, | 2278 | sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, |
2107 | PT32_ROOT_LEVEL, direct, | 2279 | PT32_ROOT_LEVEL, direct, |
2108 | ACC_ALL, NULL); | 2280 | ACC_ALL, NULL); |
@@ -2198,10 +2370,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, | |||
2198 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | 2370 | mmu_seq = vcpu->kvm->mmu_notifier_seq; |
2199 | smp_rmb(); | 2371 | smp_rmb(); |
2200 | pfn = gfn_to_pfn(vcpu->kvm, gfn); | 2372 | pfn = gfn_to_pfn(vcpu->kvm, gfn); |
2201 | if (is_error_pfn(pfn)) { | 2373 | if (is_error_pfn(pfn)) |
2202 | kvm_release_pfn_clean(pfn); | 2374 | return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); |
2203 | return 1; | ||
2204 | } | ||
2205 | spin_lock(&vcpu->kvm->mmu_lock); | 2375 | spin_lock(&vcpu->kvm->mmu_lock); |
2206 | if (mmu_notifier_retry(vcpu, mmu_seq)) | 2376 | if (mmu_notifier_retry(vcpu, mmu_seq)) |
2207 | goto out_unlock; | 2377 | goto out_unlock; |
@@ -2243,7 +2413,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu) | |||
2243 | void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) | 2413 | void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) |
2244 | { | 2414 | { |
2245 | ++vcpu->stat.tlb_flush; | 2415 | ++vcpu->stat.tlb_flush; |
2246 | kvm_x86_ops->tlb_flush(vcpu); | 2416 | kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); |
2247 | } | 2417 | } |
2248 | 2418 | ||
2249 | static void paging_new_cr3(struct kvm_vcpu *vcpu) | 2419 | static void paging_new_cr3(struct kvm_vcpu *vcpu) |
@@ -2457,10 +2627,9 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu) | |||
2457 | static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) | 2627 | static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) |
2458 | { | 2628 | { |
2459 | ASSERT(vcpu); | 2629 | ASSERT(vcpu); |
2460 | if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) { | 2630 | if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) |
2631 | /* mmu.free() should set root_hpa = INVALID_PAGE */ | ||
2461 | vcpu->arch.mmu.free(vcpu); | 2632 | vcpu->arch.mmu.free(vcpu); |
2462 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; | ||
2463 | } | ||
2464 | } | 2633 | } |
2465 | 2634 | ||
2466 | int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) | 2635 | int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) |
@@ -2477,9 +2646,6 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) | |||
2477 | r = mmu_topup_memory_caches(vcpu); | 2646 | r = mmu_topup_memory_caches(vcpu); |
2478 | if (r) | 2647 | if (r) |
2479 | goto out; | 2648 | goto out; |
2480 | spin_lock(&vcpu->kvm->mmu_lock); | ||
2481 | kvm_mmu_free_some_pages(vcpu); | ||
2482 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
2483 | r = mmu_alloc_roots(vcpu); | 2649 | r = mmu_alloc_roots(vcpu); |
2484 | spin_lock(&vcpu->kvm->mmu_lock); | 2650 | spin_lock(&vcpu->kvm->mmu_lock); |
2485 | mmu_sync_roots(vcpu); | 2651 | mmu_sync_roots(vcpu); |
@@ -2508,7 +2674,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, | |||
2508 | pte = *spte; | 2674 | pte = *spte; |
2509 | if (is_shadow_present_pte(pte)) { | 2675 | if (is_shadow_present_pte(pte)) { |
2510 | if (is_last_spte(pte, sp->role.level)) | 2676 | if (is_last_spte(pte, sp->role.level)) |
2511 | rmap_remove(vcpu->kvm, spte); | 2677 | drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte); |
2512 | else { | 2678 | else { |
2513 | child = page_header(pte & PT64_BASE_ADDR_MASK); | 2679 | child = page_header(pte & PT64_BASE_ADDR_MASK); |
2514 | mmu_page_remove_parent_pte(child, spte); | 2680 | mmu_page_remove_parent_pte(child, spte); |
@@ -2529,6 +2695,9 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, | |||
2529 | return; | 2695 | return; |
2530 | } | 2696 | } |
2531 | 2697 | ||
2698 | if (is_rsvd_bits_set(vcpu, *(u64 *)new, PT_PAGE_TABLE_LEVEL)) | ||
2699 | return; | ||
2700 | |||
2532 | ++vcpu->kvm->stat.mmu_pte_updated; | 2701 | ++vcpu->kvm->stat.mmu_pte_updated; |
2533 | if (!sp->role.cr4_pae) | 2702 | if (!sp->role.cr4_pae) |
2534 | paging32_update_pte(vcpu, sp, spte, new); | 2703 | paging32_update_pte(vcpu, sp, spte, new); |
@@ -2549,11 +2718,15 @@ static bool need_remote_flush(u64 old, u64 new) | |||
2549 | return (old & ~new & PT64_PERM_MASK) != 0; | 2718 | return (old & ~new & PT64_PERM_MASK) != 0; |
2550 | } | 2719 | } |
2551 | 2720 | ||
2552 | static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new) | 2721 | static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page, |
2722 | bool remote_flush, bool local_flush) | ||
2553 | { | 2723 | { |
2554 | if (need_remote_flush(old, new)) | 2724 | if (zap_page) |
2725 | return; | ||
2726 | |||
2727 | if (remote_flush) | ||
2555 | kvm_flush_remote_tlbs(vcpu->kvm); | 2728 | kvm_flush_remote_tlbs(vcpu->kvm); |
2556 | else | 2729 | else if (local_flush) |
2557 | kvm_mmu_flush_tlb(vcpu); | 2730 | kvm_mmu_flush_tlb(vcpu); |
2558 | } | 2731 | } |
2559 | 2732 | ||
@@ -2603,10 +2776,10 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
2603 | bool guest_initiated) | 2776 | bool guest_initiated) |
2604 | { | 2777 | { |
2605 | gfn_t gfn = gpa >> PAGE_SHIFT; | 2778 | gfn_t gfn = gpa >> PAGE_SHIFT; |
2779 | union kvm_mmu_page_role mask = { .word = 0 }; | ||
2606 | struct kvm_mmu_page *sp; | 2780 | struct kvm_mmu_page *sp; |
2607 | struct hlist_node *node, *n; | 2781 | struct hlist_node *node; |
2608 | struct hlist_head *bucket; | 2782 | LIST_HEAD(invalid_list); |
2609 | unsigned index; | ||
2610 | u64 entry, gentry; | 2783 | u64 entry, gentry; |
2611 | u64 *spte; | 2784 | u64 *spte; |
2612 | unsigned offset = offset_in_page(gpa); | 2785 | unsigned offset = offset_in_page(gpa); |
@@ -2619,6 +2792,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
2619 | int npte; | 2792 | int npte; |
2620 | int r; | 2793 | int r; |
2621 | int invlpg_counter; | 2794 | int invlpg_counter; |
2795 | bool remote_flush, local_flush, zap_page; | ||
2796 | |||
2797 | zap_page = remote_flush = local_flush = false; | ||
2622 | 2798 | ||
2623 | pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); | 2799 | pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); |
2624 | 2800 | ||
@@ -2674,13 +2850,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
2674 | vcpu->arch.last_pte_updated = NULL; | 2850 | vcpu->arch.last_pte_updated = NULL; |
2675 | } | 2851 | } |
2676 | } | 2852 | } |
2677 | index = kvm_page_table_hashfn(gfn); | ||
2678 | bucket = &vcpu->kvm->arch.mmu_page_hash[index]; | ||
2679 | 2853 | ||
2680 | restart: | 2854 | mask.cr0_wp = mask.cr4_pae = mask.nxe = 1; |
2681 | hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { | 2855 | for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) { |
2682 | if (sp->gfn != gfn || sp->role.direct || sp->role.invalid) | ||
2683 | continue; | ||
2684 | pte_size = sp->role.cr4_pae ? 8 : 4; | 2856 | pte_size = sp->role.cr4_pae ? 8 : 4; |
2685 | misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); | 2857 | misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); |
2686 | misaligned |= bytes < 4; | 2858 | misaligned |= bytes < 4; |
@@ -2697,8 +2869,8 @@ restart: | |||
2697 | */ | 2869 | */ |
2698 | pgprintk("misaligned: gpa %llx bytes %d role %x\n", | 2870 | pgprintk("misaligned: gpa %llx bytes %d role %x\n", |
2699 | gpa, bytes, sp->role.word); | 2871 | gpa, bytes, sp->role.word); |
2700 | if (kvm_mmu_zap_page(vcpu->kvm, sp)) | 2872 | zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp, |
2701 | goto restart; | 2873 | &invalid_list); |
2702 | ++vcpu->kvm->stat.mmu_flooded; | 2874 | ++vcpu->kvm->stat.mmu_flooded; |
2703 | continue; | 2875 | continue; |
2704 | } | 2876 | } |
@@ -2722,16 +2894,22 @@ restart: | |||
2722 | if (quadrant != sp->role.quadrant) | 2894 | if (quadrant != sp->role.quadrant) |
2723 | continue; | 2895 | continue; |
2724 | } | 2896 | } |
2897 | local_flush = true; | ||
2725 | spte = &sp->spt[page_offset / sizeof(*spte)]; | 2898 | spte = &sp->spt[page_offset / sizeof(*spte)]; |
2726 | while (npte--) { | 2899 | while (npte--) { |
2727 | entry = *spte; | 2900 | entry = *spte; |
2728 | mmu_pte_write_zap_pte(vcpu, sp, spte); | 2901 | mmu_pte_write_zap_pte(vcpu, sp, spte); |
2729 | if (gentry) | 2902 | if (gentry && |
2903 | !((sp->role.word ^ vcpu->arch.mmu.base_role.word) | ||
2904 | & mask.word)) | ||
2730 | mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); | 2905 | mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); |
2731 | mmu_pte_write_flush_tlb(vcpu, entry, *spte); | 2906 | if (!remote_flush && need_remote_flush(entry, *spte)) |
2907 | remote_flush = true; | ||
2732 | ++spte; | 2908 | ++spte; |
2733 | } | 2909 | } |
2734 | } | 2910 | } |
2911 | mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush); | ||
2912 | kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); | ||
2735 | kvm_mmu_audit(vcpu, "post pte write"); | 2913 | kvm_mmu_audit(vcpu, "post pte write"); |
2736 | spin_unlock(&vcpu->kvm->mmu_lock); | 2914 | spin_unlock(&vcpu->kvm->mmu_lock); |
2737 | if (!is_error_pfn(vcpu->arch.update_pte.pfn)) { | 2915 | if (!is_error_pfn(vcpu->arch.update_pte.pfn)) { |
@@ -2759,15 +2937,21 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); | |||
2759 | 2937 | ||
2760 | void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) | 2938 | void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) |
2761 | { | 2939 | { |
2762 | while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES && | 2940 | int free_pages; |
2941 | LIST_HEAD(invalid_list); | ||
2942 | |||
2943 | free_pages = vcpu->kvm->arch.n_free_mmu_pages; | ||
2944 | while (free_pages < KVM_REFILL_PAGES && | ||
2763 | !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { | 2945 | !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { |
2764 | struct kvm_mmu_page *sp; | 2946 | struct kvm_mmu_page *sp; |
2765 | 2947 | ||
2766 | sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, | 2948 | sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, |
2767 | struct kvm_mmu_page, link); | 2949 | struct kvm_mmu_page, link); |
2768 | kvm_mmu_zap_page(vcpu->kvm, sp); | 2950 | free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp, |
2951 | &invalid_list); | ||
2769 | ++vcpu->kvm->stat.mmu_recycled; | 2952 | ++vcpu->kvm->stat.mmu_recycled; |
2770 | } | 2953 | } |
2954 | kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); | ||
2771 | } | 2955 | } |
2772 | 2956 | ||
2773 | int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) | 2957 | int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) |
@@ -2795,11 +2979,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) | |||
2795 | return 1; | 2979 | return 1; |
2796 | case EMULATE_DO_MMIO: | 2980 | case EMULATE_DO_MMIO: |
2797 | ++vcpu->stat.mmio_exits; | 2981 | ++vcpu->stat.mmio_exits; |
2798 | return 0; | 2982 | /* fall through */ |
2799 | case EMULATE_FAIL: | 2983 | case EMULATE_FAIL: |
2800 | vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; | ||
2801 | vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; | ||
2802 | vcpu->run->internal.ndata = 0; | ||
2803 | return 0; | 2984 | return 0; |
2804 | default: | 2985 | default: |
2805 | BUG(); | 2986 | BUG(); |
@@ -2896,7 +3077,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | |||
2896 | pt = sp->spt; | 3077 | pt = sp->spt; |
2897 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) | 3078 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) |
2898 | /* avoid RMW */ | 3079 | /* avoid RMW */ |
2899 | if (pt[i] & PT_WRITABLE_MASK) | 3080 | if (is_writable_pte(pt[i])) |
2900 | pt[i] &= ~PT_WRITABLE_MASK; | 3081 | pt[i] &= ~PT_WRITABLE_MASK; |
2901 | } | 3082 | } |
2902 | kvm_flush_remote_tlbs(kvm); | 3083 | kvm_flush_remote_tlbs(kvm); |
@@ -2905,25 +3086,26 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | |||
2905 | void kvm_mmu_zap_all(struct kvm *kvm) | 3086 | void kvm_mmu_zap_all(struct kvm *kvm) |
2906 | { | 3087 | { |
2907 | struct kvm_mmu_page *sp, *node; | 3088 | struct kvm_mmu_page *sp, *node; |
3089 | LIST_HEAD(invalid_list); | ||
2908 | 3090 | ||
2909 | spin_lock(&kvm->mmu_lock); | 3091 | spin_lock(&kvm->mmu_lock); |
2910 | restart: | 3092 | restart: |
2911 | list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) | 3093 | list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) |
2912 | if (kvm_mmu_zap_page(kvm, sp)) | 3094 | if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) |
2913 | goto restart; | 3095 | goto restart; |
2914 | 3096 | ||
3097 | kvm_mmu_commit_zap_page(kvm, &invalid_list); | ||
2915 | spin_unlock(&kvm->mmu_lock); | 3098 | spin_unlock(&kvm->mmu_lock); |
2916 | |||
2917 | kvm_flush_remote_tlbs(kvm); | ||
2918 | } | 3099 | } |
2919 | 3100 | ||
2920 | static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm) | 3101 | static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, |
3102 | struct list_head *invalid_list) | ||
2921 | { | 3103 | { |
2922 | struct kvm_mmu_page *page; | 3104 | struct kvm_mmu_page *page; |
2923 | 3105 | ||
2924 | page = container_of(kvm->arch.active_mmu_pages.prev, | 3106 | page = container_of(kvm->arch.active_mmu_pages.prev, |
2925 | struct kvm_mmu_page, link); | 3107 | struct kvm_mmu_page, link); |
2926 | return kvm_mmu_zap_page(kvm, page) + 1; | 3108 | return kvm_mmu_prepare_zap_page(kvm, page, invalid_list); |
2927 | } | 3109 | } |
2928 | 3110 | ||
2929 | static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) | 3111 | static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) |
@@ -2936,6 +3118,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) | |||
2936 | 3118 | ||
2937 | list_for_each_entry(kvm, &vm_list, vm_list) { | 3119 | list_for_each_entry(kvm, &vm_list, vm_list) { |
2938 | int npages, idx, freed_pages; | 3120 | int npages, idx, freed_pages; |
3121 | LIST_HEAD(invalid_list); | ||
2939 | 3122 | ||
2940 | idx = srcu_read_lock(&kvm->srcu); | 3123 | idx = srcu_read_lock(&kvm->srcu); |
2941 | spin_lock(&kvm->mmu_lock); | 3124 | spin_lock(&kvm->mmu_lock); |
@@ -2943,12 +3126,14 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) | |||
2943 | kvm->arch.n_free_mmu_pages; | 3126 | kvm->arch.n_free_mmu_pages; |
2944 | cache_count += npages; | 3127 | cache_count += npages; |
2945 | if (!kvm_freed && nr_to_scan > 0 && npages > 0) { | 3128 | if (!kvm_freed && nr_to_scan > 0 && npages > 0) { |
2946 | freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm); | 3129 | freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm, |
3130 | &invalid_list); | ||
2947 | cache_count -= freed_pages; | 3131 | cache_count -= freed_pages; |
2948 | kvm_freed = kvm; | 3132 | kvm_freed = kvm; |
2949 | } | 3133 | } |
2950 | nr_to_scan--; | 3134 | nr_to_scan--; |
2951 | 3135 | ||
3136 | kvm_mmu_commit_zap_page(kvm, &invalid_list); | ||
2952 | spin_unlock(&kvm->mmu_lock); | 3137 | spin_unlock(&kvm->mmu_lock); |
2953 | srcu_read_unlock(&kvm->srcu, idx); | 3138 | srcu_read_unlock(&kvm->srcu, idx); |
2954 | } | 3139 | } |
@@ -3074,7 +3259,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu, | |||
3074 | 3259 | ||
3075 | static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) | 3260 | static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) |
3076 | { | 3261 | { |
3077 | kvm_set_cr3(vcpu, vcpu->arch.cr3); | 3262 | (void)kvm_set_cr3(vcpu, vcpu->arch.cr3); |
3078 | return 1; | 3263 | return 1; |
3079 | } | 3264 | } |
3080 | 3265 | ||
@@ -3331,9 +3516,9 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) | |||
3331 | struct kvm_mmu_page *rev_sp; | 3516 | struct kvm_mmu_page *rev_sp; |
3332 | gfn_t gfn; | 3517 | gfn_t gfn; |
3333 | 3518 | ||
3334 | if (*sptep & PT_WRITABLE_MASK) { | 3519 | if (is_writable_pte(*sptep)) { |
3335 | rev_sp = page_header(__pa(sptep)); | 3520 | rev_sp = page_header(__pa(sptep)); |
3336 | gfn = rev_sp->gfns[sptep - rev_sp->spt]; | 3521 | gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); |
3337 | 3522 | ||
3338 | if (!gfn_to_memslot(kvm, gfn)) { | 3523 | if (!gfn_to_memslot(kvm, gfn)) { |
3339 | if (!printk_ratelimit()) | 3524 | if (!printk_ratelimit()) |
@@ -3347,8 +3532,7 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) | |||
3347 | return; | 3532 | return; |
3348 | } | 3533 | } |
3349 | 3534 | ||
3350 | rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt], | 3535 | rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level); |
3351 | rev_sp->role.level); | ||
3352 | if (!*rmapp) { | 3536 | if (!*rmapp) { |
3353 | if (!printk_ratelimit()) | 3537 | if (!printk_ratelimit()) |
3354 | return; | 3538 | return; |
@@ -3381,7 +3565,7 @@ static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu) | |||
3381 | 3565 | ||
3382 | if (!(ent & PT_PRESENT_MASK)) | 3566 | if (!(ent & PT_PRESENT_MASK)) |
3383 | continue; | 3567 | continue; |
3384 | if (!(ent & PT_WRITABLE_MASK)) | 3568 | if (!is_writable_pte(ent)) |
3385 | continue; | 3569 | continue; |
3386 | inspect_spte_has_rmap(vcpu->kvm, &pt[i]); | 3570 | inspect_spte_has_rmap(vcpu->kvm, &pt[i]); |
3387 | } | 3571 | } |
@@ -3409,13 +3593,12 @@ static void audit_write_protection(struct kvm_vcpu *vcpu) | |||
3409 | if (sp->unsync) | 3593 | if (sp->unsync) |
3410 | continue; | 3594 | continue; |
3411 | 3595 | ||
3412 | gfn = unalias_gfn(vcpu->kvm, sp->gfn); | 3596 | slot = gfn_to_memslot(vcpu->kvm, sp->gfn); |
3413 | slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn); | ||
3414 | rmapp = &slot->rmap[gfn - slot->base_gfn]; | 3597 | rmapp = &slot->rmap[gfn - slot->base_gfn]; |
3415 | 3598 | ||
3416 | spte = rmap_next(vcpu->kvm, rmapp, NULL); | 3599 | spte = rmap_next(vcpu->kvm, rmapp, NULL); |
3417 | while (spte) { | 3600 | while (spte) { |
3418 | if (*spte & PT_WRITABLE_MASK) | 3601 | if (is_writable_pte(*spte)) |
3419 | printk(KERN_ERR "%s: (%s) shadow page has " | 3602 | printk(KERN_ERR "%s: (%s) shadow page has " |
3420 | "writable mappings: gfn %lx role %x\n", | 3603 | "writable mappings: gfn %lx role %x\n", |
3421 | __func__, audit_msg, sp->gfn, | 3604 | __func__, audit_msg, sp->gfn, |
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index 42f07b1bfbc9..3aab0f0930ef 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h | |||
@@ -190,7 +190,7 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_unsync_page, | |||
190 | TP_ARGS(sp) | 190 | TP_ARGS(sp) |
191 | ); | 191 | ); |
192 | 192 | ||
193 | DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_zap_page, | 193 | DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page, |
194 | TP_PROTO(struct kvm_mmu_page *sp), | 194 | TP_PROTO(struct kvm_mmu_page *sp), |
195 | 195 | ||
196 | TP_ARGS(sp) | 196 | TP_ARGS(sp) |
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 2331bdc2b549..51ef9097960d 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h | |||
@@ -7,6 +7,7 @@ | |||
7 | * MMU support | 7 | * MMU support |
8 | * | 8 | * |
9 | * Copyright (C) 2006 Qumranet, Inc. | 9 | * Copyright (C) 2006 Qumranet, Inc. |
10 | * Copyright 2010 Red Hat, Inc. and/or its affilates. | ||
10 | * | 11 | * |
11 | * Authors: | 12 | * Authors: |
12 | * Yaniv Kamay <yaniv@qumranet.com> | 13 | * Yaniv Kamay <yaniv@qumranet.com> |
@@ -118,21 +119,25 @@ static int FNAME(walk_addr)(struct guest_walker *walker, | |||
118 | { | 119 | { |
119 | pt_element_t pte; | 120 | pt_element_t pte; |
120 | gfn_t table_gfn; | 121 | gfn_t table_gfn; |
121 | unsigned index, pt_access, pte_access; | 122 | unsigned index, pt_access, uninitialized_var(pte_access); |
122 | gpa_t pte_gpa; | 123 | gpa_t pte_gpa; |
123 | int rsvd_fault = 0; | 124 | bool eperm, present, rsvd_fault; |
124 | 125 | ||
125 | trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, | 126 | trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, |
126 | fetch_fault); | 127 | fetch_fault); |
127 | walk: | 128 | walk: |
129 | present = true; | ||
130 | eperm = rsvd_fault = false; | ||
128 | walker->level = vcpu->arch.mmu.root_level; | 131 | walker->level = vcpu->arch.mmu.root_level; |
129 | pte = vcpu->arch.cr3; | 132 | pte = vcpu->arch.cr3; |
130 | #if PTTYPE == 64 | 133 | #if PTTYPE == 64 |
131 | if (!is_long_mode(vcpu)) { | 134 | if (!is_long_mode(vcpu)) { |
132 | pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3); | 135 | pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3); |
133 | trace_kvm_mmu_paging_element(pte, walker->level); | 136 | trace_kvm_mmu_paging_element(pte, walker->level); |
134 | if (!is_present_gpte(pte)) | 137 | if (!is_present_gpte(pte)) { |
135 | goto not_present; | 138 | present = false; |
139 | goto error; | ||
140 | } | ||
136 | --walker->level; | 141 | --walker->level; |
137 | } | 142 | } |
138 | #endif | 143 | #endif |
@@ -150,37 +155,42 @@ walk: | |||
150 | walker->table_gfn[walker->level - 1] = table_gfn; | 155 | walker->table_gfn[walker->level - 1] = table_gfn; |
151 | walker->pte_gpa[walker->level - 1] = pte_gpa; | 156 | walker->pte_gpa[walker->level - 1] = pte_gpa; |
152 | 157 | ||
153 | if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) | 158 | if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) { |
154 | goto not_present; | 159 | present = false; |
160 | break; | ||
161 | } | ||
155 | 162 | ||
156 | trace_kvm_mmu_paging_element(pte, walker->level); | 163 | trace_kvm_mmu_paging_element(pte, walker->level); |
157 | 164 | ||
158 | if (!is_present_gpte(pte)) | 165 | if (!is_present_gpte(pte)) { |
159 | goto not_present; | 166 | present = false; |
167 | break; | ||
168 | } | ||
160 | 169 | ||
161 | rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level); | 170 | if (is_rsvd_bits_set(vcpu, pte, walker->level)) { |
162 | if (rsvd_fault) | 171 | rsvd_fault = true; |
163 | goto access_error; | 172 | break; |
173 | } | ||
164 | 174 | ||
165 | if (write_fault && !is_writable_pte(pte)) | 175 | if (write_fault && !is_writable_pte(pte)) |
166 | if (user_fault || is_write_protection(vcpu)) | 176 | if (user_fault || is_write_protection(vcpu)) |
167 | goto access_error; | 177 | eperm = true; |
168 | 178 | ||
169 | if (user_fault && !(pte & PT_USER_MASK)) | 179 | if (user_fault && !(pte & PT_USER_MASK)) |
170 | goto access_error; | 180 | eperm = true; |
171 | 181 | ||
172 | #if PTTYPE == 64 | 182 | #if PTTYPE == 64 |
173 | if (fetch_fault && (pte & PT64_NX_MASK)) | 183 | if (fetch_fault && (pte & PT64_NX_MASK)) |
174 | goto access_error; | 184 | eperm = true; |
175 | #endif | 185 | #endif |
176 | 186 | ||
177 | if (!(pte & PT_ACCESSED_MASK)) { | 187 | if (!eperm && !rsvd_fault && !(pte & PT_ACCESSED_MASK)) { |
178 | trace_kvm_mmu_set_accessed_bit(table_gfn, index, | 188 | trace_kvm_mmu_set_accessed_bit(table_gfn, index, |
179 | sizeof(pte)); | 189 | sizeof(pte)); |
180 | mark_page_dirty(vcpu->kvm, table_gfn); | ||
181 | if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, | 190 | if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, |
182 | index, pte, pte|PT_ACCESSED_MASK)) | 191 | index, pte, pte|PT_ACCESSED_MASK)) |
183 | goto walk; | 192 | goto walk; |
193 | mark_page_dirty(vcpu->kvm, table_gfn); | ||
184 | pte |= PT_ACCESSED_MASK; | 194 | pte |= PT_ACCESSED_MASK; |
185 | } | 195 | } |
186 | 196 | ||
@@ -213,15 +223,18 @@ walk: | |||
213 | --walker->level; | 223 | --walker->level; |
214 | } | 224 | } |
215 | 225 | ||
226 | if (!present || eperm || rsvd_fault) | ||
227 | goto error; | ||
228 | |||
216 | if (write_fault && !is_dirty_gpte(pte)) { | 229 | if (write_fault && !is_dirty_gpte(pte)) { |
217 | bool ret; | 230 | bool ret; |
218 | 231 | ||
219 | trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); | 232 | trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); |
220 | mark_page_dirty(vcpu->kvm, table_gfn); | ||
221 | ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, | 233 | ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, |
222 | pte|PT_DIRTY_MASK); | 234 | pte|PT_DIRTY_MASK); |
223 | if (ret) | 235 | if (ret) |
224 | goto walk; | 236 | goto walk; |
237 | mark_page_dirty(vcpu->kvm, table_gfn); | ||
225 | pte |= PT_DIRTY_MASK; | 238 | pte |= PT_DIRTY_MASK; |
226 | walker->ptes[walker->level - 1] = pte; | 239 | walker->ptes[walker->level - 1] = pte; |
227 | } | 240 | } |
@@ -229,22 +242,18 @@ walk: | |||
229 | walker->pt_access = pt_access; | 242 | walker->pt_access = pt_access; |
230 | walker->pte_access = pte_access; | 243 | walker->pte_access = pte_access; |
231 | pgprintk("%s: pte %llx pte_access %x pt_access %x\n", | 244 | pgprintk("%s: pte %llx pte_access %x pt_access %x\n", |
232 | __func__, (u64)pte, pt_access, pte_access); | 245 | __func__, (u64)pte, pte_access, pt_access); |
233 | return 1; | 246 | return 1; |
234 | 247 | ||
235 | not_present: | 248 | error: |
236 | walker->error_code = 0; | 249 | walker->error_code = 0; |
237 | goto err; | 250 | if (present) |
238 | 251 | walker->error_code |= PFERR_PRESENT_MASK; | |
239 | access_error: | ||
240 | walker->error_code = PFERR_PRESENT_MASK; | ||
241 | |||
242 | err: | ||
243 | if (write_fault) | 252 | if (write_fault) |
244 | walker->error_code |= PFERR_WRITE_MASK; | 253 | walker->error_code |= PFERR_WRITE_MASK; |
245 | if (user_fault) | 254 | if (user_fault) |
246 | walker->error_code |= PFERR_USER_MASK; | 255 | walker->error_code |= PFERR_USER_MASK; |
247 | if (fetch_fault) | 256 | if (fetch_fault && is_nx(vcpu)) |
248 | walker->error_code |= PFERR_FETCH_MASK; | 257 | walker->error_code |= PFERR_FETCH_MASK; |
249 | if (rsvd_fault) | 258 | if (rsvd_fault) |
250 | walker->error_code |= PFERR_RSVD_MASK; | 259 | walker->error_code |= PFERR_RSVD_MASK; |
@@ -252,7 +261,7 @@ err: | |||
252 | return 0; | 261 | return 0; |
253 | } | 262 | } |
254 | 263 | ||
255 | static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, | 264 | static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, |
256 | u64 *spte, const void *pte) | 265 | u64 *spte, const void *pte) |
257 | { | 266 | { |
258 | pt_element_t gpte; | 267 | pt_element_t gpte; |
@@ -263,7 +272,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, | |||
263 | gpte = *(const pt_element_t *)pte; | 272 | gpte = *(const pt_element_t *)pte; |
264 | if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { | 273 | if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { |
265 | if (!is_present_gpte(gpte)) { | 274 | if (!is_present_gpte(gpte)) { |
266 | if (page->unsync) | 275 | if (sp->unsync) |
267 | new_spte = shadow_trap_nonpresent_pte; | 276 | new_spte = shadow_trap_nonpresent_pte; |
268 | else | 277 | else |
269 | new_spte = shadow_notrap_nonpresent_pte; | 278 | new_spte = shadow_notrap_nonpresent_pte; |
@@ -272,7 +281,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, | |||
272 | return; | 281 | return; |
273 | } | 282 | } |
274 | pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); | 283 | pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); |
275 | pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte); | 284 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); |
276 | if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) | 285 | if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) |
277 | return; | 286 | return; |
278 | pfn = vcpu->arch.update_pte.pfn; | 287 | pfn = vcpu->arch.update_pte.pfn; |
@@ -285,11 +294,22 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, | |||
285 | * we call mmu_set_spte() with reset_host_protection = true beacuse that | 294 | * we call mmu_set_spte() with reset_host_protection = true beacuse that |
286 | * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). | 295 | * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). |
287 | */ | 296 | */ |
288 | mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, | 297 | mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, |
289 | gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL, | 298 | is_dirty_gpte(gpte), NULL, PT_PAGE_TABLE_LEVEL, |
290 | gpte_to_gfn(gpte), pfn, true, true); | 299 | gpte_to_gfn(gpte), pfn, true, true); |
291 | } | 300 | } |
292 | 301 | ||
302 | static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, | ||
303 | struct guest_walker *gw, int level) | ||
304 | { | ||
305 | int r; | ||
306 | pt_element_t curr_pte; | ||
307 | |||
308 | r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 1], | ||
309 | &curr_pte, sizeof(curr_pte)); | ||
310 | return r || curr_pte != gw->ptes[level - 1]; | ||
311 | } | ||
312 | |||
293 | /* | 313 | /* |
294 | * Fetch a shadow pte for a specific level in the paging hierarchy. | 314 | * Fetch a shadow pte for a specific level in the paging hierarchy. |
295 | */ | 315 | */ |
@@ -299,75 +319,86 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | |||
299 | int *ptwrite, pfn_t pfn) | 319 | int *ptwrite, pfn_t pfn) |
300 | { | 320 | { |
301 | unsigned access = gw->pt_access; | 321 | unsigned access = gw->pt_access; |
302 | struct kvm_mmu_page *shadow_page; | 322 | struct kvm_mmu_page *sp = NULL; |
303 | u64 spte, *sptep = NULL; | 323 | bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]); |
304 | int direct; | 324 | int top_level; |
305 | gfn_t table_gfn; | 325 | unsigned direct_access; |
306 | int r; | 326 | struct kvm_shadow_walk_iterator it; |
307 | int level; | ||
308 | pt_element_t curr_pte; | ||
309 | struct kvm_shadow_walk_iterator iterator; | ||
310 | 327 | ||
311 | if (!is_present_gpte(gw->ptes[gw->level - 1])) | 328 | if (!is_present_gpte(gw->ptes[gw->level - 1])) |
312 | return NULL; | 329 | return NULL; |
313 | 330 | ||
314 | for_each_shadow_entry(vcpu, addr, iterator) { | 331 | direct_access = gw->pt_access & gw->pte_access; |
315 | level = iterator.level; | 332 | if (!dirty) |
316 | sptep = iterator.sptep; | 333 | direct_access &= ~ACC_WRITE_MASK; |
317 | if (iterator.level == hlevel) { | ||
318 | mmu_set_spte(vcpu, sptep, access, | ||
319 | gw->pte_access & access, | ||
320 | user_fault, write_fault, | ||
321 | gw->ptes[gw->level-1] & PT_DIRTY_MASK, | ||
322 | ptwrite, level, | ||
323 | gw->gfn, pfn, false, true); | ||
324 | break; | ||
325 | } | ||
326 | 334 | ||
327 | if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) | 335 | top_level = vcpu->arch.mmu.root_level; |
328 | continue; | 336 | if (top_level == PT32E_ROOT_LEVEL) |
337 | top_level = PT32_ROOT_LEVEL; | ||
338 | /* | ||
339 | * Verify that the top-level gpte is still there. Since the page | ||
340 | * is a root page, it is either write protected (and cannot be | ||
341 | * changed from now on) or it is invalid (in which case, we don't | ||
342 | * really care if it changes underneath us after this point). | ||
343 | */ | ||
344 | if (FNAME(gpte_changed)(vcpu, gw, top_level)) | ||
345 | goto out_gpte_changed; | ||
329 | 346 | ||
330 | if (is_large_pte(*sptep)) { | 347 | for (shadow_walk_init(&it, vcpu, addr); |
331 | rmap_remove(vcpu->kvm, sptep); | 348 | shadow_walk_okay(&it) && it.level > gw->level; |
332 | __set_spte(sptep, shadow_trap_nonpresent_pte); | 349 | shadow_walk_next(&it)) { |
333 | kvm_flush_remote_tlbs(vcpu->kvm); | 350 | gfn_t table_gfn; |
334 | } | ||
335 | 351 | ||
336 | if (level <= gw->level) { | 352 | drop_large_spte(vcpu, it.sptep); |
337 | int delta = level - gw->level + 1; | 353 | |
338 | direct = 1; | 354 | sp = NULL; |
339 | if (!is_dirty_gpte(gw->ptes[level - delta])) | 355 | if (!is_shadow_present_pte(*it.sptep)) { |
340 | access &= ~ACC_WRITE_MASK; | 356 | table_gfn = gw->table_gfn[it.level - 2]; |
341 | table_gfn = gpte_to_gfn(gw->ptes[level - delta]); | 357 | sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1, |
342 | /* advance table_gfn when emulating 1gb pages with 4k */ | 358 | false, access, it.sptep); |
343 | if (delta == 0) | ||
344 | table_gfn += PT_INDEX(addr, level); | ||
345 | access &= gw->pte_access; | ||
346 | } else { | ||
347 | direct = 0; | ||
348 | table_gfn = gw->table_gfn[level - 2]; | ||
349 | } | ||
350 | shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, | ||
351 | direct, access, sptep); | ||
352 | if (!direct) { | ||
353 | r = kvm_read_guest_atomic(vcpu->kvm, | ||
354 | gw->pte_gpa[level - 2], | ||
355 | &curr_pte, sizeof(curr_pte)); | ||
356 | if (r || curr_pte != gw->ptes[level - 2]) { | ||
357 | kvm_mmu_put_page(shadow_page, sptep); | ||
358 | kvm_release_pfn_clean(pfn); | ||
359 | sptep = NULL; | ||
360 | break; | ||
361 | } | ||
362 | } | 359 | } |
363 | 360 | ||
364 | spte = __pa(shadow_page->spt) | 361 | /* |
365 | | PT_PRESENT_MASK | PT_ACCESSED_MASK | 362 | * Verify that the gpte in the page we've just write |
366 | | PT_WRITABLE_MASK | PT_USER_MASK; | 363 | * protected is still there. |
367 | *sptep = spte; | 364 | */ |
365 | if (FNAME(gpte_changed)(vcpu, gw, it.level - 1)) | ||
366 | goto out_gpte_changed; | ||
367 | |||
368 | if (sp) | ||
369 | link_shadow_page(it.sptep, sp); | ||
368 | } | 370 | } |
369 | 371 | ||
370 | return sptep; | 372 | for (; |
373 | shadow_walk_okay(&it) && it.level > hlevel; | ||
374 | shadow_walk_next(&it)) { | ||
375 | gfn_t direct_gfn; | ||
376 | |||
377 | validate_direct_spte(vcpu, it.sptep, direct_access); | ||
378 | |||
379 | drop_large_spte(vcpu, it.sptep); | ||
380 | |||
381 | if (is_shadow_present_pte(*it.sptep)) | ||
382 | continue; | ||
383 | |||
384 | direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); | ||
385 | |||
386 | sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1, | ||
387 | true, direct_access, it.sptep); | ||
388 | link_shadow_page(it.sptep, sp); | ||
389 | } | ||
390 | |||
391 | mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access, | ||
392 | user_fault, write_fault, dirty, ptwrite, it.level, | ||
393 | gw->gfn, pfn, false, true); | ||
394 | |||
395 | return it.sptep; | ||
396 | |||
397 | out_gpte_changed: | ||
398 | if (sp) | ||
399 | kvm_mmu_put_page(sp, it.sptep); | ||
400 | kvm_release_pfn_clean(pfn); | ||
401 | return NULL; | ||
371 | } | 402 | } |
372 | 403 | ||
373 | /* | 404 | /* |
@@ -431,11 +462,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
431 | pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); | 462 | pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); |
432 | 463 | ||
433 | /* mmio */ | 464 | /* mmio */ |
434 | if (is_error_pfn(pfn)) { | 465 | if (is_error_pfn(pfn)) |
435 | pgprintk("gfn %lx is mmio\n", walker.gfn); | 466 | return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn); |
436 | kvm_release_pfn_clean(pfn); | ||
437 | return 1; | ||
438 | } | ||
439 | 467 | ||
440 | spin_lock(&vcpu->kvm->mmu_lock); | 468 | spin_lock(&vcpu->kvm->mmu_lock); |
441 | if (mmu_notifier_retry(vcpu, mmu_seq)) | 469 | if (mmu_notifier_retry(vcpu, mmu_seq)) |
@@ -443,6 +471,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
443 | kvm_mmu_free_some_pages(vcpu); | 471 | kvm_mmu_free_some_pages(vcpu); |
444 | sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, | 472 | sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, |
445 | level, &write_pt, pfn); | 473 | level, &write_pt, pfn); |
474 | (void)sptep; | ||
446 | pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, | 475 | pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, |
447 | sptep, *sptep, write_pt); | 476 | sptep, *sptep, write_pt); |
448 | 477 | ||
@@ -464,6 +493,7 @@ out_unlock: | |||
464 | static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) | 493 | static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) |
465 | { | 494 | { |
466 | struct kvm_shadow_walk_iterator iterator; | 495 | struct kvm_shadow_walk_iterator iterator; |
496 | struct kvm_mmu_page *sp; | ||
467 | gpa_t pte_gpa = -1; | 497 | gpa_t pte_gpa = -1; |
468 | int level; | 498 | int level; |
469 | u64 *sptep; | 499 | u64 *sptep; |
@@ -475,10 +505,13 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) | |||
475 | level = iterator.level; | 505 | level = iterator.level; |
476 | sptep = iterator.sptep; | 506 | sptep = iterator.sptep; |
477 | 507 | ||
508 | sp = page_header(__pa(sptep)); | ||
478 | if (is_last_spte(*sptep, level)) { | 509 | if (is_last_spte(*sptep, level)) { |
479 | struct kvm_mmu_page *sp = page_header(__pa(sptep)); | ||
480 | int offset, shift; | 510 | int offset, shift; |
481 | 511 | ||
512 | if (!sp->unsync) | ||
513 | break; | ||
514 | |||
482 | shift = PAGE_SHIFT - | 515 | shift = PAGE_SHIFT - |
483 | (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level; | 516 | (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level; |
484 | offset = sp->role.quadrant << shift; | 517 | offset = sp->role.quadrant << shift; |
@@ -487,16 +520,17 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) | |||
487 | pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); | 520 | pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); |
488 | 521 | ||
489 | if (is_shadow_present_pte(*sptep)) { | 522 | if (is_shadow_present_pte(*sptep)) { |
490 | rmap_remove(vcpu->kvm, sptep); | ||
491 | if (is_large_pte(*sptep)) | 523 | if (is_large_pte(*sptep)) |
492 | --vcpu->kvm->stat.lpages; | 524 | --vcpu->kvm->stat.lpages; |
525 | drop_spte(vcpu->kvm, sptep, | ||
526 | shadow_trap_nonpresent_pte); | ||
493 | need_flush = 1; | 527 | need_flush = 1; |
494 | } | 528 | } else |
495 | __set_spte(sptep, shadow_trap_nonpresent_pte); | 529 | __set_spte(sptep, shadow_trap_nonpresent_pte); |
496 | break; | 530 | break; |
497 | } | 531 | } |
498 | 532 | ||
499 | if (!is_shadow_present_pte(*sptep)) | 533 | if (!is_shadow_present_pte(*sptep) || !sp->unsync_children) |
500 | break; | 534 | break; |
501 | } | 535 | } |
502 | 536 | ||
@@ -570,9 +604,9 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, | |||
570 | * Using the cached information from sp->gfns is safe because: | 604 | * Using the cached information from sp->gfns is safe because: |
571 | * - The spte has a reference to the struct page, so the pfn for a given gfn | 605 | * - The spte has a reference to the struct page, so the pfn for a given gfn |
572 | * can't change unless all sptes pointing to it are nuked first. | 606 | * can't change unless all sptes pointing to it are nuked first. |
573 | * - Alias changes zap the entire shadow cache. | ||
574 | */ | 607 | */ |
575 | static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | 608 | static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, |
609 | bool clear_unsync) | ||
576 | { | 610 | { |
577 | int i, offset, nr_present; | 611 | int i, offset, nr_present; |
578 | bool reset_host_protection; | 612 | bool reset_host_protection; |
@@ -580,6 +614,9 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | |||
580 | 614 | ||
581 | offset = nr_present = 0; | 615 | offset = nr_present = 0; |
582 | 616 | ||
617 | /* direct kvm_mmu_page can not be unsync. */ | ||
618 | BUG_ON(sp->role.direct); | ||
619 | |||
583 | if (PTTYPE == 32) | 620 | if (PTTYPE == 32) |
584 | offset = sp->role.quadrant << PT64_LEVEL_BITS; | 621 | offset = sp->role.quadrant << PT64_LEVEL_BITS; |
585 | 622 | ||
@@ -589,7 +626,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | |||
589 | unsigned pte_access; | 626 | unsigned pte_access; |
590 | pt_element_t gpte; | 627 | pt_element_t gpte; |
591 | gpa_t pte_gpa; | 628 | gpa_t pte_gpa; |
592 | gfn_t gfn = sp->gfns[i]; | 629 | gfn_t gfn; |
593 | 630 | ||
594 | if (!is_shadow_present_pte(sp->spt[i])) | 631 | if (!is_shadow_present_pte(sp->spt[i])) |
595 | continue; | 632 | continue; |
@@ -600,16 +637,17 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | |||
600 | sizeof(pt_element_t))) | 637 | sizeof(pt_element_t))) |
601 | return -EINVAL; | 638 | return -EINVAL; |
602 | 639 | ||
603 | if (gpte_to_gfn(gpte) != gfn || !is_present_gpte(gpte) || | 640 | gfn = gpte_to_gfn(gpte); |
604 | !(gpte & PT_ACCESSED_MASK)) { | 641 | if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL) |
642 | || gfn != sp->gfns[i] || !is_present_gpte(gpte) | ||
643 | || !(gpte & PT_ACCESSED_MASK)) { | ||
605 | u64 nonpresent; | 644 | u64 nonpresent; |
606 | 645 | ||
607 | rmap_remove(vcpu->kvm, &sp->spt[i]); | 646 | if (is_present_gpte(gpte) || !clear_unsync) |
608 | if (is_present_gpte(gpte)) | ||
609 | nonpresent = shadow_trap_nonpresent_pte; | 647 | nonpresent = shadow_trap_nonpresent_pte; |
610 | else | 648 | else |
611 | nonpresent = shadow_notrap_nonpresent_pte; | 649 | nonpresent = shadow_notrap_nonpresent_pte; |
612 | __set_spte(&sp->spt[i], nonpresent); | 650 | drop_spte(vcpu->kvm, &sp->spt[i], nonpresent); |
613 | continue; | 651 | continue; |
614 | } | 652 | } |
615 | 653 | ||
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ce438e0fdd26..bc5b9b8d4a33 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c | |||
@@ -4,6 +4,7 @@ | |||
4 | * AMD SVM support | 4 | * AMD SVM support |
5 | * | 5 | * |
6 | * Copyright (C) 2006 Qumranet, Inc. | 6 | * Copyright (C) 2006 Qumranet, Inc. |
7 | * Copyright 2010 Red Hat, Inc. and/or its affilates. | ||
7 | * | 8 | * |
8 | * Authors: | 9 | * Authors: |
9 | * Yaniv Kamay <yaniv@qumranet.com> | 10 | * Yaniv Kamay <yaniv@qumranet.com> |
@@ -130,7 +131,7 @@ static struct svm_direct_access_msrs { | |||
130 | u32 index; /* Index of the MSR */ | 131 | u32 index; /* Index of the MSR */ |
131 | bool always; /* True if intercept is always on */ | 132 | bool always; /* True if intercept is always on */ |
132 | } direct_access_msrs[] = { | 133 | } direct_access_msrs[] = { |
133 | { .index = MSR_K6_STAR, .always = true }, | 134 | { .index = MSR_STAR, .always = true }, |
134 | { .index = MSR_IA32_SYSENTER_CS, .always = true }, | 135 | { .index = MSR_IA32_SYSENTER_CS, .always = true }, |
135 | #ifdef CONFIG_X86_64 | 136 | #ifdef CONFIG_X86_64 |
136 | { .index = MSR_GS_BASE, .always = true }, | 137 | { .index = MSR_GS_BASE, .always = true }, |
@@ -285,11 +286,11 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu) | |||
285 | 286 | ||
286 | static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) | 287 | static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) |
287 | { | 288 | { |
289 | vcpu->arch.efer = efer; | ||
288 | if (!npt_enabled && !(efer & EFER_LMA)) | 290 | if (!npt_enabled && !(efer & EFER_LMA)) |
289 | efer &= ~EFER_LME; | 291 | efer &= ~EFER_LME; |
290 | 292 | ||
291 | to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; | 293 | to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; |
292 | vcpu->arch.efer = efer; | ||
293 | } | 294 | } |
294 | 295 | ||
295 | static int is_external_interrupt(u32 info) | 296 | static int is_external_interrupt(u32 info) |
@@ -383,8 +384,7 @@ static void svm_init_erratum_383(void) | |||
383 | int err; | 384 | int err; |
384 | u64 val; | 385 | u64 val; |
385 | 386 | ||
386 | /* Only Fam10h is affected */ | 387 | if (!cpu_has_amd_erratum(amd_erratum_383)) |
387 | if (boot_cpu_data.x86 != 0x10) | ||
388 | return; | 388 | return; |
389 | 389 | ||
390 | /* Use _safe variants to not break nested virtualization */ | 390 | /* Use _safe variants to not break nested virtualization */ |
@@ -640,7 +640,7 @@ static __init int svm_hardware_setup(void) | |||
640 | 640 | ||
641 | if (nested) { | 641 | if (nested) { |
642 | printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); | 642 | printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); |
643 | kvm_enable_efer_bits(EFER_SVME); | 643 | kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); |
644 | } | 644 | } |
645 | 645 | ||
646 | for_each_possible_cpu(cpu) { | 646 | for_each_possible_cpu(cpu) { |
@@ -806,7 +806,7 @@ static void init_vmcb(struct vcpu_svm *svm) | |||
806 | * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. | 806 | * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. |
807 | */ | 807 | */ |
808 | svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; | 808 | svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; |
809 | kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0); | 809 | (void)kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0); |
810 | 810 | ||
811 | save->cr4 = X86_CR4_PAE; | 811 | save->cr4 = X86_CR4_PAE; |
812 | /* rdx = ?? */ | 812 | /* rdx = ?? */ |
@@ -903,13 +903,18 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) | |||
903 | svm->asid_generation = 0; | 903 | svm->asid_generation = 0; |
904 | init_vmcb(svm); | 904 | init_vmcb(svm); |
905 | 905 | ||
906 | fx_init(&svm->vcpu); | 906 | err = fx_init(&svm->vcpu); |
907 | if (err) | ||
908 | goto free_page4; | ||
909 | |||
907 | svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; | 910 | svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; |
908 | if (kvm_vcpu_is_bsp(&svm->vcpu)) | 911 | if (kvm_vcpu_is_bsp(&svm->vcpu)) |
909 | svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; | 912 | svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; |
910 | 913 | ||
911 | return &svm->vcpu; | 914 | return &svm->vcpu; |
912 | 915 | ||
916 | free_page4: | ||
917 | __free_page(hsave_page); | ||
913 | free_page3: | 918 | free_page3: |
914 | __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); | 919 | __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); |
915 | free_page2: | 920 | free_page2: |
@@ -1488,7 +1493,7 @@ static void svm_handle_mce(struct vcpu_svm *svm) | |||
1488 | */ | 1493 | */ |
1489 | pr_err("KVM: Guest triggered AMD Erratum 383\n"); | 1494 | pr_err("KVM: Guest triggered AMD Erratum 383\n"); |
1490 | 1495 | ||
1491 | set_bit(KVM_REQ_TRIPLE_FAULT, &svm->vcpu.requests); | 1496 | kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu); |
1492 | 1497 | ||
1493 | return; | 1498 | return; |
1494 | } | 1499 | } |
@@ -1535,7 +1540,7 @@ static int io_interception(struct vcpu_svm *svm) | |||
1535 | string = (io_info & SVM_IOIO_STR_MASK) != 0; | 1540 | string = (io_info & SVM_IOIO_STR_MASK) != 0; |
1536 | in = (io_info & SVM_IOIO_TYPE_MASK) != 0; | 1541 | in = (io_info & SVM_IOIO_TYPE_MASK) != 0; |
1537 | if (string || in) | 1542 | if (string || in) |
1538 | return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO); | 1543 | return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; |
1539 | 1544 | ||
1540 | port = io_info >> 16; | 1545 | port = io_info >> 16; |
1541 | size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; | 1546 | size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; |
@@ -1957,7 +1962,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) | |||
1957 | svm->vmcb->save.cr3 = hsave->save.cr3; | 1962 | svm->vmcb->save.cr3 = hsave->save.cr3; |
1958 | svm->vcpu.arch.cr3 = hsave->save.cr3; | 1963 | svm->vcpu.arch.cr3 = hsave->save.cr3; |
1959 | } else { | 1964 | } else { |
1960 | kvm_set_cr3(&svm->vcpu, hsave->save.cr3); | 1965 | (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3); |
1961 | } | 1966 | } |
1962 | kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax); | 1967 | kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax); |
1963 | kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp); | 1968 | kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp); |
@@ -2080,7 +2085,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) | |||
2080 | svm->vmcb->save.cr3 = nested_vmcb->save.cr3; | 2085 | svm->vmcb->save.cr3 = nested_vmcb->save.cr3; |
2081 | svm->vcpu.arch.cr3 = nested_vmcb->save.cr3; | 2086 | svm->vcpu.arch.cr3 = nested_vmcb->save.cr3; |
2082 | } else | 2087 | } else |
2083 | kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); | 2088 | (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); |
2084 | 2089 | ||
2085 | /* Guest paging mode is active - reset mmu */ | 2090 | /* Guest paging mode is active - reset mmu */ |
2086 | kvm_mmu_reset_context(&svm->vcpu); | 2091 | kvm_mmu_reset_context(&svm->vcpu); |
@@ -2386,16 +2391,12 @@ static int iret_interception(struct vcpu_svm *svm) | |||
2386 | 2391 | ||
2387 | static int invlpg_interception(struct vcpu_svm *svm) | 2392 | static int invlpg_interception(struct vcpu_svm *svm) |
2388 | { | 2393 | { |
2389 | if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE) | 2394 | return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE; |
2390 | pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); | ||
2391 | return 1; | ||
2392 | } | 2395 | } |
2393 | 2396 | ||
2394 | static int emulate_on_interception(struct vcpu_svm *svm) | 2397 | static int emulate_on_interception(struct vcpu_svm *svm) |
2395 | { | 2398 | { |
2396 | if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE) | 2399 | return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE; |
2397 | pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); | ||
2398 | return 1; | ||
2399 | } | 2400 | } |
2400 | 2401 | ||
2401 | static int cr8_write_interception(struct vcpu_svm *svm) | 2402 | static int cr8_write_interception(struct vcpu_svm *svm) |
@@ -2431,7 +2432,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) | |||
2431 | *data = tsc_offset + native_read_tsc(); | 2432 | *data = tsc_offset + native_read_tsc(); |
2432 | break; | 2433 | break; |
2433 | } | 2434 | } |
2434 | case MSR_K6_STAR: | 2435 | case MSR_STAR: |
2435 | *data = svm->vmcb->save.star; | 2436 | *data = svm->vmcb->save.star; |
2436 | break; | 2437 | break; |
2437 | #ifdef CONFIG_X86_64 | 2438 | #ifdef CONFIG_X86_64 |
@@ -2555,7 +2556,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) | |||
2555 | 2556 | ||
2556 | break; | 2557 | break; |
2557 | } | 2558 | } |
2558 | case MSR_K6_STAR: | 2559 | case MSR_STAR: |
2559 | svm->vmcb->save.star = data; | 2560 | svm->vmcb->save.star = data; |
2560 | break; | 2561 | break; |
2561 | #ifdef CONFIG_X86_64 | 2562 | #ifdef CONFIG_X86_64 |
@@ -2726,6 +2727,99 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { | |||
2726 | [SVM_EXIT_NPF] = pf_interception, | 2727 | [SVM_EXIT_NPF] = pf_interception, |
2727 | }; | 2728 | }; |
2728 | 2729 | ||
2730 | void dump_vmcb(struct kvm_vcpu *vcpu) | ||
2731 | { | ||
2732 | struct vcpu_svm *svm = to_svm(vcpu); | ||
2733 | struct vmcb_control_area *control = &svm->vmcb->control; | ||
2734 | struct vmcb_save_area *save = &svm->vmcb->save; | ||
2735 | |||
2736 | pr_err("VMCB Control Area:\n"); | ||
2737 | pr_err("cr_read: %04x\n", control->intercept_cr_read); | ||
2738 | pr_err("cr_write: %04x\n", control->intercept_cr_write); | ||
2739 | pr_err("dr_read: %04x\n", control->intercept_dr_read); | ||
2740 | pr_err("dr_write: %04x\n", control->intercept_dr_write); | ||
2741 | pr_err("exceptions: %08x\n", control->intercept_exceptions); | ||
2742 | pr_err("intercepts: %016llx\n", control->intercept); | ||
2743 | pr_err("pause filter count: %d\n", control->pause_filter_count); | ||
2744 | pr_err("iopm_base_pa: %016llx\n", control->iopm_base_pa); | ||
2745 | pr_err("msrpm_base_pa: %016llx\n", control->msrpm_base_pa); | ||
2746 | pr_err("tsc_offset: %016llx\n", control->tsc_offset); | ||
2747 | pr_err("asid: %d\n", control->asid); | ||
2748 | pr_err("tlb_ctl: %d\n", control->tlb_ctl); | ||
2749 | pr_err("int_ctl: %08x\n", control->int_ctl); | ||
2750 | pr_err("int_vector: %08x\n", control->int_vector); | ||
2751 | pr_err("int_state: %08x\n", control->int_state); | ||
2752 | pr_err("exit_code: %08x\n", control->exit_code); | ||
2753 | pr_err("exit_info1: %016llx\n", control->exit_info_1); | ||
2754 | pr_err("exit_info2: %016llx\n", control->exit_info_2); | ||
2755 | pr_err("exit_int_info: %08x\n", control->exit_int_info); | ||
2756 | pr_err("exit_int_info_err: %08x\n", control->exit_int_info_err); | ||
2757 | pr_err("nested_ctl: %lld\n", control->nested_ctl); | ||
2758 | pr_err("nested_cr3: %016llx\n", control->nested_cr3); | ||
2759 | pr_err("event_inj: %08x\n", control->event_inj); | ||
2760 | pr_err("event_inj_err: %08x\n", control->event_inj_err); | ||
2761 | pr_err("lbr_ctl: %lld\n", control->lbr_ctl); | ||
2762 | pr_err("next_rip: %016llx\n", control->next_rip); | ||
2763 | pr_err("VMCB State Save Area:\n"); | ||
2764 | pr_err("es: s: %04x a: %04x l: %08x b: %016llx\n", | ||
2765 | save->es.selector, save->es.attrib, | ||
2766 | save->es.limit, save->es.base); | ||
2767 | pr_err("cs: s: %04x a: %04x l: %08x b: %016llx\n", | ||
2768 | save->cs.selector, save->cs.attrib, | ||
2769 | save->cs.limit, save->cs.base); | ||
2770 | pr_err("ss: s: %04x a: %04x l: %08x b: %016llx\n", | ||
2771 | save->ss.selector, save->ss.attrib, | ||
2772 | save->ss.limit, save->ss.base); | ||
2773 | pr_err("ds: s: %04x a: %04x l: %08x b: %016llx\n", | ||
2774 | save->ds.selector, save->ds.attrib, | ||
2775 | save->ds.limit, save->ds.base); | ||
2776 | pr_err("fs: s: %04x a: %04x l: %08x b: %016llx\n", | ||
2777 | save->fs.selector, save->fs.attrib, | ||
2778 | save->fs.limit, save->fs.base); | ||
2779 | pr_err("gs: s: %04x a: %04x l: %08x b: %016llx\n", | ||
2780 | save->gs.selector, save->gs.attrib, | ||
2781 | save->gs.limit, save->gs.base); | ||
2782 | pr_err("gdtr: s: %04x a: %04x l: %08x b: %016llx\n", | ||
2783 | save->gdtr.selector, save->gdtr.attrib, | ||
2784 | save->gdtr.limit, save->gdtr.base); | ||
2785 | pr_err("ldtr: s: %04x a: %04x l: %08x b: %016llx\n", | ||
2786 | save->ldtr.selector, save->ldtr.attrib, | ||
2787 | save->ldtr.limit, save->ldtr.base); | ||
2788 | pr_err("idtr: s: %04x a: %04x l: %08x b: %016llx\n", | ||
2789 | save->idtr.selector, save->idtr.attrib, | ||
2790 | save->idtr.limit, save->idtr.base); | ||
2791 | pr_err("tr: s: %04x a: %04x l: %08x b: %016llx\n", | ||
2792 | save->tr.selector, save->tr.attrib, | ||
2793 | save->tr.limit, save->tr.base); | ||
2794 | pr_err("cpl: %d efer: %016llx\n", | ||
2795 | save->cpl, save->efer); | ||
2796 | pr_err("cr0: %016llx cr2: %016llx\n", | ||
2797 | save->cr0, save->cr2); | ||
2798 | pr_err("cr3: %016llx cr4: %016llx\n", | ||
2799 | save->cr3, save->cr4); | ||
2800 | pr_err("dr6: %016llx dr7: %016llx\n", | ||
2801 | save->dr6, save->dr7); | ||
2802 | pr_err("rip: %016llx rflags: %016llx\n", | ||
2803 | save->rip, save->rflags); | ||
2804 | pr_err("rsp: %016llx rax: %016llx\n", | ||
2805 | save->rsp, save->rax); | ||
2806 | pr_err("star: %016llx lstar: %016llx\n", | ||
2807 | save->star, save->lstar); | ||
2808 | pr_err("cstar: %016llx sfmask: %016llx\n", | ||
2809 | save->cstar, save->sfmask); | ||
2810 | pr_err("kernel_gs_base: %016llx sysenter_cs: %016llx\n", | ||
2811 | save->kernel_gs_base, save->sysenter_cs); | ||
2812 | pr_err("sysenter_esp: %016llx sysenter_eip: %016llx\n", | ||
2813 | save->sysenter_esp, save->sysenter_eip); | ||
2814 | pr_err("gpat: %016llx dbgctl: %016llx\n", | ||
2815 | save->g_pat, save->dbgctl); | ||
2816 | pr_err("br_from: %016llx br_to: %016llx\n", | ||
2817 | save->br_from, save->br_to); | ||
2818 | pr_err("excp_from: %016llx excp_to: %016llx\n", | ||
2819 | save->last_excp_from, save->last_excp_to); | ||
2820 | |||
2821 | } | ||
2822 | |||
2729 | static int handle_exit(struct kvm_vcpu *vcpu) | 2823 | static int handle_exit(struct kvm_vcpu *vcpu) |
2730 | { | 2824 | { |
2731 | struct vcpu_svm *svm = to_svm(vcpu); | 2825 | struct vcpu_svm *svm = to_svm(vcpu); |
@@ -2770,6 +2864,8 @@ static int handle_exit(struct kvm_vcpu *vcpu) | |||
2770 | kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; | 2864 | kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; |
2771 | kvm_run->fail_entry.hardware_entry_failure_reason | 2865 | kvm_run->fail_entry.hardware_entry_failure_reason |
2772 | = svm->vmcb->control.exit_code; | 2866 | = svm->vmcb->control.exit_code; |
2867 | pr_err("KVM: FAILED VMRUN WITH VMCB:\n"); | ||
2868 | dump_vmcb(vcpu); | ||
2773 | return 0; | 2869 | return 0; |
2774 | } | 2870 | } |
2775 | 2871 | ||
@@ -2826,9 +2922,6 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) | |||
2826 | { | 2922 | { |
2827 | struct vmcb_control_area *control; | 2923 | struct vmcb_control_area *control; |
2828 | 2924 | ||
2829 | trace_kvm_inj_virq(irq); | ||
2830 | |||
2831 | ++svm->vcpu.stat.irq_injections; | ||
2832 | control = &svm->vmcb->control; | 2925 | control = &svm->vmcb->control; |
2833 | control->int_vector = irq; | 2926 | control->int_vector = irq; |
2834 | control->int_ctl &= ~V_INTR_PRIO_MASK; | 2927 | control->int_ctl &= ~V_INTR_PRIO_MASK; |
@@ -2842,6 +2935,9 @@ static void svm_set_irq(struct kvm_vcpu *vcpu) | |||
2842 | 2935 | ||
2843 | BUG_ON(!(gif_set(svm))); | 2936 | BUG_ON(!(gif_set(svm))); |
2844 | 2937 | ||
2938 | trace_kvm_inj_virq(vcpu->arch.interrupt.nr); | ||
2939 | ++vcpu->stat.irq_injections; | ||
2940 | |||
2845 | svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr | | 2941 | svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr | |
2846 | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; | 2942 | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; |
2847 | } | 2943 | } |
@@ -3327,6 +3423,11 @@ static bool svm_rdtscp_supported(void) | |||
3327 | return false; | 3423 | return false; |
3328 | } | 3424 | } |
3329 | 3425 | ||
3426 | static bool svm_has_wbinvd_exit(void) | ||
3427 | { | ||
3428 | return true; | ||
3429 | } | ||
3430 | |||
3330 | static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) | 3431 | static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) |
3331 | { | 3432 | { |
3332 | struct vcpu_svm *svm = to_svm(vcpu); | 3433 | struct vcpu_svm *svm = to_svm(vcpu); |
@@ -3411,6 +3512,8 @@ static struct kvm_x86_ops svm_x86_ops = { | |||
3411 | .rdtscp_supported = svm_rdtscp_supported, | 3512 | .rdtscp_supported = svm_rdtscp_supported, |
3412 | 3513 | ||
3413 | .set_supported_cpuid = svm_set_supported_cpuid, | 3514 | .set_supported_cpuid = svm_set_supported_cpuid, |
3515 | |||
3516 | .has_wbinvd_exit = svm_has_wbinvd_exit, | ||
3414 | }; | 3517 | }; |
3415 | 3518 | ||
3416 | static int __init svm_init(void) | 3519 | static int __init svm_init(void) |
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c index 4ddadb1a5ffe..e16a0dbe74d8 100644 --- a/arch/x86/kvm/timer.c +++ b/arch/x86/kvm/timer.c | |||
@@ -1,3 +1,17 @@ | |||
1 | /* | ||
2 | * Kernel-based Virtual Machine driver for Linux | ||
3 | * | ||
4 | * This module enables machines with Intel VT-x extensions to run virtual | ||
5 | * machines without emulation or binary translation. | ||
6 | * | ||
7 | * timer support | ||
8 | * | ||
9 | * Copyright 2010 Red Hat, Inc. and/or its affilates. | ||
10 | * | ||
11 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
12 | * the COPYING file in the top-level directory. | ||
13 | */ | ||
14 | |||
1 | #include <linux/kvm_host.h> | 15 | #include <linux/kvm_host.h> |
2 | #include <linux/kvm.h> | 16 | #include <linux/kvm.h> |
3 | #include <linux/hrtimer.h> | 17 | #include <linux/hrtimer.h> |
@@ -18,7 +32,7 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer) | |||
18 | if (ktimer->reinject || !atomic_read(&ktimer->pending)) { | 32 | if (ktimer->reinject || !atomic_read(&ktimer->pending)) { |
19 | atomic_inc(&ktimer->pending); | 33 | atomic_inc(&ktimer->pending); |
20 | /* FIXME: this code should not know anything about vcpus */ | 34 | /* FIXME: this code should not know anything about vcpus */ |
21 | set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); | 35 | kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); |
22 | } | 36 | } |
23 | 37 | ||
24 | if (waitqueue_active(q)) | 38 | if (waitqueue_active(q)) |
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ee03679efe78..49b25eee25ac 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c | |||
@@ -5,6 +5,7 @@ | |||
5 | * machines without emulation or binary translation. | 5 | * machines without emulation or binary translation. |
6 | * | 6 | * |
7 | * Copyright (C) 2006 Qumranet, Inc. | 7 | * Copyright (C) 2006 Qumranet, Inc. |
8 | * Copyright 2010 Red Hat, Inc. and/or its affilates. | ||
8 | * | 9 | * |
9 | * Authors: | 10 | * Authors: |
10 | * Avi Kivity <avi@qumranet.com> | 11 | * Avi Kivity <avi@qumranet.com> |
@@ -36,6 +37,8 @@ | |||
36 | #include <asm/vmx.h> | 37 | #include <asm/vmx.h> |
37 | #include <asm/virtext.h> | 38 | #include <asm/virtext.h> |
38 | #include <asm/mce.h> | 39 | #include <asm/mce.h> |
40 | #include <asm/i387.h> | ||
41 | #include <asm/xcr.h> | ||
39 | 42 | ||
40 | #include "trace.h" | 43 | #include "trace.h" |
41 | 44 | ||
@@ -63,6 +66,9 @@ module_param_named(unrestricted_guest, | |||
63 | static int __read_mostly emulate_invalid_guest_state = 0; | 66 | static int __read_mostly emulate_invalid_guest_state = 0; |
64 | module_param(emulate_invalid_guest_state, bool, S_IRUGO); | 67 | module_param(emulate_invalid_guest_state, bool, S_IRUGO); |
65 | 68 | ||
69 | static int __read_mostly vmm_exclusive = 1; | ||
70 | module_param(vmm_exclusive, bool, S_IRUGO); | ||
71 | |||
66 | #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ | 72 | #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ |
67 | (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) | 73 | (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) |
68 | #define KVM_GUEST_CR0_MASK \ | 74 | #define KVM_GUEST_CR0_MASK \ |
@@ -173,10 +179,13 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) | |||
173 | 179 | ||
174 | static int init_rmode(struct kvm *kvm); | 180 | static int init_rmode(struct kvm *kvm); |
175 | static u64 construct_eptp(unsigned long root_hpa); | 181 | static u64 construct_eptp(unsigned long root_hpa); |
182 | static void kvm_cpu_vmxon(u64 addr); | ||
183 | static void kvm_cpu_vmxoff(void); | ||
176 | 184 | ||
177 | static DEFINE_PER_CPU(struct vmcs *, vmxarea); | 185 | static DEFINE_PER_CPU(struct vmcs *, vmxarea); |
178 | static DEFINE_PER_CPU(struct vmcs *, current_vmcs); | 186 | static DEFINE_PER_CPU(struct vmcs *, current_vmcs); |
179 | static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu); | 187 | static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu); |
188 | static DEFINE_PER_CPU(struct desc_ptr, host_gdt); | ||
180 | 189 | ||
181 | static unsigned long *vmx_io_bitmap_a; | 190 | static unsigned long *vmx_io_bitmap_a; |
182 | static unsigned long *vmx_io_bitmap_b; | 191 | static unsigned long *vmx_io_bitmap_b; |
@@ -231,14 +240,14 @@ static u64 host_efer; | |||
231 | static void ept_save_pdptrs(struct kvm_vcpu *vcpu); | 240 | static void ept_save_pdptrs(struct kvm_vcpu *vcpu); |
232 | 241 | ||
233 | /* | 242 | /* |
234 | * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it | 243 | * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it |
235 | * away by decrementing the array size. | 244 | * away by decrementing the array size. |
236 | */ | 245 | */ |
237 | static const u32 vmx_msr_index[] = { | 246 | static const u32 vmx_msr_index[] = { |
238 | #ifdef CONFIG_X86_64 | 247 | #ifdef CONFIG_X86_64 |
239 | MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, | 248 | MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, |
240 | #endif | 249 | #endif |
241 | MSR_EFER, MSR_TSC_AUX, MSR_K6_STAR, | 250 | MSR_EFER, MSR_TSC_AUX, MSR_STAR, |
242 | }; | 251 | }; |
243 | #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) | 252 | #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) |
244 | 253 | ||
@@ -334,6 +343,11 @@ static inline bool cpu_has_vmx_ept_1g_page(void) | |||
334 | return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT; | 343 | return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT; |
335 | } | 344 | } |
336 | 345 | ||
346 | static inline bool cpu_has_vmx_ept_4levels(void) | ||
347 | { | ||
348 | return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; | ||
349 | } | ||
350 | |||
337 | static inline bool cpu_has_vmx_invept_individual_addr(void) | 351 | static inline bool cpu_has_vmx_invept_individual_addr(void) |
338 | { | 352 | { |
339 | return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT; | 353 | return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT; |
@@ -349,6 +363,16 @@ static inline bool cpu_has_vmx_invept_global(void) | |||
349 | return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT; | 363 | return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT; |
350 | } | 364 | } |
351 | 365 | ||
366 | static inline bool cpu_has_vmx_invvpid_single(void) | ||
367 | { | ||
368 | return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT; | ||
369 | } | ||
370 | |||
371 | static inline bool cpu_has_vmx_invvpid_global(void) | ||
372 | { | ||
373 | return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; | ||
374 | } | ||
375 | |||
352 | static inline bool cpu_has_vmx_ept(void) | 376 | static inline bool cpu_has_vmx_ept(void) |
353 | { | 377 | { |
354 | return vmcs_config.cpu_based_2nd_exec_ctrl & | 378 | return vmcs_config.cpu_based_2nd_exec_ctrl & |
@@ -389,6 +413,12 @@ static inline bool cpu_has_virtual_nmis(void) | |||
389 | return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; | 413 | return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; |
390 | } | 414 | } |
391 | 415 | ||
416 | static inline bool cpu_has_vmx_wbinvd_exit(void) | ||
417 | { | ||
418 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
419 | SECONDARY_EXEC_WBINVD_EXITING; | ||
420 | } | ||
421 | |||
392 | static inline bool report_flexpriority(void) | 422 | static inline bool report_flexpriority(void) |
393 | { | 423 | { |
394 | return flexpriority_enabled; | 424 | return flexpriority_enabled; |
@@ -453,6 +483,19 @@ static void vmcs_clear(struct vmcs *vmcs) | |||
453 | vmcs, phys_addr); | 483 | vmcs, phys_addr); |
454 | } | 484 | } |
455 | 485 | ||
486 | static void vmcs_load(struct vmcs *vmcs) | ||
487 | { | ||
488 | u64 phys_addr = __pa(vmcs); | ||
489 | u8 error; | ||
490 | |||
491 | asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" | ||
492 | : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) | ||
493 | : "cc", "memory"); | ||
494 | if (error) | ||
495 | printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", | ||
496 | vmcs, phys_addr); | ||
497 | } | ||
498 | |||
456 | static void __vcpu_clear(void *arg) | 499 | static void __vcpu_clear(void *arg) |
457 | { | 500 | { |
458 | struct vcpu_vmx *vmx = arg; | 501 | struct vcpu_vmx *vmx = arg; |
@@ -475,12 +518,27 @@ static void vcpu_clear(struct vcpu_vmx *vmx) | |||
475 | smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1); | 518 | smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1); |
476 | } | 519 | } |
477 | 520 | ||
478 | static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx) | 521 | static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) |
479 | { | 522 | { |
480 | if (vmx->vpid == 0) | 523 | if (vmx->vpid == 0) |
481 | return; | 524 | return; |
482 | 525 | ||
483 | __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0); | 526 | if (cpu_has_vmx_invvpid_single()) |
527 | __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0); | ||
528 | } | ||
529 | |||
530 | static inline void vpid_sync_vcpu_global(void) | ||
531 | { | ||
532 | if (cpu_has_vmx_invvpid_global()) | ||
533 | __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0); | ||
534 | } | ||
535 | |||
536 | static inline void vpid_sync_context(struct vcpu_vmx *vmx) | ||
537 | { | ||
538 | if (cpu_has_vmx_invvpid_single()) | ||
539 | vpid_sync_vcpu_single(vmx); | ||
540 | else | ||
541 | vpid_sync_vcpu_global(); | ||
484 | } | 542 | } |
485 | 543 | ||
486 | static inline void ept_sync_global(void) | 544 | static inline void ept_sync_global(void) |
@@ -812,6 +870,9 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) | |||
812 | wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); | 870 | wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); |
813 | } | 871 | } |
814 | #endif | 872 | #endif |
873 | if (current_thread_info()->status & TS_USEDFPU) | ||
874 | clts(); | ||
875 | load_gdt(&__get_cpu_var(host_gdt)); | ||
815 | } | 876 | } |
816 | 877 | ||
817 | static void vmx_load_host_state(struct vcpu_vmx *vmx) | 878 | static void vmx_load_host_state(struct vcpu_vmx *vmx) |
@@ -828,35 +889,30 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx) | |||
828 | static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | 889 | static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) |
829 | { | 890 | { |
830 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 891 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
831 | u64 phys_addr = __pa(vmx->vmcs); | ||
832 | u64 tsc_this, delta, new_offset; | 892 | u64 tsc_this, delta, new_offset; |
893 | u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); | ||
833 | 894 | ||
834 | if (vcpu->cpu != cpu) { | 895 | if (!vmm_exclusive) |
896 | kvm_cpu_vmxon(phys_addr); | ||
897 | else if (vcpu->cpu != cpu) | ||
835 | vcpu_clear(vmx); | 898 | vcpu_clear(vmx); |
836 | kvm_migrate_timers(vcpu); | ||
837 | set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests); | ||
838 | local_irq_disable(); | ||
839 | list_add(&vmx->local_vcpus_link, | ||
840 | &per_cpu(vcpus_on_cpu, cpu)); | ||
841 | local_irq_enable(); | ||
842 | } | ||
843 | 899 | ||
844 | if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { | 900 | if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { |
845 | u8 error; | ||
846 | |||
847 | per_cpu(current_vmcs, cpu) = vmx->vmcs; | 901 | per_cpu(current_vmcs, cpu) = vmx->vmcs; |
848 | asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" | 902 | vmcs_load(vmx->vmcs); |
849 | : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) | ||
850 | : "cc"); | ||
851 | if (error) | ||
852 | printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", | ||
853 | vmx->vmcs, phys_addr); | ||
854 | } | 903 | } |
855 | 904 | ||
856 | if (vcpu->cpu != cpu) { | 905 | if (vcpu->cpu != cpu) { |
857 | struct desc_ptr dt; | 906 | struct desc_ptr dt; |
858 | unsigned long sysenter_esp; | 907 | unsigned long sysenter_esp; |
859 | 908 | ||
909 | kvm_migrate_timers(vcpu); | ||
910 | kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); | ||
911 | local_irq_disable(); | ||
912 | list_add(&vmx->local_vcpus_link, | ||
913 | &per_cpu(vcpus_on_cpu, cpu)); | ||
914 | local_irq_enable(); | ||
915 | |||
860 | vcpu->cpu = cpu; | 916 | vcpu->cpu = cpu; |
861 | /* | 917 | /* |
862 | * Linux uses per-cpu TSS and GDT, so set these when switching | 918 | * Linux uses per-cpu TSS and GDT, so set these when switching |
@@ -884,6 +940,10 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | |||
884 | static void vmx_vcpu_put(struct kvm_vcpu *vcpu) | 940 | static void vmx_vcpu_put(struct kvm_vcpu *vcpu) |
885 | { | 941 | { |
886 | __vmx_load_host_state(to_vmx(vcpu)); | 942 | __vmx_load_host_state(to_vmx(vcpu)); |
943 | if (!vmm_exclusive) { | ||
944 | __vcpu_clear(to_vmx(vcpu)); | ||
945 | kvm_cpu_vmxoff(); | ||
946 | } | ||
887 | } | 947 | } |
888 | 948 | ||
889 | static void vmx_fpu_activate(struct kvm_vcpu *vcpu) | 949 | static void vmx_fpu_activate(struct kvm_vcpu *vcpu) |
@@ -1057,10 +1117,10 @@ static void setup_msrs(struct vcpu_vmx *vmx) | |||
1057 | if (index >= 0 && vmx->rdtscp_enabled) | 1117 | if (index >= 0 && vmx->rdtscp_enabled) |
1058 | move_msr_up(vmx, index, save_nmsrs++); | 1118 | move_msr_up(vmx, index, save_nmsrs++); |
1059 | /* | 1119 | /* |
1060 | * MSR_K6_STAR is only needed on long mode guests, and only | 1120 | * MSR_STAR is only needed on long mode guests, and only |
1061 | * if efer.sce is enabled. | 1121 | * if efer.sce is enabled. |
1062 | */ | 1122 | */ |
1063 | index = __find_msr_index(vmx, MSR_K6_STAR); | 1123 | index = __find_msr_index(vmx, MSR_STAR); |
1064 | if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE)) | 1124 | if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE)) |
1065 | move_msr_up(vmx, index, save_nmsrs++); | 1125 | move_msr_up(vmx, index, save_nmsrs++); |
1066 | } | 1126 | } |
@@ -1286,6 +1346,13 @@ static __init int vmx_disabled_by_bios(void) | |||
1286 | /* locked but not enabled */ | 1346 | /* locked but not enabled */ |
1287 | } | 1347 | } |
1288 | 1348 | ||
1349 | static void kvm_cpu_vmxon(u64 addr) | ||
1350 | { | ||
1351 | asm volatile (ASM_VMX_VMXON_RAX | ||
1352 | : : "a"(&addr), "m"(addr) | ||
1353 | : "memory", "cc"); | ||
1354 | } | ||
1355 | |||
1289 | static int hardware_enable(void *garbage) | 1356 | static int hardware_enable(void *garbage) |
1290 | { | 1357 | { |
1291 | int cpu = raw_smp_processor_id(); | 1358 | int cpu = raw_smp_processor_id(); |
@@ -1308,11 +1375,13 @@ static int hardware_enable(void *garbage) | |||
1308 | wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); | 1375 | wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); |
1309 | } | 1376 | } |
1310 | write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ | 1377 | write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ |
1311 | asm volatile (ASM_VMX_VMXON_RAX | ||
1312 | : : "a"(&phys_addr), "m"(phys_addr) | ||
1313 | : "memory", "cc"); | ||
1314 | 1378 | ||
1315 | ept_sync_global(); | 1379 | if (vmm_exclusive) { |
1380 | kvm_cpu_vmxon(phys_addr); | ||
1381 | ept_sync_global(); | ||
1382 | } | ||
1383 | |||
1384 | store_gdt(&__get_cpu_var(host_gdt)); | ||
1316 | 1385 | ||
1317 | return 0; | 1386 | return 0; |
1318 | } | 1387 | } |
@@ -1334,13 +1403,15 @@ static void vmclear_local_vcpus(void) | |||
1334 | static void kvm_cpu_vmxoff(void) | 1403 | static void kvm_cpu_vmxoff(void) |
1335 | { | 1404 | { |
1336 | asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); | 1405 | asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); |
1337 | write_cr4(read_cr4() & ~X86_CR4_VMXE); | ||
1338 | } | 1406 | } |
1339 | 1407 | ||
1340 | static void hardware_disable(void *garbage) | 1408 | static void hardware_disable(void *garbage) |
1341 | { | 1409 | { |
1342 | vmclear_local_vcpus(); | 1410 | if (vmm_exclusive) { |
1343 | kvm_cpu_vmxoff(); | 1411 | vmclear_local_vcpus(); |
1412 | kvm_cpu_vmxoff(); | ||
1413 | } | ||
1414 | write_cr4(read_cr4() & ~X86_CR4_VMXE); | ||
1344 | } | 1415 | } |
1345 | 1416 | ||
1346 | static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, | 1417 | static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, |
@@ -1539,7 +1610,8 @@ static __init int hardware_setup(void) | |||
1539 | if (!cpu_has_vmx_vpid()) | 1610 | if (!cpu_has_vmx_vpid()) |
1540 | enable_vpid = 0; | 1611 | enable_vpid = 0; |
1541 | 1612 | ||
1542 | if (!cpu_has_vmx_ept()) { | 1613 | if (!cpu_has_vmx_ept() || |
1614 | !cpu_has_vmx_ept_4levels()) { | ||
1543 | enable_ept = 0; | 1615 | enable_ept = 0; |
1544 | enable_unrestricted_guest = 0; | 1616 | enable_unrestricted_guest = 0; |
1545 | } | 1617 | } |
@@ -1628,7 +1700,7 @@ static gva_t rmode_tss_base(struct kvm *kvm) | |||
1628 | gfn_t base_gfn; | 1700 | gfn_t base_gfn; |
1629 | 1701 | ||
1630 | slots = kvm_memslots(kvm); | 1702 | slots = kvm_memslots(kvm); |
1631 | base_gfn = kvm->memslots->memslots[0].base_gfn + | 1703 | base_gfn = slots->memslots[0].base_gfn + |
1632 | kvm->memslots->memslots[0].npages - 3; | 1704 | kvm->memslots->memslots[0].npages - 3; |
1633 | return base_gfn << PAGE_SHIFT; | 1705 | return base_gfn << PAGE_SHIFT; |
1634 | } | 1706 | } |
@@ -1759,9 +1831,12 @@ static void exit_lmode(struct kvm_vcpu *vcpu) | |||
1759 | 1831 | ||
1760 | static void vmx_flush_tlb(struct kvm_vcpu *vcpu) | 1832 | static void vmx_flush_tlb(struct kvm_vcpu *vcpu) |
1761 | { | 1833 | { |
1762 | vpid_sync_vcpu_all(to_vmx(vcpu)); | 1834 | vpid_sync_context(to_vmx(vcpu)); |
1763 | if (enable_ept) | 1835 | if (enable_ept) { |
1836 | if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) | ||
1837 | return; | ||
1764 | ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); | 1838 | ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); |
1839 | } | ||
1765 | } | 1840 | } |
1766 | 1841 | ||
1767 | static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) | 1842 | static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) |
@@ -2507,7 +2582,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | |||
2507 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); | 2582 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); |
2508 | vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ | 2583 | vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ |
2509 | 2584 | ||
2510 | vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */ | 2585 | vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS); /* 22.2.3 */ |
2511 | vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ | 2586 | vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ |
2512 | vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ | 2587 | vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ |
2513 | 2588 | ||
@@ -2599,21 +2674,27 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | |||
2599 | 2674 | ||
2600 | static int init_rmode(struct kvm *kvm) | 2675 | static int init_rmode(struct kvm *kvm) |
2601 | { | 2676 | { |
2677 | int idx, ret = 0; | ||
2678 | |||
2679 | idx = srcu_read_lock(&kvm->srcu); | ||
2602 | if (!init_rmode_tss(kvm)) | 2680 | if (!init_rmode_tss(kvm)) |
2603 | return 0; | 2681 | goto exit; |
2604 | if (!init_rmode_identity_map(kvm)) | 2682 | if (!init_rmode_identity_map(kvm)) |
2605 | return 0; | 2683 | goto exit; |
2606 | return 1; | 2684 | |
2685 | ret = 1; | ||
2686 | exit: | ||
2687 | srcu_read_unlock(&kvm->srcu, idx); | ||
2688 | return ret; | ||
2607 | } | 2689 | } |
2608 | 2690 | ||
2609 | static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | 2691 | static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) |
2610 | { | 2692 | { |
2611 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 2693 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
2612 | u64 msr; | 2694 | u64 msr; |
2613 | int ret, idx; | 2695 | int ret; |
2614 | 2696 | ||
2615 | vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); | 2697 | vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); |
2616 | idx = srcu_read_lock(&vcpu->kvm->srcu); | ||
2617 | if (!init_rmode(vmx->vcpu.kvm)) { | 2698 | if (!init_rmode(vmx->vcpu.kvm)) { |
2618 | ret = -ENOMEM; | 2699 | ret = -ENOMEM; |
2619 | goto out; | 2700 | goto out; |
@@ -2630,7 +2711,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
2630 | msr |= MSR_IA32_APICBASE_BSP; | 2711 | msr |= MSR_IA32_APICBASE_BSP; |
2631 | kvm_set_apic_base(&vmx->vcpu, msr); | 2712 | kvm_set_apic_base(&vmx->vcpu, msr); |
2632 | 2713 | ||
2633 | fx_init(&vmx->vcpu); | 2714 | ret = fx_init(&vmx->vcpu); |
2715 | if (ret != 0) | ||
2716 | goto out; | ||
2634 | 2717 | ||
2635 | seg_setup(VCPU_SREG_CS); | 2718 | seg_setup(VCPU_SREG_CS); |
2636 | /* | 2719 | /* |
@@ -2713,7 +2796,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
2713 | vmx_fpu_activate(&vmx->vcpu); | 2796 | vmx_fpu_activate(&vmx->vcpu); |
2714 | update_exception_bitmap(&vmx->vcpu); | 2797 | update_exception_bitmap(&vmx->vcpu); |
2715 | 2798 | ||
2716 | vpid_sync_vcpu_all(vmx); | 2799 | vpid_sync_context(vmx); |
2717 | 2800 | ||
2718 | ret = 0; | 2801 | ret = 0; |
2719 | 2802 | ||
@@ -2721,7 +2804,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
2721 | vmx->emulation_required = 0; | 2804 | vmx->emulation_required = 0; |
2722 | 2805 | ||
2723 | out: | 2806 | out: |
2724 | srcu_read_unlock(&vcpu->kvm->srcu, idx); | ||
2725 | return ret; | 2807 | return ret; |
2726 | } | 2808 | } |
2727 | 2809 | ||
@@ -2826,9 +2908,7 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) | |||
2826 | { | 2908 | { |
2827 | if (!cpu_has_virtual_nmis()) | 2909 | if (!cpu_has_virtual_nmis()) |
2828 | return to_vmx(vcpu)->soft_vnmi_blocked; | 2910 | return to_vmx(vcpu)->soft_vnmi_blocked; |
2829 | else | 2911 | return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; |
2830 | return !!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & | ||
2831 | GUEST_INTR_STATE_NMI); | ||
2832 | } | 2912 | } |
2833 | 2913 | ||
2834 | static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) | 2914 | static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) |
@@ -3070,7 +3150,7 @@ static int handle_io(struct kvm_vcpu *vcpu) | |||
3070 | ++vcpu->stat.io_exits; | 3150 | ++vcpu->stat.io_exits; |
3071 | 3151 | ||
3072 | if (string || in) | 3152 | if (string || in) |
3073 | return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO); | 3153 | return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; |
3074 | 3154 | ||
3075 | port = exit_qualification >> 16; | 3155 | port = exit_qualification >> 16; |
3076 | size = (exit_qualification & 7) + 1; | 3156 | size = (exit_qualification & 7) + 1; |
@@ -3090,11 +3170,20 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) | |||
3090 | hypercall[2] = 0xc1; | 3170 | hypercall[2] = 0xc1; |
3091 | } | 3171 | } |
3092 | 3172 | ||
3173 | static void complete_insn_gp(struct kvm_vcpu *vcpu, int err) | ||
3174 | { | ||
3175 | if (err) | ||
3176 | kvm_inject_gp(vcpu, 0); | ||
3177 | else | ||
3178 | skip_emulated_instruction(vcpu); | ||
3179 | } | ||
3180 | |||
3093 | static int handle_cr(struct kvm_vcpu *vcpu) | 3181 | static int handle_cr(struct kvm_vcpu *vcpu) |
3094 | { | 3182 | { |
3095 | unsigned long exit_qualification, val; | 3183 | unsigned long exit_qualification, val; |
3096 | int cr; | 3184 | int cr; |
3097 | int reg; | 3185 | int reg; |
3186 | int err; | ||
3098 | 3187 | ||
3099 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | 3188 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); |
3100 | cr = exit_qualification & 15; | 3189 | cr = exit_qualification & 15; |
@@ -3105,16 +3194,16 @@ static int handle_cr(struct kvm_vcpu *vcpu) | |||
3105 | trace_kvm_cr_write(cr, val); | 3194 | trace_kvm_cr_write(cr, val); |
3106 | switch (cr) { | 3195 | switch (cr) { |
3107 | case 0: | 3196 | case 0: |
3108 | kvm_set_cr0(vcpu, val); | 3197 | err = kvm_set_cr0(vcpu, val); |
3109 | skip_emulated_instruction(vcpu); | 3198 | complete_insn_gp(vcpu, err); |
3110 | return 1; | 3199 | return 1; |
3111 | case 3: | 3200 | case 3: |
3112 | kvm_set_cr3(vcpu, val); | 3201 | err = kvm_set_cr3(vcpu, val); |
3113 | skip_emulated_instruction(vcpu); | 3202 | complete_insn_gp(vcpu, err); |
3114 | return 1; | 3203 | return 1; |
3115 | case 4: | 3204 | case 4: |
3116 | kvm_set_cr4(vcpu, val); | 3205 | err = kvm_set_cr4(vcpu, val); |
3117 | skip_emulated_instruction(vcpu); | 3206 | complete_insn_gp(vcpu, err); |
3118 | return 1; | 3207 | return 1; |
3119 | case 8: { | 3208 | case 8: { |
3120 | u8 cr8_prev = kvm_get_cr8(vcpu); | 3209 | u8 cr8_prev = kvm_get_cr8(vcpu); |
@@ -3321,30 +3410,25 @@ static int handle_invlpg(struct kvm_vcpu *vcpu) | |||
3321 | static int handle_wbinvd(struct kvm_vcpu *vcpu) | 3410 | static int handle_wbinvd(struct kvm_vcpu *vcpu) |
3322 | { | 3411 | { |
3323 | skip_emulated_instruction(vcpu); | 3412 | skip_emulated_instruction(vcpu); |
3324 | /* TODO: Add support for VT-d/pass-through device */ | 3413 | kvm_emulate_wbinvd(vcpu); |
3325 | return 1; | 3414 | return 1; |
3326 | } | 3415 | } |
3327 | 3416 | ||
3328 | static int handle_apic_access(struct kvm_vcpu *vcpu) | 3417 | static int handle_xsetbv(struct kvm_vcpu *vcpu) |
3329 | { | 3418 | { |
3330 | unsigned long exit_qualification; | 3419 | u64 new_bv = kvm_read_edx_eax(vcpu); |
3331 | enum emulation_result er; | 3420 | u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX); |
3332 | unsigned long offset; | ||
3333 | 3421 | ||
3334 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | 3422 | if (kvm_set_xcr(vcpu, index, new_bv) == 0) |
3335 | offset = exit_qualification & 0xffful; | 3423 | skip_emulated_instruction(vcpu); |
3336 | |||
3337 | er = emulate_instruction(vcpu, 0, 0, 0); | ||
3338 | |||
3339 | if (er != EMULATE_DONE) { | ||
3340 | printk(KERN_ERR | ||
3341 | "Fail to handle apic access vmexit! Offset is 0x%lx\n", | ||
3342 | offset); | ||
3343 | return -ENOEXEC; | ||
3344 | } | ||
3345 | return 1; | 3424 | return 1; |
3346 | } | 3425 | } |
3347 | 3426 | ||
3427 | static int handle_apic_access(struct kvm_vcpu *vcpu) | ||
3428 | { | ||
3429 | return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; | ||
3430 | } | ||
3431 | |||
3348 | static int handle_task_switch(struct kvm_vcpu *vcpu) | 3432 | static int handle_task_switch(struct kvm_vcpu *vcpu) |
3349 | { | 3433 | { |
3350 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 3434 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
@@ -3554,13 +3638,8 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) | |||
3554 | goto out; | 3638 | goto out; |
3555 | } | 3639 | } |
3556 | 3640 | ||
3557 | if (err != EMULATE_DONE) { | 3641 | if (err != EMULATE_DONE) |
3558 | vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; | 3642 | return 0; |
3559 | vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; | ||
3560 | vcpu->run->internal.ndata = 0; | ||
3561 | ret = 0; | ||
3562 | goto out; | ||
3563 | } | ||
3564 | 3643 | ||
3565 | if (signal_pending(current)) | 3644 | if (signal_pending(current)) |
3566 | goto out; | 3645 | goto out; |
@@ -3623,6 +3702,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { | |||
3623 | [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, | 3702 | [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, |
3624 | [EXIT_REASON_APIC_ACCESS] = handle_apic_access, | 3703 | [EXIT_REASON_APIC_ACCESS] = handle_apic_access, |
3625 | [EXIT_REASON_WBINVD] = handle_wbinvd, | 3704 | [EXIT_REASON_WBINVD] = handle_wbinvd, |
3705 | [EXIT_REASON_XSETBV] = handle_xsetbv, | ||
3626 | [EXIT_REASON_TASK_SWITCH] = handle_task_switch, | 3706 | [EXIT_REASON_TASK_SWITCH] = handle_task_switch, |
3627 | [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, | 3707 | [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, |
3628 | [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, | 3708 | [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, |
@@ -3656,6 +3736,13 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) | |||
3656 | if (enable_ept && is_paging(vcpu)) | 3736 | if (enable_ept && is_paging(vcpu)) |
3657 | vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); | 3737 | vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); |
3658 | 3738 | ||
3739 | if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { | ||
3740 | vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; | ||
3741 | vcpu->run->fail_entry.hardware_entry_failure_reason | ||
3742 | = exit_reason; | ||
3743 | return 0; | ||
3744 | } | ||
3745 | |||
3659 | if (unlikely(vmx->fail)) { | 3746 | if (unlikely(vmx->fail)) { |
3660 | vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; | 3747 | vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; |
3661 | vcpu->run->fail_entry.hardware_entry_failure_reason | 3748 | vcpu->run->fail_entry.hardware_entry_failure_reason |
@@ -3861,11 +3948,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) | |||
3861 | if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) | 3948 | if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) |
3862 | vmx_set_interrupt_shadow(vcpu, 0); | 3949 | vmx_set_interrupt_shadow(vcpu, 0); |
3863 | 3950 | ||
3864 | /* | ||
3865 | * Loading guest fpu may have cleared host cr0.ts | ||
3866 | */ | ||
3867 | vmcs_writel(HOST_CR0, read_cr0()); | ||
3868 | |||
3869 | asm( | 3951 | asm( |
3870 | /* Store host registers */ | 3952 | /* Store host registers */ |
3871 | "push %%"R"dx; push %%"R"bp;" | 3953 | "push %%"R"dx; push %%"R"bp;" |
@@ -4001,6 +4083,19 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) | |||
4001 | kmem_cache_free(kvm_vcpu_cache, vmx); | 4083 | kmem_cache_free(kvm_vcpu_cache, vmx); |
4002 | } | 4084 | } |
4003 | 4085 | ||
4086 | static inline void vmcs_init(struct vmcs *vmcs) | ||
4087 | { | ||
4088 | u64 phys_addr = __pa(per_cpu(vmxarea, raw_smp_processor_id())); | ||
4089 | |||
4090 | if (!vmm_exclusive) | ||
4091 | kvm_cpu_vmxon(phys_addr); | ||
4092 | |||
4093 | vmcs_clear(vmcs); | ||
4094 | |||
4095 | if (!vmm_exclusive) | ||
4096 | kvm_cpu_vmxoff(); | ||
4097 | } | ||
4098 | |||
4004 | static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) | 4099 | static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) |
4005 | { | 4100 | { |
4006 | int err; | 4101 | int err; |
@@ -4026,7 +4121,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) | |||
4026 | if (!vmx->vmcs) | 4121 | if (!vmx->vmcs) |
4027 | goto free_msrs; | 4122 | goto free_msrs; |
4028 | 4123 | ||
4029 | vmcs_clear(vmx->vmcs); | 4124 | vmcs_init(vmx->vmcs); |
4030 | 4125 | ||
4031 | cpu = get_cpu(); | 4126 | cpu = get_cpu(); |
4032 | vmx_vcpu_load(&vmx->vcpu, cpu); | 4127 | vmx_vcpu_load(&vmx->vcpu, cpu); |
@@ -4265,6 +4360,8 @@ static struct kvm_x86_ops vmx_x86_ops = { | |||
4265 | .rdtscp_supported = vmx_rdtscp_supported, | 4360 | .rdtscp_supported = vmx_rdtscp_supported, |
4266 | 4361 | ||
4267 | .set_supported_cpuid = vmx_set_supported_cpuid, | 4362 | .set_supported_cpuid = vmx_set_supported_cpuid, |
4363 | |||
4364 | .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, | ||
4268 | }; | 4365 | }; |
4269 | 4366 | ||
4270 | static int __init vmx_init(void) | 4367 | static int __init vmx_init(void) |
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7fa89c39c64f..25f19078b321 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c | |||
@@ -6,6 +6,7 @@ | |||
6 | * Copyright (C) 2006 Qumranet, Inc. | 6 | * Copyright (C) 2006 Qumranet, Inc. |
7 | * Copyright (C) 2008 Qumranet, Inc. | 7 | * Copyright (C) 2008 Qumranet, Inc. |
8 | * Copyright IBM Corporation, 2008 | 8 | * Copyright IBM Corporation, 2008 |
9 | * Copyright 2010 Red Hat, Inc. and/or its affilates. | ||
9 | * | 10 | * |
10 | * Authors: | 11 | * Authors: |
11 | * Avi Kivity <avi@qumranet.com> | 12 | * Avi Kivity <avi@qumranet.com> |
@@ -41,17 +42,19 @@ | |||
41 | #include <linux/srcu.h> | 42 | #include <linux/srcu.h> |
42 | #include <linux/slab.h> | 43 | #include <linux/slab.h> |
43 | #include <linux/perf_event.h> | 44 | #include <linux/perf_event.h> |
45 | #include <linux/uaccess.h> | ||
44 | #include <trace/events/kvm.h> | 46 | #include <trace/events/kvm.h> |
45 | 47 | ||
46 | #define CREATE_TRACE_POINTS | 48 | #define CREATE_TRACE_POINTS |
47 | #include "trace.h" | 49 | #include "trace.h" |
48 | 50 | ||
49 | #include <asm/debugreg.h> | 51 | #include <asm/debugreg.h> |
50 | #include <asm/uaccess.h> | ||
51 | #include <asm/msr.h> | 52 | #include <asm/msr.h> |
52 | #include <asm/desc.h> | 53 | #include <asm/desc.h> |
53 | #include <asm/mtrr.h> | 54 | #include <asm/mtrr.h> |
54 | #include <asm/mce.h> | 55 | #include <asm/mce.h> |
56 | #include <asm/i387.h> | ||
57 | #include <asm/xcr.h> | ||
55 | 58 | ||
56 | #define MAX_IO_MSRS 256 | 59 | #define MAX_IO_MSRS 256 |
57 | #define CR0_RESERVED_BITS \ | 60 | #define CR0_RESERVED_BITS \ |
@@ -62,6 +65,7 @@ | |||
62 | (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ | 65 | (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ |
63 | | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ | 66 | | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ |
64 | | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ | 67 | | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ |
68 | | X86_CR4_OSXSAVE \ | ||
65 | | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) | 69 | | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) |
66 | 70 | ||
67 | #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) | 71 | #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) |
@@ -147,6 +151,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { | |||
147 | { NULL } | 151 | { NULL } |
148 | }; | 152 | }; |
149 | 153 | ||
154 | u64 __read_mostly host_xcr0; | ||
155 | |||
156 | static inline u32 bit(int bitno) | ||
157 | { | ||
158 | return 1 << (bitno & 31); | ||
159 | } | ||
160 | |||
150 | static void kvm_on_user_return(struct user_return_notifier *urn) | 161 | static void kvm_on_user_return(struct user_return_notifier *urn) |
151 | { | 162 | { |
152 | unsigned slot; | 163 | unsigned slot; |
@@ -285,7 +296,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, | |||
285 | prev_nr = vcpu->arch.exception.nr; | 296 | prev_nr = vcpu->arch.exception.nr; |
286 | if (prev_nr == DF_VECTOR) { | 297 | if (prev_nr == DF_VECTOR) { |
287 | /* triple fault -> shutdown */ | 298 | /* triple fault -> shutdown */ |
288 | set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); | 299 | kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); |
289 | return; | 300 | return; |
290 | } | 301 | } |
291 | class1 = exception_class(prev_nr); | 302 | class1 = exception_class(prev_nr); |
@@ -414,121 +425,163 @@ out: | |||
414 | return changed; | 425 | return changed; |
415 | } | 426 | } |
416 | 427 | ||
417 | void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | 428 | int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) |
418 | { | 429 | { |
430 | unsigned long old_cr0 = kvm_read_cr0(vcpu); | ||
431 | unsigned long update_bits = X86_CR0_PG | X86_CR0_WP | | ||
432 | X86_CR0_CD | X86_CR0_NW; | ||
433 | |||
419 | cr0 |= X86_CR0_ET; | 434 | cr0 |= X86_CR0_ET; |
420 | 435 | ||
421 | #ifdef CONFIG_X86_64 | 436 | #ifdef CONFIG_X86_64 |
422 | if (cr0 & 0xffffffff00000000UL) { | 437 | if (cr0 & 0xffffffff00000000UL) |
423 | kvm_inject_gp(vcpu, 0); | 438 | return 1; |
424 | return; | ||
425 | } | ||
426 | #endif | 439 | #endif |
427 | 440 | ||
428 | cr0 &= ~CR0_RESERVED_BITS; | 441 | cr0 &= ~CR0_RESERVED_BITS; |
429 | 442 | ||
430 | if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { | 443 | if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) |
431 | kvm_inject_gp(vcpu, 0); | 444 | return 1; |
432 | return; | ||
433 | } | ||
434 | 445 | ||
435 | if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { | 446 | if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) |
436 | kvm_inject_gp(vcpu, 0); | 447 | return 1; |
437 | return; | ||
438 | } | ||
439 | 448 | ||
440 | if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { | 449 | if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { |
441 | #ifdef CONFIG_X86_64 | 450 | #ifdef CONFIG_X86_64 |
442 | if ((vcpu->arch.efer & EFER_LME)) { | 451 | if ((vcpu->arch.efer & EFER_LME)) { |
443 | int cs_db, cs_l; | 452 | int cs_db, cs_l; |
444 | 453 | ||
445 | if (!is_pae(vcpu)) { | 454 | if (!is_pae(vcpu)) |
446 | kvm_inject_gp(vcpu, 0); | 455 | return 1; |
447 | return; | ||
448 | } | ||
449 | kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); | 456 | kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); |
450 | if (cs_l) { | 457 | if (cs_l) |
451 | kvm_inject_gp(vcpu, 0); | 458 | return 1; |
452 | return; | ||
453 | |||
454 | } | ||
455 | } else | 459 | } else |
456 | #endif | 460 | #endif |
457 | if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { | 461 | if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) |
458 | kvm_inject_gp(vcpu, 0); | 462 | return 1; |
459 | return; | ||
460 | } | ||
461 | |||
462 | } | 463 | } |
463 | 464 | ||
464 | kvm_x86_ops->set_cr0(vcpu, cr0); | 465 | kvm_x86_ops->set_cr0(vcpu, cr0); |
465 | 466 | ||
466 | kvm_mmu_reset_context(vcpu); | 467 | if ((cr0 ^ old_cr0) & update_bits) |
467 | return; | 468 | kvm_mmu_reset_context(vcpu); |
469 | return 0; | ||
468 | } | 470 | } |
469 | EXPORT_SYMBOL_GPL(kvm_set_cr0); | 471 | EXPORT_SYMBOL_GPL(kvm_set_cr0); |
470 | 472 | ||
471 | void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) | 473 | void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) |
472 | { | 474 | { |
473 | kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f)); | 475 | (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f)); |
474 | } | 476 | } |
475 | EXPORT_SYMBOL_GPL(kvm_lmsw); | 477 | EXPORT_SYMBOL_GPL(kvm_lmsw); |
476 | 478 | ||
477 | void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | 479 | int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) |
478 | { | 480 | { |
479 | unsigned long old_cr4 = kvm_read_cr4(vcpu); | 481 | u64 xcr0; |
480 | unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; | ||
481 | 482 | ||
482 | if (cr4 & CR4_RESERVED_BITS) { | 483 | /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */ |
484 | if (index != XCR_XFEATURE_ENABLED_MASK) | ||
485 | return 1; | ||
486 | xcr0 = xcr; | ||
487 | if (kvm_x86_ops->get_cpl(vcpu) != 0) | ||
488 | return 1; | ||
489 | if (!(xcr0 & XSTATE_FP)) | ||
490 | return 1; | ||
491 | if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE)) | ||
492 | return 1; | ||
493 | if (xcr0 & ~host_xcr0) | ||
494 | return 1; | ||
495 | vcpu->arch.xcr0 = xcr0; | ||
496 | vcpu->guest_xcr0_loaded = 0; | ||
497 | return 0; | ||
498 | } | ||
499 | |||
500 | int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) | ||
501 | { | ||
502 | if (__kvm_set_xcr(vcpu, index, xcr)) { | ||
483 | kvm_inject_gp(vcpu, 0); | 503 | kvm_inject_gp(vcpu, 0); |
504 | return 1; | ||
505 | } | ||
506 | return 0; | ||
507 | } | ||
508 | EXPORT_SYMBOL_GPL(kvm_set_xcr); | ||
509 | |||
510 | static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) | ||
511 | { | ||
512 | struct kvm_cpuid_entry2 *best; | ||
513 | |||
514 | best = kvm_find_cpuid_entry(vcpu, 1, 0); | ||
515 | return best && (best->ecx & bit(X86_FEATURE_XSAVE)); | ||
516 | } | ||
517 | |||
518 | static void update_cpuid(struct kvm_vcpu *vcpu) | ||
519 | { | ||
520 | struct kvm_cpuid_entry2 *best; | ||
521 | |||
522 | best = kvm_find_cpuid_entry(vcpu, 1, 0); | ||
523 | if (!best) | ||
484 | return; | 524 | return; |
525 | |||
526 | /* Update OSXSAVE bit */ | ||
527 | if (cpu_has_xsave && best->function == 0x1) { | ||
528 | best->ecx &= ~(bit(X86_FEATURE_OSXSAVE)); | ||
529 | if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) | ||
530 | best->ecx |= bit(X86_FEATURE_OSXSAVE); | ||
485 | } | 531 | } |
532 | } | ||
533 | |||
534 | int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | ||
535 | { | ||
536 | unsigned long old_cr4 = kvm_read_cr4(vcpu); | ||
537 | unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; | ||
538 | |||
539 | if (cr4 & CR4_RESERVED_BITS) | ||
540 | return 1; | ||
541 | |||
542 | if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE)) | ||
543 | return 1; | ||
486 | 544 | ||
487 | if (is_long_mode(vcpu)) { | 545 | if (is_long_mode(vcpu)) { |
488 | if (!(cr4 & X86_CR4_PAE)) { | 546 | if (!(cr4 & X86_CR4_PAE)) |
489 | kvm_inject_gp(vcpu, 0); | 547 | return 1; |
490 | return; | ||
491 | } | ||
492 | } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) | 548 | } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) |
493 | && ((cr4 ^ old_cr4) & pdptr_bits) | 549 | && ((cr4 ^ old_cr4) & pdptr_bits) |
494 | && !load_pdptrs(vcpu, vcpu->arch.cr3)) { | 550 | && !load_pdptrs(vcpu, vcpu->arch.cr3)) |
495 | kvm_inject_gp(vcpu, 0); | 551 | return 1; |
496 | return; | 552 | |
497 | } | 553 | if (cr4 & X86_CR4_VMXE) |
554 | return 1; | ||
498 | 555 | ||
499 | if (cr4 & X86_CR4_VMXE) { | ||
500 | kvm_inject_gp(vcpu, 0); | ||
501 | return; | ||
502 | } | ||
503 | kvm_x86_ops->set_cr4(vcpu, cr4); | 556 | kvm_x86_ops->set_cr4(vcpu, cr4); |
504 | vcpu->arch.cr4 = cr4; | 557 | |
505 | kvm_mmu_reset_context(vcpu); | 558 | if ((cr4 ^ old_cr4) & pdptr_bits) |
559 | kvm_mmu_reset_context(vcpu); | ||
560 | |||
561 | if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) | ||
562 | update_cpuid(vcpu); | ||
563 | |||
564 | return 0; | ||
506 | } | 565 | } |
507 | EXPORT_SYMBOL_GPL(kvm_set_cr4); | 566 | EXPORT_SYMBOL_GPL(kvm_set_cr4); |
508 | 567 | ||
509 | void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | 568 | int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) |
510 | { | 569 | { |
511 | if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { | 570 | if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { |
512 | kvm_mmu_sync_roots(vcpu); | 571 | kvm_mmu_sync_roots(vcpu); |
513 | kvm_mmu_flush_tlb(vcpu); | 572 | kvm_mmu_flush_tlb(vcpu); |
514 | return; | 573 | return 0; |
515 | } | 574 | } |
516 | 575 | ||
517 | if (is_long_mode(vcpu)) { | 576 | if (is_long_mode(vcpu)) { |
518 | if (cr3 & CR3_L_MODE_RESERVED_BITS) { | 577 | if (cr3 & CR3_L_MODE_RESERVED_BITS) |
519 | kvm_inject_gp(vcpu, 0); | 578 | return 1; |
520 | return; | ||
521 | } | ||
522 | } else { | 579 | } else { |
523 | if (is_pae(vcpu)) { | 580 | if (is_pae(vcpu)) { |
524 | if (cr3 & CR3_PAE_RESERVED_BITS) { | 581 | if (cr3 & CR3_PAE_RESERVED_BITS) |
525 | kvm_inject_gp(vcpu, 0); | 582 | return 1; |
526 | return; | 583 | if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) |
527 | } | 584 | return 1; |
528 | if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { | ||
529 | kvm_inject_gp(vcpu, 0); | ||
530 | return; | ||
531 | } | ||
532 | } | 585 | } |
533 | /* | 586 | /* |
534 | * We don't check reserved bits in nonpae mode, because | 587 | * We don't check reserved bits in nonpae mode, because |
@@ -546,24 +599,28 @@ void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | |||
546 | * to debug) behavior on the guest side. | 599 | * to debug) behavior on the guest side. |
547 | */ | 600 | */ |
548 | if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) | 601 | if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) |
549 | kvm_inject_gp(vcpu, 0); | 602 | return 1; |
550 | else { | 603 | vcpu->arch.cr3 = cr3; |
551 | vcpu->arch.cr3 = cr3; | 604 | vcpu->arch.mmu.new_cr3(vcpu); |
552 | vcpu->arch.mmu.new_cr3(vcpu); | 605 | return 0; |
553 | } | ||
554 | } | 606 | } |
555 | EXPORT_SYMBOL_GPL(kvm_set_cr3); | 607 | EXPORT_SYMBOL_GPL(kvm_set_cr3); |
556 | 608 | ||
557 | void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) | 609 | int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) |
558 | { | 610 | { |
559 | if (cr8 & CR8_RESERVED_BITS) { | 611 | if (cr8 & CR8_RESERVED_BITS) |
560 | kvm_inject_gp(vcpu, 0); | 612 | return 1; |
561 | return; | ||
562 | } | ||
563 | if (irqchip_in_kernel(vcpu->kvm)) | 613 | if (irqchip_in_kernel(vcpu->kvm)) |
564 | kvm_lapic_set_tpr(vcpu, cr8); | 614 | kvm_lapic_set_tpr(vcpu, cr8); |
565 | else | 615 | else |
566 | vcpu->arch.cr8 = cr8; | 616 | vcpu->arch.cr8 = cr8; |
617 | return 0; | ||
618 | } | ||
619 | |||
620 | void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) | ||
621 | { | ||
622 | if (__kvm_set_cr8(vcpu, cr8)) | ||
623 | kvm_inject_gp(vcpu, 0); | ||
567 | } | 624 | } |
568 | EXPORT_SYMBOL_GPL(kvm_set_cr8); | 625 | EXPORT_SYMBOL_GPL(kvm_set_cr8); |
569 | 626 | ||
@@ -576,7 +633,7 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) | |||
576 | } | 633 | } |
577 | EXPORT_SYMBOL_GPL(kvm_get_cr8); | 634 | EXPORT_SYMBOL_GPL(kvm_get_cr8); |
578 | 635 | ||
579 | int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) | 636 | static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) |
580 | { | 637 | { |
581 | switch (dr) { | 638 | switch (dr) { |
582 | case 0 ... 3: | 639 | case 0 ... 3: |
@@ -585,29 +642,21 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) | |||
585 | vcpu->arch.eff_db[dr] = val; | 642 | vcpu->arch.eff_db[dr] = val; |
586 | break; | 643 | break; |
587 | case 4: | 644 | case 4: |
588 | if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { | 645 | if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) |
589 | kvm_queue_exception(vcpu, UD_VECTOR); | 646 | return 1; /* #UD */ |
590 | return 1; | ||
591 | } | ||
592 | /* fall through */ | 647 | /* fall through */ |
593 | case 6: | 648 | case 6: |
594 | if (val & 0xffffffff00000000ULL) { | 649 | if (val & 0xffffffff00000000ULL) |
595 | kvm_inject_gp(vcpu, 0); | 650 | return -1; /* #GP */ |
596 | return 1; | ||
597 | } | ||
598 | vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; | 651 | vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; |
599 | break; | 652 | break; |
600 | case 5: | 653 | case 5: |
601 | if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { | 654 | if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) |
602 | kvm_queue_exception(vcpu, UD_VECTOR); | 655 | return 1; /* #UD */ |
603 | return 1; | ||
604 | } | ||
605 | /* fall through */ | 656 | /* fall through */ |
606 | default: /* 7 */ | 657 | default: /* 7 */ |
607 | if (val & 0xffffffff00000000ULL) { | 658 | if (val & 0xffffffff00000000ULL) |
608 | kvm_inject_gp(vcpu, 0); | 659 | return -1; /* #GP */ |
609 | return 1; | ||
610 | } | ||
611 | vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; | 660 | vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; |
612 | if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { | 661 | if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { |
613 | kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7); | 662 | kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7); |
@@ -618,28 +667,37 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) | |||
618 | 667 | ||
619 | return 0; | 668 | return 0; |
620 | } | 669 | } |
670 | |||
671 | int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) | ||
672 | { | ||
673 | int res; | ||
674 | |||
675 | res = __kvm_set_dr(vcpu, dr, val); | ||
676 | if (res > 0) | ||
677 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
678 | else if (res < 0) | ||
679 | kvm_inject_gp(vcpu, 0); | ||
680 | |||
681 | return res; | ||
682 | } | ||
621 | EXPORT_SYMBOL_GPL(kvm_set_dr); | 683 | EXPORT_SYMBOL_GPL(kvm_set_dr); |
622 | 684 | ||
623 | int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) | 685 | static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) |
624 | { | 686 | { |
625 | switch (dr) { | 687 | switch (dr) { |
626 | case 0 ... 3: | 688 | case 0 ... 3: |
627 | *val = vcpu->arch.db[dr]; | 689 | *val = vcpu->arch.db[dr]; |
628 | break; | 690 | break; |
629 | case 4: | 691 | case 4: |
630 | if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { | 692 | if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) |
631 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
632 | return 1; | 693 | return 1; |
633 | } | ||
634 | /* fall through */ | 694 | /* fall through */ |
635 | case 6: | 695 | case 6: |
636 | *val = vcpu->arch.dr6; | 696 | *val = vcpu->arch.dr6; |
637 | break; | 697 | break; |
638 | case 5: | 698 | case 5: |
639 | if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { | 699 | if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) |
640 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
641 | return 1; | 700 | return 1; |
642 | } | ||
643 | /* fall through */ | 701 | /* fall through */ |
644 | default: /* 7 */ | 702 | default: /* 7 */ |
645 | *val = vcpu->arch.dr7; | 703 | *val = vcpu->arch.dr7; |
@@ -648,12 +706,16 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) | |||
648 | 706 | ||
649 | return 0; | 707 | return 0; |
650 | } | 708 | } |
651 | EXPORT_SYMBOL_GPL(kvm_get_dr); | ||
652 | 709 | ||
653 | static inline u32 bit(int bitno) | 710 | int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) |
654 | { | 711 | { |
655 | return 1 << (bitno & 31); | 712 | if (_kvm_get_dr(vcpu, dr, val)) { |
713 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
714 | return 1; | ||
715 | } | ||
716 | return 0; | ||
656 | } | 717 | } |
718 | EXPORT_SYMBOL_GPL(kvm_get_dr); | ||
657 | 719 | ||
658 | /* | 720 | /* |
659 | * List of msr numbers which we expose to userspace through KVM_GET_MSRS | 721 | * List of msr numbers which we expose to userspace through KVM_GET_MSRS |
@@ -671,7 +733,7 @@ static u32 msrs_to_save[] = { | |||
671 | HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, | 733 | HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, |
672 | HV_X64_MSR_APIC_ASSIST_PAGE, | 734 | HV_X64_MSR_APIC_ASSIST_PAGE, |
673 | MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, | 735 | MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, |
674 | MSR_K6_STAR, | 736 | MSR_STAR, |
675 | #ifdef CONFIG_X86_64 | 737 | #ifdef CONFIG_X86_64 |
676 | MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, | 738 | MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, |
677 | #endif | 739 | #endif |
@@ -682,10 +744,14 @@ static unsigned num_msrs_to_save; | |||
682 | 744 | ||
683 | static u32 emulated_msrs[] = { | 745 | static u32 emulated_msrs[] = { |
684 | MSR_IA32_MISC_ENABLE, | 746 | MSR_IA32_MISC_ENABLE, |
747 | MSR_IA32_MCG_STATUS, | ||
748 | MSR_IA32_MCG_CTL, | ||
685 | }; | 749 | }; |
686 | 750 | ||
687 | static int set_efer(struct kvm_vcpu *vcpu, u64 efer) | 751 | static int set_efer(struct kvm_vcpu *vcpu, u64 efer) |
688 | { | 752 | { |
753 | u64 old_efer = vcpu->arch.efer; | ||
754 | |||
689 | if (efer & efer_reserved_bits) | 755 | if (efer & efer_reserved_bits) |
690 | return 1; | 756 | return 1; |
691 | 757 | ||
@@ -714,11 +780,13 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer) | |||
714 | 780 | ||
715 | kvm_x86_ops->set_efer(vcpu, efer); | 781 | kvm_x86_ops->set_efer(vcpu, efer); |
716 | 782 | ||
717 | vcpu->arch.efer = efer; | ||
718 | |||
719 | vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; | 783 | vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; |
720 | kvm_mmu_reset_context(vcpu); | 784 | kvm_mmu_reset_context(vcpu); |
721 | 785 | ||
786 | /* Update reserved bits */ | ||
787 | if ((efer ^ old_efer) & EFER_NX) | ||
788 | kvm_mmu_reset_context(vcpu); | ||
789 | |||
722 | return 0; | 790 | return 0; |
723 | } | 791 | } |
724 | 792 | ||
@@ -882,7 +950,7 @@ static int kvm_request_guest_time_update(struct kvm_vcpu *v) | |||
882 | 950 | ||
883 | if (!vcpu->time_page) | 951 | if (!vcpu->time_page) |
884 | return 0; | 952 | return 0; |
885 | set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests); | 953 | kvm_make_request(KVM_REQ_KVMCLOCK_UPDATE, v); |
886 | return 1; | 954 | return 1; |
887 | } | 955 | } |
888 | 956 | ||
@@ -1524,16 +1592,12 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, | |||
1524 | { | 1592 | { |
1525 | int i, idx; | 1593 | int i, idx; |
1526 | 1594 | ||
1527 | vcpu_load(vcpu); | ||
1528 | |||
1529 | idx = srcu_read_lock(&vcpu->kvm->srcu); | 1595 | idx = srcu_read_lock(&vcpu->kvm->srcu); |
1530 | for (i = 0; i < msrs->nmsrs; ++i) | 1596 | for (i = 0; i < msrs->nmsrs; ++i) |
1531 | if (do_msr(vcpu, entries[i].index, &entries[i].data)) | 1597 | if (do_msr(vcpu, entries[i].index, &entries[i].data)) |
1532 | break; | 1598 | break; |
1533 | srcu_read_unlock(&vcpu->kvm->srcu, idx); | 1599 | srcu_read_unlock(&vcpu->kvm->srcu, idx); |
1534 | 1600 | ||
1535 | vcpu_put(vcpu); | ||
1536 | |||
1537 | return i; | 1601 | return i; |
1538 | } | 1602 | } |
1539 | 1603 | ||
@@ -1618,6 +1682,7 @@ int kvm_dev_ioctl_check_extension(long ext) | |||
1618 | case KVM_CAP_PCI_SEGMENT: | 1682 | case KVM_CAP_PCI_SEGMENT: |
1619 | case KVM_CAP_DEBUGREGS: | 1683 | case KVM_CAP_DEBUGREGS: |
1620 | case KVM_CAP_X86_ROBUST_SINGLESTEP: | 1684 | case KVM_CAP_X86_ROBUST_SINGLESTEP: |
1685 | case KVM_CAP_XSAVE: | ||
1621 | r = 1; | 1686 | r = 1; |
1622 | break; | 1687 | break; |
1623 | case KVM_CAP_COALESCED_MMIO: | 1688 | case KVM_CAP_COALESCED_MMIO: |
@@ -1641,6 +1706,9 @@ int kvm_dev_ioctl_check_extension(long ext) | |||
1641 | case KVM_CAP_MCE: | 1706 | case KVM_CAP_MCE: |
1642 | r = KVM_MAX_MCE_BANKS; | 1707 | r = KVM_MAX_MCE_BANKS; |
1643 | break; | 1708 | break; |
1709 | case KVM_CAP_XCRS: | ||
1710 | r = cpu_has_xsave; | ||
1711 | break; | ||
1644 | default: | 1712 | default: |
1645 | r = 0; | 1713 | r = 0; |
1646 | break; | 1714 | break; |
@@ -1717,8 +1785,28 @@ out: | |||
1717 | return r; | 1785 | return r; |
1718 | } | 1786 | } |
1719 | 1787 | ||
1788 | static void wbinvd_ipi(void *garbage) | ||
1789 | { | ||
1790 | wbinvd(); | ||
1791 | } | ||
1792 | |||
1793 | static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu) | ||
1794 | { | ||
1795 | return vcpu->kvm->arch.iommu_domain && | ||
1796 | !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY); | ||
1797 | } | ||
1798 | |||
1720 | void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | 1799 | void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) |
1721 | { | 1800 | { |
1801 | /* Address WBINVD may be executed by guest */ | ||
1802 | if (need_emulate_wbinvd(vcpu)) { | ||
1803 | if (kvm_x86_ops->has_wbinvd_exit()) | ||
1804 | cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); | ||
1805 | else if (vcpu->cpu != -1 && vcpu->cpu != cpu) | ||
1806 | smp_call_function_single(vcpu->cpu, | ||
1807 | wbinvd_ipi, NULL, 1); | ||
1808 | } | ||
1809 | |||
1722 | kvm_x86_ops->vcpu_load(vcpu, cpu); | 1810 | kvm_x86_ops->vcpu_load(vcpu, cpu); |
1723 | if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) { | 1811 | if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) { |
1724 | unsigned long khz = cpufreq_quick_get(cpu); | 1812 | unsigned long khz = cpufreq_quick_get(cpu); |
@@ -1731,8 +1819,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | |||
1731 | 1819 | ||
1732 | void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) | 1820 | void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) |
1733 | { | 1821 | { |
1734 | kvm_put_guest_fpu(vcpu); | ||
1735 | kvm_x86_ops->vcpu_put(vcpu); | 1822 | kvm_x86_ops->vcpu_put(vcpu); |
1823 | kvm_put_guest_fpu(vcpu); | ||
1736 | } | 1824 | } |
1737 | 1825 | ||
1738 | static int is_efer_nx(void) | 1826 | static int is_efer_nx(void) |
@@ -1781,7 +1869,6 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, | |||
1781 | if (copy_from_user(cpuid_entries, entries, | 1869 | if (copy_from_user(cpuid_entries, entries, |
1782 | cpuid->nent * sizeof(struct kvm_cpuid_entry))) | 1870 | cpuid->nent * sizeof(struct kvm_cpuid_entry))) |
1783 | goto out_free; | 1871 | goto out_free; |
1784 | vcpu_load(vcpu); | ||
1785 | for (i = 0; i < cpuid->nent; i++) { | 1872 | for (i = 0; i < cpuid->nent; i++) { |
1786 | vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; | 1873 | vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; |
1787 | vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; | 1874 | vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; |
@@ -1799,7 +1886,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, | |||
1799 | r = 0; | 1886 | r = 0; |
1800 | kvm_apic_set_version(vcpu); | 1887 | kvm_apic_set_version(vcpu); |
1801 | kvm_x86_ops->cpuid_update(vcpu); | 1888 | kvm_x86_ops->cpuid_update(vcpu); |
1802 | vcpu_put(vcpu); | 1889 | update_cpuid(vcpu); |
1803 | 1890 | ||
1804 | out_free: | 1891 | out_free: |
1805 | vfree(cpuid_entries); | 1892 | vfree(cpuid_entries); |
@@ -1820,11 +1907,10 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, | |||
1820 | if (copy_from_user(&vcpu->arch.cpuid_entries, entries, | 1907 | if (copy_from_user(&vcpu->arch.cpuid_entries, entries, |
1821 | cpuid->nent * sizeof(struct kvm_cpuid_entry2))) | 1908 | cpuid->nent * sizeof(struct kvm_cpuid_entry2))) |
1822 | goto out; | 1909 | goto out; |
1823 | vcpu_load(vcpu); | ||
1824 | vcpu->arch.cpuid_nent = cpuid->nent; | 1910 | vcpu->arch.cpuid_nent = cpuid->nent; |
1825 | kvm_apic_set_version(vcpu); | 1911 | kvm_apic_set_version(vcpu); |
1826 | kvm_x86_ops->cpuid_update(vcpu); | 1912 | kvm_x86_ops->cpuid_update(vcpu); |
1827 | vcpu_put(vcpu); | 1913 | update_cpuid(vcpu); |
1828 | return 0; | 1914 | return 0; |
1829 | 1915 | ||
1830 | out: | 1916 | out: |
@@ -1837,7 +1923,6 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, | |||
1837 | { | 1923 | { |
1838 | int r; | 1924 | int r; |
1839 | 1925 | ||
1840 | vcpu_load(vcpu); | ||
1841 | r = -E2BIG; | 1926 | r = -E2BIG; |
1842 | if (cpuid->nent < vcpu->arch.cpuid_nent) | 1927 | if (cpuid->nent < vcpu->arch.cpuid_nent) |
1843 | goto out; | 1928 | goto out; |
@@ -1849,7 +1934,6 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, | |||
1849 | 1934 | ||
1850 | out: | 1935 | out: |
1851 | cpuid->nent = vcpu->arch.cpuid_nent; | 1936 | cpuid->nent = vcpu->arch.cpuid_nent; |
1852 | vcpu_put(vcpu); | ||
1853 | return r; | 1937 | return r; |
1854 | } | 1938 | } |
1855 | 1939 | ||
@@ -1901,13 +1985,13 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
1901 | 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); | 1985 | 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); |
1902 | /* cpuid 1.ecx */ | 1986 | /* cpuid 1.ecx */ |
1903 | const u32 kvm_supported_word4_x86_features = | 1987 | const u32 kvm_supported_word4_x86_features = |
1904 | F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ | | 1988 | F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ | |
1905 | 0 /* DS-CPL, VMX, SMX, EST */ | | 1989 | 0 /* DS-CPL, VMX, SMX, EST */ | |
1906 | 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | | 1990 | 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | |
1907 | 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | | 1991 | 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | |
1908 | 0 /* Reserved, DCA */ | F(XMM4_1) | | 1992 | 0 /* Reserved, DCA */ | F(XMM4_1) | |
1909 | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | | 1993 | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | |
1910 | 0 /* Reserved, XSAVE, OSXSAVE */; | 1994 | 0 /* Reserved, AES */ | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX); |
1911 | /* cpuid 0x80000001.ecx */ | 1995 | /* cpuid 0x80000001.ecx */ |
1912 | const u32 kvm_supported_word6_x86_features = | 1996 | const u32 kvm_supported_word6_x86_features = |
1913 | F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | | 1997 | F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | |
@@ -1922,7 +2006,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
1922 | 2006 | ||
1923 | switch (function) { | 2007 | switch (function) { |
1924 | case 0: | 2008 | case 0: |
1925 | entry->eax = min(entry->eax, (u32)0xb); | 2009 | entry->eax = min(entry->eax, (u32)0xd); |
1926 | break; | 2010 | break; |
1927 | case 1: | 2011 | case 1: |
1928 | entry->edx &= kvm_supported_word0_x86_features; | 2012 | entry->edx &= kvm_supported_word0_x86_features; |
@@ -1980,6 +2064,20 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
1980 | } | 2064 | } |
1981 | break; | 2065 | break; |
1982 | } | 2066 | } |
2067 | case 0xd: { | ||
2068 | int i; | ||
2069 | |||
2070 | entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | ||
2071 | for (i = 1; *nent < maxnent; ++i) { | ||
2072 | if (entry[i - 1].eax == 0 && i != 2) | ||
2073 | break; | ||
2074 | do_cpuid_1_ent(&entry[i], function, i); | ||
2075 | entry[i].flags |= | ||
2076 | KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | ||
2077 | ++*nent; | ||
2078 | } | ||
2079 | break; | ||
2080 | } | ||
1983 | case KVM_CPUID_SIGNATURE: { | 2081 | case KVM_CPUID_SIGNATURE: { |
1984 | char signature[12] = "KVMKVMKVM\0\0"; | 2082 | char signature[12] = "KVMKVMKVM\0\0"; |
1985 | u32 *sigptr = (u32 *)signature; | 2083 | u32 *sigptr = (u32 *)signature; |
@@ -2081,9 +2179,7 @@ out: | |||
2081 | static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, | 2179 | static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, |
2082 | struct kvm_lapic_state *s) | 2180 | struct kvm_lapic_state *s) |
2083 | { | 2181 | { |
2084 | vcpu_load(vcpu); | ||
2085 | memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); | 2182 | memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); |
2086 | vcpu_put(vcpu); | ||
2087 | 2183 | ||
2088 | return 0; | 2184 | return 0; |
2089 | } | 2185 | } |
@@ -2091,11 +2187,9 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, | |||
2091 | static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, | 2187 | static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, |
2092 | struct kvm_lapic_state *s) | 2188 | struct kvm_lapic_state *s) |
2093 | { | 2189 | { |
2094 | vcpu_load(vcpu); | ||
2095 | memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); | 2190 | memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); |
2096 | kvm_apic_post_state_restore(vcpu); | 2191 | kvm_apic_post_state_restore(vcpu); |
2097 | update_cr8_intercept(vcpu); | 2192 | update_cr8_intercept(vcpu); |
2098 | vcpu_put(vcpu); | ||
2099 | 2193 | ||
2100 | return 0; | 2194 | return 0; |
2101 | } | 2195 | } |
@@ -2107,20 +2201,15 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, | |||
2107 | return -EINVAL; | 2201 | return -EINVAL; |
2108 | if (irqchip_in_kernel(vcpu->kvm)) | 2202 | if (irqchip_in_kernel(vcpu->kvm)) |
2109 | return -ENXIO; | 2203 | return -ENXIO; |
2110 | vcpu_load(vcpu); | ||
2111 | 2204 | ||
2112 | kvm_queue_interrupt(vcpu, irq->irq, false); | 2205 | kvm_queue_interrupt(vcpu, irq->irq, false); |
2113 | 2206 | ||
2114 | vcpu_put(vcpu); | ||
2115 | |||
2116 | return 0; | 2207 | return 0; |
2117 | } | 2208 | } |
2118 | 2209 | ||
2119 | static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) | 2210 | static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) |
2120 | { | 2211 | { |
2121 | vcpu_load(vcpu); | ||
2122 | kvm_inject_nmi(vcpu); | 2212 | kvm_inject_nmi(vcpu); |
2123 | vcpu_put(vcpu); | ||
2124 | 2213 | ||
2125 | return 0; | 2214 | return 0; |
2126 | } | 2215 | } |
@@ -2140,7 +2229,6 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, | |||
2140 | int r; | 2229 | int r; |
2141 | unsigned bank_num = mcg_cap & 0xff, bank; | 2230 | unsigned bank_num = mcg_cap & 0xff, bank; |
2142 | 2231 | ||
2143 | vcpu_load(vcpu); | ||
2144 | r = -EINVAL; | 2232 | r = -EINVAL; |
2145 | if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) | 2233 | if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) |
2146 | goto out; | 2234 | goto out; |
@@ -2155,7 +2243,6 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, | |||
2155 | for (bank = 0; bank < bank_num; bank++) | 2243 | for (bank = 0; bank < bank_num; bank++) |
2156 | vcpu->arch.mce_banks[bank*4] = ~(u64)0; | 2244 | vcpu->arch.mce_banks[bank*4] = ~(u64)0; |
2157 | out: | 2245 | out: |
2158 | vcpu_put(vcpu); | ||
2159 | return r; | 2246 | return r; |
2160 | } | 2247 | } |
2161 | 2248 | ||
@@ -2188,7 +2275,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, | |||
2188 | printk(KERN_DEBUG "kvm: set_mce: " | 2275 | printk(KERN_DEBUG "kvm: set_mce: " |
2189 | "injects mce exception while " | 2276 | "injects mce exception while " |
2190 | "previous one is in progress!\n"); | 2277 | "previous one is in progress!\n"); |
2191 | set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); | 2278 | kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); |
2192 | return 0; | 2279 | return 0; |
2193 | } | 2280 | } |
2194 | if (banks[1] & MCI_STATUS_VAL) | 2281 | if (banks[1] & MCI_STATUS_VAL) |
@@ -2213,8 +2300,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, | |||
2213 | static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, | 2300 | static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, |
2214 | struct kvm_vcpu_events *events) | 2301 | struct kvm_vcpu_events *events) |
2215 | { | 2302 | { |
2216 | vcpu_load(vcpu); | ||
2217 | |||
2218 | events->exception.injected = | 2303 | events->exception.injected = |
2219 | vcpu->arch.exception.pending && | 2304 | vcpu->arch.exception.pending && |
2220 | !kvm_exception_is_soft(vcpu->arch.exception.nr); | 2305 | !kvm_exception_is_soft(vcpu->arch.exception.nr); |
@@ -2239,8 +2324,6 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, | |||
2239 | events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING | 2324 | events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING |
2240 | | KVM_VCPUEVENT_VALID_SIPI_VECTOR | 2325 | | KVM_VCPUEVENT_VALID_SIPI_VECTOR |
2241 | | KVM_VCPUEVENT_VALID_SHADOW); | 2326 | | KVM_VCPUEVENT_VALID_SHADOW); |
2242 | |||
2243 | vcpu_put(vcpu); | ||
2244 | } | 2327 | } |
2245 | 2328 | ||
2246 | static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, | 2329 | static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, |
@@ -2251,8 +2334,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, | |||
2251 | | KVM_VCPUEVENT_VALID_SHADOW)) | 2334 | | KVM_VCPUEVENT_VALID_SHADOW)) |
2252 | return -EINVAL; | 2335 | return -EINVAL; |
2253 | 2336 | ||
2254 | vcpu_load(vcpu); | ||
2255 | |||
2256 | vcpu->arch.exception.pending = events->exception.injected; | 2337 | vcpu->arch.exception.pending = events->exception.injected; |
2257 | vcpu->arch.exception.nr = events->exception.nr; | 2338 | vcpu->arch.exception.nr = events->exception.nr; |
2258 | vcpu->arch.exception.has_error_code = events->exception.has_error_code; | 2339 | vcpu->arch.exception.has_error_code = events->exception.has_error_code; |
@@ -2275,22 +2356,16 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, | |||
2275 | if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) | 2356 | if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) |
2276 | vcpu->arch.sipi_vector = events->sipi_vector; | 2357 | vcpu->arch.sipi_vector = events->sipi_vector; |
2277 | 2358 | ||
2278 | vcpu_put(vcpu); | ||
2279 | |||
2280 | return 0; | 2359 | return 0; |
2281 | } | 2360 | } |
2282 | 2361 | ||
2283 | static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, | 2362 | static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, |
2284 | struct kvm_debugregs *dbgregs) | 2363 | struct kvm_debugregs *dbgregs) |
2285 | { | 2364 | { |
2286 | vcpu_load(vcpu); | ||
2287 | |||
2288 | memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); | 2365 | memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); |
2289 | dbgregs->dr6 = vcpu->arch.dr6; | 2366 | dbgregs->dr6 = vcpu->arch.dr6; |
2290 | dbgregs->dr7 = vcpu->arch.dr7; | 2367 | dbgregs->dr7 = vcpu->arch.dr7; |
2291 | dbgregs->flags = 0; | 2368 | dbgregs->flags = 0; |
2292 | |||
2293 | vcpu_put(vcpu); | ||
2294 | } | 2369 | } |
2295 | 2370 | ||
2296 | static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, | 2371 | static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, |
@@ -2299,40 +2374,113 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, | |||
2299 | if (dbgregs->flags) | 2374 | if (dbgregs->flags) |
2300 | return -EINVAL; | 2375 | return -EINVAL; |
2301 | 2376 | ||
2302 | vcpu_load(vcpu); | ||
2303 | |||
2304 | memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); | 2377 | memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); |
2305 | vcpu->arch.dr6 = dbgregs->dr6; | 2378 | vcpu->arch.dr6 = dbgregs->dr6; |
2306 | vcpu->arch.dr7 = dbgregs->dr7; | 2379 | vcpu->arch.dr7 = dbgregs->dr7; |
2307 | 2380 | ||
2308 | vcpu_put(vcpu); | 2381 | return 0; |
2382 | } | ||
2383 | |||
2384 | static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, | ||
2385 | struct kvm_xsave *guest_xsave) | ||
2386 | { | ||
2387 | if (cpu_has_xsave) | ||
2388 | memcpy(guest_xsave->region, | ||
2389 | &vcpu->arch.guest_fpu.state->xsave, | ||
2390 | sizeof(struct xsave_struct)); | ||
2391 | else { | ||
2392 | memcpy(guest_xsave->region, | ||
2393 | &vcpu->arch.guest_fpu.state->fxsave, | ||
2394 | sizeof(struct i387_fxsave_struct)); | ||
2395 | *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] = | ||
2396 | XSTATE_FPSSE; | ||
2397 | } | ||
2398 | } | ||
2399 | |||
2400 | static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, | ||
2401 | struct kvm_xsave *guest_xsave) | ||
2402 | { | ||
2403 | u64 xstate_bv = | ||
2404 | *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)]; | ||
2309 | 2405 | ||
2406 | if (cpu_has_xsave) | ||
2407 | memcpy(&vcpu->arch.guest_fpu.state->xsave, | ||
2408 | guest_xsave->region, sizeof(struct xsave_struct)); | ||
2409 | else { | ||
2410 | if (xstate_bv & ~XSTATE_FPSSE) | ||
2411 | return -EINVAL; | ||
2412 | memcpy(&vcpu->arch.guest_fpu.state->fxsave, | ||
2413 | guest_xsave->region, sizeof(struct i387_fxsave_struct)); | ||
2414 | } | ||
2310 | return 0; | 2415 | return 0; |
2311 | } | 2416 | } |
2312 | 2417 | ||
2418 | static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, | ||
2419 | struct kvm_xcrs *guest_xcrs) | ||
2420 | { | ||
2421 | if (!cpu_has_xsave) { | ||
2422 | guest_xcrs->nr_xcrs = 0; | ||
2423 | return; | ||
2424 | } | ||
2425 | |||
2426 | guest_xcrs->nr_xcrs = 1; | ||
2427 | guest_xcrs->flags = 0; | ||
2428 | guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK; | ||
2429 | guest_xcrs->xcrs[0].value = vcpu->arch.xcr0; | ||
2430 | } | ||
2431 | |||
2432 | static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, | ||
2433 | struct kvm_xcrs *guest_xcrs) | ||
2434 | { | ||
2435 | int i, r = 0; | ||
2436 | |||
2437 | if (!cpu_has_xsave) | ||
2438 | return -EINVAL; | ||
2439 | |||
2440 | if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags) | ||
2441 | return -EINVAL; | ||
2442 | |||
2443 | for (i = 0; i < guest_xcrs->nr_xcrs; i++) | ||
2444 | /* Only support XCR0 currently */ | ||
2445 | if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) { | ||
2446 | r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK, | ||
2447 | guest_xcrs->xcrs[0].value); | ||
2448 | break; | ||
2449 | } | ||
2450 | if (r) | ||
2451 | r = -EINVAL; | ||
2452 | return r; | ||
2453 | } | ||
2454 | |||
2313 | long kvm_arch_vcpu_ioctl(struct file *filp, | 2455 | long kvm_arch_vcpu_ioctl(struct file *filp, |
2314 | unsigned int ioctl, unsigned long arg) | 2456 | unsigned int ioctl, unsigned long arg) |
2315 | { | 2457 | { |
2316 | struct kvm_vcpu *vcpu = filp->private_data; | 2458 | struct kvm_vcpu *vcpu = filp->private_data; |
2317 | void __user *argp = (void __user *)arg; | 2459 | void __user *argp = (void __user *)arg; |
2318 | int r; | 2460 | int r; |
2319 | struct kvm_lapic_state *lapic = NULL; | 2461 | union { |
2462 | struct kvm_lapic_state *lapic; | ||
2463 | struct kvm_xsave *xsave; | ||
2464 | struct kvm_xcrs *xcrs; | ||
2465 | void *buffer; | ||
2466 | } u; | ||
2320 | 2467 | ||
2468 | u.buffer = NULL; | ||
2321 | switch (ioctl) { | 2469 | switch (ioctl) { |
2322 | case KVM_GET_LAPIC: { | 2470 | case KVM_GET_LAPIC: { |
2323 | r = -EINVAL; | 2471 | r = -EINVAL; |
2324 | if (!vcpu->arch.apic) | 2472 | if (!vcpu->arch.apic) |
2325 | goto out; | 2473 | goto out; |
2326 | lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); | 2474 | u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); |
2327 | 2475 | ||
2328 | r = -ENOMEM; | 2476 | r = -ENOMEM; |
2329 | if (!lapic) | 2477 | if (!u.lapic) |
2330 | goto out; | 2478 | goto out; |
2331 | r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic); | 2479 | r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic); |
2332 | if (r) | 2480 | if (r) |
2333 | goto out; | 2481 | goto out; |
2334 | r = -EFAULT; | 2482 | r = -EFAULT; |
2335 | if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state))) | 2483 | if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state))) |
2336 | goto out; | 2484 | goto out; |
2337 | r = 0; | 2485 | r = 0; |
2338 | break; | 2486 | break; |
@@ -2341,14 +2489,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp, | |||
2341 | r = -EINVAL; | 2489 | r = -EINVAL; |
2342 | if (!vcpu->arch.apic) | 2490 | if (!vcpu->arch.apic) |
2343 | goto out; | 2491 | goto out; |
2344 | lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); | 2492 | u.lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); |
2345 | r = -ENOMEM; | 2493 | r = -ENOMEM; |
2346 | if (!lapic) | 2494 | if (!u.lapic) |
2347 | goto out; | 2495 | goto out; |
2348 | r = -EFAULT; | 2496 | r = -EFAULT; |
2349 | if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state))) | 2497 | if (copy_from_user(u.lapic, argp, sizeof(struct kvm_lapic_state))) |
2350 | goto out; | 2498 | goto out; |
2351 | r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic); | 2499 | r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic); |
2352 | if (r) | 2500 | if (r) |
2353 | goto out; | 2501 | goto out; |
2354 | r = 0; | 2502 | r = 0; |
@@ -2464,9 +2612,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, | |||
2464 | r = -EFAULT; | 2612 | r = -EFAULT; |
2465 | if (copy_from_user(&mce, argp, sizeof mce)) | 2613 | if (copy_from_user(&mce, argp, sizeof mce)) |
2466 | goto out; | 2614 | goto out; |
2467 | vcpu_load(vcpu); | ||
2468 | r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); | 2615 | r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); |
2469 | vcpu_put(vcpu); | ||
2470 | break; | 2616 | break; |
2471 | } | 2617 | } |
2472 | case KVM_GET_VCPU_EVENTS: { | 2618 | case KVM_GET_VCPU_EVENTS: { |
@@ -2513,11 +2659,67 @@ long kvm_arch_vcpu_ioctl(struct file *filp, | |||
2513 | r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs); | 2659 | r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs); |
2514 | break; | 2660 | break; |
2515 | } | 2661 | } |
2662 | case KVM_GET_XSAVE: { | ||
2663 | u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); | ||
2664 | r = -ENOMEM; | ||
2665 | if (!u.xsave) | ||
2666 | break; | ||
2667 | |||
2668 | kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave); | ||
2669 | |||
2670 | r = -EFAULT; | ||
2671 | if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave))) | ||
2672 | break; | ||
2673 | r = 0; | ||
2674 | break; | ||
2675 | } | ||
2676 | case KVM_SET_XSAVE: { | ||
2677 | u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); | ||
2678 | r = -ENOMEM; | ||
2679 | if (!u.xsave) | ||
2680 | break; | ||
2681 | |||
2682 | r = -EFAULT; | ||
2683 | if (copy_from_user(u.xsave, argp, sizeof(struct kvm_xsave))) | ||
2684 | break; | ||
2685 | |||
2686 | r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); | ||
2687 | break; | ||
2688 | } | ||
2689 | case KVM_GET_XCRS: { | ||
2690 | u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); | ||
2691 | r = -ENOMEM; | ||
2692 | if (!u.xcrs) | ||
2693 | break; | ||
2694 | |||
2695 | kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs); | ||
2696 | |||
2697 | r = -EFAULT; | ||
2698 | if (copy_to_user(argp, u.xcrs, | ||
2699 | sizeof(struct kvm_xcrs))) | ||
2700 | break; | ||
2701 | r = 0; | ||
2702 | break; | ||
2703 | } | ||
2704 | case KVM_SET_XCRS: { | ||
2705 | u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); | ||
2706 | r = -ENOMEM; | ||
2707 | if (!u.xcrs) | ||
2708 | break; | ||
2709 | |||
2710 | r = -EFAULT; | ||
2711 | if (copy_from_user(u.xcrs, argp, | ||
2712 | sizeof(struct kvm_xcrs))) | ||
2713 | break; | ||
2714 | |||
2715 | r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); | ||
2716 | break; | ||
2717 | } | ||
2516 | default: | 2718 | default: |
2517 | r = -EINVAL; | 2719 | r = -EINVAL; |
2518 | } | 2720 | } |
2519 | out: | 2721 | out: |
2520 | kfree(lapic); | 2722 | kfree(u.buffer); |
2521 | return r; | 2723 | return r; |
2522 | } | 2724 | } |
2523 | 2725 | ||
@@ -2560,115 +2762,6 @@ static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) | |||
2560 | return kvm->arch.n_alloc_mmu_pages; | 2762 | return kvm->arch.n_alloc_mmu_pages; |
2561 | } | 2763 | } |
2562 | 2764 | ||
2563 | gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn) | ||
2564 | { | ||
2565 | int i; | ||
2566 | struct kvm_mem_alias *alias; | ||
2567 | struct kvm_mem_aliases *aliases; | ||
2568 | |||
2569 | aliases = kvm_aliases(kvm); | ||
2570 | |||
2571 | for (i = 0; i < aliases->naliases; ++i) { | ||
2572 | alias = &aliases->aliases[i]; | ||
2573 | if (alias->flags & KVM_ALIAS_INVALID) | ||
2574 | continue; | ||
2575 | if (gfn >= alias->base_gfn | ||
2576 | && gfn < alias->base_gfn + alias->npages) | ||
2577 | return alias->target_gfn + gfn - alias->base_gfn; | ||
2578 | } | ||
2579 | return gfn; | ||
2580 | } | ||
2581 | |||
2582 | gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) | ||
2583 | { | ||
2584 | int i; | ||
2585 | struct kvm_mem_alias *alias; | ||
2586 | struct kvm_mem_aliases *aliases; | ||
2587 | |||
2588 | aliases = kvm_aliases(kvm); | ||
2589 | |||
2590 | for (i = 0; i < aliases->naliases; ++i) { | ||
2591 | alias = &aliases->aliases[i]; | ||
2592 | if (gfn >= alias->base_gfn | ||
2593 | && gfn < alias->base_gfn + alias->npages) | ||
2594 | return alias->target_gfn + gfn - alias->base_gfn; | ||
2595 | } | ||
2596 | return gfn; | ||
2597 | } | ||
2598 | |||
2599 | /* | ||
2600 | * Set a new alias region. Aliases map a portion of physical memory into | ||
2601 | * another portion. This is useful for memory windows, for example the PC | ||
2602 | * VGA region. | ||
2603 | */ | ||
2604 | static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, | ||
2605 | struct kvm_memory_alias *alias) | ||
2606 | { | ||
2607 | int r, n; | ||
2608 | struct kvm_mem_alias *p; | ||
2609 | struct kvm_mem_aliases *aliases, *old_aliases; | ||
2610 | |||
2611 | r = -EINVAL; | ||
2612 | /* General sanity checks */ | ||
2613 | if (alias->memory_size & (PAGE_SIZE - 1)) | ||
2614 | goto out; | ||
2615 | if (alias->guest_phys_addr & (PAGE_SIZE - 1)) | ||
2616 | goto out; | ||
2617 | if (alias->slot >= KVM_ALIAS_SLOTS) | ||
2618 | goto out; | ||
2619 | if (alias->guest_phys_addr + alias->memory_size | ||
2620 | < alias->guest_phys_addr) | ||
2621 | goto out; | ||
2622 | if (alias->target_phys_addr + alias->memory_size | ||
2623 | < alias->target_phys_addr) | ||
2624 | goto out; | ||
2625 | |||
2626 | r = -ENOMEM; | ||
2627 | aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); | ||
2628 | if (!aliases) | ||
2629 | goto out; | ||
2630 | |||
2631 | mutex_lock(&kvm->slots_lock); | ||
2632 | |||
2633 | /* invalidate any gfn reference in case of deletion/shrinking */ | ||
2634 | memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases)); | ||
2635 | aliases->aliases[alias->slot].flags |= KVM_ALIAS_INVALID; | ||
2636 | old_aliases = kvm->arch.aliases; | ||
2637 | rcu_assign_pointer(kvm->arch.aliases, aliases); | ||
2638 | synchronize_srcu_expedited(&kvm->srcu); | ||
2639 | kvm_mmu_zap_all(kvm); | ||
2640 | kfree(old_aliases); | ||
2641 | |||
2642 | r = -ENOMEM; | ||
2643 | aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); | ||
2644 | if (!aliases) | ||
2645 | goto out_unlock; | ||
2646 | |||
2647 | memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases)); | ||
2648 | |||
2649 | p = &aliases->aliases[alias->slot]; | ||
2650 | p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; | ||
2651 | p->npages = alias->memory_size >> PAGE_SHIFT; | ||
2652 | p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; | ||
2653 | p->flags &= ~(KVM_ALIAS_INVALID); | ||
2654 | |||
2655 | for (n = KVM_ALIAS_SLOTS; n > 0; --n) | ||
2656 | if (aliases->aliases[n - 1].npages) | ||
2657 | break; | ||
2658 | aliases->naliases = n; | ||
2659 | |||
2660 | old_aliases = kvm->arch.aliases; | ||
2661 | rcu_assign_pointer(kvm->arch.aliases, aliases); | ||
2662 | synchronize_srcu_expedited(&kvm->srcu); | ||
2663 | kfree(old_aliases); | ||
2664 | r = 0; | ||
2665 | |||
2666 | out_unlock: | ||
2667 | mutex_unlock(&kvm->slots_lock); | ||
2668 | out: | ||
2669 | return r; | ||
2670 | } | ||
2671 | |||
2672 | static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) | 2765 | static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) |
2673 | { | 2766 | { |
2674 | int r; | 2767 | int r; |
@@ -2797,7 +2890,6 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, | |||
2797 | struct kvm_memory_slot *memslot; | 2890 | struct kvm_memory_slot *memslot; |
2798 | unsigned long n; | 2891 | unsigned long n; |
2799 | unsigned long is_dirty = 0; | 2892 | unsigned long is_dirty = 0; |
2800 | unsigned long *dirty_bitmap = NULL; | ||
2801 | 2893 | ||
2802 | mutex_lock(&kvm->slots_lock); | 2894 | mutex_lock(&kvm->slots_lock); |
2803 | 2895 | ||
@@ -2812,27 +2904,30 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, | |||
2812 | 2904 | ||
2813 | n = kvm_dirty_bitmap_bytes(memslot); | 2905 | n = kvm_dirty_bitmap_bytes(memslot); |
2814 | 2906 | ||
2815 | r = -ENOMEM; | ||
2816 | dirty_bitmap = vmalloc(n); | ||
2817 | if (!dirty_bitmap) | ||
2818 | goto out; | ||
2819 | memset(dirty_bitmap, 0, n); | ||
2820 | |||
2821 | for (i = 0; !is_dirty && i < n/sizeof(long); i++) | 2907 | for (i = 0; !is_dirty && i < n/sizeof(long); i++) |
2822 | is_dirty = memslot->dirty_bitmap[i]; | 2908 | is_dirty = memslot->dirty_bitmap[i]; |
2823 | 2909 | ||
2824 | /* If nothing is dirty, don't bother messing with page tables. */ | 2910 | /* If nothing is dirty, don't bother messing with page tables. */ |
2825 | if (is_dirty) { | 2911 | if (is_dirty) { |
2826 | struct kvm_memslots *slots, *old_slots; | 2912 | struct kvm_memslots *slots, *old_slots; |
2913 | unsigned long *dirty_bitmap; | ||
2827 | 2914 | ||
2828 | spin_lock(&kvm->mmu_lock); | 2915 | spin_lock(&kvm->mmu_lock); |
2829 | kvm_mmu_slot_remove_write_access(kvm, log->slot); | 2916 | kvm_mmu_slot_remove_write_access(kvm, log->slot); |
2830 | spin_unlock(&kvm->mmu_lock); | 2917 | spin_unlock(&kvm->mmu_lock); |
2831 | 2918 | ||
2832 | slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); | 2919 | r = -ENOMEM; |
2833 | if (!slots) | 2920 | dirty_bitmap = vmalloc(n); |
2834 | goto out_free; | 2921 | if (!dirty_bitmap) |
2922 | goto out; | ||
2923 | memset(dirty_bitmap, 0, n); | ||
2835 | 2924 | ||
2925 | r = -ENOMEM; | ||
2926 | slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); | ||
2927 | if (!slots) { | ||
2928 | vfree(dirty_bitmap); | ||
2929 | goto out; | ||
2930 | } | ||
2836 | memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); | 2931 | memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); |
2837 | slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; | 2932 | slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; |
2838 | 2933 | ||
@@ -2841,13 +2936,20 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, | |||
2841 | synchronize_srcu_expedited(&kvm->srcu); | 2936 | synchronize_srcu_expedited(&kvm->srcu); |
2842 | dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap; | 2937 | dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap; |
2843 | kfree(old_slots); | 2938 | kfree(old_slots); |
2939 | |||
2940 | r = -EFAULT; | ||
2941 | if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) { | ||
2942 | vfree(dirty_bitmap); | ||
2943 | goto out; | ||
2944 | } | ||
2945 | vfree(dirty_bitmap); | ||
2946 | } else { | ||
2947 | r = -EFAULT; | ||
2948 | if (clear_user(log->dirty_bitmap, n)) | ||
2949 | goto out; | ||
2844 | } | 2950 | } |
2845 | 2951 | ||
2846 | r = 0; | 2952 | r = 0; |
2847 | if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) | ||
2848 | r = -EFAULT; | ||
2849 | out_free: | ||
2850 | vfree(dirty_bitmap); | ||
2851 | out: | 2953 | out: |
2852 | mutex_unlock(&kvm->slots_lock); | 2954 | mutex_unlock(&kvm->slots_lock); |
2853 | return r; | 2955 | return r; |
@@ -2867,7 +2969,6 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
2867 | union { | 2969 | union { |
2868 | struct kvm_pit_state ps; | 2970 | struct kvm_pit_state ps; |
2869 | struct kvm_pit_state2 ps2; | 2971 | struct kvm_pit_state2 ps2; |
2870 | struct kvm_memory_alias alias; | ||
2871 | struct kvm_pit_config pit_config; | 2972 | struct kvm_pit_config pit_config; |
2872 | } u; | 2973 | } u; |
2873 | 2974 | ||
@@ -2888,22 +2989,6 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
2888 | goto out; | 2989 | goto out; |
2889 | break; | 2990 | break; |
2890 | } | 2991 | } |
2891 | case KVM_SET_MEMORY_REGION: { | ||
2892 | struct kvm_memory_region kvm_mem; | ||
2893 | struct kvm_userspace_memory_region kvm_userspace_mem; | ||
2894 | |||
2895 | r = -EFAULT; | ||
2896 | if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem)) | ||
2897 | goto out; | ||
2898 | kvm_userspace_mem.slot = kvm_mem.slot; | ||
2899 | kvm_userspace_mem.flags = kvm_mem.flags; | ||
2900 | kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr; | ||
2901 | kvm_userspace_mem.memory_size = kvm_mem.memory_size; | ||
2902 | r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0); | ||
2903 | if (r) | ||
2904 | goto out; | ||
2905 | break; | ||
2906 | } | ||
2907 | case KVM_SET_NR_MMU_PAGES: | 2992 | case KVM_SET_NR_MMU_PAGES: |
2908 | r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); | 2993 | r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); |
2909 | if (r) | 2994 | if (r) |
@@ -2912,14 +2997,6 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
2912 | case KVM_GET_NR_MMU_PAGES: | 2997 | case KVM_GET_NR_MMU_PAGES: |
2913 | r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); | 2998 | r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); |
2914 | break; | 2999 | break; |
2915 | case KVM_SET_MEMORY_ALIAS: | ||
2916 | r = -EFAULT; | ||
2917 | if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias))) | ||
2918 | goto out; | ||
2919 | r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias); | ||
2920 | if (r) | ||
2921 | goto out; | ||
2922 | break; | ||
2923 | case KVM_CREATE_IRQCHIP: { | 3000 | case KVM_CREATE_IRQCHIP: { |
2924 | struct kvm_pic *vpic; | 3001 | struct kvm_pic *vpic; |
2925 | 3002 | ||
@@ -3259,7 +3336,7 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, | |||
3259 | } | 3336 | } |
3260 | ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); | 3337 | ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); |
3261 | if (ret < 0) { | 3338 | if (ret < 0) { |
3262 | r = X86EMUL_UNHANDLEABLE; | 3339 | r = X86EMUL_IO_NEEDED; |
3263 | goto out; | 3340 | goto out; |
3264 | } | 3341 | } |
3265 | 3342 | ||
@@ -3315,7 +3392,7 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val, | |||
3315 | } | 3392 | } |
3316 | ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); | 3393 | ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); |
3317 | if (ret < 0) { | 3394 | if (ret < 0) { |
3318 | r = X86EMUL_UNHANDLEABLE; | 3395 | r = X86EMUL_IO_NEEDED; |
3319 | goto out; | 3396 | goto out; |
3320 | } | 3397 | } |
3321 | 3398 | ||
@@ -3330,10 +3407,10 @@ out: | |||
3330 | static int emulator_read_emulated(unsigned long addr, | 3407 | static int emulator_read_emulated(unsigned long addr, |
3331 | void *val, | 3408 | void *val, |
3332 | unsigned int bytes, | 3409 | unsigned int bytes, |
3410 | unsigned int *error_code, | ||
3333 | struct kvm_vcpu *vcpu) | 3411 | struct kvm_vcpu *vcpu) |
3334 | { | 3412 | { |
3335 | gpa_t gpa; | 3413 | gpa_t gpa; |
3336 | u32 error_code; | ||
3337 | 3414 | ||
3338 | if (vcpu->mmio_read_completed) { | 3415 | if (vcpu->mmio_read_completed) { |
3339 | memcpy(val, vcpu->mmio_data, bytes); | 3416 | memcpy(val, vcpu->mmio_data, bytes); |
@@ -3343,12 +3420,10 @@ static int emulator_read_emulated(unsigned long addr, | |||
3343 | return X86EMUL_CONTINUE; | 3420 | return X86EMUL_CONTINUE; |
3344 | } | 3421 | } |
3345 | 3422 | ||
3346 | gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code); | 3423 | gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, error_code); |
3347 | 3424 | ||
3348 | if (gpa == UNMAPPED_GVA) { | 3425 | if (gpa == UNMAPPED_GVA) |
3349 | kvm_inject_page_fault(vcpu, addr, error_code); | ||
3350 | return X86EMUL_PROPAGATE_FAULT; | 3426 | return X86EMUL_PROPAGATE_FAULT; |
3351 | } | ||
3352 | 3427 | ||
3353 | /* For APIC access vmexit */ | 3428 | /* For APIC access vmexit */ |
3354 | if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) | 3429 | if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) |
@@ -3370,11 +3445,12 @@ mmio: | |||
3370 | trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); | 3445 | trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); |
3371 | 3446 | ||
3372 | vcpu->mmio_needed = 1; | 3447 | vcpu->mmio_needed = 1; |
3373 | vcpu->mmio_phys_addr = gpa; | 3448 | vcpu->run->exit_reason = KVM_EXIT_MMIO; |
3374 | vcpu->mmio_size = bytes; | 3449 | vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; |
3375 | vcpu->mmio_is_write = 0; | 3450 | vcpu->run->mmio.len = vcpu->mmio_size = bytes; |
3451 | vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0; | ||
3376 | 3452 | ||
3377 | return X86EMUL_UNHANDLEABLE; | 3453 | return X86EMUL_IO_NEEDED; |
3378 | } | 3454 | } |
3379 | 3455 | ||
3380 | int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, | 3456 | int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, |
@@ -3392,17 +3468,15 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
3392 | static int emulator_write_emulated_onepage(unsigned long addr, | 3468 | static int emulator_write_emulated_onepage(unsigned long addr, |
3393 | const void *val, | 3469 | const void *val, |
3394 | unsigned int bytes, | 3470 | unsigned int bytes, |
3471 | unsigned int *error_code, | ||
3395 | struct kvm_vcpu *vcpu) | 3472 | struct kvm_vcpu *vcpu) |
3396 | { | 3473 | { |
3397 | gpa_t gpa; | 3474 | gpa_t gpa; |
3398 | u32 error_code; | ||
3399 | 3475 | ||
3400 | gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code); | 3476 | gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error_code); |
3401 | 3477 | ||
3402 | if (gpa == UNMAPPED_GVA) { | 3478 | if (gpa == UNMAPPED_GVA) |
3403 | kvm_inject_page_fault(vcpu, addr, error_code); | ||
3404 | return X86EMUL_PROPAGATE_FAULT; | 3479 | return X86EMUL_PROPAGATE_FAULT; |
3405 | } | ||
3406 | 3480 | ||
3407 | /* For APIC access vmexit */ | 3481 | /* For APIC access vmexit */ |
3408 | if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) | 3482 | if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) |
@@ -3420,10 +3494,11 @@ mmio: | |||
3420 | return X86EMUL_CONTINUE; | 3494 | return X86EMUL_CONTINUE; |
3421 | 3495 | ||
3422 | vcpu->mmio_needed = 1; | 3496 | vcpu->mmio_needed = 1; |
3423 | vcpu->mmio_phys_addr = gpa; | 3497 | vcpu->run->exit_reason = KVM_EXIT_MMIO; |
3424 | vcpu->mmio_size = bytes; | 3498 | vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; |
3425 | vcpu->mmio_is_write = 1; | 3499 | vcpu->run->mmio.len = vcpu->mmio_size = bytes; |
3426 | memcpy(vcpu->mmio_data, val, bytes); | 3500 | vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1; |
3501 | memcpy(vcpu->run->mmio.data, val, bytes); | ||
3427 | 3502 | ||
3428 | return X86EMUL_CONTINUE; | 3503 | return X86EMUL_CONTINUE; |
3429 | } | 3504 | } |
@@ -3431,6 +3506,7 @@ mmio: | |||
3431 | int emulator_write_emulated(unsigned long addr, | 3506 | int emulator_write_emulated(unsigned long addr, |
3432 | const void *val, | 3507 | const void *val, |
3433 | unsigned int bytes, | 3508 | unsigned int bytes, |
3509 | unsigned int *error_code, | ||
3434 | struct kvm_vcpu *vcpu) | 3510 | struct kvm_vcpu *vcpu) |
3435 | { | 3511 | { |
3436 | /* Crossing a page boundary? */ | 3512 | /* Crossing a page boundary? */ |
@@ -3438,16 +3514,17 @@ int emulator_write_emulated(unsigned long addr, | |||
3438 | int rc, now; | 3514 | int rc, now; |
3439 | 3515 | ||
3440 | now = -addr & ~PAGE_MASK; | 3516 | now = -addr & ~PAGE_MASK; |
3441 | rc = emulator_write_emulated_onepage(addr, val, now, vcpu); | 3517 | rc = emulator_write_emulated_onepage(addr, val, now, error_code, |
3518 | vcpu); | ||
3442 | if (rc != X86EMUL_CONTINUE) | 3519 | if (rc != X86EMUL_CONTINUE) |
3443 | return rc; | 3520 | return rc; |
3444 | addr += now; | 3521 | addr += now; |
3445 | val += now; | 3522 | val += now; |
3446 | bytes -= now; | 3523 | bytes -= now; |
3447 | } | 3524 | } |
3448 | return emulator_write_emulated_onepage(addr, val, bytes, vcpu); | 3525 | return emulator_write_emulated_onepage(addr, val, bytes, error_code, |
3526 | vcpu); | ||
3449 | } | 3527 | } |
3450 | EXPORT_SYMBOL_GPL(emulator_write_emulated); | ||
3451 | 3528 | ||
3452 | #define CMPXCHG_TYPE(t, ptr, old, new) \ | 3529 | #define CMPXCHG_TYPE(t, ptr, old, new) \ |
3453 | (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old)) | 3530 | (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old)) |
@@ -3463,6 +3540,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr, | |||
3463 | const void *old, | 3540 | const void *old, |
3464 | const void *new, | 3541 | const void *new, |
3465 | unsigned int bytes, | 3542 | unsigned int bytes, |
3543 | unsigned int *error_code, | ||
3466 | struct kvm_vcpu *vcpu) | 3544 | struct kvm_vcpu *vcpu) |
3467 | { | 3545 | { |
3468 | gpa_t gpa; | 3546 | gpa_t gpa; |
@@ -3484,6 +3562,10 @@ static int emulator_cmpxchg_emulated(unsigned long addr, | |||
3484 | goto emul_write; | 3562 | goto emul_write; |
3485 | 3563 | ||
3486 | page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); | 3564 | page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); |
3565 | if (is_error_page(page)) { | ||
3566 | kvm_release_page_clean(page); | ||
3567 | goto emul_write; | ||
3568 | } | ||
3487 | 3569 | ||
3488 | kaddr = kmap_atomic(page, KM_USER0); | 3570 | kaddr = kmap_atomic(page, KM_USER0); |
3489 | kaddr += offset_in_page(gpa); | 3571 | kaddr += offset_in_page(gpa); |
@@ -3516,7 +3598,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr, | |||
3516 | emul_write: | 3598 | emul_write: |
3517 | printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); | 3599 | printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); |
3518 | 3600 | ||
3519 | return emulator_write_emulated(addr, new, bytes, vcpu); | 3601 | return emulator_write_emulated(addr, new, bytes, error_code, vcpu); |
3520 | } | 3602 | } |
3521 | 3603 | ||
3522 | static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) | 3604 | static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) |
@@ -3604,42 +3686,38 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) | |||
3604 | return X86EMUL_CONTINUE; | 3686 | return X86EMUL_CONTINUE; |
3605 | } | 3687 | } |
3606 | 3688 | ||
3607 | int emulate_clts(struct kvm_vcpu *vcpu) | 3689 | int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) |
3608 | { | 3690 | { |
3609 | kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); | 3691 | if (!need_emulate_wbinvd(vcpu)) |
3610 | kvm_x86_ops->fpu_activate(vcpu); | 3692 | return X86EMUL_CONTINUE; |
3693 | |||
3694 | if (kvm_x86_ops->has_wbinvd_exit()) { | ||
3695 | smp_call_function_many(vcpu->arch.wbinvd_dirty_mask, | ||
3696 | wbinvd_ipi, NULL, 1); | ||
3697 | cpumask_clear(vcpu->arch.wbinvd_dirty_mask); | ||
3698 | } | ||
3699 | wbinvd(); | ||
3611 | return X86EMUL_CONTINUE; | 3700 | return X86EMUL_CONTINUE; |
3612 | } | 3701 | } |
3702 | EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); | ||
3613 | 3703 | ||
3614 | int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) | 3704 | int emulate_clts(struct kvm_vcpu *vcpu) |
3615 | { | 3705 | { |
3616 | return kvm_get_dr(ctxt->vcpu, dr, dest); | 3706 | kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); |
3707 | kvm_x86_ops->fpu_activate(vcpu); | ||
3708 | return X86EMUL_CONTINUE; | ||
3617 | } | 3709 | } |
3618 | 3710 | ||
3619 | int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) | 3711 | int emulator_get_dr(int dr, unsigned long *dest, struct kvm_vcpu *vcpu) |
3620 | { | 3712 | { |
3621 | unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; | 3713 | return _kvm_get_dr(vcpu, dr, dest); |
3622 | |||
3623 | return kvm_set_dr(ctxt->vcpu, dr, value & mask); | ||
3624 | } | 3714 | } |
3625 | 3715 | ||
3626 | void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) | 3716 | int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu) |
3627 | { | 3717 | { |
3628 | u8 opcodes[4]; | ||
3629 | unsigned long rip = kvm_rip_read(vcpu); | ||
3630 | unsigned long rip_linear; | ||
3631 | |||
3632 | if (!printk_ratelimit()) | ||
3633 | return; | ||
3634 | 3718 | ||
3635 | rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); | 3719 | return __kvm_set_dr(vcpu, dr, value); |
3636 | |||
3637 | kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL); | ||
3638 | |||
3639 | printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", | ||
3640 | context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); | ||
3641 | } | 3720 | } |
3642 | EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); | ||
3643 | 3721 | ||
3644 | static u64 mk_cr_64(u64 curr_cr, u32 new_val) | 3722 | static u64 mk_cr_64(u64 curr_cr, u32 new_val) |
3645 | { | 3723 | { |
@@ -3674,27 +3752,32 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu) | |||
3674 | return value; | 3752 | return value; |
3675 | } | 3753 | } |
3676 | 3754 | ||
3677 | static void emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) | 3755 | static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) |
3678 | { | 3756 | { |
3757 | int res = 0; | ||
3758 | |||
3679 | switch (cr) { | 3759 | switch (cr) { |
3680 | case 0: | 3760 | case 0: |
3681 | kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val)); | 3761 | res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val)); |
3682 | break; | 3762 | break; |
3683 | case 2: | 3763 | case 2: |
3684 | vcpu->arch.cr2 = val; | 3764 | vcpu->arch.cr2 = val; |
3685 | break; | 3765 | break; |
3686 | case 3: | 3766 | case 3: |
3687 | kvm_set_cr3(vcpu, val); | 3767 | res = kvm_set_cr3(vcpu, val); |
3688 | break; | 3768 | break; |
3689 | case 4: | 3769 | case 4: |
3690 | kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); | 3770 | res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); |
3691 | break; | 3771 | break; |
3692 | case 8: | 3772 | case 8: |
3693 | kvm_set_cr8(vcpu, val & 0xfUL); | 3773 | res = __kvm_set_cr8(vcpu, val & 0xfUL); |
3694 | break; | 3774 | break; |
3695 | default: | 3775 | default: |
3696 | vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); | 3776 | vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); |
3777 | res = -1; | ||
3697 | } | 3778 | } |
3779 | |||
3780 | return res; | ||
3698 | } | 3781 | } |
3699 | 3782 | ||
3700 | static int emulator_get_cpl(struct kvm_vcpu *vcpu) | 3783 | static int emulator_get_cpl(struct kvm_vcpu *vcpu) |
@@ -3707,6 +3790,12 @@ static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu) | |||
3707 | kvm_x86_ops->get_gdt(vcpu, dt); | 3790 | kvm_x86_ops->get_gdt(vcpu, dt); |
3708 | } | 3791 | } |
3709 | 3792 | ||
3793 | static unsigned long emulator_get_cached_segment_base(int seg, | ||
3794 | struct kvm_vcpu *vcpu) | ||
3795 | { | ||
3796 | return get_segment_base(vcpu, seg); | ||
3797 | } | ||
3798 | |||
3710 | static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg, | 3799 | static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg, |
3711 | struct kvm_vcpu *vcpu) | 3800 | struct kvm_vcpu *vcpu) |
3712 | { | 3801 | { |
@@ -3779,11 +3868,6 @@ static void emulator_set_segment_selector(u16 sel, int seg, | |||
3779 | kvm_set_segment(vcpu, &kvm_seg, seg); | 3868 | kvm_set_segment(vcpu, &kvm_seg, seg); |
3780 | } | 3869 | } |
3781 | 3870 | ||
3782 | static void emulator_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) | ||
3783 | { | ||
3784 | kvm_x86_ops->set_rflags(vcpu, rflags); | ||
3785 | } | ||
3786 | |||
3787 | static struct x86_emulate_ops emulate_ops = { | 3871 | static struct x86_emulate_ops emulate_ops = { |
3788 | .read_std = kvm_read_guest_virt_system, | 3872 | .read_std = kvm_read_guest_virt_system, |
3789 | .write_std = kvm_write_guest_virt_system, | 3873 | .write_std = kvm_write_guest_virt_system, |
@@ -3797,11 +3881,15 @@ static struct x86_emulate_ops emulate_ops = { | |||
3797 | .set_cached_descriptor = emulator_set_cached_descriptor, | 3881 | .set_cached_descriptor = emulator_set_cached_descriptor, |
3798 | .get_segment_selector = emulator_get_segment_selector, | 3882 | .get_segment_selector = emulator_get_segment_selector, |
3799 | .set_segment_selector = emulator_set_segment_selector, | 3883 | .set_segment_selector = emulator_set_segment_selector, |
3884 | .get_cached_segment_base = emulator_get_cached_segment_base, | ||
3800 | .get_gdt = emulator_get_gdt, | 3885 | .get_gdt = emulator_get_gdt, |
3801 | .get_cr = emulator_get_cr, | 3886 | .get_cr = emulator_get_cr, |
3802 | .set_cr = emulator_set_cr, | 3887 | .set_cr = emulator_set_cr, |
3803 | .cpl = emulator_get_cpl, | 3888 | .cpl = emulator_get_cpl, |
3804 | .set_rflags = emulator_set_rflags, | 3889 | .get_dr = emulator_get_dr, |
3890 | .set_dr = emulator_set_dr, | ||
3891 | .set_msr = kvm_set_msr, | ||
3892 | .get_msr = kvm_get_msr, | ||
3805 | }; | 3893 | }; |
3806 | 3894 | ||
3807 | static void cache_all_regs(struct kvm_vcpu *vcpu) | 3895 | static void cache_all_regs(struct kvm_vcpu *vcpu) |
@@ -3812,14 +3900,75 @@ static void cache_all_regs(struct kvm_vcpu *vcpu) | |||
3812 | vcpu->arch.regs_dirty = ~0; | 3900 | vcpu->arch.regs_dirty = ~0; |
3813 | } | 3901 | } |
3814 | 3902 | ||
3903 | static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) | ||
3904 | { | ||
3905 | u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask); | ||
3906 | /* | ||
3907 | * an sti; sti; sequence only disable interrupts for the first | ||
3908 | * instruction. So, if the last instruction, be it emulated or | ||
3909 | * not, left the system with the INT_STI flag enabled, it | ||
3910 | * means that the last instruction is an sti. We should not | ||
3911 | * leave the flag on in this case. The same goes for mov ss | ||
3912 | */ | ||
3913 | if (!(int_shadow & mask)) | ||
3914 | kvm_x86_ops->set_interrupt_shadow(vcpu, mask); | ||
3915 | } | ||
3916 | |||
3917 | static void inject_emulated_exception(struct kvm_vcpu *vcpu) | ||
3918 | { | ||
3919 | struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; | ||
3920 | if (ctxt->exception == PF_VECTOR) | ||
3921 | kvm_inject_page_fault(vcpu, ctxt->cr2, ctxt->error_code); | ||
3922 | else if (ctxt->error_code_valid) | ||
3923 | kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code); | ||
3924 | else | ||
3925 | kvm_queue_exception(vcpu, ctxt->exception); | ||
3926 | } | ||
3927 | |||
3928 | static int handle_emulation_failure(struct kvm_vcpu *vcpu) | ||
3929 | { | ||
3930 | ++vcpu->stat.insn_emulation_fail; | ||
3931 | trace_kvm_emulate_insn_failed(vcpu); | ||
3932 | vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; | ||
3933 | vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; | ||
3934 | vcpu->run->internal.ndata = 0; | ||
3935 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
3936 | return EMULATE_FAIL; | ||
3937 | } | ||
3938 | |||
3939 | static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) | ||
3940 | { | ||
3941 | gpa_t gpa; | ||
3942 | |||
3943 | if (tdp_enabled) | ||
3944 | return false; | ||
3945 | |||
3946 | /* | ||
3947 | * if emulation was due to access to shadowed page table | ||
3948 | * and it failed try to unshadow page and re-entetr the | ||
3949 | * guest to let CPU execute the instruction. | ||
3950 | */ | ||
3951 | if (kvm_mmu_unprotect_page_virt(vcpu, gva)) | ||
3952 | return true; | ||
3953 | |||
3954 | gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL); | ||
3955 | |||
3956 | if (gpa == UNMAPPED_GVA) | ||
3957 | return true; /* let cpu generate fault */ | ||
3958 | |||
3959 | if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT))) | ||
3960 | return true; | ||
3961 | |||
3962 | return false; | ||
3963 | } | ||
3964 | |||
3815 | int emulate_instruction(struct kvm_vcpu *vcpu, | 3965 | int emulate_instruction(struct kvm_vcpu *vcpu, |
3816 | unsigned long cr2, | 3966 | unsigned long cr2, |
3817 | u16 error_code, | 3967 | u16 error_code, |
3818 | int emulation_type) | 3968 | int emulation_type) |
3819 | { | 3969 | { |
3820 | int r, shadow_mask; | 3970 | int r; |
3821 | struct decode_cache *c; | 3971 | struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; |
3822 | struct kvm_run *run = vcpu->run; | ||
3823 | 3972 | ||
3824 | kvm_clear_exception_queue(vcpu); | 3973 | kvm_clear_exception_queue(vcpu); |
3825 | vcpu->arch.mmio_fault_cr2 = cr2; | 3974 | vcpu->arch.mmio_fault_cr2 = cr2; |
@@ -3831,8 +3980,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu, | |||
3831 | */ | 3980 | */ |
3832 | cache_all_regs(vcpu); | 3981 | cache_all_regs(vcpu); |
3833 | 3982 | ||
3834 | vcpu->mmio_is_write = 0; | ||
3835 | |||
3836 | if (!(emulation_type & EMULTYPE_NO_DECODE)) { | 3983 | if (!(emulation_type & EMULTYPE_NO_DECODE)) { |
3837 | int cs_db, cs_l; | 3984 | int cs_db, cs_l; |
3838 | kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); | 3985 | kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); |
@@ -3846,13 +3993,16 @@ int emulate_instruction(struct kvm_vcpu *vcpu, | |||
3846 | ? X86EMUL_MODE_VM86 : cs_l | 3993 | ? X86EMUL_MODE_VM86 : cs_l |
3847 | ? X86EMUL_MODE_PROT64 : cs_db | 3994 | ? X86EMUL_MODE_PROT64 : cs_db |
3848 | ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; | 3995 | ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; |
3996 | memset(c, 0, sizeof(struct decode_cache)); | ||
3997 | memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); | ||
3998 | vcpu->arch.emulate_ctxt.interruptibility = 0; | ||
3999 | vcpu->arch.emulate_ctxt.exception = -1; | ||
3849 | 4000 | ||
3850 | r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); | 4001 | r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); |
3851 | trace_kvm_emulate_insn_start(vcpu); | 4002 | trace_kvm_emulate_insn_start(vcpu); |
3852 | 4003 | ||
3853 | /* Only allow emulation of specific instructions on #UD | 4004 | /* Only allow emulation of specific instructions on #UD |
3854 | * (namely VMMCALL, sysenter, sysexit, syscall)*/ | 4005 | * (namely VMMCALL, sysenter, sysexit, syscall)*/ |
3855 | c = &vcpu->arch.emulate_ctxt.decode; | ||
3856 | if (emulation_type & EMULTYPE_TRAP_UD) { | 4006 | if (emulation_type & EMULTYPE_TRAP_UD) { |
3857 | if (!c->twobyte) | 4007 | if (!c->twobyte) |
3858 | return EMULATE_FAIL; | 4008 | return EMULATE_FAIL; |
@@ -3880,11 +4030,11 @@ int emulate_instruction(struct kvm_vcpu *vcpu, | |||
3880 | 4030 | ||
3881 | ++vcpu->stat.insn_emulation; | 4031 | ++vcpu->stat.insn_emulation; |
3882 | if (r) { | 4032 | if (r) { |
3883 | ++vcpu->stat.insn_emulation_fail; | 4033 | if (reexecute_instruction(vcpu, cr2)) |
3884 | trace_kvm_emulate_insn_failed(vcpu); | ||
3885 | if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) | ||
3886 | return EMULATE_DONE; | 4034 | return EMULATE_DONE; |
3887 | return EMULATE_FAIL; | 4035 | if (emulation_type & EMULTYPE_SKIP) |
4036 | return EMULATE_FAIL; | ||
4037 | return handle_emulation_failure(vcpu); | ||
3888 | } | 4038 | } |
3889 | } | 4039 | } |
3890 | 4040 | ||
@@ -3893,48 +4043,42 @@ int emulate_instruction(struct kvm_vcpu *vcpu, | |||
3893 | return EMULATE_DONE; | 4043 | return EMULATE_DONE; |
3894 | } | 4044 | } |
3895 | 4045 | ||
4046 | /* this is needed for vmware backdor interface to work since it | ||
4047 | changes registers values during IO operation */ | ||
4048 | memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); | ||
4049 | |||
3896 | restart: | 4050 | restart: |
3897 | r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); | 4051 | r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); |
3898 | shadow_mask = vcpu->arch.emulate_ctxt.interruptibility; | ||
3899 | 4052 | ||
3900 | if (r == 0) | 4053 | if (r) { /* emulation failed */ |
3901 | kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); | 4054 | if (reexecute_instruction(vcpu, cr2)) |
4055 | return EMULATE_DONE; | ||
3902 | 4056 | ||
3903 | if (vcpu->arch.pio.count) { | 4057 | return handle_emulation_failure(vcpu); |
3904 | if (!vcpu->arch.pio.in) | ||
3905 | vcpu->arch.pio.count = 0; | ||
3906 | return EMULATE_DO_MMIO; | ||
3907 | } | 4058 | } |
3908 | 4059 | ||
3909 | if (r || vcpu->mmio_is_write) { | 4060 | toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility); |
3910 | run->exit_reason = KVM_EXIT_MMIO; | 4061 | kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); |
3911 | run->mmio.phys_addr = vcpu->mmio_phys_addr; | 4062 | memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); |
3912 | memcpy(run->mmio.data, vcpu->mmio_data, 8); | 4063 | kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); |
3913 | run->mmio.len = vcpu->mmio_size; | 4064 | |
3914 | run->mmio.is_write = vcpu->mmio_is_write; | 4065 | if (vcpu->arch.emulate_ctxt.exception >= 0) { |
4066 | inject_emulated_exception(vcpu); | ||
4067 | return EMULATE_DONE; | ||
3915 | } | 4068 | } |
3916 | 4069 | ||
3917 | if (r) { | 4070 | if (vcpu->arch.pio.count) { |
3918 | if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) | 4071 | if (!vcpu->arch.pio.in) |
3919 | goto done; | 4072 | vcpu->arch.pio.count = 0; |
3920 | if (!vcpu->mmio_needed) { | ||
3921 | ++vcpu->stat.insn_emulation_fail; | ||
3922 | trace_kvm_emulate_insn_failed(vcpu); | ||
3923 | kvm_report_emulation_failure(vcpu, "mmio"); | ||
3924 | return EMULATE_FAIL; | ||
3925 | } | ||
3926 | return EMULATE_DO_MMIO; | 4073 | return EMULATE_DO_MMIO; |
3927 | } | 4074 | } |
3928 | 4075 | ||
3929 | if (vcpu->mmio_is_write) { | 4076 | if (vcpu->mmio_needed) { |
3930 | vcpu->mmio_needed = 0; | 4077 | if (vcpu->mmio_is_write) |
4078 | vcpu->mmio_needed = 0; | ||
3931 | return EMULATE_DO_MMIO; | 4079 | return EMULATE_DO_MMIO; |
3932 | } | 4080 | } |
3933 | 4081 | ||
3934 | done: | ||
3935 | if (vcpu->arch.exception.pending) | ||
3936 | vcpu->arch.emulate_ctxt.restart = false; | ||
3937 | |||
3938 | if (vcpu->arch.emulate_ctxt.restart) | 4082 | if (vcpu->arch.emulate_ctxt.restart) |
3939 | goto restart; | 4083 | goto restart; |
3940 | 4084 | ||
@@ -4108,6 +4252,9 @@ int kvm_arch_init(void *opaque) | |||
4108 | 4252 | ||
4109 | perf_register_guest_info_callbacks(&kvm_guest_cbs); | 4253 | perf_register_guest_info_callbacks(&kvm_guest_cbs); |
4110 | 4254 | ||
4255 | if (cpu_has_xsave) | ||
4256 | host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); | ||
4257 | |||
4111 | return 0; | 4258 | return 0; |
4112 | 4259 | ||
4113 | out: | 4260 | out: |
@@ -4270,7 +4417,7 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu) | |||
4270 | 4417 | ||
4271 | kvm_x86_ops->patch_hypercall(vcpu, instruction); | 4418 | kvm_x86_ops->patch_hypercall(vcpu, instruction); |
4272 | 4419 | ||
4273 | return emulator_write_emulated(rip, instruction, 3, vcpu); | 4420 | return emulator_write_emulated(rip, instruction, 3, NULL, vcpu); |
4274 | } | 4421 | } |
4275 | 4422 | ||
4276 | void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) | 4423 | void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) |
@@ -4506,59 +4653,78 @@ static void inject_pending_event(struct kvm_vcpu *vcpu) | |||
4506 | } | 4653 | } |
4507 | } | 4654 | } |
4508 | 4655 | ||
4656 | static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu) | ||
4657 | { | ||
4658 | if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) && | ||
4659 | !vcpu->guest_xcr0_loaded) { | ||
4660 | /* kvm_set_xcr() also depends on this */ | ||
4661 | xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); | ||
4662 | vcpu->guest_xcr0_loaded = 1; | ||
4663 | } | ||
4664 | } | ||
4665 | |||
4666 | static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) | ||
4667 | { | ||
4668 | if (vcpu->guest_xcr0_loaded) { | ||
4669 | if (vcpu->arch.xcr0 != host_xcr0) | ||
4670 | xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0); | ||
4671 | vcpu->guest_xcr0_loaded = 0; | ||
4672 | } | ||
4673 | } | ||
4674 | |||
4509 | static int vcpu_enter_guest(struct kvm_vcpu *vcpu) | 4675 | static int vcpu_enter_guest(struct kvm_vcpu *vcpu) |
4510 | { | 4676 | { |
4511 | int r; | 4677 | int r; |
4512 | bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && | 4678 | bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && |
4513 | vcpu->run->request_interrupt_window; | 4679 | vcpu->run->request_interrupt_window; |
4514 | 4680 | ||
4515 | if (vcpu->requests) | ||
4516 | if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) | ||
4517 | kvm_mmu_unload(vcpu); | ||
4518 | |||
4519 | r = kvm_mmu_reload(vcpu); | ||
4520 | if (unlikely(r)) | ||
4521 | goto out; | ||
4522 | |||
4523 | if (vcpu->requests) { | 4681 | if (vcpu->requests) { |
4524 | if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) | 4682 | if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) |
4683 | kvm_mmu_unload(vcpu); | ||
4684 | if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) | ||
4525 | __kvm_migrate_timers(vcpu); | 4685 | __kvm_migrate_timers(vcpu); |
4526 | if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests)) | 4686 | if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu)) |
4527 | kvm_write_guest_time(vcpu); | 4687 | kvm_write_guest_time(vcpu); |
4528 | if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests)) | 4688 | if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) |
4529 | kvm_mmu_sync_roots(vcpu); | 4689 | kvm_mmu_sync_roots(vcpu); |
4530 | if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) | 4690 | if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) |
4531 | kvm_x86_ops->tlb_flush(vcpu); | 4691 | kvm_x86_ops->tlb_flush(vcpu); |
4532 | if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, | 4692 | if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { |
4533 | &vcpu->requests)) { | ||
4534 | vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; | 4693 | vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; |
4535 | r = 0; | 4694 | r = 0; |
4536 | goto out; | 4695 | goto out; |
4537 | } | 4696 | } |
4538 | if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) { | 4697 | if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) { |
4539 | vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; | 4698 | vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; |
4540 | r = 0; | 4699 | r = 0; |
4541 | goto out; | 4700 | goto out; |
4542 | } | 4701 | } |
4543 | if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) { | 4702 | if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) { |
4544 | vcpu->fpu_active = 0; | 4703 | vcpu->fpu_active = 0; |
4545 | kvm_x86_ops->fpu_deactivate(vcpu); | 4704 | kvm_x86_ops->fpu_deactivate(vcpu); |
4546 | } | 4705 | } |
4547 | } | 4706 | } |
4548 | 4707 | ||
4708 | r = kvm_mmu_reload(vcpu); | ||
4709 | if (unlikely(r)) | ||
4710 | goto out; | ||
4711 | |||
4549 | preempt_disable(); | 4712 | preempt_disable(); |
4550 | 4713 | ||
4551 | kvm_x86_ops->prepare_guest_switch(vcpu); | 4714 | kvm_x86_ops->prepare_guest_switch(vcpu); |
4552 | if (vcpu->fpu_active) | 4715 | if (vcpu->fpu_active) |
4553 | kvm_load_guest_fpu(vcpu); | 4716 | kvm_load_guest_fpu(vcpu); |
4717 | kvm_load_guest_xcr0(vcpu); | ||
4554 | 4718 | ||
4555 | local_irq_disable(); | 4719 | atomic_set(&vcpu->guest_mode, 1); |
4720 | smp_wmb(); | ||
4556 | 4721 | ||
4557 | clear_bit(KVM_REQ_KICK, &vcpu->requests); | 4722 | local_irq_disable(); |
4558 | smp_mb__after_clear_bit(); | ||
4559 | 4723 | ||
4560 | if (vcpu->requests || need_resched() || signal_pending(current)) { | 4724 | if (!atomic_read(&vcpu->guest_mode) || vcpu->requests |
4561 | set_bit(KVM_REQ_KICK, &vcpu->requests); | 4725 | || need_resched() || signal_pending(current)) { |
4726 | atomic_set(&vcpu->guest_mode, 0); | ||
4727 | smp_wmb(); | ||
4562 | local_irq_enable(); | 4728 | local_irq_enable(); |
4563 | preempt_enable(); | 4729 | preempt_enable(); |
4564 | r = 1; | 4730 | r = 1; |
@@ -4603,7 +4769,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) | |||
4603 | if (hw_breakpoint_active()) | 4769 | if (hw_breakpoint_active()) |
4604 | hw_breakpoint_restore(); | 4770 | hw_breakpoint_restore(); |
4605 | 4771 | ||
4606 | set_bit(KVM_REQ_KICK, &vcpu->requests); | 4772 | atomic_set(&vcpu->guest_mode, 0); |
4773 | smp_wmb(); | ||
4607 | local_irq_enable(); | 4774 | local_irq_enable(); |
4608 | 4775 | ||
4609 | ++vcpu->stat.exits; | 4776 | ++vcpu->stat.exits; |
@@ -4665,7 +4832,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) | |||
4665 | srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); | 4832 | srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); |
4666 | kvm_vcpu_block(vcpu); | 4833 | kvm_vcpu_block(vcpu); |
4667 | vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); | 4834 | vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); |
4668 | if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) | 4835 | if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) |
4669 | { | 4836 | { |
4670 | switch(vcpu->arch.mp_state) { | 4837 | switch(vcpu->arch.mp_state) { |
4671 | case KVM_MP_STATE_HALTED: | 4838 | case KVM_MP_STATE_HALTED: |
@@ -4717,8 +4884,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
4717 | int r; | 4884 | int r; |
4718 | sigset_t sigsaved; | 4885 | sigset_t sigsaved; |
4719 | 4886 | ||
4720 | vcpu_load(vcpu); | ||
4721 | |||
4722 | if (vcpu->sigset_active) | 4887 | if (vcpu->sigset_active) |
4723 | sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); | 4888 | sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); |
4724 | 4889 | ||
@@ -4743,7 +4908,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
4743 | vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); | 4908 | vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); |
4744 | r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE); | 4909 | r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE); |
4745 | srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); | 4910 | srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); |
4746 | if (r == EMULATE_DO_MMIO) { | 4911 | if (r != EMULATE_DONE) { |
4747 | r = 0; | 4912 | r = 0; |
4748 | goto out; | 4913 | goto out; |
4749 | } | 4914 | } |
@@ -4759,14 +4924,11 @@ out: | |||
4759 | if (vcpu->sigset_active) | 4924 | if (vcpu->sigset_active) |
4760 | sigprocmask(SIG_SETMASK, &sigsaved, NULL); | 4925 | sigprocmask(SIG_SETMASK, &sigsaved, NULL); |
4761 | 4926 | ||
4762 | vcpu_put(vcpu); | ||
4763 | return r; | 4927 | return r; |
4764 | } | 4928 | } |
4765 | 4929 | ||
4766 | int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) | 4930 | int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) |
4767 | { | 4931 | { |
4768 | vcpu_load(vcpu); | ||
4769 | |||
4770 | regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); | 4932 | regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); |
4771 | regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); | 4933 | regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); |
4772 | regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); | 4934 | regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); |
@@ -4789,15 +4951,11 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) | |||
4789 | regs->rip = kvm_rip_read(vcpu); | 4951 | regs->rip = kvm_rip_read(vcpu); |
4790 | regs->rflags = kvm_get_rflags(vcpu); | 4952 | regs->rflags = kvm_get_rflags(vcpu); |
4791 | 4953 | ||
4792 | vcpu_put(vcpu); | ||
4793 | |||
4794 | return 0; | 4954 | return 0; |
4795 | } | 4955 | } |
4796 | 4956 | ||
4797 | int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) | 4957 | int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) |
4798 | { | 4958 | { |
4799 | vcpu_load(vcpu); | ||
4800 | |||
4801 | kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); | 4959 | kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); |
4802 | kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); | 4960 | kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); |
4803 | kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); | 4961 | kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); |
@@ -4822,8 +4980,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) | |||
4822 | 4980 | ||
4823 | vcpu->arch.exception.pending = false; | 4981 | vcpu->arch.exception.pending = false; |
4824 | 4982 | ||
4825 | vcpu_put(vcpu); | ||
4826 | |||
4827 | return 0; | 4983 | return 0; |
4828 | } | 4984 | } |
4829 | 4985 | ||
@@ -4842,8 +4998,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, | |||
4842 | { | 4998 | { |
4843 | struct desc_ptr dt; | 4999 | struct desc_ptr dt; |
4844 | 5000 | ||
4845 | vcpu_load(vcpu); | ||
4846 | |||
4847 | kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); | 5001 | kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); |
4848 | kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); | 5002 | kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); |
4849 | kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); | 5003 | kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); |
@@ -4875,32 +5029,27 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, | |||
4875 | set_bit(vcpu->arch.interrupt.nr, | 5029 | set_bit(vcpu->arch.interrupt.nr, |
4876 | (unsigned long *)sregs->interrupt_bitmap); | 5030 | (unsigned long *)sregs->interrupt_bitmap); |
4877 | 5031 | ||
4878 | vcpu_put(vcpu); | ||
4879 | |||
4880 | return 0; | 5032 | return 0; |
4881 | } | 5033 | } |
4882 | 5034 | ||
4883 | int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, | 5035 | int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, |
4884 | struct kvm_mp_state *mp_state) | 5036 | struct kvm_mp_state *mp_state) |
4885 | { | 5037 | { |
4886 | vcpu_load(vcpu); | ||
4887 | mp_state->mp_state = vcpu->arch.mp_state; | 5038 | mp_state->mp_state = vcpu->arch.mp_state; |
4888 | vcpu_put(vcpu); | ||
4889 | return 0; | 5039 | return 0; |
4890 | } | 5040 | } |
4891 | 5041 | ||
4892 | int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, | 5042 | int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, |
4893 | struct kvm_mp_state *mp_state) | 5043 | struct kvm_mp_state *mp_state) |
4894 | { | 5044 | { |
4895 | vcpu_load(vcpu); | ||
4896 | vcpu->arch.mp_state = mp_state->mp_state; | 5045 | vcpu->arch.mp_state = mp_state->mp_state; |
4897 | vcpu_put(vcpu); | ||
4898 | return 0; | 5046 | return 0; |
4899 | } | 5047 | } |
4900 | 5048 | ||
4901 | int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, | 5049 | int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, |
4902 | bool has_error_code, u32 error_code) | 5050 | bool has_error_code, u32 error_code) |
4903 | { | 5051 | { |
5052 | struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; | ||
4904 | int cs_db, cs_l, ret; | 5053 | int cs_db, cs_l, ret; |
4905 | cache_all_regs(vcpu); | 5054 | cache_all_regs(vcpu); |
4906 | 5055 | ||
@@ -4915,6 +5064,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, | |||
4915 | ? X86EMUL_MODE_VM86 : cs_l | 5064 | ? X86EMUL_MODE_VM86 : cs_l |
4916 | ? X86EMUL_MODE_PROT64 : cs_db | 5065 | ? X86EMUL_MODE_PROT64 : cs_db |
4917 | ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; | 5066 | ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; |
5067 | memset(c, 0, sizeof(struct decode_cache)); | ||
5068 | memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); | ||
4918 | 5069 | ||
4919 | ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops, | 5070 | ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops, |
4920 | tss_selector, reason, has_error_code, | 5071 | tss_selector, reason, has_error_code, |
@@ -4923,6 +5074,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, | |||
4923 | if (ret) | 5074 | if (ret) |
4924 | return EMULATE_FAIL; | 5075 | return EMULATE_FAIL; |
4925 | 5076 | ||
5077 | memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); | ||
5078 | kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); | ||
4926 | kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); | 5079 | kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); |
4927 | return EMULATE_DONE; | 5080 | return EMULATE_DONE; |
4928 | } | 5081 | } |
@@ -4935,8 +5088,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | |||
4935 | int pending_vec, max_bits; | 5088 | int pending_vec, max_bits; |
4936 | struct desc_ptr dt; | 5089 | struct desc_ptr dt; |
4937 | 5090 | ||
4938 | vcpu_load(vcpu); | ||
4939 | |||
4940 | dt.size = sregs->idt.limit; | 5091 | dt.size = sregs->idt.limit; |
4941 | dt.address = sregs->idt.base; | 5092 | dt.address = sregs->idt.base; |
4942 | kvm_x86_ops->set_idt(vcpu, &dt); | 5093 | kvm_x86_ops->set_idt(vcpu, &dt); |
@@ -4996,8 +5147,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | |||
4996 | !is_protmode(vcpu)) | 5147 | !is_protmode(vcpu)) |
4997 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; | 5148 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; |
4998 | 5149 | ||
4999 | vcpu_put(vcpu); | ||
5000 | |||
5001 | return 0; | 5150 | return 0; |
5002 | } | 5151 | } |
5003 | 5152 | ||
@@ -5007,12 +5156,10 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, | |||
5007 | unsigned long rflags; | 5156 | unsigned long rflags; |
5008 | int i, r; | 5157 | int i, r; |
5009 | 5158 | ||
5010 | vcpu_load(vcpu); | ||
5011 | |||
5012 | if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) { | 5159 | if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) { |
5013 | r = -EBUSY; | 5160 | r = -EBUSY; |
5014 | if (vcpu->arch.exception.pending) | 5161 | if (vcpu->arch.exception.pending) |
5015 | goto unlock_out; | 5162 | goto out; |
5016 | if (dbg->control & KVM_GUESTDBG_INJECT_DB) | 5163 | if (dbg->control & KVM_GUESTDBG_INJECT_DB) |
5017 | kvm_queue_exception(vcpu, DB_VECTOR); | 5164 | kvm_queue_exception(vcpu, DB_VECTOR); |
5018 | else | 5165 | else |
@@ -5054,34 +5201,12 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, | |||
5054 | 5201 | ||
5055 | r = 0; | 5202 | r = 0; |
5056 | 5203 | ||
5057 | unlock_out: | 5204 | out: |
5058 | vcpu_put(vcpu); | ||
5059 | 5205 | ||
5060 | return r; | 5206 | return r; |
5061 | } | 5207 | } |
5062 | 5208 | ||
5063 | /* | 5209 | /* |
5064 | * fxsave fpu state. Taken from x86_64/processor.h. To be killed when | ||
5065 | * we have asm/x86/processor.h | ||
5066 | */ | ||
5067 | struct fxsave { | ||
5068 | u16 cwd; | ||
5069 | u16 swd; | ||
5070 | u16 twd; | ||
5071 | u16 fop; | ||
5072 | u64 rip; | ||
5073 | u64 rdp; | ||
5074 | u32 mxcsr; | ||
5075 | u32 mxcsr_mask; | ||
5076 | u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ | ||
5077 | #ifdef CONFIG_X86_64 | ||
5078 | u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ | ||
5079 | #else | ||
5080 | u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ | ||
5081 | #endif | ||
5082 | }; | ||
5083 | |||
5084 | /* | ||
5085 | * Translate a guest virtual address to a guest physical address. | 5210 | * Translate a guest virtual address to a guest physical address. |
5086 | */ | 5211 | */ |
5087 | int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, | 5212 | int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, |
@@ -5091,7 +5216,6 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, | |||
5091 | gpa_t gpa; | 5216 | gpa_t gpa; |
5092 | int idx; | 5217 | int idx; |
5093 | 5218 | ||
5094 | vcpu_load(vcpu); | ||
5095 | idx = srcu_read_lock(&vcpu->kvm->srcu); | 5219 | idx = srcu_read_lock(&vcpu->kvm->srcu); |
5096 | gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL); | 5220 | gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL); |
5097 | srcu_read_unlock(&vcpu->kvm->srcu, idx); | 5221 | srcu_read_unlock(&vcpu->kvm->srcu, idx); |
@@ -5099,16 +5223,14 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, | |||
5099 | tr->valid = gpa != UNMAPPED_GVA; | 5223 | tr->valid = gpa != UNMAPPED_GVA; |
5100 | tr->writeable = 1; | 5224 | tr->writeable = 1; |
5101 | tr->usermode = 0; | 5225 | tr->usermode = 0; |
5102 | vcpu_put(vcpu); | ||
5103 | 5226 | ||
5104 | return 0; | 5227 | return 0; |
5105 | } | 5228 | } |
5106 | 5229 | ||
5107 | int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) | 5230 | int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) |
5108 | { | 5231 | { |
5109 | struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; | 5232 | struct i387_fxsave_struct *fxsave = |
5110 | 5233 | &vcpu->arch.guest_fpu.state->fxsave; | |
5111 | vcpu_load(vcpu); | ||
5112 | 5234 | ||
5113 | memcpy(fpu->fpr, fxsave->st_space, 128); | 5235 | memcpy(fpu->fpr, fxsave->st_space, 128); |
5114 | fpu->fcw = fxsave->cwd; | 5236 | fpu->fcw = fxsave->cwd; |
@@ -5119,16 +5241,13 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) | |||
5119 | fpu->last_dp = fxsave->rdp; | 5241 | fpu->last_dp = fxsave->rdp; |
5120 | memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); | 5242 | memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); |
5121 | 5243 | ||
5122 | vcpu_put(vcpu); | ||
5123 | |||
5124 | return 0; | 5244 | return 0; |
5125 | } | 5245 | } |
5126 | 5246 | ||
5127 | int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) | 5247 | int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) |
5128 | { | 5248 | { |
5129 | struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; | 5249 | struct i387_fxsave_struct *fxsave = |
5130 | 5250 | &vcpu->arch.guest_fpu.state->fxsave; | |
5131 | vcpu_load(vcpu); | ||
5132 | 5251 | ||
5133 | memcpy(fxsave->st_space, fpu->fpr, 128); | 5252 | memcpy(fxsave->st_space, fpu->fpr, 128); |
5134 | fxsave->cwd = fpu->fcw; | 5253 | fxsave->cwd = fpu->fcw; |
@@ -5139,61 +5258,63 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) | |||
5139 | fxsave->rdp = fpu->last_dp; | 5258 | fxsave->rdp = fpu->last_dp; |
5140 | memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); | 5259 | memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); |
5141 | 5260 | ||
5142 | vcpu_put(vcpu); | ||
5143 | |||
5144 | return 0; | 5261 | return 0; |
5145 | } | 5262 | } |
5146 | 5263 | ||
5147 | void fx_init(struct kvm_vcpu *vcpu) | 5264 | int fx_init(struct kvm_vcpu *vcpu) |
5148 | { | 5265 | { |
5149 | unsigned after_mxcsr_mask; | 5266 | int err; |
5267 | |||
5268 | err = fpu_alloc(&vcpu->arch.guest_fpu); | ||
5269 | if (err) | ||
5270 | return err; | ||
5271 | |||
5272 | fpu_finit(&vcpu->arch.guest_fpu); | ||
5150 | 5273 | ||
5151 | /* | 5274 | /* |
5152 | * Touch the fpu the first time in non atomic context as if | 5275 | * Ensure guest xcr0 is valid for loading |
5153 | * this is the first fpu instruction the exception handler | ||
5154 | * will fire before the instruction returns and it'll have to | ||
5155 | * allocate ram with GFP_KERNEL. | ||
5156 | */ | 5276 | */ |
5157 | if (!used_math()) | 5277 | vcpu->arch.xcr0 = XSTATE_FP; |
5158 | kvm_fx_save(&vcpu->arch.host_fx_image); | ||
5159 | |||
5160 | /* Initialize guest FPU by resetting ours and saving into guest's */ | ||
5161 | preempt_disable(); | ||
5162 | kvm_fx_save(&vcpu->arch.host_fx_image); | ||
5163 | kvm_fx_finit(); | ||
5164 | kvm_fx_save(&vcpu->arch.guest_fx_image); | ||
5165 | kvm_fx_restore(&vcpu->arch.host_fx_image); | ||
5166 | preempt_enable(); | ||
5167 | 5278 | ||
5168 | vcpu->arch.cr0 |= X86_CR0_ET; | 5279 | vcpu->arch.cr0 |= X86_CR0_ET; |
5169 | after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); | 5280 | |
5170 | vcpu->arch.guest_fx_image.mxcsr = 0x1f80; | 5281 | return 0; |
5171 | memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask, | ||
5172 | 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask); | ||
5173 | } | 5282 | } |
5174 | EXPORT_SYMBOL_GPL(fx_init); | 5283 | EXPORT_SYMBOL_GPL(fx_init); |
5175 | 5284 | ||
5285 | static void fx_free(struct kvm_vcpu *vcpu) | ||
5286 | { | ||
5287 | fpu_free(&vcpu->arch.guest_fpu); | ||
5288 | } | ||
5289 | |||
5176 | void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) | 5290 | void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) |
5177 | { | 5291 | { |
5178 | if (vcpu->guest_fpu_loaded) | 5292 | if (vcpu->guest_fpu_loaded) |
5179 | return; | 5293 | return; |
5180 | 5294 | ||
5295 | /* | ||
5296 | * Restore all possible states in the guest, | ||
5297 | * and assume host would use all available bits. | ||
5298 | * Guest xcr0 would be loaded later. | ||
5299 | */ | ||
5300 | kvm_put_guest_xcr0(vcpu); | ||
5181 | vcpu->guest_fpu_loaded = 1; | 5301 | vcpu->guest_fpu_loaded = 1; |
5182 | kvm_fx_save(&vcpu->arch.host_fx_image); | 5302 | unlazy_fpu(current); |
5183 | kvm_fx_restore(&vcpu->arch.guest_fx_image); | 5303 | fpu_restore_checking(&vcpu->arch.guest_fpu); |
5184 | trace_kvm_fpu(1); | 5304 | trace_kvm_fpu(1); |
5185 | } | 5305 | } |
5186 | 5306 | ||
5187 | void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) | 5307 | void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) |
5188 | { | 5308 | { |
5309 | kvm_put_guest_xcr0(vcpu); | ||
5310 | |||
5189 | if (!vcpu->guest_fpu_loaded) | 5311 | if (!vcpu->guest_fpu_loaded) |
5190 | return; | 5312 | return; |
5191 | 5313 | ||
5192 | vcpu->guest_fpu_loaded = 0; | 5314 | vcpu->guest_fpu_loaded = 0; |
5193 | kvm_fx_save(&vcpu->arch.guest_fx_image); | 5315 | fpu_save_init(&vcpu->arch.guest_fpu); |
5194 | kvm_fx_restore(&vcpu->arch.host_fx_image); | ||
5195 | ++vcpu->stat.fpu_reload; | 5316 | ++vcpu->stat.fpu_reload; |
5196 | set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests); | 5317 | kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); |
5197 | trace_kvm_fpu(0); | 5318 | trace_kvm_fpu(0); |
5198 | } | 5319 | } |
5199 | 5320 | ||
@@ -5204,6 +5325,8 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) | |||
5204 | vcpu->arch.time_page = NULL; | 5325 | vcpu->arch.time_page = NULL; |
5205 | } | 5326 | } |
5206 | 5327 | ||
5328 | free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); | ||
5329 | fx_free(vcpu); | ||
5207 | kvm_x86_ops->vcpu_free(vcpu); | 5330 | kvm_x86_ops->vcpu_free(vcpu); |
5208 | } | 5331 | } |
5209 | 5332 | ||
@@ -5217,9 +5340,6 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) | |||
5217 | { | 5340 | { |
5218 | int r; | 5341 | int r; |
5219 | 5342 | ||
5220 | /* We do fxsave: this must be aligned. */ | ||
5221 | BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF); | ||
5222 | |||
5223 | vcpu->arch.mtrr_state.have_fixed = 1; | 5343 | vcpu->arch.mtrr_state.have_fixed = 1; |
5224 | vcpu_load(vcpu); | 5344 | vcpu_load(vcpu); |
5225 | r = kvm_arch_vcpu_reset(vcpu); | 5345 | r = kvm_arch_vcpu_reset(vcpu); |
@@ -5241,6 +5361,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) | |||
5241 | kvm_mmu_unload(vcpu); | 5361 | kvm_mmu_unload(vcpu); |
5242 | vcpu_put(vcpu); | 5362 | vcpu_put(vcpu); |
5243 | 5363 | ||
5364 | fx_free(vcpu); | ||
5244 | kvm_x86_ops->vcpu_free(vcpu); | 5365 | kvm_x86_ops->vcpu_free(vcpu); |
5245 | } | 5366 | } |
5246 | 5367 | ||
@@ -5334,7 +5455,12 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) | |||
5334 | } | 5455 | } |
5335 | vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; | 5456 | vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; |
5336 | 5457 | ||
5458 | if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) | ||
5459 | goto fail_free_mce_banks; | ||
5460 | |||
5337 | return 0; | 5461 | return 0; |
5462 | fail_free_mce_banks: | ||
5463 | kfree(vcpu->arch.mce_banks); | ||
5338 | fail_free_lapic: | 5464 | fail_free_lapic: |
5339 | kvm_free_lapic(vcpu); | 5465 | kvm_free_lapic(vcpu); |
5340 | fail_mmu_destroy: | 5466 | fail_mmu_destroy: |
@@ -5364,12 +5490,6 @@ struct kvm *kvm_arch_create_vm(void) | |||
5364 | if (!kvm) | 5490 | if (!kvm) |
5365 | return ERR_PTR(-ENOMEM); | 5491 | return ERR_PTR(-ENOMEM); |
5366 | 5492 | ||
5367 | kvm->arch.aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); | ||
5368 | if (!kvm->arch.aliases) { | ||
5369 | kfree(kvm); | ||
5370 | return ERR_PTR(-ENOMEM); | ||
5371 | } | ||
5372 | |||
5373 | INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); | 5493 | INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); |
5374 | INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); | 5494 | INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); |
5375 | 5495 | ||
@@ -5412,12 +5532,12 @@ static void kvm_free_vcpus(struct kvm *kvm) | |||
5412 | void kvm_arch_sync_events(struct kvm *kvm) | 5532 | void kvm_arch_sync_events(struct kvm *kvm) |
5413 | { | 5533 | { |
5414 | kvm_free_all_assigned_devices(kvm); | 5534 | kvm_free_all_assigned_devices(kvm); |
5535 | kvm_free_pit(kvm); | ||
5415 | } | 5536 | } |
5416 | 5537 | ||
5417 | void kvm_arch_destroy_vm(struct kvm *kvm) | 5538 | void kvm_arch_destroy_vm(struct kvm *kvm) |
5418 | { | 5539 | { |
5419 | kvm_iommu_unmap_guest(kvm); | 5540 | kvm_iommu_unmap_guest(kvm); |
5420 | kvm_free_pit(kvm); | ||
5421 | kfree(kvm->arch.vpic); | 5541 | kfree(kvm->arch.vpic); |
5422 | kfree(kvm->arch.vioapic); | 5542 | kfree(kvm->arch.vioapic); |
5423 | kvm_free_vcpus(kvm); | 5543 | kvm_free_vcpus(kvm); |
@@ -5427,7 +5547,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm) | |||
5427 | if (kvm->arch.ept_identity_pagetable) | 5547 | if (kvm->arch.ept_identity_pagetable) |
5428 | put_page(kvm->arch.ept_identity_pagetable); | 5548 | put_page(kvm->arch.ept_identity_pagetable); |
5429 | cleanup_srcu_struct(&kvm->srcu); | 5549 | cleanup_srcu_struct(&kvm->srcu); |
5430 | kfree(kvm->arch.aliases); | ||
5431 | kfree(kvm); | 5550 | kfree(kvm); |
5432 | } | 5551 | } |
5433 | 5552 | ||
@@ -5438,6 +5557,11 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, | |||
5438 | int user_alloc) | 5557 | int user_alloc) |
5439 | { | 5558 | { |
5440 | int npages = memslot->npages; | 5559 | int npages = memslot->npages; |
5560 | int map_flags = MAP_PRIVATE | MAP_ANONYMOUS; | ||
5561 | |||
5562 | /* Prevent internal slot pages from being moved by fork()/COW. */ | ||
5563 | if (memslot->id >= KVM_MEMORY_SLOTS) | ||
5564 | map_flags = MAP_SHARED | MAP_ANONYMOUS; | ||
5441 | 5565 | ||
5442 | /*To keep backward compatibility with older userspace, | 5566 | /*To keep backward compatibility with older userspace, |
5443 | *x86 needs to hanlde !user_alloc case. | 5567 | *x86 needs to hanlde !user_alloc case. |
@@ -5450,7 +5574,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, | |||
5450 | userspace_addr = do_mmap(NULL, 0, | 5574 | userspace_addr = do_mmap(NULL, 0, |
5451 | npages * PAGE_SIZE, | 5575 | npages * PAGE_SIZE, |
5452 | PROT_READ | PROT_WRITE, | 5576 | PROT_READ | PROT_WRITE, |
5453 | MAP_PRIVATE | MAP_ANONYMOUS, | 5577 | map_flags, |
5454 | 0); | 5578 | 0); |
5455 | up_write(¤t->mm->mmap_sem); | 5579 | up_write(¤t->mm->mmap_sem); |
5456 | 5580 | ||
@@ -5523,7 +5647,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu) | |||
5523 | 5647 | ||
5524 | me = get_cpu(); | 5648 | me = get_cpu(); |
5525 | if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) | 5649 | if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) |
5526 | if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests)) | 5650 | if (atomic_xchg(&vcpu->guest_mode, 0)) |
5527 | smp_send_reschedule(cpu); | 5651 | smp_send_reschedule(cpu); |
5528 | put_cpu(); | 5652 | put_cpu(); |
5529 | } | 5653 | } |
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index f4b54458285b..b7a404722d2b 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h | |||
@@ -65,13 +65,6 @@ static inline int is_paging(struct kvm_vcpu *vcpu) | |||
65 | return kvm_read_cr0_bits(vcpu, X86_CR0_PG); | 65 | return kvm_read_cr0_bits(vcpu, X86_CR0_PG); |
66 | } | 66 | } |
67 | 67 | ||
68 | static inline struct kvm_mem_aliases *kvm_aliases(struct kvm *kvm) | ||
69 | { | ||
70 | return rcu_dereference_check(kvm->arch.aliases, | ||
71 | srcu_read_lock_held(&kvm->srcu) | ||
72 | || lockdep_is_held(&kvm->slots_lock)); | ||
73 | } | ||
74 | |||
75 | void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); | 68 | void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); |
76 | void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); | 69 | void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); |
77 | 70 | ||
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index f871e04b6965..e10cf070ede0 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile | |||
@@ -30,6 +30,7 @@ ifeq ($(CONFIG_X86_32),y) | |||
30 | lib-y += checksum_32.o | 30 | lib-y += checksum_32.o |
31 | lib-y += strstr_32.o | 31 | lib-y += strstr_32.o |
32 | lib-y += semaphore_32.o string_32.o | 32 | lib-y += semaphore_32.o string_32.o |
33 | lib-y += cmpxchg.o | ||
33 | ifneq ($(CONFIG_X86_CMPXCHG64),y) | 34 | ifneq ($(CONFIG_X86_CMPXCHG64),y) |
34 | lib-y += cmpxchg8b_emu.o atomic64_386_32.o | 35 | lib-y += cmpxchg8b_emu.o atomic64_386_32.o |
35 | endif | 36 | endif |
diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S index 4a5979aa6883..2cda60a06e65 100644 --- a/arch/x86/lib/atomic64_386_32.S +++ b/arch/x86/lib/atomic64_386_32.S | |||
@@ -25,150 +25,172 @@ | |||
25 | CFI_ADJUST_CFA_OFFSET -4 | 25 | CFI_ADJUST_CFA_OFFSET -4 |
26 | .endm | 26 | .endm |
27 | 27 | ||
28 | .macro BEGIN func reg | 28 | #define BEGIN(op) \ |
29 | $v = \reg | 29 | .macro endp; \ |
30 | 30 | CFI_ENDPROC; \ | |
31 | ENTRY(atomic64_\func\()_386) | 31 | ENDPROC(atomic64_##op##_386); \ |
32 | CFI_STARTPROC | 32 | .purgem endp; \ |
33 | LOCK $v | 33 | .endm; \ |
34 | 34 | ENTRY(atomic64_##op##_386); \ | |
35 | .macro RETURN | 35 | CFI_STARTPROC; \ |
36 | UNLOCK $v | 36 | LOCK v; |
37 | |||
38 | #define ENDP endp | ||
39 | |||
40 | #define RET \ | ||
41 | UNLOCK v; \ | ||
37 | ret | 42 | ret |
38 | .endm | ||
39 | |||
40 | .macro END_ | ||
41 | CFI_ENDPROC | ||
42 | ENDPROC(atomic64_\func\()_386) | ||
43 | .purgem RETURN | ||
44 | .purgem END_ | ||
45 | .purgem END | ||
46 | .endm | ||
47 | |||
48 | .macro END | ||
49 | RETURN | ||
50 | END_ | ||
51 | .endm | ||
52 | .endm | ||
53 | 43 | ||
54 | BEGIN read %ecx | 44 | #define RET_ENDP \ |
55 | movl ($v), %eax | 45 | RET; \ |
56 | movl 4($v), %edx | 46 | ENDP |
57 | END | 47 | |
58 | 48 | #define v %ecx | |
59 | BEGIN set %esi | 49 | BEGIN(read) |
60 | movl %ebx, ($v) | 50 | movl (v), %eax |
61 | movl %ecx, 4($v) | 51 | movl 4(v), %edx |
62 | END | 52 | RET_ENDP |
63 | 53 | #undef v | |
64 | BEGIN xchg %esi | 54 | |
65 | movl ($v), %eax | 55 | #define v %esi |
66 | movl 4($v), %edx | 56 | BEGIN(set) |
67 | movl %ebx, ($v) | 57 | movl %ebx, (v) |
68 | movl %ecx, 4($v) | 58 | movl %ecx, 4(v) |
69 | END | 59 | RET_ENDP |
70 | 60 | #undef v | |
71 | BEGIN add %ecx | 61 | |
72 | addl %eax, ($v) | 62 | #define v %esi |
73 | adcl %edx, 4($v) | 63 | BEGIN(xchg) |
74 | END | 64 | movl (v), %eax |
75 | 65 | movl 4(v), %edx | |
76 | BEGIN add_return %ecx | 66 | movl %ebx, (v) |
77 | addl ($v), %eax | 67 | movl %ecx, 4(v) |
78 | adcl 4($v), %edx | 68 | RET_ENDP |
79 | movl %eax, ($v) | 69 | #undef v |
80 | movl %edx, 4($v) | 70 | |
81 | END | 71 | #define v %ecx |
82 | 72 | BEGIN(add) | |
83 | BEGIN sub %ecx | 73 | addl %eax, (v) |
84 | subl %eax, ($v) | 74 | adcl %edx, 4(v) |
85 | sbbl %edx, 4($v) | 75 | RET_ENDP |
86 | END | 76 | #undef v |
87 | 77 | ||
88 | BEGIN sub_return %ecx | 78 | #define v %ecx |
79 | BEGIN(add_return) | ||
80 | addl (v), %eax | ||
81 | adcl 4(v), %edx | ||
82 | movl %eax, (v) | ||
83 | movl %edx, 4(v) | ||
84 | RET_ENDP | ||
85 | #undef v | ||
86 | |||
87 | #define v %ecx | ||
88 | BEGIN(sub) | ||
89 | subl %eax, (v) | ||
90 | sbbl %edx, 4(v) | ||
91 | RET_ENDP | ||
92 | #undef v | ||
93 | |||
94 | #define v %ecx | ||
95 | BEGIN(sub_return) | ||
89 | negl %edx | 96 | negl %edx |
90 | negl %eax | 97 | negl %eax |
91 | sbbl $0, %edx | 98 | sbbl $0, %edx |
92 | addl ($v), %eax | 99 | addl (v), %eax |
93 | adcl 4($v), %edx | 100 | adcl 4(v), %edx |
94 | movl %eax, ($v) | 101 | movl %eax, (v) |
95 | movl %edx, 4($v) | 102 | movl %edx, 4(v) |
96 | END | 103 | RET_ENDP |
97 | 104 | #undef v | |
98 | BEGIN inc %esi | 105 | |
99 | addl $1, ($v) | 106 | #define v %esi |
100 | adcl $0, 4($v) | 107 | BEGIN(inc) |
101 | END | 108 | addl $1, (v) |
102 | 109 | adcl $0, 4(v) | |
103 | BEGIN inc_return %esi | 110 | RET_ENDP |
104 | movl ($v), %eax | 111 | #undef v |
105 | movl 4($v), %edx | 112 | |
113 | #define v %esi | ||
114 | BEGIN(inc_return) | ||
115 | movl (v), %eax | ||
116 | movl 4(v), %edx | ||
106 | addl $1, %eax | 117 | addl $1, %eax |
107 | adcl $0, %edx | 118 | adcl $0, %edx |
108 | movl %eax, ($v) | 119 | movl %eax, (v) |
109 | movl %edx, 4($v) | 120 | movl %edx, 4(v) |
110 | END | 121 | RET_ENDP |
111 | 122 | #undef v | |
112 | BEGIN dec %esi | 123 | |
113 | subl $1, ($v) | 124 | #define v %esi |
114 | sbbl $0, 4($v) | 125 | BEGIN(dec) |
115 | END | 126 | subl $1, (v) |
116 | 127 | sbbl $0, 4(v) | |
117 | BEGIN dec_return %esi | 128 | RET_ENDP |
118 | movl ($v), %eax | 129 | #undef v |
119 | movl 4($v), %edx | 130 | |
131 | #define v %esi | ||
132 | BEGIN(dec_return) | ||
133 | movl (v), %eax | ||
134 | movl 4(v), %edx | ||
120 | subl $1, %eax | 135 | subl $1, %eax |
121 | sbbl $0, %edx | 136 | sbbl $0, %edx |
122 | movl %eax, ($v) | 137 | movl %eax, (v) |
123 | movl %edx, 4($v) | 138 | movl %edx, 4(v) |
124 | END | 139 | RET_ENDP |
140 | #undef v | ||
125 | 141 | ||
126 | BEGIN add_unless %ecx | 142 | #define v %ecx |
143 | BEGIN(add_unless) | ||
127 | addl %eax, %esi | 144 | addl %eax, %esi |
128 | adcl %edx, %edi | 145 | adcl %edx, %edi |
129 | addl ($v), %eax | 146 | addl (v), %eax |
130 | adcl 4($v), %edx | 147 | adcl 4(v), %edx |
131 | cmpl %eax, %esi | 148 | cmpl %eax, %esi |
132 | je 3f | 149 | je 3f |
133 | 1: | 150 | 1: |
134 | movl %eax, ($v) | 151 | movl %eax, (v) |
135 | movl %edx, 4($v) | 152 | movl %edx, 4(v) |
136 | movl $1, %eax | 153 | movl $1, %eax |
137 | 2: | 154 | 2: |
138 | RETURN | 155 | RET |
139 | 3: | 156 | 3: |
140 | cmpl %edx, %edi | 157 | cmpl %edx, %edi |
141 | jne 1b | 158 | jne 1b |
142 | xorl %eax, %eax | 159 | xorl %eax, %eax |
143 | jmp 2b | 160 | jmp 2b |
144 | END_ | 161 | ENDP |
162 | #undef v | ||
145 | 163 | ||
146 | BEGIN inc_not_zero %esi | 164 | #define v %esi |
147 | movl ($v), %eax | 165 | BEGIN(inc_not_zero) |
148 | movl 4($v), %edx | 166 | movl (v), %eax |
167 | movl 4(v), %edx | ||
149 | testl %eax, %eax | 168 | testl %eax, %eax |
150 | je 3f | 169 | je 3f |
151 | 1: | 170 | 1: |
152 | addl $1, %eax | 171 | addl $1, %eax |
153 | adcl $0, %edx | 172 | adcl $0, %edx |
154 | movl %eax, ($v) | 173 | movl %eax, (v) |
155 | movl %edx, 4($v) | 174 | movl %edx, 4(v) |
156 | movl $1, %eax | 175 | movl $1, %eax |
157 | 2: | 176 | 2: |
158 | RETURN | 177 | RET |
159 | 3: | 178 | 3: |
160 | testl %edx, %edx | 179 | testl %edx, %edx |
161 | jne 1b | 180 | jne 1b |
162 | jmp 2b | 181 | jmp 2b |
163 | END_ | 182 | ENDP |
183 | #undef v | ||
164 | 184 | ||
165 | BEGIN dec_if_positive %esi | 185 | #define v %esi |
166 | movl ($v), %eax | 186 | BEGIN(dec_if_positive) |
167 | movl 4($v), %edx | 187 | movl (v), %eax |
188 | movl 4(v), %edx | ||
168 | subl $1, %eax | 189 | subl $1, %eax |
169 | sbbl $0, %edx | 190 | sbbl $0, %edx |
170 | js 1f | 191 | js 1f |
171 | movl %eax, ($v) | 192 | movl %eax, (v) |
172 | movl %edx, 4($v) | 193 | movl %edx, 4(v) |
173 | 1: | 194 | 1: |
174 | END | 195 | RET_ENDP |
196 | #undef v | ||
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index ebeafcce04a9..aa4326bfb24a 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S | |||
@@ -52,7 +52,7 @@ ENDPROC(clear_page) | |||
52 | .align 8 | 52 | .align 8 |
53 | .quad clear_page | 53 | .quad clear_page |
54 | .quad 1b | 54 | .quad 1b |
55 | .byte X86_FEATURE_REP_GOOD | 55 | .word X86_FEATURE_REP_GOOD |
56 | .byte .Lclear_page_end - clear_page | 56 | .byte .Lclear_page_end - clear_page |
57 | .byte 2b - 1b | 57 | .byte 2b - 1b |
58 | .previous | 58 | .previous |
diff --git a/arch/x86/kernel/cpu/cmpxchg.c b/arch/x86/lib/cmpxchg.c index 2056ccf572cc..5d619f6df3ee 100644 --- a/arch/x86/kernel/cpu/cmpxchg.c +++ b/arch/x86/lib/cmpxchg.c | |||
@@ -52,21 +52,3 @@ unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new) | |||
52 | } | 52 | } |
53 | EXPORT_SYMBOL(cmpxchg_386_u32); | 53 | EXPORT_SYMBOL(cmpxchg_386_u32); |
54 | #endif | 54 | #endif |
55 | |||
56 | #ifndef CONFIG_X86_CMPXCHG64 | ||
57 | unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new) | ||
58 | { | ||
59 | u64 prev; | ||
60 | unsigned long flags; | ||
61 | |||
62 | /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */ | ||
63 | local_irq_save(flags); | ||
64 | prev = *(u64 *)ptr; | ||
65 | if (prev == old) | ||
66 | *(u64 *)ptr = new; | ||
67 | local_irq_restore(flags); | ||
68 | return prev; | ||
69 | } | ||
70 | EXPORT_SYMBOL(cmpxchg_486_u64); | ||
71 | #endif | ||
72 | |||
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S index 727a5d46d2fc..6fec2d1cebe1 100644 --- a/arch/x86/lib/copy_page_64.S +++ b/arch/x86/lib/copy_page_64.S | |||
@@ -113,7 +113,7 @@ ENDPROC(copy_page) | |||
113 | .align 8 | 113 | .align 8 |
114 | .quad copy_page | 114 | .quad copy_page |
115 | .quad 1b | 115 | .quad 1b |
116 | .byte X86_FEATURE_REP_GOOD | 116 | .word X86_FEATURE_REP_GOOD |
117 | .byte .Lcopy_page_end - copy_page | 117 | .byte .Lcopy_page_end - copy_page |
118 | .byte 2b - 1b | 118 | .byte 2b - 1b |
119 | .previous | 119 | .previous |
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 71100c98e337..a460158b5ac5 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S | |||
@@ -29,7 +29,7 @@ | |||
29 | .align 8 | 29 | .align 8 |
30 | .quad 0b | 30 | .quad 0b |
31 | .quad 2b | 31 | .quad 2b |
32 | .byte \feature /* when feature is set */ | 32 | .word \feature /* when feature is set */ |
33 | .byte 5 | 33 | .byte 5 |
34 | .byte 5 | 34 | .byte 5 |
35 | .previous | 35 | .previous |
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index f82e884928af..bcbcd1e0f7d5 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S | |||
@@ -131,7 +131,7 @@ ENDPROC(__memcpy) | |||
131 | .align 8 | 131 | .align 8 |
132 | .quad memcpy | 132 | .quad memcpy |
133 | .quad .Lmemcpy_c | 133 | .quad .Lmemcpy_c |
134 | .byte X86_FEATURE_REP_GOOD | 134 | .word X86_FEATURE_REP_GOOD |
135 | 135 | ||
136 | /* | 136 | /* |
137 | * Replace only beginning, memcpy is used to apply alternatives, | 137 | * Replace only beginning, memcpy is used to apply alternatives, |
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index e88d3b81644a..09d344269652 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S | |||
@@ -121,7 +121,7 @@ ENDPROC(__memset) | |||
121 | .align 8 | 121 | .align 8 |
122 | .quad memset | 122 | .quad memset |
123 | .quad .Lmemset_c | 123 | .quad .Lmemset_c |
124 | .byte X86_FEATURE_REP_GOOD | 124 | .word X86_FEATURE_REP_GOOD |
125 | .byte .Lfinal - memset | 125 | .byte .Lfinal - memset |
126 | .byte .Lmemset_e - .Lmemset_c | 126 | .byte .Lmemset_e - .Lmemset_c |
127 | .previous | 127 | .previous |
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index a725b7f760ae..0002a3a33081 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c | |||
@@ -37,6 +37,28 @@ struct addr_marker { | |||
37 | const char *name; | 37 | const char *name; |
38 | }; | 38 | }; |
39 | 39 | ||
40 | /* indices for address_markers; keep sync'd w/ address_markers below */ | ||
41 | enum address_markers_idx { | ||
42 | USER_SPACE_NR = 0, | ||
43 | #ifdef CONFIG_X86_64 | ||
44 | KERNEL_SPACE_NR, | ||
45 | LOW_KERNEL_NR, | ||
46 | VMALLOC_START_NR, | ||
47 | VMEMMAP_START_NR, | ||
48 | HIGH_KERNEL_NR, | ||
49 | MODULES_VADDR_NR, | ||
50 | MODULES_END_NR, | ||
51 | #else | ||
52 | KERNEL_SPACE_NR, | ||
53 | VMALLOC_START_NR, | ||
54 | VMALLOC_END_NR, | ||
55 | # ifdef CONFIG_HIGHMEM | ||
56 | PKMAP_BASE_NR, | ||
57 | # endif | ||
58 | FIXADDR_START_NR, | ||
59 | #endif | ||
60 | }; | ||
61 | |||
40 | /* Address space markers hints */ | 62 | /* Address space markers hints */ |
41 | static struct addr_marker address_markers[] = { | 63 | static struct addr_marker address_markers[] = { |
42 | { 0, "User Space" }, | 64 | { 0, "User Space" }, |
@@ -331,14 +353,12 @@ static int pt_dump_init(void) | |||
331 | 353 | ||
332 | #ifdef CONFIG_X86_32 | 354 | #ifdef CONFIG_X86_32 |
333 | /* Not a compile-time constant on x86-32 */ | 355 | /* Not a compile-time constant on x86-32 */ |
334 | address_markers[2].start_address = VMALLOC_START; | 356 | address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; |
335 | address_markers[3].start_address = VMALLOC_END; | 357 | address_markers[VMALLOC_END_NR].start_address = VMALLOC_END; |
336 | # ifdef CONFIG_HIGHMEM | 358 | # ifdef CONFIG_HIGHMEM |
337 | address_markers[4].start_address = PKMAP_BASE; | 359 | address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE; |
338 | address_markers[5].start_address = FIXADDR_START; | ||
339 | # else | ||
340 | address_markers[4].start_address = FIXADDR_START; | ||
341 | # endif | 360 | # endif |
361 | address_markers[FIXADDR_START_NR].start_address = FIXADDR_START; | ||
342 | #endif | 362 | #endif |
343 | 363 | ||
344 | pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL, | 364 | pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL, |
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index f62777940dfb..4c4508e8a204 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c | |||
@@ -802,8 +802,10 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, | |||
802 | up_read(&mm->mmap_sem); | 802 | up_read(&mm->mmap_sem); |
803 | 803 | ||
804 | /* Kernel mode? Handle exceptions or die: */ | 804 | /* Kernel mode? Handle exceptions or die: */ |
805 | if (!(error_code & PF_USER)) | 805 | if (!(error_code & PF_USER)) { |
806 | no_context(regs, error_code, address); | 806 | no_context(regs, error_code, address); |
807 | return; | ||
808 | } | ||
807 | 809 | ||
808 | /* User-space => ok to do another page fault: */ | 810 | /* User-space => ok to do another page fault: */ |
809 | if (is_prefetch(regs, error_code, address)) | 811 | if (is_prefetch(regs, error_code, address)) |
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 63a6ba66cbe0..5e8fa12ef861 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c | |||
@@ -53,7 +53,7 @@ void *kmap_atomic(struct page *page, enum km_type type) | |||
53 | return kmap_atomic_prot(page, type, kmap_prot); | 53 | return kmap_atomic_prot(page, type, kmap_prot); |
54 | } | 54 | } |
55 | 55 | ||
56 | void kunmap_atomic(void *kvaddr, enum km_type type) | 56 | void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type) |
57 | { | 57 | { |
58 | unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; | 58 | unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; |
59 | enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); | 59 | enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); |
@@ -102,7 +102,7 @@ struct page *kmap_atomic_to_page(void *ptr) | |||
102 | EXPORT_SYMBOL(kmap); | 102 | EXPORT_SYMBOL(kmap); |
103 | EXPORT_SYMBOL(kunmap); | 103 | EXPORT_SYMBOL(kunmap); |
104 | EXPORT_SYMBOL(kmap_atomic); | 104 | EXPORT_SYMBOL(kmap_atomic); |
105 | EXPORT_SYMBOL(kunmap_atomic); | 105 | EXPORT_SYMBOL(kunmap_atomic_notypecheck); |
106 | EXPORT_SYMBOL(kmap_atomic_prot); | 106 | EXPORT_SYMBOL(kmap_atomic_prot); |
107 | EXPORT_SYMBOL(kmap_atomic_to_page); | 107 | EXPORT_SYMBOL(kmap_atomic_to_page); |
108 | 108 | ||
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index ee41bba315d1..9a6674689a20 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c | |||
@@ -2,7 +2,7 @@ | |||
2 | * linux/arch/x86_64/mm/init.c | 2 | * linux/arch/x86_64/mm/init.c |
3 | * | 3 | * |
4 | * Copyright (C) 1995 Linus Torvalds | 4 | * Copyright (C) 1995 Linus Torvalds |
5 | * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> | 5 | * Copyright (C) 2000 Pavel Machek <pavel@ucw.cz> |
6 | * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de> | 6 | * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de> |
7 | */ | 7 | */ |
8 | 8 | ||
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 12e4d2d3c110..3ba6e0608c55 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c | |||
@@ -62,8 +62,8 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size, | |||
62 | static void __iomem *__ioremap_caller(resource_size_t phys_addr, | 62 | static void __iomem *__ioremap_caller(resource_size_t phys_addr, |
63 | unsigned long size, unsigned long prot_val, void *caller) | 63 | unsigned long size, unsigned long prot_val, void *caller) |
64 | { | 64 | { |
65 | unsigned long pfn, offset, vaddr; | 65 | unsigned long offset, vaddr; |
66 | resource_size_t last_addr; | 66 | resource_size_t pfn, last_pfn, last_addr; |
67 | const resource_size_t unaligned_phys_addr = phys_addr; | 67 | const resource_size_t unaligned_phys_addr = phys_addr; |
68 | const unsigned long unaligned_size = size; | 68 | const unsigned long unaligned_size = size; |
69 | struct vm_struct *area; | 69 | struct vm_struct *area; |
@@ -100,10 +100,8 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, | |||
100 | /* | 100 | /* |
101 | * Don't allow anybody to remap normal RAM that we're using.. | 101 | * Don't allow anybody to remap normal RAM that we're using.. |
102 | */ | 102 | */ |
103 | for (pfn = phys_addr >> PAGE_SHIFT; | 103 | last_pfn = last_addr >> PAGE_SHIFT; |
104 | (pfn << PAGE_SHIFT) < (last_addr & PAGE_MASK); | 104 | for (pfn = phys_addr >> PAGE_SHIFT; pfn <= last_pfn; pfn++) { |
105 | pfn++) { | ||
106 | |||
107 | int is_ram = page_is_ram(pfn); | 105 | int is_ram = page_is_ram(pfn); |
108 | 106 | ||
109 | if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn))) | 107 | if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn))) |
@@ -115,7 +113,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, | |||
115 | * Mappings have to be page-aligned | 113 | * Mappings have to be page-aligned |
116 | */ | 114 | */ |
117 | offset = phys_addr & ~PAGE_MASK; | 115 | offset = phys_addr & ~PAGE_MASK; |
118 | phys_addr &= PAGE_MASK; | 116 | phys_addr &= PHYSICAL_PAGE_MASK; |
119 | size = PAGE_ALIGN(last_addr+1) - phys_addr; | 117 | size = PAGE_ALIGN(last_addr+1) - phys_addr; |
120 | 118 | ||
121 | retval = reserve_memtype(phys_addr, (u64)phys_addr + size, | 119 | retval = reserve_memtype(phys_addr, (u64)phys_addr + size, |
@@ -613,7 +611,7 @@ void __init early_iounmap(void __iomem *addr, unsigned long size) | |||
613 | return; | 611 | return; |
614 | } | 612 | } |
615 | offset = virt_addr & ~PAGE_MASK; | 613 | offset = virt_addr & ~PAGE_MASK; |
616 | nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; | 614 | nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT; |
617 | 615 | ||
618 | idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; | 616 | idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; |
619 | while (nrpages > 0) { | 617 | while (nrpages > 0) { |
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 5d0e67fff1a6..e5d5e2ce9f77 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c | |||
@@ -45,6 +45,8 @@ struct kmmio_fault_page { | |||
45 | * Protected by kmmio_lock, when linked into kmmio_page_table. | 45 | * Protected by kmmio_lock, when linked into kmmio_page_table. |
46 | */ | 46 | */ |
47 | int count; | 47 | int count; |
48 | |||
49 | bool scheduled_for_release; | ||
48 | }; | 50 | }; |
49 | 51 | ||
50 | struct kmmio_delayed_release { | 52 | struct kmmio_delayed_release { |
@@ -398,8 +400,11 @@ static void release_kmmio_fault_page(unsigned long page, | |||
398 | BUG_ON(f->count < 0); | 400 | BUG_ON(f->count < 0); |
399 | if (!f->count) { | 401 | if (!f->count) { |
400 | disarm_kmmio_fault_page(f); | 402 | disarm_kmmio_fault_page(f); |
401 | f->release_next = *release_list; | 403 | if (!f->scheduled_for_release) { |
402 | *release_list = f; | 404 | f->release_next = *release_list; |
405 | *release_list = f; | ||
406 | f->scheduled_for_release = true; | ||
407 | } | ||
403 | } | 408 | } |
404 | } | 409 | } |
405 | 410 | ||
@@ -471,8 +476,10 @@ static void remove_kmmio_fault_pages(struct rcu_head *head) | |||
471 | prevp = &f->release_next; | 476 | prevp = &f->release_next; |
472 | } else { | 477 | } else { |
473 | *prevp = f->release_next; | 478 | *prevp = f->release_next; |
479 | f->release_next = NULL; | ||
480 | f->scheduled_for_release = false; | ||
474 | } | 481 | } |
475 | f = f->release_next; | 482 | f = *prevp; |
476 | } | 483 | } |
477 | spin_unlock_irqrestore(&kmmio_lock, flags); | 484 | spin_unlock_irqrestore(&kmmio_lock, flags); |
478 | 485 | ||
@@ -510,6 +517,9 @@ void unregister_kmmio_probe(struct kmmio_probe *p) | |||
510 | kmmio_count--; | 517 | kmmio_count--; |
511 | spin_unlock_irqrestore(&kmmio_lock, flags); | 518 | spin_unlock_irqrestore(&kmmio_lock, flags); |
512 | 519 | ||
520 | if (!release_list) | ||
521 | return; | ||
522 | |||
513 | drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC); | 523 | drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC); |
514 | if (!drelease) { | 524 | if (!drelease) { |
515 | pr_crit("leaking kmmio_fault_page objects.\n"); | 525 | pr_crit("leaking kmmio_fault_page objects.\n"); |
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 64121a18b8cb..f6ff57b7efa5 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c | |||
@@ -158,7 +158,7 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type) | |||
158 | return req_type; | 158 | return req_type; |
159 | } | 159 | } |
160 | 160 | ||
161 | static int pat_pagerange_is_ram(unsigned long start, unsigned long end) | 161 | static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end) |
162 | { | 162 | { |
163 | int ram_page = 0, not_rampage = 0; | 163 | int ram_page = 0, not_rampage = 0; |
164 | unsigned long page_nr; | 164 | unsigned long page_nr; |
diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c index 308e32570d84..38e6d174c497 100644 --- a/arch/x86/mm/pf_in.c +++ b/arch/x86/mm/pf_in.c | |||
@@ -40,16 +40,16 @@ static unsigned char prefix_codes[] = { | |||
40 | static unsigned int reg_rop[] = { | 40 | static unsigned int reg_rop[] = { |
41 | 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F | 41 | 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F |
42 | }; | 42 | }; |
43 | static unsigned int reg_wop[] = { 0x88, 0x89 }; | 43 | static unsigned int reg_wop[] = { 0x88, 0x89, 0xAA, 0xAB }; |
44 | static unsigned int imm_wop[] = { 0xC6, 0xC7 }; | 44 | static unsigned int imm_wop[] = { 0xC6, 0xC7 }; |
45 | /* IA32 Manual 3, 3-432*/ | 45 | /* IA32 Manual 3, 3-432*/ |
46 | static unsigned int rw8[] = { 0x88, 0x8A, 0xC6 }; | 46 | static unsigned int rw8[] = { 0x88, 0x8A, 0xC6, 0xAA }; |
47 | static unsigned int rw32[] = { | 47 | static unsigned int rw32[] = { |
48 | 0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F | 48 | 0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F, 0xAB |
49 | }; | 49 | }; |
50 | static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F }; | 50 | static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F, 0xAA }; |
51 | static unsigned int mw16[] = { 0xB70F, 0xBF0F }; | 51 | static unsigned int mw16[] = { 0xB70F, 0xBF0F }; |
52 | static unsigned int mw32[] = { 0x89, 0x8B, 0xC7 }; | 52 | static unsigned int mw32[] = { 0x89, 0x8B, 0xC7, 0xAB }; |
53 | static unsigned int mw64[] = {}; | 53 | static unsigned int mw64[] = {}; |
54 | #else /* not __i386__ */ | 54 | #else /* not __i386__ */ |
55 | static unsigned char prefix_codes[] = { | 55 | static unsigned char prefix_codes[] = { |
@@ -63,20 +63,20 @@ static unsigned char prefix_codes[] = { | |||
63 | static unsigned int reg_rop[] = { | 63 | static unsigned int reg_rop[] = { |
64 | 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F | 64 | 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F |
65 | }; | 65 | }; |
66 | static unsigned int reg_wop[] = { 0x88, 0x89 }; | 66 | static unsigned int reg_wop[] = { 0x88, 0x89, 0xAA, 0xAB }; |
67 | static unsigned int imm_wop[] = { 0xC6, 0xC7 }; | 67 | static unsigned int imm_wop[] = { 0xC6, 0xC7 }; |
68 | static unsigned int rw8[] = { 0xC6, 0x88, 0x8A }; | 68 | static unsigned int rw8[] = { 0xC6, 0x88, 0x8A, 0xAA }; |
69 | static unsigned int rw32[] = { | 69 | static unsigned int rw32[] = { |
70 | 0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F | 70 | 0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F, 0xAB |
71 | }; | 71 | }; |
72 | /* 8 bit only */ | 72 | /* 8 bit only */ |
73 | static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F }; | 73 | static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F, 0xAA }; |
74 | /* 16 bit only */ | 74 | /* 16 bit only */ |
75 | static unsigned int mw16[] = { 0xB70F, 0xBF0F }; | 75 | static unsigned int mw16[] = { 0xB70F, 0xBF0F }; |
76 | /* 16 or 32 bit */ | 76 | /* 16 or 32 bit */ |
77 | static unsigned int mw32[] = { 0xC7 }; | 77 | static unsigned int mw32[] = { 0xC7 }; |
78 | /* 16, 32 or 64 bit */ | 78 | /* 16, 32 or 64 bit */ |
79 | static unsigned int mw64[] = { 0x89, 0x8B }; | 79 | static unsigned int mw64[] = { 0x89, 0x8B, 0xAB }; |
80 | #endif /* not __i386__ */ | 80 | #endif /* not __i386__ */ |
81 | 81 | ||
82 | struct prefix_bits { | 82 | struct prefix_bits { |
@@ -410,7 +410,6 @@ static unsigned long *get_reg_w32(int no, struct pt_regs *regs) | |||
410 | unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs) | 410 | unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs) |
411 | { | 411 | { |
412 | unsigned int opcode; | 412 | unsigned int opcode; |
413 | unsigned char mod_rm; | ||
414 | int reg; | 413 | int reg; |
415 | unsigned char *p; | 414 | unsigned char *p; |
416 | struct prefix_bits prf; | 415 | struct prefix_bits prf; |
@@ -437,8 +436,13 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs) | |||
437 | goto err; | 436 | goto err; |
438 | 437 | ||
439 | do_work: | 438 | do_work: |
440 | mod_rm = *p; | 439 | /* for STOS, source register is fixed */ |
441 | reg = ((mod_rm >> 3) & 0x7) | (prf.rexr << 3); | 440 | if (opcode == 0xAA || opcode == 0xAB) { |
441 | reg = arg_AX; | ||
442 | } else { | ||
443 | unsigned char mod_rm = *p; | ||
444 | reg = ((mod_rm >> 3) & 0x7) | (prf.rexr << 3); | ||
445 | } | ||
442 | switch (get_ins_reg_width(ins_addr)) { | 446 | switch (get_ins_reg_width(ins_addr)) { |
443 | case 1: | 447 | case 1: |
444 | return *get_reg_w8(reg, prf.rex, regs); | 448 | return *get_reg_w8(reg, prf.rex, regs); |
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c index 8565d944f7cf..38868adf07ea 100644 --- a/arch/x86/mm/testmmiotrace.c +++ b/arch/x86/mm/testmmiotrace.c | |||
@@ -90,6 +90,27 @@ static void do_test(unsigned long size) | |||
90 | iounmap(p); | 90 | iounmap(p); |
91 | } | 91 | } |
92 | 92 | ||
93 | /* | ||
94 | * Tests how mmiotrace behaves in face of multiple ioremap / iounmaps in | ||
95 | * a short time. We had a bug in deferred freeing procedure which tried | ||
96 | * to free this region multiple times (ioremap can reuse the same address | ||
97 | * for many mappings). | ||
98 | */ | ||
99 | static void do_test_bulk_ioremapping(void) | ||
100 | { | ||
101 | void __iomem *p; | ||
102 | int i; | ||
103 | |||
104 | for (i = 0; i < 10; ++i) { | ||
105 | p = ioremap_nocache(mmio_address, PAGE_SIZE); | ||
106 | if (p) | ||
107 | iounmap(p); | ||
108 | } | ||
109 | |||
110 | /* Force freeing. If it will crash we will know why. */ | ||
111 | synchronize_rcu(); | ||
112 | } | ||
113 | |||
93 | static int __init init(void) | 114 | static int __init init(void) |
94 | { | 115 | { |
95 | unsigned long size = (read_far) ? (8 << 20) : (16 << 10); | 116 | unsigned long size = (read_far) ? (8 << 20) : (16 << 10); |
@@ -104,6 +125,7 @@ static int __init init(void) | |||
104 | "and writing 16 kB of rubbish in there.\n", | 125 | "and writing 16 kB of rubbish in there.\n", |
105 | size >> 10, mmio_address); | 126 | size >> 10, mmio_address); |
106 | do_test(size); | 127 | do_test(size); |
128 | do_test_bulk_ioremapping(); | ||
107 | pr_info("All done.\n"); | 129 | pr_info("All done.\n"); |
108 | return 0; | 130 | return 0; |
109 | } | 131 | } |
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 426f3a1a64d3..c03f14ab6667 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c | |||
@@ -278,11 +278,9 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) | |||
278 | 278 | ||
279 | static void do_flush_tlb_all(void *info) | 279 | static void do_flush_tlb_all(void *info) |
280 | { | 280 | { |
281 | unsigned long cpu = smp_processor_id(); | ||
282 | |||
283 | __flush_tlb_all(); | 281 | __flush_tlb_all(); |
284 | if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) | 282 | if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) |
285 | leave_mm(cpu); | 283 | leave_mm(smp_processor_id()); |
286 | } | 284 | } |
287 | 285 | ||
288 | void flush_tlb_all(void) | 286 | void flush_tlb_all(void) |
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index b28d2f1253bb..f6b48f6c5951 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c | |||
@@ -634,6 +634,18 @@ static int __init ppro_init(char **cpu_type) | |||
634 | if (force_arch_perfmon && cpu_has_arch_perfmon) | 634 | if (force_arch_perfmon && cpu_has_arch_perfmon) |
635 | return 0; | 635 | return 0; |
636 | 636 | ||
637 | /* | ||
638 | * Documentation on identifying Intel processors by CPU family | ||
639 | * and model can be found in the Intel Software Developer's | ||
640 | * Manuals (SDM): | ||
641 | * | ||
642 | * http://www.intel.com/products/processor/manuals/ | ||
643 | * | ||
644 | * As of May 2010 the documentation for this was in the: | ||
645 | * "Intel 64 and IA-32 Architectures Software Developer's | ||
646 | * Manual Volume 3B: System Programming Guide", "Table B-1 | ||
647 | * CPUID Signature Values of DisplayFamily_DisplayModel". | ||
648 | */ | ||
637 | switch (cpu_model) { | 649 | switch (cpu_model) { |
638 | case 0 ... 2: | 650 | case 0 ... 2: |
639 | *cpu_type = "i386/ppro"; | 651 | *cpu_type = "i386/ppro"; |
@@ -655,12 +667,13 @@ static int __init ppro_init(char **cpu_type) | |||
655 | case 15: case 23: | 667 | case 15: case 23: |
656 | *cpu_type = "i386/core_2"; | 668 | *cpu_type = "i386/core_2"; |
657 | break; | 669 | break; |
670 | case 0x1a: | ||
671 | case 0x1e: | ||
658 | case 0x2e: | 672 | case 0x2e: |
659 | case 26: | ||
660 | spec = &op_arch_perfmon_spec; | 673 | spec = &op_arch_perfmon_spec; |
661 | *cpu_type = "i386/core_i7"; | 674 | *cpu_type = "i386/core_i7"; |
662 | break; | 675 | break; |
663 | case 28: | 676 | case 0x1c: |
664 | *cpu_type = "i386/atom"; | 677 | *cpu_type = "i386/atom"; |
665 | break; | 678 | break; |
666 | default: | 679 | default: |
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 2ec04c424a62..15466c096ba5 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c | |||
@@ -34,6 +34,15 @@ static const struct dmi_system_id pci_use_crs_table[] __initconst = { | |||
34 | DMI_MATCH(DMI_PRODUCT_NAME, "x3800"), | 34 | DMI_MATCH(DMI_PRODUCT_NAME, "x3800"), |
35 | }, | 35 | }, |
36 | }, | 36 | }, |
37 | /* https://bugzilla.kernel.org/show_bug.cgi?id=16007 */ | ||
38 | /* 2006 AMD HT/VIA system with two host bridges */ | ||
39 | { | ||
40 | .callback = set_use_crs, | ||
41 | .ident = "ASRock ALiveSATA2-GLAN", | ||
42 | .matches = { | ||
43 | DMI_MATCH(DMI_PRODUCT_NAME, "ALiveSATA2-GLAN"), | ||
44 | }, | ||
45 | }, | ||
37 | {} | 46 | {} |
38 | }; | 47 | }; |
39 | 48 | ||
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 215a27ae050d..a0772af64efb 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c | |||
@@ -125,6 +125,23 @@ void __init dmi_check_skip_isa_align(void) | |||
125 | static void __devinit pcibios_fixup_device_resources(struct pci_dev *dev) | 125 | static void __devinit pcibios_fixup_device_resources(struct pci_dev *dev) |
126 | { | 126 | { |
127 | struct resource *rom_r = &dev->resource[PCI_ROM_RESOURCE]; | 127 | struct resource *rom_r = &dev->resource[PCI_ROM_RESOURCE]; |
128 | struct resource *bar_r; | ||
129 | int bar; | ||
130 | |||
131 | if (pci_probe & PCI_NOASSIGN_BARS) { | ||
132 | /* | ||
133 | * If the BIOS did not assign the BAR, zero out the | ||
134 | * resource so the kernel doesn't attmept to assign | ||
135 | * it later on in pci_assign_unassigned_resources | ||
136 | */ | ||
137 | for (bar = 0; bar <= PCI_STD_RESOURCE_END; bar++) { | ||
138 | bar_r = &dev->resource[bar]; | ||
139 | if (bar_r->start == 0 && bar_r->end != 0) { | ||
140 | bar_r->flags = 0; | ||
141 | bar_r->end = 0; | ||
142 | } | ||
143 | } | ||
144 | } | ||
128 | 145 | ||
129 | if (pci_probe & PCI_NOASSIGN_ROMS) { | 146 | if (pci_probe & PCI_NOASSIGN_ROMS) { |
130 | if (rom_r->parent) | 147 | if (rom_r->parent) |
@@ -509,6 +526,9 @@ char * __devinit pcibios_setup(char *str) | |||
509 | } else if (!strcmp(str, "norom")) { | 526 | } else if (!strcmp(str, "norom")) { |
510 | pci_probe |= PCI_NOASSIGN_ROMS; | 527 | pci_probe |= PCI_NOASSIGN_ROMS; |
511 | return NULL; | 528 | return NULL; |
529 | } else if (!strcmp(str, "nobar")) { | ||
530 | pci_probe |= PCI_NOASSIGN_BARS; | ||
531 | return NULL; | ||
512 | } else if (!strcmp(str, "assign-busses")) { | 532 | } else if (!strcmp(str, "assign-busses")) { |
513 | pci_probe |= PCI_ASSIGN_ALL_BUSSES; | 533 | pci_probe |= PCI_ASSIGN_ALL_BUSSES; |
514 | return NULL; | 534 | return NULL; |
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 9810a0f76c91..f547ee05f715 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c | |||
@@ -989,7 +989,7 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) | |||
989 | dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin - 1, irq); | 989 | dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin - 1, irq); |
990 | 990 | ||
991 | /* Update IRQ for all devices with the same pirq value */ | 991 | /* Update IRQ for all devices with the same pirq value */ |
992 | while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) { | 992 | for_each_pci_dev(dev2) { |
993 | pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin); | 993 | pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin); |
994 | if (!pin) | 994 | if (!pin) |
995 | continue; | 995 | continue; |
@@ -1028,7 +1028,7 @@ void __init pcibios_fixup_irqs(void) | |||
1028 | u8 pin; | 1028 | u8 pin; |
1029 | 1029 | ||
1030 | DBG(KERN_DEBUG "PCI: IRQ fixup\n"); | 1030 | DBG(KERN_DEBUG "PCI: IRQ fixup\n"); |
1031 | while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { | 1031 | for_each_pci_dev(dev) { |
1032 | /* | 1032 | /* |
1033 | * If the BIOS has set an out of range IRQ number, just | 1033 | * If the BIOS has set an out of range IRQ number, just |
1034 | * ignore it. Also keep track of which IRQ's are | 1034 | * ignore it. Also keep track of which IRQ's are |
@@ -1052,7 +1052,7 @@ void __init pcibios_fixup_irqs(void) | |||
1052 | return; | 1052 | return; |
1053 | 1053 | ||
1054 | dev = NULL; | 1054 | dev = NULL; |
1055 | while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { | 1055 | for_each_pci_dev(dev) { |
1056 | pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); | 1056 | pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); |
1057 | if (!pin) | 1057 | if (!pin) |
1058 | continue; | 1058 | continue; |
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c index 8d460eaf524f..c89266be6048 100644 --- a/arch/x86/pci/legacy.c +++ b/arch/x86/pci/legacy.c | |||
@@ -36,7 +36,7 @@ int __init pci_legacy_init(void) | |||
36 | return 0; | 36 | return 0; |
37 | } | 37 | } |
38 | 38 | ||
39 | void pcibios_scan_specific_bus(int busn) | 39 | void __devinit pcibios_scan_specific_bus(int busn) |
40 | { | 40 | { |
41 | int devfn; | 41 | int devfn; |
42 | long node; | 42 | long node; |
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 1290ba54b350..e7e8c5f54956 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c | |||
@@ -4,7 +4,7 @@ | |||
4 | * Distribute under GPLv2 | 4 | * Distribute under GPLv2 |
5 | * | 5 | * |
6 | * Copyright (c) 2007 Rafael J. Wysocki <rjw@sisk.pl> | 6 | * Copyright (c) 2007 Rafael J. Wysocki <rjw@sisk.pl> |
7 | * Copyright (c) 2002 Pavel Machek <pavel@suse.cz> | 7 | * Copyright (c) 2002 Pavel Machek <pavel@ucw.cz> |
8 | * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org> | 8 | * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org> |
9 | */ | 9 | */ |
10 | 10 | ||
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index d24f983ba1e5..460f314d13e5 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c | |||
@@ -4,7 +4,7 @@ | |||
4 | * Distribute under GPLv2 | 4 | * Distribute under GPLv2 |
5 | * | 5 | * |
6 | * Copyright (c) 2007 Rafael J. Wysocki <rjw@sisk.pl> | 6 | * Copyright (c) 2007 Rafael J. Wysocki <rjw@sisk.pl> |
7 | * Copyright (c) 2002 Pavel Machek <pavel@suse.cz> | 7 | * Copyright (c) 2002 Pavel Machek <pavel@ucw.cz> |
8 | * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org> | 8 | * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org> |
9 | */ | 9 | */ |
10 | 10 | ||
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index 6b4ffedb93c9..4a2afa1bac51 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile | |||
@@ -120,7 +120,8 @@ $(obj)/vdso32-syms.lds: $(vdso32.so-y:%=$(obj)/vdso32-%-syms.lds) FORCE | |||
120 | quiet_cmd_vdso = VDSO $@ | 120 | quiet_cmd_vdso = VDSO $@ |
121 | cmd_vdso = $(CC) -nostdlib -o $@ \ | 121 | cmd_vdso = $(CC) -nostdlib -o $@ \ |
122 | $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \ | 122 | $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \ |
123 | -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) | 123 | -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \ |
124 | sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@' | ||
124 | 125 | ||
125 | VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) | 126 | VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) |
126 | GCOV_PROFILE := n | 127 | GCOV_PROFILE := n |
diff --git a/arch/x86/vdso/checkundef.sh b/arch/x86/vdso/checkundef.sh new file mode 100755 index 000000000000..7ee90a9b549d --- /dev/null +++ b/arch/x86/vdso/checkundef.sh | |||
@@ -0,0 +1,10 @@ | |||
1 | #!/bin/sh | ||
2 | nm="$1" | ||
3 | file="$2" | ||
4 | $nm "$file" | grep '^ *U' > /dev/null 2>&1 | ||
5 | if [ $? -eq 1 ]; then | ||
6 | exit 0 | ||
7 | else | ||
8 | echo "$file: undefined symbols found" >&2 | ||
9 | exit 1 | ||
10 | fi | ||
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 02b442e92007..36df991985b2 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c | |||
@@ -374,7 +374,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) | |||
374 | 374 | ||
375 | #ifdef CONFIG_X86_64 | 375 | #ifdef CONFIG_X86_64 |
376 | 376 | ||
377 | __initcall(sysenter_setup); | 377 | subsys_initcall(sysenter_setup); |
378 | 378 | ||
379 | #ifdef CONFIG_SYSCTL | 379 | #ifdef CONFIG_SYSCTL |
380 | /* Register vsyscall32 into the ABI table */ | 380 | /* Register vsyscall32 into the ABI table */ |
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index ac74869b8140..4b5d26f108bb 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c | |||
@@ -67,6 +67,7 @@ static int __init init_vdso_vars(void) | |||
67 | *(typeof(__ ## x) **) var_ref(VDSO64_SYMBOL(vbase, x), #x) = &__ ## x; | 67 | *(typeof(__ ## x) **) var_ref(VDSO64_SYMBOL(vbase, x), #x) = &__ ## x; |
68 | #include "vextern.h" | 68 | #include "vextern.h" |
69 | #undef VEXTERN | 69 | #undef VEXTERN |
70 | vunmap(vbase); | ||
70 | return 0; | 71 | return 0; |
71 | 72 | ||
72 | oom: | 73 | oom: |
@@ -74,7 +75,7 @@ static int __init init_vdso_vars(void) | |||
74 | vdso_enabled = 0; | 75 | vdso_enabled = 0; |
75 | return -ENOMEM; | 76 | return -ENOMEM; |
76 | } | 77 | } |
77 | __initcall(init_vdso_vars); | 78 | subsys_initcall(init_vdso_vars); |
78 | 79 | ||
79 | struct linux_binprm; | 80 | struct linux_binprm; |
80 | 81 | ||
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index b83e119fbeb0..68128a1b401a 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig | |||
@@ -13,6 +13,11 @@ config XEN | |||
13 | kernel to boot in a paravirtualized environment under the | 13 | kernel to boot in a paravirtualized environment under the |
14 | Xen hypervisor. | 14 | Xen hypervisor. |
15 | 15 | ||
16 | config XEN_PVHVM | ||
17 | def_bool y | ||
18 | depends on XEN | ||
19 | depends on X86_LOCAL_APIC | ||
20 | |||
16 | config XEN_MAX_DOMAIN_MEMORY | 21 | config XEN_MAX_DOMAIN_MEMORY |
17 | int "Maximum allowed size of a domain in gigabytes" | 22 | int "Maximum allowed size of a domain in gigabytes" |
18 | default 8 if X86_32 | 23 | default 8 if X86_32 |
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 3bb4fc21f4f2..779385158915 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile | |||
@@ -12,9 +12,10 @@ CFLAGS_mmu.o := $(nostackp) | |||
12 | 12 | ||
13 | obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ | 13 | obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ |
14 | time.o xen-asm.o xen-asm_$(BITS).o \ | 14 | time.o xen-asm.o xen-asm_$(BITS).o \ |
15 | grant-table.o suspend.o | 15 | grant-table.o suspend.o platform-pci-unplug.o |
16 | 16 | ||
17 | obj-$(CONFIG_SMP) += smp.o | 17 | obj-$(CONFIG_SMP) += smp.o |
18 | obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o | 18 | obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o |
19 | obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o | 19 | obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o |
20 | 20 | ||
21 | obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o | ||
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 65d8d79b46a8..7d46c8441418 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c | |||
@@ -11,6 +11,7 @@ | |||
11 | * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 | 11 | * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 |
12 | */ | 12 | */ |
13 | 13 | ||
14 | #include <linux/cpu.h> | ||
14 | #include <linux/kernel.h> | 15 | #include <linux/kernel.h> |
15 | #include <linux/init.h> | 16 | #include <linux/init.h> |
16 | #include <linux/smp.h> | 17 | #include <linux/smp.h> |
@@ -35,8 +36,10 @@ | |||
35 | #include <xen/interface/version.h> | 36 | #include <xen/interface/version.h> |
36 | #include <xen/interface/physdev.h> | 37 | #include <xen/interface/physdev.h> |
37 | #include <xen/interface/vcpu.h> | 38 | #include <xen/interface/vcpu.h> |
39 | #include <xen/interface/memory.h> | ||
38 | #include <xen/features.h> | 40 | #include <xen/features.h> |
39 | #include <xen/page.h> | 41 | #include <xen/page.h> |
42 | #include <xen/hvm.h> | ||
40 | #include <xen/hvc-console.h> | 43 | #include <xen/hvc-console.h> |
41 | 44 | ||
42 | #include <asm/paravirt.h> | 45 | #include <asm/paravirt.h> |
@@ -55,7 +58,9 @@ | |||
55 | #include <asm/pgtable.h> | 58 | #include <asm/pgtable.h> |
56 | #include <asm/tlbflush.h> | 59 | #include <asm/tlbflush.h> |
57 | #include <asm/reboot.h> | 60 | #include <asm/reboot.h> |
61 | #include <asm/setup.h> | ||
58 | #include <asm/stackprotector.h> | 62 | #include <asm/stackprotector.h> |
63 | #include <asm/hypervisor.h> | ||
59 | 64 | ||
60 | #include "xen-ops.h" | 65 | #include "xen-ops.h" |
61 | #include "mmu.h" | 66 | #include "mmu.h" |
@@ -76,6 +81,10 @@ struct shared_info xen_dummy_shared_info; | |||
76 | 81 | ||
77 | void *xen_initial_gdt; | 82 | void *xen_initial_gdt; |
78 | 83 | ||
84 | RESERVE_BRK(shared_info_page_brk, PAGE_SIZE); | ||
85 | __read_mostly int xen_have_vector_callback; | ||
86 | EXPORT_SYMBOL_GPL(xen_have_vector_callback); | ||
87 | |||
79 | /* | 88 | /* |
80 | * Point at some empty memory to start with. We map the real shared_info | 89 | * Point at some empty memory to start with. We map the real shared_info |
81 | * page as soon as fixmap is up and running. | 90 | * page as soon as fixmap is up and running. |
@@ -97,6 +106,14 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info; | |||
97 | */ | 106 | */ |
98 | static int have_vcpu_info_placement = 1; | 107 | static int have_vcpu_info_placement = 1; |
99 | 108 | ||
109 | static void clamp_max_cpus(void) | ||
110 | { | ||
111 | #ifdef CONFIG_SMP | ||
112 | if (setup_max_cpus > MAX_VIRT_CPUS) | ||
113 | setup_max_cpus = MAX_VIRT_CPUS; | ||
114 | #endif | ||
115 | } | ||
116 | |||
100 | static void xen_vcpu_setup(int cpu) | 117 | static void xen_vcpu_setup(int cpu) |
101 | { | 118 | { |
102 | struct vcpu_register_vcpu_info info; | 119 | struct vcpu_register_vcpu_info info; |
@@ -104,13 +121,17 @@ static void xen_vcpu_setup(int cpu) | |||
104 | struct vcpu_info *vcpup; | 121 | struct vcpu_info *vcpup; |
105 | 122 | ||
106 | BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); | 123 | BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); |
107 | per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; | ||
108 | 124 | ||
109 | if (!have_vcpu_info_placement) | 125 | if (cpu < MAX_VIRT_CPUS) |
110 | return; /* already tested, not available */ | 126 | per_cpu(xen_vcpu,cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; |
111 | 127 | ||
112 | vcpup = &per_cpu(xen_vcpu_info, cpu); | 128 | if (!have_vcpu_info_placement) { |
129 | if (cpu >= MAX_VIRT_CPUS) | ||
130 | clamp_max_cpus(); | ||
131 | return; | ||
132 | } | ||
113 | 133 | ||
134 | vcpup = &per_cpu(xen_vcpu_info, cpu); | ||
114 | info.mfn = arbitrary_virt_to_mfn(vcpup); | 135 | info.mfn = arbitrary_virt_to_mfn(vcpup); |
115 | info.offset = offset_in_page(vcpup); | 136 | info.offset = offset_in_page(vcpup); |
116 | 137 | ||
@@ -125,6 +146,7 @@ static void xen_vcpu_setup(int cpu) | |||
125 | if (err) { | 146 | if (err) { |
126 | printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err); | 147 | printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err); |
127 | have_vcpu_info_placement = 0; | 148 | have_vcpu_info_placement = 0; |
149 | clamp_max_cpus(); | ||
128 | } else { | 150 | } else { |
129 | /* This cpu is using the registered vcpu info, even if | 151 | /* This cpu is using the registered vcpu info, even if |
130 | later ones fail to. */ | 152 | later ones fail to. */ |
@@ -731,7 +753,6 @@ static void set_xen_basic_apic_ops(void) | |||
731 | 753 | ||
732 | #endif | 754 | #endif |
733 | 755 | ||
734 | |||
735 | static void xen_clts(void) | 756 | static void xen_clts(void) |
736 | { | 757 | { |
737 | struct multicall_space mcs; | 758 | struct multicall_space mcs; |
@@ -926,10 +947,6 @@ static const struct pv_init_ops xen_init_ops __initdata = { | |||
926 | .patch = xen_patch, | 947 | .patch = xen_patch, |
927 | }; | 948 | }; |
928 | 949 | ||
929 | static const struct pv_time_ops xen_time_ops __initdata = { | ||
930 | .sched_clock = xen_sched_clock, | ||
931 | }; | ||
932 | |||
933 | static const struct pv_cpu_ops xen_cpu_ops __initdata = { | 950 | static const struct pv_cpu_ops xen_cpu_ops __initdata = { |
934 | .cpuid = xen_cpuid, | 951 | .cpuid = xen_cpuid, |
935 | 952 | ||
@@ -1028,6 +1045,23 @@ static void xen_crash_shutdown(struct pt_regs *regs) | |||
1028 | xen_reboot(SHUTDOWN_crash); | 1045 | xen_reboot(SHUTDOWN_crash); |
1029 | } | 1046 | } |
1030 | 1047 | ||
1048 | static int | ||
1049 | xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr) | ||
1050 | { | ||
1051 | xen_reboot(SHUTDOWN_crash); | ||
1052 | return NOTIFY_DONE; | ||
1053 | } | ||
1054 | |||
1055 | static struct notifier_block xen_panic_block = { | ||
1056 | .notifier_call= xen_panic_event, | ||
1057 | }; | ||
1058 | |||
1059 | int xen_panic_handler_init(void) | ||
1060 | { | ||
1061 | atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block); | ||
1062 | return 0; | ||
1063 | } | ||
1064 | |||
1031 | static const struct machine_ops __initdata xen_machine_ops = { | 1065 | static const struct machine_ops __initdata xen_machine_ops = { |
1032 | .restart = xen_restart, | 1066 | .restart = xen_restart, |
1033 | .halt = xen_machine_halt, | 1067 | .halt = xen_machine_halt, |
@@ -1067,7 +1101,6 @@ asmlinkage void __init xen_start_kernel(void) | |||
1067 | /* Install Xen paravirt ops */ | 1101 | /* Install Xen paravirt ops */ |
1068 | pv_info = xen_info; | 1102 | pv_info = xen_info; |
1069 | pv_init_ops = xen_init_ops; | 1103 | pv_init_ops = xen_init_ops; |
1070 | pv_time_ops = xen_time_ops; | ||
1071 | pv_cpu_ops = xen_cpu_ops; | 1104 | pv_cpu_ops = xen_cpu_ops; |
1072 | pv_apic_ops = xen_apic_ops; | 1105 | pv_apic_ops = xen_apic_ops; |
1073 | 1106 | ||
@@ -1075,13 +1108,7 @@ asmlinkage void __init xen_start_kernel(void) | |||
1075 | x86_init.oem.arch_setup = xen_arch_setup; | 1108 | x86_init.oem.arch_setup = xen_arch_setup; |
1076 | x86_init.oem.banner = xen_banner; | 1109 | x86_init.oem.banner = xen_banner; |
1077 | 1110 | ||
1078 | x86_init.timers.timer_init = xen_time_init; | 1111 | xen_init_time_ops(); |
1079 | x86_init.timers.setup_percpu_clockev = x86_init_noop; | ||
1080 | x86_cpuinit.setup_percpu_clockev = x86_init_noop; | ||
1081 | |||
1082 | x86_platform.calibrate_tsc = xen_tsc_khz; | ||
1083 | x86_platform.get_wallclock = xen_get_wallclock; | ||
1084 | x86_platform.set_wallclock = xen_set_wallclock; | ||
1085 | 1112 | ||
1086 | /* | 1113 | /* |
1087 | * Set up some pagetable state before starting to set any ptes. | 1114 | * Set up some pagetable state before starting to set any ptes. |
@@ -1145,6 +1172,10 @@ asmlinkage void __init xen_start_kernel(void) | |||
1145 | 1172 | ||
1146 | pgd = (pgd_t *)xen_start_info->pt_base; | 1173 | pgd = (pgd_t *)xen_start_info->pt_base; |
1147 | 1174 | ||
1175 | if (!xen_initial_domain()) | ||
1176 | __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); | ||
1177 | |||
1178 | __supported_pte_mask |= _PAGE_IOMAP; | ||
1148 | /* Don't do the full vcpu_info placement stuff until we have a | 1179 | /* Don't do the full vcpu_info placement stuff until we have a |
1149 | possible map and a non-dummy shared_info. */ | 1180 | possible map and a non-dummy shared_info. */ |
1150 | per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; | 1181 | per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; |
@@ -1206,3 +1237,139 @@ asmlinkage void __init xen_start_kernel(void) | |||
1206 | x86_64_start_reservations((char *)__pa_symbol(&boot_params)); | 1237 | x86_64_start_reservations((char *)__pa_symbol(&boot_params)); |
1207 | #endif | 1238 | #endif |
1208 | } | 1239 | } |
1240 | |||
1241 | static uint32_t xen_cpuid_base(void) | ||
1242 | { | ||
1243 | uint32_t base, eax, ebx, ecx, edx; | ||
1244 | char signature[13]; | ||
1245 | |||
1246 | for (base = 0x40000000; base < 0x40010000; base += 0x100) { | ||
1247 | cpuid(base, &eax, &ebx, &ecx, &edx); | ||
1248 | *(uint32_t *)(signature + 0) = ebx; | ||
1249 | *(uint32_t *)(signature + 4) = ecx; | ||
1250 | *(uint32_t *)(signature + 8) = edx; | ||
1251 | signature[12] = 0; | ||
1252 | |||
1253 | if (!strcmp("XenVMMXenVMM", signature) && ((eax - base) >= 2)) | ||
1254 | return base; | ||
1255 | } | ||
1256 | |||
1257 | return 0; | ||
1258 | } | ||
1259 | |||
1260 | static int init_hvm_pv_info(int *major, int *minor) | ||
1261 | { | ||
1262 | uint32_t eax, ebx, ecx, edx, pages, msr, base; | ||
1263 | u64 pfn; | ||
1264 | |||
1265 | base = xen_cpuid_base(); | ||
1266 | cpuid(base + 1, &eax, &ebx, &ecx, &edx); | ||
1267 | |||
1268 | *major = eax >> 16; | ||
1269 | *minor = eax & 0xffff; | ||
1270 | printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor); | ||
1271 | |||
1272 | cpuid(base + 2, &pages, &msr, &ecx, &edx); | ||
1273 | |||
1274 | pfn = __pa(hypercall_page); | ||
1275 | wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); | ||
1276 | |||
1277 | xen_setup_features(); | ||
1278 | |||
1279 | pv_info = xen_info; | ||
1280 | pv_info.kernel_rpl = 0; | ||
1281 | |||
1282 | xen_domain_type = XEN_HVM_DOMAIN; | ||
1283 | |||
1284 | return 0; | ||
1285 | } | ||
1286 | |||
1287 | void xen_hvm_init_shared_info(void) | ||
1288 | { | ||
1289 | int cpu; | ||
1290 | struct xen_add_to_physmap xatp; | ||
1291 | static struct shared_info *shared_info_page = 0; | ||
1292 | |||
1293 | if (!shared_info_page) | ||
1294 | shared_info_page = (struct shared_info *) | ||
1295 | extend_brk(PAGE_SIZE, PAGE_SIZE); | ||
1296 | xatp.domid = DOMID_SELF; | ||
1297 | xatp.idx = 0; | ||
1298 | xatp.space = XENMAPSPACE_shared_info; | ||
1299 | xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT; | ||
1300 | if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) | ||
1301 | BUG(); | ||
1302 | |||
1303 | HYPERVISOR_shared_info = (struct shared_info *)shared_info_page; | ||
1304 | |||
1305 | /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info | ||
1306 | * page, we use it in the event channel upcall and in some pvclock | ||
1307 | * related functions. We don't need the vcpu_info placement | ||
1308 | * optimizations because we don't use any pv_mmu or pv_irq op on | ||
1309 | * HVM. | ||
1310 | * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is | ||
1311 | * online but xen_hvm_init_shared_info is run at resume time too and | ||
1312 | * in that case multiple vcpus might be online. */ | ||
1313 | for_each_online_cpu(cpu) { | ||
1314 | per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; | ||
1315 | } | ||
1316 | } | ||
1317 | |||
1318 | #ifdef CONFIG_XEN_PVHVM | ||
1319 | static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self, | ||
1320 | unsigned long action, void *hcpu) | ||
1321 | { | ||
1322 | int cpu = (long)hcpu; | ||
1323 | switch (action) { | ||
1324 | case CPU_UP_PREPARE: | ||
1325 | per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; | ||
1326 | break; | ||
1327 | default: | ||
1328 | break; | ||
1329 | } | ||
1330 | return NOTIFY_OK; | ||
1331 | } | ||
1332 | |||
1333 | static struct notifier_block __cpuinitdata xen_hvm_cpu_notifier = { | ||
1334 | .notifier_call = xen_hvm_cpu_notify, | ||
1335 | }; | ||
1336 | |||
1337 | static void __init xen_hvm_guest_init(void) | ||
1338 | { | ||
1339 | int r; | ||
1340 | int major, minor; | ||
1341 | |||
1342 | r = init_hvm_pv_info(&major, &minor); | ||
1343 | if (r < 0) | ||
1344 | return; | ||
1345 | |||
1346 | xen_hvm_init_shared_info(); | ||
1347 | |||
1348 | if (xen_feature(XENFEAT_hvm_callback_vector)) | ||
1349 | xen_have_vector_callback = 1; | ||
1350 | register_cpu_notifier(&xen_hvm_cpu_notifier); | ||
1351 | xen_unplug_emulated_devices(); | ||
1352 | have_vcpu_info_placement = 0; | ||
1353 | x86_init.irqs.intr_init = xen_init_IRQ; | ||
1354 | xen_hvm_init_time_ops(); | ||
1355 | xen_hvm_init_mmu_ops(); | ||
1356 | } | ||
1357 | |||
1358 | static bool __init xen_hvm_platform(void) | ||
1359 | { | ||
1360 | if (xen_pv_domain()) | ||
1361 | return false; | ||
1362 | |||
1363 | if (!xen_cpuid_base()) | ||
1364 | return false; | ||
1365 | |||
1366 | return true; | ||
1367 | } | ||
1368 | |||
1369 | const __refconst struct hypervisor_x86 x86_hyper_xen_hvm = { | ||
1370 | .name = "Xen HVM", | ||
1371 | .detect = xen_hvm_platform, | ||
1372 | .init_platform = xen_hvm_guest_init, | ||
1373 | }; | ||
1374 | EXPORT_SYMBOL(x86_hyper_xen_hvm); | ||
1375 | #endif | ||
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 914f04695ce5..42086ac406af 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c | |||
@@ -42,6 +42,7 @@ | |||
42 | #include <linux/highmem.h> | 42 | #include <linux/highmem.h> |
43 | #include <linux/debugfs.h> | 43 | #include <linux/debugfs.h> |
44 | #include <linux/bug.h> | 44 | #include <linux/bug.h> |
45 | #include <linux/vmalloc.h> | ||
45 | #include <linux/module.h> | 46 | #include <linux/module.h> |
46 | #include <linux/gfp.h> | 47 | #include <linux/gfp.h> |
47 | 48 | ||
@@ -51,14 +52,19 @@ | |||
51 | #include <asm/mmu_context.h> | 52 | #include <asm/mmu_context.h> |
52 | #include <asm/setup.h> | 53 | #include <asm/setup.h> |
53 | #include <asm/paravirt.h> | 54 | #include <asm/paravirt.h> |
55 | #include <asm/e820.h> | ||
54 | #include <asm/linkage.h> | 56 | #include <asm/linkage.h> |
57 | #include <asm/page.h> | ||
55 | 58 | ||
56 | #include <asm/xen/hypercall.h> | 59 | #include <asm/xen/hypercall.h> |
57 | #include <asm/xen/hypervisor.h> | 60 | #include <asm/xen/hypervisor.h> |
58 | 61 | ||
62 | #include <xen/xen.h> | ||
59 | #include <xen/page.h> | 63 | #include <xen/page.h> |
60 | #include <xen/interface/xen.h> | 64 | #include <xen/interface/xen.h> |
65 | #include <xen/interface/hvm/hvm_op.h> | ||
61 | #include <xen/interface/version.h> | 66 | #include <xen/interface/version.h> |
67 | #include <xen/interface/memory.h> | ||
62 | #include <xen/hvc-console.h> | 68 | #include <xen/hvc-console.h> |
63 | 69 | ||
64 | #include "multicalls.h" | 70 | #include "multicalls.h" |
@@ -67,6 +73,13 @@ | |||
67 | 73 | ||
68 | #define MMU_UPDATE_HISTO 30 | 74 | #define MMU_UPDATE_HISTO 30 |
69 | 75 | ||
76 | /* | ||
77 | * Protects atomic reservation decrease/increase against concurrent increases. | ||
78 | * Also protects non-atomic updates of current_pages and driver_pages, and | ||
79 | * balloon lists. | ||
80 | */ | ||
81 | DEFINE_SPINLOCK(xen_reservation_lock); | ||
82 | |||
70 | #ifdef CONFIG_XEN_DEBUG_FS | 83 | #ifdef CONFIG_XEN_DEBUG_FS |
71 | 84 | ||
72 | static struct { | 85 | static struct { |
@@ -377,6 +390,28 @@ static bool xen_page_pinned(void *ptr) | |||
377 | return PagePinned(page); | 390 | return PagePinned(page); |
378 | } | 391 | } |
379 | 392 | ||
393 | static bool xen_iomap_pte(pte_t pte) | ||
394 | { | ||
395 | return pte_flags(pte) & _PAGE_IOMAP; | ||
396 | } | ||
397 | |||
398 | static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval) | ||
399 | { | ||
400 | struct multicall_space mcs; | ||
401 | struct mmu_update *u; | ||
402 | |||
403 | mcs = xen_mc_entry(sizeof(*u)); | ||
404 | u = mcs.args; | ||
405 | |||
406 | /* ptep might be kmapped when using 32-bit HIGHPTE */ | ||
407 | u->ptr = arbitrary_virt_to_machine(ptep).maddr; | ||
408 | u->val = pte_val_ma(pteval); | ||
409 | |||
410 | MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_IO); | ||
411 | |||
412 | xen_mc_issue(PARAVIRT_LAZY_MMU); | ||
413 | } | ||
414 | |||
380 | static void xen_extend_mmu_update(const struct mmu_update *update) | 415 | static void xen_extend_mmu_update(const struct mmu_update *update) |
381 | { | 416 | { |
382 | struct multicall_space mcs; | 417 | struct multicall_space mcs; |
@@ -453,6 +488,11 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) | |||
453 | void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, | 488 | void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, |
454 | pte_t *ptep, pte_t pteval) | 489 | pte_t *ptep, pte_t pteval) |
455 | { | 490 | { |
491 | if (xen_iomap_pte(pteval)) { | ||
492 | xen_set_iomap_pte(ptep, pteval); | ||
493 | goto out; | ||
494 | } | ||
495 | |||
456 | ADD_STATS(set_pte_at, 1); | 496 | ADD_STATS(set_pte_at, 1); |
457 | // ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep)); | 497 | // ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep)); |
458 | ADD_STATS(set_pte_at_current, mm == current->mm); | 498 | ADD_STATS(set_pte_at_current, mm == current->mm); |
@@ -523,8 +563,25 @@ static pteval_t pte_pfn_to_mfn(pteval_t val) | |||
523 | return val; | 563 | return val; |
524 | } | 564 | } |
525 | 565 | ||
566 | static pteval_t iomap_pte(pteval_t val) | ||
567 | { | ||
568 | if (val & _PAGE_PRESENT) { | ||
569 | unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; | ||
570 | pteval_t flags = val & PTE_FLAGS_MASK; | ||
571 | |||
572 | /* We assume the pte frame number is a MFN, so | ||
573 | just use it as-is. */ | ||
574 | val = ((pteval_t)pfn << PAGE_SHIFT) | flags; | ||
575 | } | ||
576 | |||
577 | return val; | ||
578 | } | ||
579 | |||
526 | pteval_t xen_pte_val(pte_t pte) | 580 | pteval_t xen_pte_val(pte_t pte) |
527 | { | 581 | { |
582 | if (xen_initial_domain() && (pte.pte & _PAGE_IOMAP)) | ||
583 | return pte.pte; | ||
584 | |||
528 | return pte_mfn_to_pfn(pte.pte); | 585 | return pte_mfn_to_pfn(pte.pte); |
529 | } | 586 | } |
530 | PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); | 587 | PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); |
@@ -537,7 +594,22 @@ PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val); | |||
537 | 594 | ||
538 | pte_t xen_make_pte(pteval_t pte) | 595 | pte_t xen_make_pte(pteval_t pte) |
539 | { | 596 | { |
540 | pte = pte_pfn_to_mfn(pte); | 597 | phys_addr_t addr = (pte & PTE_PFN_MASK); |
598 | |||
599 | /* | ||
600 | * Unprivileged domains are allowed to do IOMAPpings for | ||
601 | * PCI passthrough, but not map ISA space. The ISA | ||
602 | * mappings are just dummy local mappings to keep other | ||
603 | * parts of the kernel happy. | ||
604 | */ | ||
605 | if (unlikely(pte & _PAGE_IOMAP) && | ||
606 | (xen_initial_domain() || addr >= ISA_END_ADDRESS)) { | ||
607 | pte = iomap_pte(pte); | ||
608 | } else { | ||
609 | pte &= ~_PAGE_IOMAP; | ||
610 | pte = pte_pfn_to_mfn(pte); | ||
611 | } | ||
612 | |||
541 | return native_make_pte(pte); | 613 | return native_make_pte(pte); |
542 | } | 614 | } |
543 | PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); | 615 | PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); |
@@ -593,6 +665,11 @@ void xen_set_pud(pud_t *ptr, pud_t val) | |||
593 | 665 | ||
594 | void xen_set_pte(pte_t *ptep, pte_t pte) | 666 | void xen_set_pte(pte_t *ptep, pte_t pte) |
595 | { | 667 | { |
668 | if (xen_iomap_pte(pte)) { | ||
669 | xen_set_iomap_pte(ptep, pte); | ||
670 | return; | ||
671 | } | ||
672 | |||
596 | ADD_STATS(pte_update, 1); | 673 | ADD_STATS(pte_update, 1); |
597 | // ADD_STATS(pte_update_pinned, xen_page_pinned(ptep)); | 674 | // ADD_STATS(pte_update_pinned, xen_page_pinned(ptep)); |
598 | ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); | 675 | ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); |
@@ -609,6 +686,11 @@ void xen_set_pte(pte_t *ptep, pte_t pte) | |||
609 | #ifdef CONFIG_X86_PAE | 686 | #ifdef CONFIG_X86_PAE |
610 | void xen_set_pte_atomic(pte_t *ptep, pte_t pte) | 687 | void xen_set_pte_atomic(pte_t *ptep, pte_t pte) |
611 | { | 688 | { |
689 | if (xen_iomap_pte(pte)) { | ||
690 | xen_set_iomap_pte(ptep, pte); | ||
691 | return; | ||
692 | } | ||
693 | |||
612 | set_64bit((u64 *)ptep, native_pte_val(pte)); | 694 | set_64bit((u64 *)ptep, native_pte_val(pte)); |
613 | } | 695 | } |
614 | 696 | ||
@@ -935,8 +1017,6 @@ static int xen_pin_page(struct mm_struct *mm, struct page *page, | |||
935 | read-only, and can be pinned. */ | 1017 | read-only, and can be pinned. */ |
936 | static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd) | 1018 | static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd) |
937 | { | 1019 | { |
938 | vm_unmap_aliases(); | ||
939 | |||
940 | xen_mc_batch(); | 1020 | xen_mc_batch(); |
941 | 1021 | ||
942 | if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) { | 1022 | if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) { |
@@ -1500,7 +1580,6 @@ static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned l | |||
1500 | if (PagePinned(virt_to_page(mm->pgd))) { | 1580 | if (PagePinned(virt_to_page(mm->pgd))) { |
1501 | SetPagePinned(page); | 1581 | SetPagePinned(page); |
1502 | 1582 | ||
1503 | vm_unmap_aliases(); | ||
1504 | if (!PageHighMem(page)) { | 1583 | if (!PageHighMem(page)) { |
1505 | make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn))); | 1584 | make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn))); |
1506 | if (level == PT_PTE && USE_SPLIT_PTLOCKS) | 1585 | if (level == PT_PTE && USE_SPLIT_PTLOCKS) |
@@ -1811,9 +1890,16 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) | |||
1811 | pte = pfn_pte(phys, prot); | 1890 | pte = pfn_pte(phys, prot); |
1812 | break; | 1891 | break; |
1813 | 1892 | ||
1814 | default: | 1893 | case FIX_PARAVIRT_BOOTMAP: |
1894 | /* This is an MFN, but it isn't an IO mapping from the | ||
1895 | IO domain */ | ||
1815 | pte = mfn_pte(phys, prot); | 1896 | pte = mfn_pte(phys, prot); |
1816 | break; | 1897 | break; |
1898 | |||
1899 | default: | ||
1900 | /* By default, set_fixmap is used for hardware mappings */ | ||
1901 | pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP)); | ||
1902 | break; | ||
1817 | } | 1903 | } |
1818 | 1904 | ||
1819 | __native_set_fixmap(idx, pte); | 1905 | __native_set_fixmap(idx, pte); |
@@ -1939,8 +2025,240 @@ void __init xen_init_mmu_ops(void) | |||
1939 | x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; | 2025 | x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; |
1940 | x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; | 2026 | x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; |
1941 | pv_mmu_ops = xen_mmu_ops; | 2027 | pv_mmu_ops = xen_mmu_ops; |
2028 | |||
2029 | vmap_lazy_unmap = false; | ||
2030 | } | ||
2031 | |||
2032 | /* Protected by xen_reservation_lock. */ | ||
2033 | #define MAX_CONTIG_ORDER 9 /* 2MB */ | ||
2034 | static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER]; | ||
2035 | |||
2036 | #define VOID_PTE (mfn_pte(0, __pgprot(0))) | ||
2037 | static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order, | ||
2038 | unsigned long *in_frames, | ||
2039 | unsigned long *out_frames) | ||
2040 | { | ||
2041 | int i; | ||
2042 | struct multicall_space mcs; | ||
2043 | |||
2044 | xen_mc_batch(); | ||
2045 | for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) { | ||
2046 | mcs = __xen_mc_entry(0); | ||
2047 | |||
2048 | if (in_frames) | ||
2049 | in_frames[i] = virt_to_mfn(vaddr); | ||
2050 | |||
2051 | MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0); | ||
2052 | set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY); | ||
2053 | |||
2054 | if (out_frames) | ||
2055 | out_frames[i] = virt_to_pfn(vaddr); | ||
2056 | } | ||
2057 | xen_mc_issue(0); | ||
2058 | } | ||
2059 | |||
2060 | /* | ||
2061 | * Update the pfn-to-mfn mappings for a virtual address range, either to | ||
2062 | * point to an array of mfns, or contiguously from a single starting | ||
2063 | * mfn. | ||
2064 | */ | ||
2065 | static void xen_remap_exchanged_ptes(unsigned long vaddr, int order, | ||
2066 | unsigned long *mfns, | ||
2067 | unsigned long first_mfn) | ||
2068 | { | ||
2069 | unsigned i, limit; | ||
2070 | unsigned long mfn; | ||
2071 | |||
2072 | xen_mc_batch(); | ||
2073 | |||
2074 | limit = 1u << order; | ||
2075 | for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) { | ||
2076 | struct multicall_space mcs; | ||
2077 | unsigned flags; | ||
2078 | |||
2079 | mcs = __xen_mc_entry(0); | ||
2080 | if (mfns) | ||
2081 | mfn = mfns[i]; | ||
2082 | else | ||
2083 | mfn = first_mfn + i; | ||
2084 | |||
2085 | if (i < (limit - 1)) | ||
2086 | flags = 0; | ||
2087 | else { | ||
2088 | if (order == 0) | ||
2089 | flags = UVMF_INVLPG | UVMF_ALL; | ||
2090 | else | ||
2091 | flags = UVMF_TLB_FLUSH | UVMF_ALL; | ||
2092 | } | ||
2093 | |||
2094 | MULTI_update_va_mapping(mcs.mc, vaddr, | ||
2095 | mfn_pte(mfn, PAGE_KERNEL), flags); | ||
2096 | |||
2097 | set_phys_to_machine(virt_to_pfn(vaddr), mfn); | ||
2098 | } | ||
2099 | |||
2100 | xen_mc_issue(0); | ||
2101 | } | ||
2102 | |||
2103 | /* | ||
2104 | * Perform the hypercall to exchange a region of our pfns to point to | ||
2105 | * memory with the required contiguous alignment. Takes the pfns as | ||
2106 | * input, and populates mfns as output. | ||
2107 | * | ||
2108 | * Returns a success code indicating whether the hypervisor was able to | ||
2109 | * satisfy the request or not. | ||
2110 | */ | ||
2111 | static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in, | ||
2112 | unsigned long *pfns_in, | ||
2113 | unsigned long extents_out, | ||
2114 | unsigned int order_out, | ||
2115 | unsigned long *mfns_out, | ||
2116 | unsigned int address_bits) | ||
2117 | { | ||
2118 | long rc; | ||
2119 | int success; | ||
2120 | |||
2121 | struct xen_memory_exchange exchange = { | ||
2122 | .in = { | ||
2123 | .nr_extents = extents_in, | ||
2124 | .extent_order = order_in, | ||
2125 | .extent_start = pfns_in, | ||
2126 | .domid = DOMID_SELF | ||
2127 | }, | ||
2128 | .out = { | ||
2129 | .nr_extents = extents_out, | ||
2130 | .extent_order = order_out, | ||
2131 | .extent_start = mfns_out, | ||
2132 | .address_bits = address_bits, | ||
2133 | .domid = DOMID_SELF | ||
2134 | } | ||
2135 | }; | ||
2136 | |||
2137 | BUG_ON(extents_in << order_in != extents_out << order_out); | ||
2138 | |||
2139 | rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange); | ||
2140 | success = (exchange.nr_exchanged == extents_in); | ||
2141 | |||
2142 | BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0))); | ||
2143 | BUG_ON(success && (rc != 0)); | ||
2144 | |||
2145 | return success; | ||
1942 | } | 2146 | } |
1943 | 2147 | ||
2148 | int xen_create_contiguous_region(unsigned long vstart, unsigned int order, | ||
2149 | unsigned int address_bits) | ||
2150 | { | ||
2151 | unsigned long *in_frames = discontig_frames, out_frame; | ||
2152 | unsigned long flags; | ||
2153 | int success; | ||
2154 | |||
2155 | /* | ||
2156 | * Currently an auto-translated guest will not perform I/O, nor will | ||
2157 | * it require PAE page directories below 4GB. Therefore any calls to | ||
2158 | * this function are redundant and can be ignored. | ||
2159 | */ | ||
2160 | |||
2161 | if (xen_feature(XENFEAT_auto_translated_physmap)) | ||
2162 | return 0; | ||
2163 | |||
2164 | if (unlikely(order > MAX_CONTIG_ORDER)) | ||
2165 | return -ENOMEM; | ||
2166 | |||
2167 | memset((void *) vstart, 0, PAGE_SIZE << order); | ||
2168 | |||
2169 | spin_lock_irqsave(&xen_reservation_lock, flags); | ||
2170 | |||
2171 | /* 1. Zap current PTEs, remembering MFNs. */ | ||
2172 | xen_zap_pfn_range(vstart, order, in_frames, NULL); | ||
2173 | |||
2174 | /* 2. Get a new contiguous memory extent. */ | ||
2175 | out_frame = virt_to_pfn(vstart); | ||
2176 | success = xen_exchange_memory(1UL << order, 0, in_frames, | ||
2177 | 1, order, &out_frame, | ||
2178 | address_bits); | ||
2179 | |||
2180 | /* 3. Map the new extent in place of old pages. */ | ||
2181 | if (success) | ||
2182 | xen_remap_exchanged_ptes(vstart, order, NULL, out_frame); | ||
2183 | else | ||
2184 | xen_remap_exchanged_ptes(vstart, order, in_frames, 0); | ||
2185 | |||
2186 | spin_unlock_irqrestore(&xen_reservation_lock, flags); | ||
2187 | |||
2188 | return success ? 0 : -ENOMEM; | ||
2189 | } | ||
2190 | EXPORT_SYMBOL_GPL(xen_create_contiguous_region); | ||
2191 | |||
2192 | void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order) | ||
2193 | { | ||
2194 | unsigned long *out_frames = discontig_frames, in_frame; | ||
2195 | unsigned long flags; | ||
2196 | int success; | ||
2197 | |||
2198 | if (xen_feature(XENFEAT_auto_translated_physmap)) | ||
2199 | return; | ||
2200 | |||
2201 | if (unlikely(order > MAX_CONTIG_ORDER)) | ||
2202 | return; | ||
2203 | |||
2204 | memset((void *) vstart, 0, PAGE_SIZE << order); | ||
2205 | |||
2206 | spin_lock_irqsave(&xen_reservation_lock, flags); | ||
2207 | |||
2208 | /* 1. Find start MFN of contiguous extent. */ | ||
2209 | in_frame = virt_to_mfn(vstart); | ||
2210 | |||
2211 | /* 2. Zap current PTEs. */ | ||
2212 | xen_zap_pfn_range(vstart, order, NULL, out_frames); | ||
2213 | |||
2214 | /* 3. Do the exchange for non-contiguous MFNs. */ | ||
2215 | success = xen_exchange_memory(1, order, &in_frame, 1UL << order, | ||
2216 | 0, out_frames, 0); | ||
2217 | |||
2218 | /* 4. Map new pages in place of old pages. */ | ||
2219 | if (success) | ||
2220 | xen_remap_exchanged_ptes(vstart, order, out_frames, 0); | ||
2221 | else | ||
2222 | xen_remap_exchanged_ptes(vstart, order, NULL, in_frame); | ||
2223 | |||
2224 | spin_unlock_irqrestore(&xen_reservation_lock, flags); | ||
2225 | } | ||
2226 | EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region); | ||
2227 | |||
2228 | #ifdef CONFIG_XEN_PVHVM | ||
2229 | static void xen_hvm_exit_mmap(struct mm_struct *mm) | ||
2230 | { | ||
2231 | struct xen_hvm_pagetable_dying a; | ||
2232 | int rc; | ||
2233 | |||
2234 | a.domid = DOMID_SELF; | ||
2235 | a.gpa = __pa(mm->pgd); | ||
2236 | rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a); | ||
2237 | WARN_ON_ONCE(rc < 0); | ||
2238 | } | ||
2239 | |||
2240 | static int is_pagetable_dying_supported(void) | ||
2241 | { | ||
2242 | struct xen_hvm_pagetable_dying a; | ||
2243 | int rc = 0; | ||
2244 | |||
2245 | a.domid = DOMID_SELF; | ||
2246 | a.gpa = 0x00; | ||
2247 | rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a); | ||
2248 | if (rc < 0) { | ||
2249 | printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n"); | ||
2250 | return 0; | ||
2251 | } | ||
2252 | return 1; | ||
2253 | } | ||
2254 | |||
2255 | void __init xen_hvm_init_mmu_ops(void) | ||
2256 | { | ||
2257 | if (is_pagetable_dying_supported()) | ||
2258 | pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap; | ||
2259 | } | ||
2260 | #endif | ||
2261 | |||
1944 | #ifdef CONFIG_XEN_DEBUG_FS | 2262 | #ifdef CONFIG_XEN_DEBUG_FS |
1945 | 2263 | ||
1946 | static struct dentry *d_mmu_debug; | 2264 | static struct dentry *d_mmu_debug; |
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index 5fe6bc7f5ecf..fa938c4aa2f7 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h | |||
@@ -60,4 +60,5 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, | |||
60 | unsigned long xen_read_cr2_direct(void); | 60 | unsigned long xen_read_cr2_direct(void); |
61 | 61 | ||
62 | extern void xen_init_mmu_ops(void); | 62 | extern void xen_init_mmu_ops(void); |
63 | extern void xen_hvm_init_mmu_ops(void); | ||
63 | #endif /* _XEN_MMU_H */ | 64 | #endif /* _XEN_MMU_H */ |
diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c new file mode 100644 index 000000000000..a013ec9d0c54 --- /dev/null +++ b/arch/x86/xen/pci-swiotlb-xen.c | |||
@@ -0,0 +1,58 @@ | |||
1 | /* Glue code to lib/swiotlb-xen.c */ | ||
2 | |||
3 | #include <linux/dma-mapping.h> | ||
4 | #include <xen/swiotlb-xen.h> | ||
5 | |||
6 | #include <asm/xen/hypervisor.h> | ||
7 | #include <xen/xen.h> | ||
8 | |||
9 | int xen_swiotlb __read_mostly; | ||
10 | |||
11 | static struct dma_map_ops xen_swiotlb_dma_ops = { | ||
12 | .mapping_error = xen_swiotlb_dma_mapping_error, | ||
13 | .alloc_coherent = xen_swiotlb_alloc_coherent, | ||
14 | .free_coherent = xen_swiotlb_free_coherent, | ||
15 | .sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu, | ||
16 | .sync_single_for_device = xen_swiotlb_sync_single_for_device, | ||
17 | .sync_sg_for_cpu = xen_swiotlb_sync_sg_for_cpu, | ||
18 | .sync_sg_for_device = xen_swiotlb_sync_sg_for_device, | ||
19 | .map_sg = xen_swiotlb_map_sg_attrs, | ||
20 | .unmap_sg = xen_swiotlb_unmap_sg_attrs, | ||
21 | .map_page = xen_swiotlb_map_page, | ||
22 | .unmap_page = xen_swiotlb_unmap_page, | ||
23 | .dma_supported = xen_swiotlb_dma_supported, | ||
24 | }; | ||
25 | |||
26 | /* | ||
27 | * pci_xen_swiotlb_detect - set xen_swiotlb to 1 if necessary | ||
28 | * | ||
29 | * This returns non-zero if we are forced to use xen_swiotlb (by the boot | ||
30 | * option). | ||
31 | */ | ||
32 | int __init pci_xen_swiotlb_detect(void) | ||
33 | { | ||
34 | |||
35 | /* If running as PV guest, either iommu=soft, or swiotlb=force will | ||
36 | * activate this IOMMU. If running as PV privileged, activate it | ||
37 | * irregardlesss. | ||
38 | */ | ||
39 | if ((xen_initial_domain() || swiotlb || swiotlb_force) && | ||
40 | (xen_pv_domain())) | ||
41 | xen_swiotlb = 1; | ||
42 | |||
43 | /* If we are running under Xen, we MUST disable the native SWIOTLB. | ||
44 | * Don't worry about swiotlb_force flag activating the native, as | ||
45 | * the 'swiotlb' flag is the only one turning it on. */ | ||
46 | if (xen_pv_domain()) | ||
47 | swiotlb = 0; | ||
48 | |||
49 | return xen_swiotlb; | ||
50 | } | ||
51 | |||
52 | void __init pci_xen_swiotlb_init(void) | ||
53 | { | ||
54 | if (xen_swiotlb) { | ||
55 | xen_swiotlb_init(1); | ||
56 | dma_ops = &xen_swiotlb_dma_ops; | ||
57 | } | ||
58 | } | ||
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c new file mode 100644 index 000000000000..554c002a1e1a --- /dev/null +++ b/arch/x86/xen/platform-pci-unplug.c | |||
@@ -0,0 +1,137 @@ | |||
1 | /****************************************************************************** | ||
2 | * platform-pci-unplug.c | ||
3 | * | ||
4 | * Xen platform PCI device driver | ||
5 | * Copyright (c) 2010, Citrix | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify it | ||
8 | * under the terms and conditions of the GNU General Public License, | ||
9 | * version 2, as published by the Free Software Foundation. | ||
10 | * | ||
11 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
12 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
13 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
14 | * more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public License along with | ||
17 | * this program; if not, write to the Free Software Foundation, Inc., 59 Temple | ||
18 | * Place - Suite 330, Boston, MA 02111-1307 USA. | ||
19 | * | ||
20 | */ | ||
21 | |||
22 | #include <linux/init.h> | ||
23 | #include <linux/io.h> | ||
24 | #include <linux/module.h> | ||
25 | |||
26 | #include <xen/platform_pci.h> | ||
27 | |||
28 | #define XEN_PLATFORM_ERR_MAGIC -1 | ||
29 | #define XEN_PLATFORM_ERR_PROTOCOL -2 | ||
30 | #define XEN_PLATFORM_ERR_BLACKLIST -3 | ||
31 | |||
32 | /* store the value of xen_emul_unplug after the unplug is done */ | ||
33 | int xen_platform_pci_unplug; | ||
34 | EXPORT_SYMBOL_GPL(xen_platform_pci_unplug); | ||
35 | #ifdef CONFIG_XEN_PVHVM | ||
36 | static int xen_emul_unplug; | ||
37 | |||
38 | static int __init check_platform_magic(void) | ||
39 | { | ||
40 | short magic; | ||
41 | char protocol; | ||
42 | |||
43 | magic = inw(XEN_IOPORT_MAGIC); | ||
44 | if (magic != XEN_IOPORT_MAGIC_VAL) { | ||
45 | printk(KERN_ERR "Xen Platform PCI: unrecognised magic value\n"); | ||
46 | return XEN_PLATFORM_ERR_MAGIC; | ||
47 | } | ||
48 | |||
49 | protocol = inb(XEN_IOPORT_PROTOVER); | ||
50 | |||
51 | printk(KERN_DEBUG "Xen Platform PCI: I/O protocol version %d\n", | ||
52 | protocol); | ||
53 | |||
54 | switch (protocol) { | ||
55 | case 1: | ||
56 | outw(XEN_IOPORT_LINUX_PRODNUM, XEN_IOPORT_PRODNUM); | ||
57 | outl(XEN_IOPORT_LINUX_DRVVER, XEN_IOPORT_DRVVER); | ||
58 | if (inw(XEN_IOPORT_MAGIC) != XEN_IOPORT_MAGIC_VAL) { | ||
59 | printk(KERN_ERR "Xen Platform: blacklisted by host\n"); | ||
60 | return XEN_PLATFORM_ERR_BLACKLIST; | ||
61 | } | ||
62 | break; | ||
63 | default: | ||
64 | printk(KERN_WARNING "Xen Platform PCI: unknown I/O protocol version"); | ||
65 | return XEN_PLATFORM_ERR_PROTOCOL; | ||
66 | } | ||
67 | |||
68 | return 0; | ||
69 | } | ||
70 | |||
71 | void __init xen_unplug_emulated_devices(void) | ||
72 | { | ||
73 | int r; | ||
74 | |||
75 | /* check the version of the xen platform PCI device */ | ||
76 | r = check_platform_magic(); | ||
77 | /* If the version matches enable the Xen platform PCI driver. | ||
78 | * Also enable the Xen platform PCI driver if the version is really old | ||
79 | * and the user told us to ignore it. */ | ||
80 | if (r && !(r == XEN_PLATFORM_ERR_MAGIC && | ||
81 | (xen_emul_unplug & XEN_UNPLUG_IGNORE))) | ||
82 | return; | ||
83 | /* Set the default value of xen_emul_unplug depending on whether or | ||
84 | * not the Xen PV frontends and the Xen platform PCI driver have | ||
85 | * been compiled for this kernel (modules or built-in are both OK). */ | ||
86 | if (!xen_emul_unplug) { | ||
87 | if (xen_must_unplug_nics()) { | ||
88 | printk(KERN_INFO "Netfront and the Xen platform PCI driver have " | ||
89 | "been compiled for this kernel: unplug emulated NICs.\n"); | ||
90 | xen_emul_unplug |= XEN_UNPLUG_ALL_NICS; | ||
91 | } | ||
92 | if (xen_must_unplug_disks()) { | ||
93 | printk(KERN_INFO "Blkfront and the Xen platform PCI driver have " | ||
94 | "been compiled for this kernel: unplug emulated disks.\n" | ||
95 | "You might have to change the root device\n" | ||
96 | "from /dev/hd[a-d] to /dev/xvd[a-d]\n" | ||
97 | "in your root= kernel command line option\n"); | ||
98 | xen_emul_unplug |= XEN_UNPLUG_ALL_IDE_DISKS; | ||
99 | } | ||
100 | } | ||
101 | /* Now unplug the emulated devices */ | ||
102 | if (!(xen_emul_unplug & XEN_UNPLUG_IGNORE)) | ||
103 | outw(xen_emul_unplug, XEN_IOPORT_UNPLUG); | ||
104 | xen_platform_pci_unplug = xen_emul_unplug; | ||
105 | } | ||
106 | |||
107 | static int __init parse_xen_emul_unplug(char *arg) | ||
108 | { | ||
109 | char *p, *q; | ||
110 | int l; | ||
111 | |||
112 | for (p = arg; p; p = q) { | ||
113 | q = strchr(p, ','); | ||
114 | if (q) { | ||
115 | l = q - p; | ||
116 | q++; | ||
117 | } else { | ||
118 | l = strlen(p); | ||
119 | } | ||
120 | if (!strncmp(p, "all", l)) | ||
121 | xen_emul_unplug |= XEN_UNPLUG_ALL; | ||
122 | else if (!strncmp(p, "ide-disks", l)) | ||
123 | xen_emul_unplug |= XEN_UNPLUG_ALL_IDE_DISKS; | ||
124 | else if (!strncmp(p, "aux-ide-disks", l)) | ||
125 | xen_emul_unplug |= XEN_UNPLUG_AUX_IDE_DISKS; | ||
126 | else if (!strncmp(p, "nics", l)) | ||
127 | xen_emul_unplug |= XEN_UNPLUG_ALL_NICS; | ||
128 | else if (!strncmp(p, "ignore", l)) | ||
129 | xen_emul_unplug |= XEN_UNPLUG_IGNORE; | ||
130 | else | ||
131 | printk(KERN_WARNING "unrecognised option '%s' " | ||
132 | "in parameter 'xen_emul_unplug'\n", p); | ||
133 | } | ||
134 | return 0; | ||
135 | } | ||
136 | early_param("xen_emul_unplug", parse_xen_emul_unplug); | ||
137 | #endif | ||
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index ad0047f47cd4..328b00305426 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c | |||
@@ -20,6 +20,7 @@ | |||
20 | #include <xen/page.h> | 20 | #include <xen/page.h> |
21 | #include <xen/interface/callback.h> | 21 | #include <xen/interface/callback.h> |
22 | #include <xen/interface/physdev.h> | 22 | #include <xen/interface/physdev.h> |
23 | #include <xen/interface/memory.h> | ||
23 | #include <xen/features.h> | 24 | #include <xen/features.h> |
24 | 25 | ||
25 | #include "xen-ops.h" | 26 | #include "xen-ops.h" |
@@ -32,6 +33,73 @@ extern void xen_sysenter_target(void); | |||
32 | extern void xen_syscall_target(void); | 33 | extern void xen_syscall_target(void); |
33 | extern void xen_syscall32_target(void); | 34 | extern void xen_syscall32_target(void); |
34 | 35 | ||
36 | static unsigned long __init xen_release_chunk(phys_addr_t start_addr, | ||
37 | phys_addr_t end_addr) | ||
38 | { | ||
39 | struct xen_memory_reservation reservation = { | ||
40 | .address_bits = 0, | ||
41 | .extent_order = 0, | ||
42 | .domid = DOMID_SELF | ||
43 | }; | ||
44 | unsigned long start, end; | ||
45 | unsigned long len = 0; | ||
46 | unsigned long pfn; | ||
47 | int ret; | ||
48 | |||
49 | start = PFN_UP(start_addr); | ||
50 | end = PFN_DOWN(end_addr); | ||
51 | |||
52 | if (end <= start) | ||
53 | return 0; | ||
54 | |||
55 | printk(KERN_INFO "xen_release_chunk: looking at area pfn %lx-%lx: ", | ||
56 | start, end); | ||
57 | for(pfn = start; pfn < end; pfn++) { | ||
58 | unsigned long mfn = pfn_to_mfn(pfn); | ||
59 | |||
60 | /* Make sure pfn exists to start with */ | ||
61 | if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn) | ||
62 | continue; | ||
63 | |||
64 | set_xen_guest_handle(reservation.extent_start, &mfn); | ||
65 | reservation.nr_extents = 1; | ||
66 | |||
67 | ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, | ||
68 | &reservation); | ||
69 | WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n", | ||
70 | start, end, ret); | ||
71 | if (ret == 1) { | ||
72 | set_phys_to_machine(pfn, INVALID_P2M_ENTRY); | ||
73 | len++; | ||
74 | } | ||
75 | } | ||
76 | printk(KERN_CONT "%ld pages freed\n", len); | ||
77 | |||
78 | return len; | ||
79 | } | ||
80 | |||
81 | static unsigned long __init xen_return_unused_memory(unsigned long max_pfn, | ||
82 | const struct e820map *e820) | ||
83 | { | ||
84 | phys_addr_t max_addr = PFN_PHYS(max_pfn); | ||
85 | phys_addr_t last_end = 0; | ||
86 | unsigned long released = 0; | ||
87 | int i; | ||
88 | |||
89 | for (i = 0; i < e820->nr_map && last_end < max_addr; i++) { | ||
90 | phys_addr_t end = e820->map[i].addr; | ||
91 | end = min(max_addr, end); | ||
92 | |||
93 | released += xen_release_chunk(last_end, end); | ||
94 | last_end = e820->map[i].addr + e820->map[i].size; | ||
95 | } | ||
96 | |||
97 | if (last_end < max_addr) | ||
98 | released += xen_release_chunk(last_end, max_addr); | ||
99 | |||
100 | printk(KERN_INFO "released %ld pages of unused memory\n", released); | ||
101 | return released; | ||
102 | } | ||
35 | 103 | ||
36 | /** | 104 | /** |
37 | * machine_specific_memory_setup - Hook for machine specific memory setup. | 105 | * machine_specific_memory_setup - Hook for machine specific memory setup. |
@@ -67,6 +135,8 @@ char * __init xen_memory_setup(void) | |||
67 | 135 | ||
68 | sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); | 136 | sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); |
69 | 137 | ||
138 | xen_return_unused_memory(xen_start_info->nr_pages, &e820); | ||
139 | |||
70 | return "Xen"; | 140 | return "Xen"; |
71 | } | 141 | } |
72 | 142 | ||
@@ -156,6 +226,8 @@ void __init xen_arch_setup(void) | |||
156 | struct physdev_set_iopl set_iopl; | 226 | struct physdev_set_iopl set_iopl; |
157 | int rc; | 227 | int rc; |
158 | 228 | ||
229 | xen_panic_handler_init(); | ||
230 | |||
159 | HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); | 231 | HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); |
160 | HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); | 232 | HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); |
161 | 233 | ||
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index a29693fd3138..25f232b18a82 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c | |||
@@ -394,6 +394,8 @@ static void stop_self(void *v) | |||
394 | load_cr3(swapper_pg_dir); | 394 | load_cr3(swapper_pg_dir); |
395 | /* should set up a minimal gdt */ | 395 | /* should set up a minimal gdt */ |
396 | 396 | ||
397 | set_cpu_online(cpu, false); | ||
398 | |||
397 | HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL); | 399 | HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL); |
398 | BUG(); | 400 | BUG(); |
399 | } | 401 | } |
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index a9c661108034..1d789d56877c 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c | |||
@@ -26,6 +26,18 @@ void xen_pre_suspend(void) | |||
26 | BUG(); | 26 | BUG(); |
27 | } | 27 | } |
28 | 28 | ||
29 | void xen_hvm_post_suspend(int suspend_cancelled) | ||
30 | { | ||
31 | int cpu; | ||
32 | xen_hvm_init_shared_info(); | ||
33 | xen_callback_vector(); | ||
34 | if (xen_feature(XENFEAT_hvm_safe_pvclock)) { | ||
35 | for_each_online_cpu(cpu) { | ||
36 | xen_setup_runstate_info(cpu); | ||
37 | } | ||
38 | } | ||
39 | } | ||
40 | |||
29 | void xen_post_suspend(int suspend_cancelled) | 41 | void xen_post_suspend(int suspend_cancelled) |
30 | { | 42 | { |
31 | xen_build_mfn_list_list(); | 43 | xen_build_mfn_list_list(); |
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index b3c6c59ed302..1a5353a753fc 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c | |||
@@ -20,6 +20,7 @@ | |||
20 | #include <asm/xen/hypercall.h> | 20 | #include <asm/xen/hypercall.h> |
21 | 21 | ||
22 | #include <xen/events.h> | 22 | #include <xen/events.h> |
23 | #include <xen/features.h> | ||
23 | #include <xen/interface/xen.h> | 24 | #include <xen/interface/xen.h> |
24 | #include <xen/interface/vcpu.h> | 25 | #include <xen/interface/vcpu.h> |
25 | 26 | ||
@@ -155,47 +156,8 @@ static void do_stolen_accounting(void) | |||
155 | account_idle_ticks(ticks); | 156 | account_idle_ticks(ticks); |
156 | } | 157 | } |
157 | 158 | ||
158 | /* | ||
159 | * Xen sched_clock implementation. Returns the number of unstolen | ||
160 | * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED | ||
161 | * states. | ||
162 | */ | ||
163 | unsigned long long xen_sched_clock(void) | ||
164 | { | ||
165 | struct vcpu_runstate_info state; | ||
166 | cycle_t now; | ||
167 | u64 ret; | ||
168 | s64 offset; | ||
169 | |||
170 | /* | ||
171 | * Ideally sched_clock should be called on a per-cpu basis | ||
172 | * anyway, so preempt should already be disabled, but that's | ||
173 | * not current practice at the moment. | ||
174 | */ | ||
175 | preempt_disable(); | ||
176 | |||
177 | now = xen_clocksource_read(); | ||
178 | |||
179 | get_runstate_snapshot(&state); | ||
180 | |||
181 | WARN_ON(state.state != RUNSTATE_running); | ||
182 | |||
183 | offset = now - state.state_entry_time; | ||
184 | if (offset < 0) | ||
185 | offset = 0; | ||
186 | |||
187 | ret = state.time[RUNSTATE_blocked] + | ||
188 | state.time[RUNSTATE_running] + | ||
189 | offset; | ||
190 | |||
191 | preempt_enable(); | ||
192 | |||
193 | return ret; | ||
194 | } | ||
195 | |||
196 | |||
197 | /* Get the TSC speed from Xen */ | 159 | /* Get the TSC speed from Xen */ |
198 | unsigned long xen_tsc_khz(void) | 160 | static unsigned long xen_tsc_khz(void) |
199 | { | 161 | { |
200 | struct pvclock_vcpu_time_info *info = | 162 | struct pvclock_vcpu_time_info *info = |
201 | &HYPERVISOR_shared_info->vcpu_info[0].time; | 163 | &HYPERVISOR_shared_info->vcpu_info[0].time; |
@@ -230,7 +192,7 @@ static void xen_read_wallclock(struct timespec *ts) | |||
230 | put_cpu_var(xen_vcpu); | 192 | put_cpu_var(xen_vcpu); |
231 | } | 193 | } |
232 | 194 | ||
233 | unsigned long xen_get_wallclock(void) | 195 | static unsigned long xen_get_wallclock(void) |
234 | { | 196 | { |
235 | struct timespec ts; | 197 | struct timespec ts; |
236 | 198 | ||
@@ -238,7 +200,7 @@ unsigned long xen_get_wallclock(void) | |||
238 | return ts.tv_sec; | 200 | return ts.tv_sec; |
239 | } | 201 | } |
240 | 202 | ||
241 | int xen_set_wallclock(unsigned long now) | 203 | static int xen_set_wallclock(unsigned long now) |
242 | { | 204 | { |
243 | /* do nothing for domU */ | 205 | /* do nothing for domU */ |
244 | return -1; | 206 | return -1; |
@@ -473,7 +435,11 @@ void xen_timer_resume(void) | |||
473 | } | 435 | } |
474 | } | 436 | } |
475 | 437 | ||
476 | __init void xen_time_init(void) | 438 | static const struct pv_time_ops xen_time_ops __initdata = { |
439 | .sched_clock = xen_clocksource_read, | ||
440 | }; | ||
441 | |||
442 | static __init void xen_time_init(void) | ||
477 | { | 443 | { |
478 | int cpu = smp_processor_id(); | 444 | int cpu = smp_processor_id(); |
479 | struct timespec tp; | 445 | struct timespec tp; |
@@ -497,3 +463,47 @@ __init void xen_time_init(void) | |||
497 | xen_setup_timer(cpu); | 463 | xen_setup_timer(cpu); |
498 | xen_setup_cpu_clockevents(); | 464 | xen_setup_cpu_clockevents(); |
499 | } | 465 | } |
466 | |||
467 | __init void xen_init_time_ops(void) | ||
468 | { | ||
469 | pv_time_ops = xen_time_ops; | ||
470 | |||
471 | x86_init.timers.timer_init = xen_time_init; | ||
472 | x86_init.timers.setup_percpu_clockev = x86_init_noop; | ||
473 | x86_cpuinit.setup_percpu_clockev = x86_init_noop; | ||
474 | |||
475 | x86_platform.calibrate_tsc = xen_tsc_khz; | ||
476 | x86_platform.get_wallclock = xen_get_wallclock; | ||
477 | x86_platform.set_wallclock = xen_set_wallclock; | ||
478 | } | ||
479 | |||
480 | #ifdef CONFIG_XEN_PVHVM | ||
481 | static void xen_hvm_setup_cpu_clockevents(void) | ||
482 | { | ||
483 | int cpu = smp_processor_id(); | ||
484 | xen_setup_runstate_info(cpu); | ||
485 | xen_setup_timer(cpu); | ||
486 | xen_setup_cpu_clockevents(); | ||
487 | } | ||
488 | |||
489 | __init void xen_hvm_init_time_ops(void) | ||
490 | { | ||
491 | /* vector callback is needed otherwise we cannot receive interrupts | ||
492 | * on cpu > 0 */ | ||
493 | if (!xen_have_vector_callback && num_present_cpus() > 1) | ||
494 | return; | ||
495 | if (!xen_feature(XENFEAT_hvm_safe_pvclock)) { | ||
496 | printk(KERN_INFO "Xen doesn't support pvclock on HVM," | ||
497 | "disable pv timer\n"); | ||
498 | return; | ||
499 | } | ||
500 | |||
501 | pv_time_ops = xen_time_ops; | ||
502 | x86_init.timers.setup_percpu_clockev = xen_time_init; | ||
503 | x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents; | ||
504 | |||
505 | x86_platform.calibrate_tsc = xen_tsc_khz; | ||
506 | x86_platform.get_wallclock = xen_get_wallclock; | ||
507 | x86_platform.set_wallclock = xen_set_wallclock; | ||
508 | } | ||
509 | #endif | ||
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index f9153a300bce..7c8ab86163e9 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h | |||
@@ -38,6 +38,10 @@ void xen_enable_sysenter(void); | |||
38 | void xen_enable_syscall(void); | 38 | void xen_enable_syscall(void); |
39 | void xen_vcpu_restore(void); | 39 | void xen_vcpu_restore(void); |
40 | 40 | ||
41 | void xen_callback_vector(void); | ||
42 | void xen_hvm_init_shared_info(void); | ||
43 | void __init xen_unplug_emulated_devices(void); | ||
44 | |||
41 | void __init xen_build_dynamic_phys_to_machine(void); | 45 | void __init xen_build_dynamic_phys_to_machine(void); |
42 | 46 | ||
43 | void xen_init_irq_ops(void); | 47 | void xen_init_irq_ops(void); |
@@ -46,11 +50,8 @@ void xen_setup_runstate_info(int cpu); | |||
46 | void xen_teardown_timer(int cpu); | 50 | void xen_teardown_timer(int cpu); |
47 | cycle_t xen_clocksource_read(void); | 51 | cycle_t xen_clocksource_read(void); |
48 | void xen_setup_cpu_clockevents(void); | 52 | void xen_setup_cpu_clockevents(void); |
49 | unsigned long xen_tsc_khz(void); | 53 | void __init xen_init_time_ops(void); |
50 | void __init xen_time_init(void); | 54 | void __init xen_hvm_init_time_ops(void); |
51 | unsigned long xen_get_wallclock(void); | ||
52 | int xen_set_wallclock(unsigned long time); | ||
53 | unsigned long long xen_sched_clock(void); | ||
54 | 55 | ||
55 | irqreturn_t xen_debug_interrupt(int irq, void *dev_id); | 56 | irqreturn_t xen_debug_interrupt(int irq, void *dev_id); |
56 | 57 | ||
@@ -101,4 +102,6 @@ void xen_sysret32(void); | |||
101 | void xen_sysret64(void); | 102 | void xen_sysret64(void); |
102 | void xen_adjust_exception_frame(void); | 103 | void xen_adjust_exception_frame(void); |
103 | 104 | ||
105 | extern int xen_panic_handler_init(void); | ||
106 | |||
104 | #endif /* XEN_OPS_H */ | 107 | #endif /* XEN_OPS_H */ |